#!/usr/bin/env python2
"""
Generate a .zip file that "patches" the contents of a directory to re-enact
any file rename, move, add or delete operations.
"""
HOWTO = """
How to use:
 - Before copying the old state away, generate hashes with
     find | xargs sha1sum > old_state.txt
   (or md5sum, sha256sum, or any similar tool)
 - Rename, move, add or delete files in one replica.
 - Generate hashes again with
     find | xargs sha1sum > new_state.txt
 - Run this script:
     ./dirpatch.py -C the_directory
                   --old old_state.txt --new new_state.txt
                   -o patch.zip
 - Copy patch.zip to where the other replicas are
 - Unpack patch.zip into the directory where the files are.
 - Run _apply_patch.bat (Win32) or _apply_patch.sh (other OSes) to
   apply the changes.

Caveats:
 - Does not support double-quotes (") in filenames on Win32 targets.
 - Does not track removal of subdirectories that were already empty
   in the old state.
 - Assumes file names use 1252 encoding in .bat and UTF-8 in .sh;
   in hash files, UTF-8 is tried, and if that fails, 1252 is used.
"""
import sys, os, argparse, collections, cStringIO, zipfile, random

###############################################################################

def enc_sh(x):
    x = x.encode('utf-8')
    if '"' in x:
        return "'%s'" % x
    if ("'" in x) or (" " in x):
        return '"%s"' % x
    return x

def enc_bat(x):
    x = x.encode('windows-1252', 'replace').replace('/', '\\')
    if " " in x:
        return '"%s"' % x
    return x

_nfnseq = None
def generate_new_file_name():
    global _nfnseq
    if not _nfnseq:
        _nfnseq = range(10000)
        random.shuffle(_nfnseq)
    return "_new%04d" % _nfnseq.pop()

class Stats(object):
    def __init__(self):
        self.new, self.dels, self.moves, self.copies = 0, 0, 0, 0
    def dump(self):
        print "files created:       %4d" % self.new
        print "files renamed/moved: %4d" % self.moves
        print "files copied:        %4d" % self.copies
        print "files deleted:       %4d" % self.dels

###############################################################################

class FileRecord(object):
    def __init__(self):
        self.old = []
        self.new = []
        self.arch_int = None
        self.arch_ext = None

    def add_name(self, name, is_new):
        name = os.path.normpath(name.strip().replace('\\', '/')).replace('\\', '/')
        try:
            name = unicode(name, 'utf-8')
        except UnicodeDecodeError:
            name = unicode(name, 'windows-1252')
        if is_new:
            self.new.append(name)
        else:
            self.old.append(name)

    def has_changed(self):
        return sorted(self.old) != sorted(self.new)

    def dump(self):
        for name in self.old: print "-", name
        for name in self.new: print "+", name
        print

    def generate_actions(self, bat, sh, stats=Stats()):
        # eliminate equal file names
        self.old = set(self.old)
        self.new = set(self.new)
        common = self.old & self.new
        self.old = sorted(list(self.old - common))
        self.new = sorted(list(self.new - common))

        # create deletions
        delmin = (1 if self.new else 0)
        for f in self.old[delmin:]:
            print >>bat, "del /f %s" % enc_bat(f)
            print >>sh, "rm -f %s" % enc_sh(f)
            stats.dels += 1

        # if there are no destination files, we're done here
        if not self.new:
            return

        # generate first target file
        if self.old:
            # rename
            src = self.old[0]
            stats.moves += 1
        else:
            # new file
            src = generate_new_file_name()
            self.arch_int = src
            self.arch_ext = self.new[0]
            stats.new += 1
        print >>bat, "move /y %s %s" % tuple(map(enc_bat, (src, self.new[0])))
        print >>sh, "mv %s %s" % tuple(map(enc_sh, (src, self.new[0])))

        # generate additional target files
        for f in self.new[1:]:
            print >>bat, "copy /b /y %s %s" % tuple(map(enc_bat, (self.new[0], f)))
            print >>sh, "cp %s %s" % tuple(map(enc_sh, (self.new[0], f)))
            stats.copies += 1

def ParseHashFile(hashes, hashfile, is_new):
    try:
        if hashfile == '-':
            f = sys.stdin
        else:
            f = open(hashfile, "r")
    except IOError, e:
        print >>sys.stderr, "Fatal: can not open hash file '%s' - %s" % (hashfile, e)
        sys.exit(1)
    for line in f:
        line = line.replace('\t', ' ').strip()
        if not(line) or not(' ' in line) or line.startswith('#'):
            continue
        digest, name = line.split(' ', 1)
        hashes[digest].add_name(name.lstrip('*'), is_new)
    f.close()

###############################################################################

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description=__doc__, epilog=HOWTO,
             formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument("-C", "--dir", type=str, metavar="DIR",
                        help="specify working directory [default: current]")
    parser.add_argument("-o", "--output", type=str, default="patch.zip", metavar="ZIP",
                        help="specify output .zip file name [default: %(default)s]")
    parser.add_argument("-a", "--old", type=str, metavar="HASHFILE",
                        help="specify old hash file [mandatory]")
    parser.add_argument("-b", "--new", type=str, metavar="HASHFILE",
                        help="specify new hash file [mandatory, '-' = stdin]")
    parser.add_argument("-n", "--dry-run", action='store_true',
                        help="don't create archive, only show statistics")
    args = parser.parse_args()
    if not(args.old) or not(args.new):
        parser.error("--old and --new need to be specified")

    # import hash lists
    hashes = collections.defaultdict(FileRecord)
    ParseHashFile(hashes, args.old, False)
    ParseHashFile(hashes, args.new, True)
    files = filter(FileRecord.has_changed, hashes.values())
#    for f in files: f.dump()

    # change into output directory
    outfile = os.path.abspath(args.output)
    if args.dir:
        try:
            os.chdir(args.dir)
        except OSError:
            print >>sys.stderr, "Fatal: can not change into working directory '%s' - %s" % (args.dir, e)
            sys.exit(1)

    # create .bat and .sh scripts
    bat = cStringIO.StringIO()
    sh = cStringIO.StringIO()
    print >>bat, "@echo off"
    print >>sh, "#!/bin/sh"

    # generate sanity check for existing files
    src_files = []
    for f in files:
        src_files.extend(f.old)
    random.shuffle(src_files)
    if src_files:
        del src_files[min(5, max(1, len(src_files) / 10)):]
        for f in src_files:
            print >>bat, "if exist %s goto ok" % enc_bat(f)
        print >>sh, "if test %s ; then" % " -a ".join("! -f %s" % enc_sh(f) for f in src_files)
        for msgline in map(str.strip, """
            The source files for this patch seem to be missing.
            Please double-check that the .zip file has been uncompressed
            into the proper directory!
        """.strip().split("\n")):
            print >>bat, "echo", msgline
            print >>sh, "    echo \"%s\"" % msgline
        print >>bat, "pause"
        print >>bat, "exit /b 1"
        print >>bat, ":ok"
        print >>sh, "    exit 1"
        print >>sh, "fi"

    # collect target subdirectory names, create mkdir commands
    subdirs = set()
    for f in files:
        for name in f.new:
            if '/' in name:
                subdirs.add(name.rsplit('/', 1)[0])
    for d in sorted(subdirs):
        print >>bat, "if not exist %s mkdir %s" % tuple(2 * [enc_bat(d)])
        print >>sh, "mkdir -p %s" % enc_sh(d)

    # decide actions
    stats = Stats()
    for f in files:
        f.generate_actions(bat, sh, stats)

    # collect source subdirectory names, create rmdir commands
    subdirs = set()
    for f in files:
        for name in f.old:
            if '/' in name:
                subdirs.add(name.rsplit('/', 1)[0])
    for d in sorted(subdirs, reverse=True):
        print >>bat, "rmdir %s 2>nul" % enc_bat(d)
        print >>sh, "rmdir %s 2>/dev/null" % enc_sh(d)

    # finalize .bat and .sh scripts
    print >>bat, "del /f _apply_patch.sh _apply_patch.bat"
    print >>sh, "rm -f _apply_patch.bat _apply_patch.sh"
    bat = bat.getvalue().replace('\r\n', '\n').replace('\n', '\r\n')
    sh = sh.getvalue().replace('\r\n', '\n')

    # generate archive
    if args.dry_run:
        archive = None
    else:
        archive = zipfile.ZipFile(outfile, "w", zipfile.ZIP_DEFLATED, True)
        for f in files:
            if f.arch_int and f.arch_ext:
                print "archiving:", f.arch_ext, "=>", f.arch_int
                archive.write(f.arch_ext, f.arch_int)
        archive.writestr("_apply_patch.sh", sh)
        archive.writestr("_apply_patch.bat", bat)
        archive.close()

    # finish; print statistics
    stats.dump()
    if not args.dry_run:
        print "patch archive size: %.1f MB" % (os.path.getsize(outfile) / 1E6)
