dobrokot · August 29, 2015 14:02 · Jun 1, 2014 · Jun 1, 2014 · Jun 1, 2014
diff --git a/dobro_backup.py b/dobro_backup.py
@@ -5,9 +5,11 @@
 
 Usage: python "prog_name" destination_dir path1/source1 ... pathN/sourceN'
 
-Copied result will be in the destination_dir/source1 ... destination_dir/sourceN'
+Copied result will be in the destination_dir/DATE_TIME/source1 ... destination_dir/DATE_TIME/sourceN'
 Also there will be created files pathN/sourceN/hash_dir_info.txt (write access to source is needed now)'
-When backup is complete, destination_dir/backup.complete is created, with timestamp info.
+When backup is complete, destination_dir/DATE_TIME/backup.complete is created, with timestamp info.
+Hardlinks of duplicated files created to previous directory, to save space/time.
+Deletion of old backup directories is safe.
 
 Encryption and compression can be provided by the file-system (TrueCrypt + NTFS, for example).
 

diff --git a/dobro_backup.py b/dobro_backup.py
@@ -3,7 +3,7 @@
 USAGE = '''
 Incremental backup program
 
-Usage: python "prog_name" destination_dir path1/source1 ... pathN/sourceN' % prog_name
+Usage: python "prog_name" destination_dir path1/source1 ... pathN/sourceN'
 
 Copied result will be in the destination_dir/source1 ... destination_dir/sourceN'
 Also there will be created files pathN/sourceN/hash_dir_info.txt (write access to source is needed now)'

diff --git a/dobro_backup.py b/dobro_backup.py
@@ -0,0 +1,486 @@
+#!/usr/bin/env python
+
+USAGE = '''
+Incremental backup program
+
+Usage: python "prog_name" destination_dir path1/source1 ... pathN/sourceN' % prog_name
+
+Copied result will be in the destination_dir/source1 ... destination_dir/sourceN'
+Also there will be created files pathN/sourceN/hash_dir_info.txt (write access to source is needed now)'
+When backup is complete, destination_dir/backup.complete is created, with timestamp info.
+
+Encryption and compression can be provided by the file-system (TrueCrypt + NTFS, for example).
+
+Hardlinks to older-than-last backups are not created now (you may manually move/copy older files to last backup)
+'''
+
+# Copyright (c) 2014 Ivan Yanikov
+
+# The MIT License (MIT)
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+
+#------------------------------------------------
+# known missing features and issues:
+# 
+# todo: allow to create hardlinks to old backups, to handle changed source set
+#          ( ["music", "documents", "video"] -> ["music", "documents"] -> ["music", "documents", "video"] ).
+# todo: needless rehashes of internal file hash_dir_info.txt
+# todo: catch "source inside destination" errors and vice-versa
+# todo: allow to rename sources targets
+# todo: allow to store hash_dir_info.txt outside source directories
+# todo: allow exclude dirs/mask
+# todo: don't spam about known unreadable dirs ( Microsoft\Windows\WER\ReportQueue )
+#------------------------------------------------
+
+import hashlib
+import os, sys, shutil, stat 
+import time
+import itertools
+import re
+import traceback
+
+#recursively enumerates files in directory 
+#returns list of tuples ( relative_path, file_modified_timestamp, file_size )
+#cross-platform version for any OS
+def _listfiles_py(path, relpath, result):
+    if path:
+        path += os.sep
+    try:
+        files = os.listdir(path)
+    except EnvironmentError:
+        traceback.print_exc()
+        print >> sys.stderr, 'warning: skipping "' + path.encode('UTF-8') + '"'
+        return result
+
+    for fn in files:
+        fullfn = path + fn
+        relpath2 = (relpath + os.sep if relpath else '') + fn
+        try:
+            st = os.stat(fullfn)
+        except EnvironmentError:
+            traceback.print_exc()
+            print >> sys.stderr, 'warning: skipping "' + path.encode('UTF-8') + '"'
+            continue
+
+        file_type = stat.S_IFMT(st.st_mode)
+        if file_type == stat.S_IFDIR:
+            _listfiles_py(fullfn, relpath2, result) 
+        elif file_type == stat.S_IFREG:
+            result.append((relpath2, int(st.st_mtime), st.st_size))
+        #soft-links are skipped
+    return result
+
+win32file = None
+if (os.name == 'nt'):
+    if not hasattr(os, 'link'): # python on windows do not have os.link for hardlinks
+        import ctypes
+        from ctypes import WinError
+        from ctypes.wintypes import BOOL
+        CreateHardLink = ctypes.windll.kernel32.CreateHardLinkW
+        CreateHardLink.argtypes = [ctypes.c_wchar_p, ctypes.c_wchar_p, ctypes.c_void_p]
+        CreateHardLink.restype = BOOL
+
+        def mk_hard_link(src, dst):
+            res = CreateHardLink(dst, src, None)
+            if res == 0:
+                raise WinError()
+
+    FindFirstFile = ctypes.windll.kernel32.FindFirstFileW
+    FindNextFile  = ctypes.windll.kernel32.FindNextFileW
+    FindClose     = ctypes.windll.kernel32.FindClose
+    GetLastError  =  ctypes.windll.kernel32.GetLastError
+    #optimization for Windows: FindFirstFileW is much faster than (os.listdir + os.stat)
+    #enumerates non-recursive immediate files and subdirectories in directory 
+    #returns list of tuples ( relative_path, file_modified_timestamp, file_size, is_directory )
+    def FindFilesWin32(path):
+        # pylint: disable=E0611
+        from exceptions import WindowsError
+        out  = ctypes.wintypes.WIN32_FIND_DATAW()
+        fldr = FindFirstFile(unicode(path) + u'\\*', ctypes.byref(out))
+
+        if fldr == -1: #INVALID_HANDLE_VALUE
+            raise WindowsError("can't list files in the %s" % repr(path))
+
+        result = []
+        try:
+            while 1:
+                fn = out.cFileName
+                if fn != u'.' and fn != u'..':
+                    isdir = bool(out.dwFileAttributes & 0x10) #FILE_ATTRIBUTE_DIRECTORY
+                    ts = out.ftLastWriteTime
+                    win_filetime = (ts.dwHighDateTime << 32) | ts.dwLowDateTime
+                    secs_between_epochs = 11644473600 # Seconds between 1.1.1601 and 1.1.1970
+                    timestamp = win_filetime / 10000000 - secs_between_epochs
+                    size = (out.nFileSizeHigh << 32) | out.nFileSizeLow
+                    result.append((out.cFileName, timestamp, size, isdir))
+
+                if not FindNextFile(fldr, ctypes.byref(out)):
+                    win_error = GetLastError()
+                    if win_error ==  18: #ERROR_NO_MORE_FILES:
+                        break
+                    exc = WindowsError(win_error) #todo: test
+                    exc.filename = path
+                    raise exc
+        finally:
+            FindClose(fldr)
+        return result
+
+    #recursively enumerates files in directory 
+    #returns list of tuples ( relative_path, file_modified_timestamp, file_size )
+    #specialized version for Windows.
+    def ListFilesWin32(path, relpath, result):
+        if path:
+            path += os.sep
+        try:
+            files = FindFilesWin32(path)
+        except EnvironmentError:
+            traceback.print_exc()
+            print >> sys.stderr, 'warning: skipping "' + path.encode('UTF-8') + '"'
+            return result
+
+        for (fn, ts, sz, isdir) in files:
+            fullfn = path + fn
+            relpath2 = (relpath + os.sep if relpath else '') + fn
+            if isdir:
+                ListFilesWin32(fullfn, relpath2, result) 
+            else: 
+                result.append((relpath2, ts, sz))
+            #soft-links are skipped
+        return result
+
+    _listfiles = ListFilesWin32
+
+
+else: # non-windows version of mk_hard_link and _listfiles
+    mk_hard_link = os.link
+    _listfiles = _listfiles_py
+
+#recursively enumerates files in directory 
+#returns list of tuples ( relative_path, file_modified_timestamp, file_size )    
+def listfiles(path):
+    return _listfiles(unicode(path), u'', [])
+
+def md5_for_file(filename):
+    # pylint: disable=E1101
+    md5 = hashlib.md5()
+    with open(filename, 'rb') as f:
+        while True:
+            data = f.read(2**20)
+            if not data:
+                break
+            md5.update(data)
+        return md5.hexdigest()
+
+#read text file, and load it to dictionary {filename -> (md5, file_size, file_modified_timestamp)}
+def load_hashes(hf_txt):
+    #different systems can have different file-time-modified format
+    if not os.path.isfile(hf_txt):
+        return {}
+
+    try:
+        res = {}
+        with open(hf_txt, 'rb') as f:
+            for l in f:
+                md5, size, mtime, filename = l.rstrip('\r\n').split('\t', 3)
+                res[filename.replace('/', os.sep).decode('UTF-8')] = (md5, int(size), int(mtime))
+        return res
+    except EnvironmentError:
+        traceback.print_exc()
+        return {}
+
+# pretty format of file size, '81731298312' -> '76.1GB'
+def filesize_fmt(num):
+    for x in ['bytes', 'KB', 'MB', 'GB']:
+        if num < 1024.0 and num > -1024.0:
+            return "%3.1f%s" % (num, x)
+        num /= 1024.0
+    return "%3.1f%s" % (num, 'TB')
+
+# recalculates hashes of files, timestamp of which changed
+# also calls 'file_processed_callback' on each file (to 
+# have fresh data from OS file-system cache).
+def get_hashes(dir, prev_info, file_processed_callback):
+    print >> sys.stderr, 'info: traversing', repr(dir)
+    current_files = listfiles(dir)
+
+    result = []
+    for relpath, mtime, size in current_files:
+        prev_file_info = prev_info.get(relpath)
+        md5 = None
+        if prev_file_info:
+            prev_md5, prev_size, prev_mtime = prev_file_info
+            if prev_size == size and prev_mtime == mtime:
+                md5 = prev_md5
+        result.append((relpath, md5, size, mtime))
+
+    recalc_count = 0
+    recalc_size = 0
+    total_size = 0
+    for relpath, md5, size, mtime in result:
+        total_size += size
+        if md5 == None:
+            recalc_count += 1
+            recalc_size += size
+
+    print >> sys.stderr, 'info: found %d files of size %s, need to rehash %d files of size %s' % (
+        len(result), filesize_fmt(total_size), 
+        recalc_count, filesize_fmt(recalc_size))
+
+    tmp = []
+    #todo: make incremental, to allow restart from interrupted point
+    for (relpath, md5, size, mtime) in result: 
+        if md5 == None:
+            md5 = md5_for_file(os.path.join(dir, relpath))
+        tmp.append((relpath, md5, size, mtime))
+        if file_processed_callback: 
+            #after md5_for_file file is in the OS file cache, copy it immediately 
+            file_processed_callback(relpath, md5, size, mtime)
+    result = tmp
+
+    result.sort()
+    return result
+
+def write_hashes(hashes_data_list, hf_txt):
+    with open(hf_txt + '.tmp', 'wb') as f:
+        for relpath, md5, size, mtime in hashes_data_list:
+            print >> f, '%s\t%s\t%s\t%s' % (md5, size, mtime, relpath.encode('UTF-8').replace(os.sep, '/'))
+    if os.path.exists(hf_txt):
+        os.unlink(hf_txt)
+    os.rename(hf_txt + ".tmp", hf_txt)
+
+# call 'get_hashes' on directory, and put in file with fresh hashes 
+# see also 'get_hashes' function
+def hash_dir(dir, file_processed_callback = None):
+    hf_txt = os.path.join(dir, 'hash_dir_info.txt')
+    prev_info = load_hashes(hf_txt)
+
+    with open(hf_txt + '.tmp', 'wb') as _f: 
+        #check we can open the file to write
+        pass
+
+    start = time.time()
+    result = get_hashes(dir, prev_info, file_processed_callback)
+    print >> sys.stderr, 'info: hashing', repr(dir), int(time.time() - start), 'seconds'
+
+    write_hashes(result, hf_txt)
+    return result
+
+# Put md5 hashes from dst/**/* to file dst/'hash_dir_info.txt'.
+# for files which are in the 'dst_rel_path2md5' md5 is taken from dst_rel_path2md5. 
+# Otherwise hash is recalculated
+def write_dst_hashes(dst, dst_rel_path2md5):
+    result = []
+    for relpath, mtime, size in listfiles(dst):
+        md5 = dst_rel_path2md5.get(relpath)
+        if md5 == None:
+            md5 = md5_for_file(os.path.join(dst, relpath))
+        result.append((relpath, md5, size, mtime))
+    result.sort()
+    write_hashes(result, os.path.join(dst, 'hash_dir_info.txt'))
+
+# Recursively deletes file tree, 
+# handles some real-life problems with unremovable files    
+def my_rmtree(dir):
+    def remove_readonly(fn, path, _excinfo):
+        os.chmod(path, stat.S_IWRITE) #handle read-only files
+        fn(path)
+    shutil.rmtree(unicode(dir), onerror=remove_readonly)
+
+# some checks of invalid arguments and access rights
+def backup_fast_checks(dst_prev, dst_tmp, srcs):
+    assert dst_prev == None or os.path.isdir(dst_prev), dst_prev
+
+    for src in srcs:
+        if not os.path.isdir(src):
+            print '"%s" is not a directory!' % src
+            exit(1)
+
+    open(os.path.join(dst_tmp, "test_dir_is_writable"), "wb").close()
+    os.unlink(os.path.join(dst_tmp, "test_dir_is_writable"))
+
+    get_src_basenames = sorted([(get_src_basename(src), src) for src in srcs])
+    for src_base, srcs_grouped in itertools.groupby(get_src_basenames, key = lambda (base,s): base):
+        srcs_grouped = list(x[1] for x in srcs_grouped)
+        if len(srcs_grouped) >= 2:
+            raise Exception("name '%s' duplicated for %s" % (src_base, srcs_grouped))
+
+# get basename of file. Tolerate traling/// slashes and Windows drive letter.
+def get_src_basename(x):
+    assert os.path.normpath('xxx///') == 'xxx'
+    if os.name == 'nt':
+        assert os.path.normpath('x://\\//\\') == 'x:\\'
+        assert os.path.normpath('xxx//\\//\\') == 'xxx'
+    x = os.path.normpath(x).rstrip(os.sep)
+    if os.name == 'nt' and x[-1:] == ':':
+        x = x[0:-1] #remove ':' from "drive:"
+    return os.path.basename(x)
+
+#convert drive path to dir path on Windows
+def win_drive_to_dir(x):
+    if os.name == 'nt' and x[-1:] == ':':
+        # http://stackoverflow.com/questions/7258993/pythons-os-listdir-behaviour-on-windows
+        # path 'X:' interpreted as 'current directory for drive X', which is not 'X:\\'
+        x = x + '\\'
+    return x
+
+#load mapping from md5 to filename in previous backup (to hardlink them in future)
+def load_prev_dst_hashes(dst_prev):
+    print >> sys.stderr, 'loading prev_dst="%s" hashes...' % dst_prev
+    if (dst_prev == None):
+        return {}
+    dst_prev_hash_fn = os.path.join(dst_prev, "hash_dir_info.txt")
+    if not os.path.isfile(dst_prev_hash_fn):
+        print >> sys.stderr, "warning: there is no '%s'" % dst_prev_hash_fn
+    dst_prev_hashes = get_hashes(dst_prev, load_hashes(dst_prev_hash_fn), None)
+
+    md5to_old = {}
+    # avoid randomization of hardlinks to files with same content - sort by name, select last
+    for (relpath, md5, _size, _mtime) in sorted(dst_prev_hashes):
+        md5to_old[md5] = os.path.join(dst_prev, relpath)
+    return md5to_old
+
+# create hardlinks to dst_prev or copy srcs files to dst_tmp.
+# if dst_tmp is 'x/y/z' and sourcedir is 'a/b/c', 
+# then 'a/b/c' is copied to 'x/y/z/c'
+def backup_impl(dst_prev, dst_tmp, srcs):
+    srcs = [win_drive_to_dir(src) for src in srcs]
+    if not os.path.isdir(dst_tmp):
+        os.makedirs(dst_tmp)
+    backup_fast_checks(dst_prev, dst_tmp, srcs)
+
+    md5to_old = load_prev_dst_hashes(dst_prev)
+
+    dst_rel_path2md5 = {}
+    for src in srcs:
+        print >> sys.stderr, 'info: copying src="%s"...' % src
+        src_base = get_src_basename(src)
+        current_dst = os.path.join(dst_tmp, src_base)
+
+        if os.path.exists(current_dst):
+            print >> sys.stderr, 'warning: deleting files from backup', repr(current_dst)
+            my_rmtree(current_dst) #todo: don't delete all files, track already copied to dst_tmp in previous run to BD
+        if not os.path.isdir(current_dst):
+            os.makedirs(current_dst)
+
+        created_dirs = set() # don't check known directories
+        def file_processed_callback(relpath, md5, size, _mtime):
+            full_dst_fn = os.path.join(current_dst, relpath)
+            full_dst_dir = os.path.dirname(full_dst_fn)
+            if full_dst_dir not in created_dirs: 
+                if not os.path.isdir(full_dst_dir):
+                    os.makedirs(full_dst_dir)
+                created_dirs.add(full_dst_dir)
+            try:
+                if size == 0:
+                    #NTFS hard-links limit = 1023, empty file is most frequent
+                    open(full_dst_fn, 'wb').close() 
+                elif md5 in md5to_old:
+                    #print >> sys.stderr, 'verbose: take cached', repr(relpath) #lot of spam
+                    mk_hard_link(md5to_old[md5], full_dst_fn)
+                else:
+                    if dst_prev != None: #a lot of copies, when run first time
+                        print >> sys.stderr, 'verbose: full copy', repr(relpath)
+                    shutil.copy(os.path.join(src, relpath), full_dst_fn)
+                dst_rel_path = os.path.join(src_base, relpath)
+                dst_rel_path2md5[dst_rel_path] = md5
+            except EnvironmentError:
+                traceback.print_exc()
+                print >> sys.stderr, 'warning: skipping', repr(src) + repr(relpath)
+
+        hash_dir(src, file_processed_callback)
+
+    write_dst_hashes(dst_tmp, dst_rel_path2md5)
+    #todo: start validation here, that dst hard-drive can be read back (protection against bad-blocks).
+
+# calls 'backup_impl' function. See also 'backup_impl'.
+# Measures time, creates backup.complete file, moves 'dst.tmp' directory to 'dst'
+def backup(dst_prev, dst, srcs):
+    st = time.time()
+    print >> sys.stderr, 'info: starting backup from "%s" to "%s" for sources=%s' % (dst_prev, dst, srcs)
+    if os.path.exists(dst):
+        print >> sys.stderr, 'ERROR! There is already directory "%s". Backup aborted, to avoid old backup corruption.' % dst
+        exit(1)
+
+    backup_impl(dst_prev, dst + ".tmp", srcs)
+    os.rename(dst + ".tmp", dst)
+
+    backup_complete_file = os.path.join(dst, 'backup.complete')
+    with open(backup_complete_file, 'wb') as f:
+        print >> f, 'last backup time: "%s"' % time.time()
+    print >> sys.stderr, 'info: done backup from "%s" to "%s" for sources=%s' % (dst_prev, dst, srcs)
+    print >> sys.stderr, 'info: total time: %s seconds' % (int(time.time() - st))
+
+# select directory with most recent root/*/backup.complete file.   
+def select_last_previous_backup(root):
+    root = win_drive_to_dir(root)
+    alternatives = []
+    if not os.path.isdir(root):
+        print '"%s" is not a directory!' % root
+        exit(1)
+    for dir in os.listdir(root):
+        dst_prev = os.path.join(root, dir)
+        if os.path.isdir(dst_prev):
+            backup_complete_file = os.path.join(dst_prev, 'backup.complete')
+            if os.path.isfile(backup_complete_file):
+                with open(backup_complete_file) as f:
+                    try:
+                        m = re.match('last backup time: "(.*)"$', f.read().strip())
+                        backup_time = float(m.group(1))
+                        alternatives.append((backup_time, dst_prev))
+                    except:
+                        traceback.print_exc()
+                        print >> sys.stderr, 'warning: corrupted "%s"' % backup_complete_file
+
+    dst_prev = None
+    if alternatives:
+        _backup_time, dst_prev = max(alternatives)
+        print >> sys.stderr, 'info: directory "%s" is used as previous backup' % dst_prev
+    else:
+        print >> sys.stderr, 'info: no previous (with backup.complete file) backup found at "%s/*/backup.complete", full copy will be performed' % root
+
+    return dst_prev
+
+def main():
+    if len(sys.argv) < 3:
+        prog_name = sys.argv[0] if sys.argv else ''
+        print >> sys.stderr, USAGE.replace("prog_name", prog_name)
+        exit(1)
+    root = sys.argv[1] 
+    srcs = sys.argv[2:]
+    dst_basename = time.strftime("backup_%Y-%m-%d_%H-%M-%S")
+    backup(select_last_previous_backup(root), os.path.join(win_drive_to_dir(root), dst_basename), srcs)
+
+#compare results of ListFilesWin32 and _listfiles_py, write output to file, and print time.
+def my_listdir_functions_test():
+    for lf, out in [(ListFilesWin32, 'b.txt'), (_listfiles_py, 'a.txt')]:
+        t = time.time()
+        with open(out, 'wb') as f:
+            path = u"c:"
+            for relpath, mtime, size in lf(unicode(path), u'', []):
+                relpath = relpath.replace('\\', '/').encode('UTF-8')
+                if 'test/string/trunk/bin/backup' not in relpath: #my development source directory
+                    print >> f, '%s\t%s\t%s' % (relpath, mtime, size)
+        print >> sys.stderr, 'TIME:', lf, time.time() - t            
+
+if __name__ == "__main__":
+    main()
+
+