Skip to content

Instantly share code, notes, and snippets.

@dobrokot
Last active August 29, 2015 14:02
Show Gist options
  • Save dobrokot/e552315e9acfac2359e7 to your computer and use it in GitHub Desktop.
Save dobrokot/e552315e9acfac2359e7 to your computer and use it in GitHub Desktop.

Revisions

  1. dobrokot revised this gist Jun 1, 2014. 1 changed file with 4 additions and 2 deletions.
    6 changes: 4 additions & 2 deletions dobro_backup.py
    Original file line number Diff line number Diff line change
    @@ -5,9 +5,11 @@
    Usage: python "prog_name" destination_dir path1/source1 ... pathN/sourceN'
    Copied result will be in the destination_dir/source1 ... destination_dir/sourceN'
    Copied result will be in the destination_dir/DATE_TIME/source1 ... destination_dir/DATE_TIME/sourceN'
    Also there will be created files pathN/sourceN/hash_dir_info.txt (write access to source is needed now)'
    When backup is complete, destination_dir/backup.complete is created, with timestamp info.
    When backup is complete, destination_dir/DATE_TIME/backup.complete is created, with timestamp info.
    Hardlinks of duplicated files created to previous directory, to save space/time.
    Deletion of old backup directories is safe.
    Encryption and compression can be provided by the file-system (TrueCrypt + NTFS, for example).
  2. dobrokot revised this gist Jun 1, 2014. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion dobro_backup.py
    Original file line number Diff line number Diff line change
    @@ -3,7 +3,7 @@
    USAGE = '''
    Incremental backup program
    Usage: python "prog_name" destination_dir path1/source1 ... pathN/sourceN' % prog_name
    Usage: python "prog_name" destination_dir path1/source1 ... pathN/sourceN'
    Copied result will be in the destination_dir/source1 ... destination_dir/sourceN'
    Also there will be created files pathN/sourceN/hash_dir_info.txt (write access to source is needed now)'
  3. dobrokot created this gist Jun 1, 2014.
    486 changes: 486 additions & 0 deletions dobro_backup.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,486 @@
    #!/usr/bin/env python

    USAGE = '''
    Incremental backup program
    Usage: python "prog_name" destination_dir path1/source1 ... pathN/sourceN' % prog_name
    Copied result will be in the destination_dir/source1 ... destination_dir/sourceN'
    Also there will be created files pathN/sourceN/hash_dir_info.txt (write access to source is needed now)'
    When backup is complete, destination_dir/backup.complete is created, with timestamp info.
    Encryption and compression can be provided by the file-system (TrueCrypt + NTFS, for example).
    Hardlinks to older-than-last backups are not created now (you may manually move/copy older files to last backup)
    '''

    # Copyright (c) 2014 Ivan Yanikov

    # The MIT License (MIT)

    # Permission is hereby granted, free of charge, to any person obtaining a copy
    # of this software and associated documentation files (the "Software"), to deal
    # in the Software without restriction, including without limitation the rights
    # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    # copies of the Software, and to permit persons to whom the Software is
    # furnished to do so, subject to the following conditions:

    # The above copyright notice and this permission notice shall be included in
    # all copies or substantial portions of the Software.

    # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    # THE SOFTWARE.


    #------------------------------------------------
    # known missing features and issues:
    #
    # todo: allow to create hardlinks to old backups, to handle changed source set
    # ( ["music", "documents", "video"] -> ["music", "documents"] -> ["music", "documents", "video"] ).
    # todo: needless rehashes of internal file hash_dir_info.txt
    # todo: catch "source inside destination" errors and vice-versa
    # todo: allow to rename sources targets
    # todo: allow to store hash_dir_info.txt outside source directories
    # todo: allow exclude dirs/mask
    # todo: don't spam about known unreadable dirs ( Microsoft\Windows\WER\ReportQueue )
    #------------------------------------------------

    import hashlib
    import os, sys, shutil, stat
    import time
    import itertools
    import re
    import traceback

    #recursively enumerates files in directory
    #returns list of tuples ( relative_path, file_modified_timestamp, file_size )
    #cross-platform version for any OS
    def _listfiles_py(path, relpath, result):
    if path:
    path += os.sep
    try:
    files = os.listdir(path)
    except EnvironmentError:
    traceback.print_exc()
    print >> sys.stderr, 'warning: skipping "' + path.encode('UTF-8') + '"'
    return result

    for fn in files:
    fullfn = path + fn
    relpath2 = (relpath + os.sep if relpath else '') + fn
    try:
    st = os.stat(fullfn)
    except EnvironmentError:
    traceback.print_exc()
    print >> sys.stderr, 'warning: skipping "' + path.encode('UTF-8') + '"'
    continue

    file_type = stat.S_IFMT(st.st_mode)
    if file_type == stat.S_IFDIR:
    _listfiles_py(fullfn, relpath2, result)
    elif file_type == stat.S_IFREG:
    result.append((relpath2, int(st.st_mtime), st.st_size))
    #soft-links are skipped
    return result

    win32file = None
    if (os.name == 'nt'):
    if not hasattr(os, 'link'): # python on windows do not have os.link for hardlinks
    import ctypes
    from ctypes import WinError
    from ctypes.wintypes import BOOL
    CreateHardLink = ctypes.windll.kernel32.CreateHardLinkW
    CreateHardLink.argtypes = [ctypes.c_wchar_p, ctypes.c_wchar_p, ctypes.c_void_p]
    CreateHardLink.restype = BOOL

    def mk_hard_link(src, dst):
    res = CreateHardLink(dst, src, None)
    if res == 0:
    raise WinError()

    FindFirstFile = ctypes.windll.kernel32.FindFirstFileW
    FindNextFile = ctypes.windll.kernel32.FindNextFileW
    FindClose = ctypes.windll.kernel32.FindClose
    GetLastError = ctypes.windll.kernel32.GetLastError
    #optimization for Windows: FindFirstFileW is much faster than (os.listdir + os.stat)
    #enumerates non-recursive immediate files and subdirectories in directory
    #returns list of tuples ( relative_path, file_modified_timestamp, file_size, is_directory )
    def FindFilesWin32(path):
    # pylint: disable=E0611
    from exceptions import WindowsError
    out = ctypes.wintypes.WIN32_FIND_DATAW()
    fldr = FindFirstFile(unicode(path) + u'\\*', ctypes.byref(out))

    if fldr == -1: #INVALID_HANDLE_VALUE
    raise WindowsError("can't list files in the %s" % repr(path))

    result = []
    try:
    while 1:
    fn = out.cFileName
    if fn != u'.' and fn != u'..':
    isdir = bool(out.dwFileAttributes & 0x10) #FILE_ATTRIBUTE_DIRECTORY
    ts = out.ftLastWriteTime
    win_filetime = (ts.dwHighDateTime << 32) | ts.dwLowDateTime
    secs_between_epochs = 11644473600 # Seconds between 1.1.1601 and 1.1.1970
    timestamp = win_filetime / 10000000 - secs_between_epochs
    size = (out.nFileSizeHigh << 32) | out.nFileSizeLow
    result.append((out.cFileName, timestamp, size, isdir))

    if not FindNextFile(fldr, ctypes.byref(out)):
    win_error = GetLastError()
    if win_error == 18: #ERROR_NO_MORE_FILES:
    break
    exc = WindowsError(win_error) #todo: test
    exc.filename = path
    raise exc
    finally:
    FindClose(fldr)
    return result

    #recursively enumerates files in directory
    #returns list of tuples ( relative_path, file_modified_timestamp, file_size )
    #specialized version for Windows.
    def ListFilesWin32(path, relpath, result):
    if path:
    path += os.sep
    try:
    files = FindFilesWin32(path)
    except EnvironmentError:
    traceback.print_exc()
    print >> sys.stderr, 'warning: skipping "' + path.encode('UTF-8') + '"'
    return result

    for (fn, ts, sz, isdir) in files:
    fullfn = path + fn
    relpath2 = (relpath + os.sep if relpath else '') + fn
    if isdir:
    ListFilesWin32(fullfn, relpath2, result)
    else:
    result.append((relpath2, ts, sz))
    #soft-links are skipped
    return result

    _listfiles = ListFilesWin32


    else: # non-windows version of mk_hard_link and _listfiles
    mk_hard_link = os.link
    _listfiles = _listfiles_py

    #recursively enumerates files in directory
    #returns list of tuples ( relative_path, file_modified_timestamp, file_size )
    def listfiles(path):
    return _listfiles(unicode(path), u'', [])

    def md5_for_file(filename):
    # pylint: disable=E1101
    md5 = hashlib.md5()
    with open(filename, 'rb') as f:
    while True:
    data = f.read(2**20)
    if not data:
    break
    md5.update(data)
    return md5.hexdigest()

    #read text file, and load it to dictionary {filename -> (md5, file_size, file_modified_timestamp)}
    def load_hashes(hf_txt):
    #different systems can have different file-time-modified format
    if not os.path.isfile(hf_txt):
    return {}

    try:
    res = {}
    with open(hf_txt, 'rb') as f:
    for l in f:
    md5, size, mtime, filename = l.rstrip('\r\n').split('\t', 3)
    res[filename.replace('/', os.sep).decode('UTF-8')] = (md5, int(size), int(mtime))
    return res
    except EnvironmentError:
    traceback.print_exc()
    return {}

    # pretty format of file size, '81731298312' -> '76.1GB'
    def filesize_fmt(num):
    for x in ['bytes', 'KB', 'MB', 'GB']:
    if num < 1024.0 and num > -1024.0:
    return "%3.1f%s" % (num, x)
    num /= 1024.0
    return "%3.1f%s" % (num, 'TB')

    # recalculates hashes of files, timestamp of which changed
    # also calls 'file_processed_callback' on each file (to
    # have fresh data from OS file-system cache).
    def get_hashes(dir, prev_info, file_processed_callback):
    print >> sys.stderr, 'info: traversing', repr(dir)
    current_files = listfiles(dir)

    result = []
    for relpath, mtime, size in current_files:
    prev_file_info = prev_info.get(relpath)
    md5 = None
    if prev_file_info:
    prev_md5, prev_size, prev_mtime = prev_file_info
    if prev_size == size and prev_mtime == mtime:
    md5 = prev_md5
    result.append((relpath, md5, size, mtime))

    recalc_count = 0
    recalc_size = 0
    total_size = 0
    for relpath, md5, size, mtime in result:
    total_size += size
    if md5 == None:
    recalc_count += 1
    recalc_size += size

    print >> sys.stderr, 'info: found %d files of size %s, need to rehash %d files of size %s' % (
    len(result), filesize_fmt(total_size),
    recalc_count, filesize_fmt(recalc_size))

    tmp = []
    #todo: make incremental, to allow restart from interrupted point
    for (relpath, md5, size, mtime) in result:
    if md5 == None:
    md5 = md5_for_file(os.path.join(dir, relpath))
    tmp.append((relpath, md5, size, mtime))
    if file_processed_callback:
    #after md5_for_file file is in the OS file cache, copy it immediately
    file_processed_callback(relpath, md5, size, mtime)
    result = tmp

    result.sort()
    return result

    def write_hashes(hashes_data_list, hf_txt):
    with open(hf_txt + '.tmp', 'wb') as f:
    for relpath, md5, size, mtime in hashes_data_list:
    print >> f, '%s\t%s\t%s\t%s' % (md5, size, mtime, relpath.encode('UTF-8').replace(os.sep, '/'))
    if os.path.exists(hf_txt):
    os.unlink(hf_txt)
    os.rename(hf_txt + ".tmp", hf_txt)

    # call 'get_hashes' on directory, and put in file with fresh hashes
    # see also 'get_hashes' function
    def hash_dir(dir, file_processed_callback = None):
    hf_txt = os.path.join(dir, 'hash_dir_info.txt')
    prev_info = load_hashes(hf_txt)

    with open(hf_txt + '.tmp', 'wb') as _f:
    #check we can open the file to write
    pass

    start = time.time()
    result = get_hashes(dir, prev_info, file_processed_callback)
    print >> sys.stderr, 'info: hashing', repr(dir), int(time.time() - start), 'seconds'

    write_hashes(result, hf_txt)
    return result

    # Put md5 hashes from dst/**/* to file dst/'hash_dir_info.txt'.
    # for files which are in the 'dst_rel_path2md5' md5 is taken from dst_rel_path2md5.
    # Otherwise hash is recalculated
    def write_dst_hashes(dst, dst_rel_path2md5):
    result = []
    for relpath, mtime, size in listfiles(dst):
    md5 = dst_rel_path2md5.get(relpath)
    if md5 == None:
    md5 = md5_for_file(os.path.join(dst, relpath))
    result.append((relpath, md5, size, mtime))
    result.sort()
    write_hashes(result, os.path.join(dst, 'hash_dir_info.txt'))

    # Recursively deletes file tree,
    # handles some real-life problems with unremovable files
    def my_rmtree(dir):
    def remove_readonly(fn, path, _excinfo):
    os.chmod(path, stat.S_IWRITE) #handle read-only files
    fn(path)
    shutil.rmtree(unicode(dir), onerror=remove_readonly)

    # some checks of invalid arguments and access rights
    def backup_fast_checks(dst_prev, dst_tmp, srcs):
    assert dst_prev == None or os.path.isdir(dst_prev), dst_prev

    for src in srcs:
    if not os.path.isdir(src):
    print '"%s" is not a directory!' % src
    exit(1)

    open(os.path.join(dst_tmp, "test_dir_is_writable"), "wb").close()
    os.unlink(os.path.join(dst_tmp, "test_dir_is_writable"))

    get_src_basenames = sorted([(get_src_basename(src), src) for src in srcs])
    for src_base, srcs_grouped in itertools.groupby(get_src_basenames, key = lambda (base,s): base):
    srcs_grouped = list(x[1] for x in srcs_grouped)
    if len(srcs_grouped) >= 2:
    raise Exception("name '%s' duplicated for %s" % (src_base, srcs_grouped))

    # get basename of file. Tolerate traling/// slashes and Windows drive letter.
    def get_src_basename(x):
    assert os.path.normpath('xxx///') == 'xxx'
    if os.name == 'nt':
    assert os.path.normpath('x://\\//\\') == 'x:\\'
    assert os.path.normpath('xxx//\\//\\') == 'xxx'
    x = os.path.normpath(x).rstrip(os.sep)
    if os.name == 'nt' and x[-1:] == ':':
    x = x[0:-1] #remove ':' from "drive:"
    return os.path.basename(x)

    #convert drive path to dir path on Windows
    def win_drive_to_dir(x):
    if os.name == 'nt' and x[-1:] == ':':
    # http://stackoverflow.com/questions/7258993/pythons-os-listdir-behaviour-on-windows
    # path 'X:' interpreted as 'current directory for drive X', which is not 'X:\\'
    x = x + '\\'
    return x

    #load mapping from md5 to filename in previous backup (to hardlink them in future)
    def load_prev_dst_hashes(dst_prev):
    print >> sys.stderr, 'loading prev_dst="%s" hashes...' % dst_prev
    if (dst_prev == None):
    return {}
    dst_prev_hash_fn = os.path.join(dst_prev, "hash_dir_info.txt")
    if not os.path.isfile(dst_prev_hash_fn):
    print >> sys.stderr, "warning: there is no '%s'" % dst_prev_hash_fn
    dst_prev_hashes = get_hashes(dst_prev, load_hashes(dst_prev_hash_fn), None)

    md5to_old = {}
    # avoid randomization of hardlinks to files with same content - sort by name, select last
    for (relpath, md5, _size, _mtime) in sorted(dst_prev_hashes):
    md5to_old[md5] = os.path.join(dst_prev, relpath)
    return md5to_old

    # create hardlinks to dst_prev or copy srcs files to dst_tmp.
    # if dst_tmp is 'x/y/z' and sourcedir is 'a/b/c',
    # then 'a/b/c' is copied to 'x/y/z/c'
    def backup_impl(dst_prev, dst_tmp, srcs):
    srcs = [win_drive_to_dir(src) for src in srcs]
    if not os.path.isdir(dst_tmp):
    os.makedirs(dst_tmp)
    backup_fast_checks(dst_prev, dst_tmp, srcs)

    md5to_old = load_prev_dst_hashes(dst_prev)

    dst_rel_path2md5 = {}
    for src in srcs:
    print >> sys.stderr, 'info: copying src="%s"...' % src
    src_base = get_src_basename(src)
    current_dst = os.path.join(dst_tmp, src_base)

    if os.path.exists(current_dst):
    print >> sys.stderr, 'warning: deleting files from backup', repr(current_dst)
    my_rmtree(current_dst) #todo: don't delete all files, track already copied to dst_tmp in previous run to BD
    if not os.path.isdir(current_dst):
    os.makedirs(current_dst)

    created_dirs = set() # don't check known directories
    def file_processed_callback(relpath, md5, size, _mtime):
    full_dst_fn = os.path.join(current_dst, relpath)
    full_dst_dir = os.path.dirname(full_dst_fn)
    if full_dst_dir not in created_dirs:
    if not os.path.isdir(full_dst_dir):
    os.makedirs(full_dst_dir)
    created_dirs.add(full_dst_dir)
    try:
    if size == 0:
    #NTFS hard-links limit = 1023, empty file is most frequent
    open(full_dst_fn, 'wb').close()
    elif md5 in md5to_old:
    #print >> sys.stderr, 'verbose: take cached', repr(relpath) #lot of spam
    mk_hard_link(md5to_old[md5], full_dst_fn)
    else:
    if dst_prev != None: #a lot of copies, when run first time
    print >> sys.stderr, 'verbose: full copy', repr(relpath)
    shutil.copy(os.path.join(src, relpath), full_dst_fn)
    dst_rel_path = os.path.join(src_base, relpath)
    dst_rel_path2md5[dst_rel_path] = md5
    except EnvironmentError:
    traceback.print_exc()
    print >> sys.stderr, 'warning: skipping', repr(src) + repr(relpath)

    hash_dir(src, file_processed_callback)

    write_dst_hashes(dst_tmp, dst_rel_path2md5)
    #todo: start validation here, that dst hard-drive can be read back (protection against bad-blocks).

    # calls 'backup_impl' function. See also 'backup_impl'.
    # Measures time, creates backup.complete file, moves 'dst.tmp' directory to 'dst'
    def backup(dst_prev, dst, srcs):
    st = time.time()
    print >> sys.stderr, 'info: starting backup from "%s" to "%s" for sources=%s' % (dst_prev, dst, srcs)
    if os.path.exists(dst):
    print >> sys.stderr, 'ERROR! There is already directory "%s". Backup aborted, to avoid old backup corruption.' % dst
    exit(1)

    backup_impl(dst_prev, dst + ".tmp", srcs)
    os.rename(dst + ".tmp", dst)

    backup_complete_file = os.path.join(dst, 'backup.complete')
    with open(backup_complete_file, 'wb') as f:
    print >> f, 'last backup time: "%s"' % time.time()
    print >> sys.stderr, 'info: done backup from "%s" to "%s" for sources=%s' % (dst_prev, dst, srcs)
    print >> sys.stderr, 'info: total time: %s seconds' % (int(time.time() - st))

    # select directory with most recent root/*/backup.complete file.
    def select_last_previous_backup(root):
    root = win_drive_to_dir(root)
    alternatives = []
    if not os.path.isdir(root):
    print '"%s" is not a directory!' % root
    exit(1)
    for dir in os.listdir(root):
    dst_prev = os.path.join(root, dir)
    if os.path.isdir(dst_prev):
    backup_complete_file = os.path.join(dst_prev, 'backup.complete')
    if os.path.isfile(backup_complete_file):
    with open(backup_complete_file) as f:
    try:
    m = re.match('last backup time: "(.*)"$', f.read().strip())
    backup_time = float(m.group(1))
    alternatives.append((backup_time, dst_prev))
    except:
    traceback.print_exc()
    print >> sys.stderr, 'warning: corrupted "%s"' % backup_complete_file

    dst_prev = None
    if alternatives:
    _backup_time, dst_prev = max(alternatives)
    print >> sys.stderr, 'info: directory "%s" is used as previous backup' % dst_prev
    else:
    print >> sys.stderr, 'info: no previous (with backup.complete file) backup found at "%s/*/backup.complete", full copy will be performed' % root

    return dst_prev

    def main():
    if len(sys.argv) < 3:
    prog_name = sys.argv[0] if sys.argv else ''
    print >> sys.stderr, USAGE.replace("prog_name", prog_name)
    exit(1)
    root = sys.argv[1]
    srcs = sys.argv[2:]
    dst_basename = time.strftime("backup_%Y-%m-%d_%H-%M-%S")
    backup(select_last_previous_backup(root), os.path.join(win_drive_to_dir(root), dst_basename), srcs)

    #compare results of ListFilesWin32 and _listfiles_py, write output to file, and print time.
    def my_listdir_functions_test():
    for lf, out in [(ListFilesWin32, 'b.txt'), (_listfiles_py, 'a.txt')]:
    t = time.time()
    with open(out, 'wb') as f:
    path = u"c:"
    for relpath, mtime, size in lf(unicode(path), u'', []):
    relpath = relpath.replace('\\', '/').encode('UTF-8')
    if 'test/string/trunk/bin/backup' not in relpath: #my development source directory
    print >> f, '%s\t%s\t%s' % (relpath, mtime, size)
    print >> sys.stderr, 'TIME:', lf, time.time() - t

    if __name__ == "__main__":
    main()