Skip to content

Instantly share code, notes, and snippets.

@tfeldmann
Last active August 3, 2025 18:52
Show Gist options
  • Select an option

  • Save tfeldmann/fc875e6630d11f2256e746f67a09c1ae to your computer and use it in GitHub Desktop.

Select an option

Save tfeldmann/fc875e6630d11f2256e746f67a09c1ae to your computer and use it in GitHub Desktop.

Revisions

  1. tfeldmann revised this gist Aug 5, 2020. 1 changed file with 0 additions and 3 deletions.
    3 changes: 0 additions & 3 deletions duplicates.py
    Original file line number Diff line number Diff line change
    @@ -5,9 +5,6 @@
    Based on https://stackoverflow.com/a/36113168/300783
    Modified for Python3 with some small code improvements.
    The script on stackoverflow has a bug which could lead to false positives. This is fixed
    here by using a tuple (file_size, hash) as key in the small hash comparison dictionary.
    """
    import os
    import sys
  2. tfeldmann revised this gist Jun 15, 2020. 1 changed file with 6 additions and 3 deletions.
    9 changes: 6 additions & 3 deletions duplicates.py
    Original file line number Diff line number Diff line change
    @@ -5,6 +5,9 @@
    Based on https://stackoverflow.com/a/36113168/300783
    Modified for Python3 with some small code improvements.
    The script on stackoverflow has a bug which could lead to false positives. This is fixed
    here by using a tuple (file_size, hash) as key in the small hash comparison dictionary.
    """
    import os
    import sys
    @@ -38,7 +41,7 @@ def check_for_duplicates(paths):
    files_by_full_hash = dict()

    for path in paths:
    for dirpath, dirnames, filenames in os.walk(path):
    for dirpath, _, filenames in os.walk(path):
    for filename in filenames:
    full_path = os.path.join(dirpath, filename)
    try:
    @@ -52,7 +55,7 @@ def check_for_duplicates(paths):
    files_by_size[file_size].append(full_path)

    # For all files with the same file size, get their hash on the first 1024 bytes
    for files in files_by_size.values():
    for file_size, files in files_by_size.items():
    if len(files) < 2:
    continue # this file size is unique, no need to spend cpu cycles on it

    @@ -62,7 +65,7 @@ def check_for_duplicates(paths):
    except OSError:
    # the file access might've changed till the exec point got here
    continue
    files_by_small_hash[small_hash].append(filename)
    files_by_small_hash[(file_size, small_hash)].append(filename)

    # For all files with the hash on the first 1024 bytes, get their hash on the full
    # file - collisions will be duplicates
  3. tfeldmann revised this gist Nov 27, 2019. 1 changed file with 3 additions and 3 deletions.
    6 changes: 3 additions & 3 deletions duplicates.py
    Original file line number Diff line number Diff line change
    @@ -21,8 +21,8 @@ def chunk_reader(fobj, chunk_size=1024):
    yield chunk


    def get_hash(filename, first_chunk_only=False, hash=hashlib.sha1):
    hashobj = hash()
    def get_hash(filename, first_chunk_only=False, hash_algo=hashlib.sha1):
    hashobj = hash_algo()
    with open(filename, "rb") as f:
    if first_chunk_only:
    hashobj.update(f.read(1024))
    @@ -32,7 +32,7 @@ def get_hash(filename, first_chunk_only=False, hash=hashlib.sha1):
    return hashobj.digest()


    def check_for_duplicates(paths, hash=hashlib.sha1):
    def check_for_duplicates(paths):
    files_by_size = defaultdict(list)
    files_by_small_hash = defaultdict(list)
    files_by_full_hash = dict()
  4. tfeldmann created this gist Nov 27, 2019.
    92 changes: 92 additions & 0 deletions duplicates.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,92 @@
    #!/usr/bin/env python
    """
    Fast duplicate file finder.
    Usage: duplicates.py <folder> [<folder>...]
    Based on https://stackoverflow.com/a/36113168/300783
    Modified for Python3 with some small code improvements.
    """
    import os
    import sys
    import hashlib
    from collections import defaultdict


    def chunk_reader(fobj, chunk_size=1024):
    """ Generator that reads a file in chunks of bytes """
    while True:
    chunk = fobj.read(chunk_size)
    if not chunk:
    return
    yield chunk


    def get_hash(filename, first_chunk_only=False, hash=hashlib.sha1):
    hashobj = hash()
    with open(filename, "rb") as f:
    if first_chunk_only:
    hashobj.update(f.read(1024))
    else:
    for chunk in chunk_reader(f):
    hashobj.update(chunk)
    return hashobj.digest()


    def check_for_duplicates(paths, hash=hashlib.sha1):
    files_by_size = defaultdict(list)
    files_by_small_hash = defaultdict(list)
    files_by_full_hash = dict()

    for path in paths:
    for dirpath, dirnames, filenames in os.walk(path):
    for filename in filenames:
    full_path = os.path.join(dirpath, filename)
    try:
    # if the target is a symlink (soft one), this will
    # dereference it - change the value to the actual target file
    full_path = os.path.realpath(full_path)
    file_size = os.path.getsize(full_path)
    except OSError:
    # not accessible (permissions, etc) - pass on
    continue
    files_by_size[file_size].append(full_path)

    # For all files with the same file size, get their hash on the first 1024 bytes
    for files in files_by_size.values():
    if len(files) < 2:
    continue # this file size is unique, no need to spend cpu cycles on it

    for filename in files:
    try:
    small_hash = get_hash(filename, first_chunk_only=True)
    except OSError:
    # the file access might've changed till the exec point got here
    continue
    files_by_small_hash[small_hash].append(filename)

    # For all files with the hash on the first 1024 bytes, get their hash on the full
    # file - collisions will be duplicates
    for files in files_by_small_hash.values():
    if len(files) < 2:
    # the hash of the first 1k bytes is unique -> skip this file
    continue

    for filename in files:
    try:
    full_hash = get_hash(filename, first_chunk_only=False)
    except OSError:
    # the file access might've changed till the exec point got here
    continue

    if full_hash in files_by_full_hash:
    duplicate = files_by_full_hash[full_hash]
    print("Duplicate found:\n - %s\n - %s\n" % (filename, duplicate))
    else:
    files_by_full_hash[full_hash] = filename


    if __name__ == "__main__":
    if sys.argv[1:]:
    check_for_duplicates(sys.argv[1:])
    else:
    print("Usage: %s <folder> [<folder>...]" % sys.argv[0])