Last active
August 3, 2025 18:52
-
-
Save tfeldmann/fc875e6630d11f2256e746f67a09c1ae to your computer and use it in GitHub Desktop.
Revisions
-
tfeldmann revised this gist
Aug 5, 2020 . 1 changed file with 0 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -5,9 +5,6 @@ Based on https://stackoverflow.com/a/36113168/300783 Modified for Python3 with some small code improvements. """ import os import sys -
tfeldmann revised this gist
Jun 15, 2020 . 1 changed file with 6 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -5,6 +5,9 @@ Based on https://stackoverflow.com/a/36113168/300783 Modified for Python3 with some small code improvements. The script on stackoverflow has a bug which could lead to false positives. This is fixed here by using a tuple (file_size, hash) as key in the small hash comparison dictionary. """ import os import sys @@ -38,7 +41,7 @@ def check_for_duplicates(paths): files_by_full_hash = dict() for path in paths: for dirpath, _, filenames in os.walk(path): for filename in filenames: full_path = os.path.join(dirpath, filename) try: @@ -52,7 +55,7 @@ def check_for_duplicates(paths): files_by_size[file_size].append(full_path) # For all files with the same file size, get their hash on the first 1024 bytes for file_size, files in files_by_size.items(): if len(files) < 2: continue # this file size is unique, no need to spend cpu cycles on it @@ -62,7 +65,7 @@ def check_for_duplicates(paths): except OSError: # the file access might've changed till the exec point got here continue files_by_small_hash[(file_size, small_hash)].append(filename) # For all files with the hash on the first 1024 bytes, get their hash on the full # file - collisions will be duplicates -
tfeldmann revised this gist
Nov 27, 2019 . 1 changed file with 3 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -21,8 +21,8 @@ def chunk_reader(fobj, chunk_size=1024): yield chunk def get_hash(filename, first_chunk_only=False, hash_algo=hashlib.sha1): hashobj = hash_algo() with open(filename, "rb") as f: if first_chunk_only: hashobj.update(f.read(1024)) @@ -32,7 +32,7 @@ def get_hash(filename, first_chunk_only=False, hash=hashlib.sha1): return hashobj.digest() def check_for_duplicates(paths): files_by_size = defaultdict(list) files_by_small_hash = defaultdict(list) files_by_full_hash = dict() -
tfeldmann created this gist
Nov 27, 2019 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,92 @@ #!/usr/bin/env python """ Fast duplicate file finder. Usage: duplicates.py <folder> [<folder>...] Based on https://stackoverflow.com/a/36113168/300783 Modified for Python3 with some small code improvements. """ import os import sys import hashlib from collections import defaultdict def chunk_reader(fobj, chunk_size=1024): """ Generator that reads a file in chunks of bytes """ while True: chunk = fobj.read(chunk_size) if not chunk: return yield chunk def get_hash(filename, first_chunk_only=False, hash=hashlib.sha1): hashobj = hash() with open(filename, "rb") as f: if first_chunk_only: hashobj.update(f.read(1024)) else: for chunk in chunk_reader(f): hashobj.update(chunk) return hashobj.digest() def check_for_duplicates(paths, hash=hashlib.sha1): files_by_size = defaultdict(list) files_by_small_hash = defaultdict(list) files_by_full_hash = dict() for path in paths: for dirpath, dirnames, filenames in os.walk(path): for filename in filenames: full_path = os.path.join(dirpath, filename) try: # if the target is a symlink (soft one), this will # dereference it - change the value to the actual target file full_path = os.path.realpath(full_path) file_size = os.path.getsize(full_path) except OSError: # not accessible (permissions, etc) - pass on continue files_by_size[file_size].append(full_path) # For all files with the same file size, get their hash on the first 1024 bytes for files in files_by_size.values(): if len(files) < 2: continue # this file size is unique, no need to spend cpu cycles on it for filename in files: try: small_hash = get_hash(filename, first_chunk_only=True) except OSError: # the file access might've changed till the exec point got here continue files_by_small_hash[small_hash].append(filename) # For all files with the hash on the first 1024 bytes, get their hash on the full # file - collisions will be duplicates for files in files_by_small_hash.values(): if len(files) < 2: # the hash of the first 1k bytes is unique -> skip this file continue for filename in files: try: full_hash = get_hash(filename, first_chunk_only=False) except OSError: # the file access might've changed till the exec point got here continue if full_hash in files_by_full_hash: duplicate = files_by_full_hash[full_hash] print("Duplicate found:\n - %s\n - %s\n" % (filename, duplicate)) else: files_by_full_hash[full_hash] = filename if __name__ == "__main__": if sys.argv[1:]: check_for_duplicates(sys.argv[1:]) else: print("Usage: %s <folder> [<folder>...]" % sys.argv[0])