tfeldmann · August 3, 2025 18:52 · Aug 5, 2020 · Jun 15, 2020 · Nov 27, 2019 · Nov 27, 2019
diff --git a/duplicates.py b/duplicates.py
@@ -5,9 +5,6 @@
 
 Based on https://stackoverflow.com/a/36113168/300783
 Modified for Python3 with some small code improvements.
-
-The script on stackoverflow has a bug which could lead to false positives. This is fixed
-here by using a tuple (file_size, hash) as key in the small hash comparison dictionary.
 """
 import os
 import sys

diff --git a/duplicates.py b/duplicates.py
@@ -5,6 +5,9 @@
 
 Based on https://stackoverflow.com/a/36113168/300783
 Modified for Python3 with some small code improvements.
+
+The script on stackoverflow has a bug which could lead to false positives. This is fixed
+here by using a tuple (file_size, hash) as key in the small hash comparison dictionary.
 """
 import os
 import sys
@@ -38,7 +41,7 @@ def check_for_duplicates(paths):
     files_by_full_hash = dict()
 
     for path in paths:
-        for dirpath, dirnames, filenames in os.walk(path):
+        for dirpath, _, filenames in os.walk(path):
             for filename in filenames:
                 full_path = os.path.join(dirpath, filename)
                 try:
@@ -52,7 +55,7 @@ def check_for_duplicates(paths):
                 files_by_size[file_size].append(full_path)
 
     # For all files with the same file size, get their hash on the first 1024 bytes
-    for files in files_by_size.values():
+    for file_size, files in files_by_size.items():
         if len(files) < 2:
             continue  # this file size is unique, no need to spend cpu cycles on it
 
@@ -62,7 +65,7 @@ def check_for_duplicates(paths):
             except OSError:
                 # the file access might've changed till the exec point got here
                 continue
-            files_by_small_hash[small_hash].append(filename)
+            files_by_small_hash[(file_size, small_hash)].append(filename)
 
     # For all files with the hash on the first 1024 bytes, get their hash on the full
     # file - collisions will be duplicates

diff --git a/duplicates.py b/duplicates.py
@@ -21,8 +21,8 @@ def chunk_reader(fobj, chunk_size=1024):
         yield chunk
 
 
-def get_hash(filename, first_chunk_only=False, hash=hashlib.sha1):
-    hashobj = hash()
+def get_hash(filename, first_chunk_only=False, hash_algo=hashlib.sha1):
+    hashobj = hash_algo()
     with open(filename, "rb") as f:
         if first_chunk_only:
             hashobj.update(f.read(1024))
@@ -32,7 +32,7 @@ def get_hash(filename, first_chunk_only=False, hash=hashlib.sha1):
     return hashobj.digest()
 
 
-def check_for_duplicates(paths, hash=hashlib.sha1):
+def check_for_duplicates(paths):
     files_by_size = defaultdict(list)
     files_by_small_hash = defaultdict(list)
     files_by_full_hash = dict()

diff --git a/duplicates.py b/duplicates.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+"""
+Fast duplicate file finder.
+Usage: duplicates.py <folder> [<folder>...]
+
+Based on https://stackoverflow.com/a/36113168/300783
+Modified for Python3 with some small code improvements.
+"""
+import os
+import sys
+import hashlib
+from collections import defaultdict
+
+
+def chunk_reader(fobj, chunk_size=1024):
+    """ Generator that reads a file in chunks of bytes """
+    while True:
+        chunk = fobj.read(chunk_size)
+        if not chunk:
+            return
+        yield chunk
+
+
+def get_hash(filename, first_chunk_only=False, hash=hashlib.sha1):
+    hashobj = hash()
+    with open(filename, "rb") as f:
+        if first_chunk_only:
+            hashobj.update(f.read(1024))
+        else:
+            for chunk in chunk_reader(f):
+                hashobj.update(chunk)
+    return hashobj.digest()
+
+
+def check_for_duplicates(paths, hash=hashlib.sha1):
+    files_by_size = defaultdict(list)
+    files_by_small_hash = defaultdict(list)
+    files_by_full_hash = dict()
+
+    for path in paths:
+        for dirpath, dirnames, filenames in os.walk(path):
+            for filename in filenames:
+                full_path = os.path.join(dirpath, filename)
+                try:
+                    # if the target is a symlink (soft one), this will
+                    # dereference it - change the value to the actual target file
+                    full_path = os.path.realpath(full_path)
+                    file_size = os.path.getsize(full_path)
+                except OSError:
+                    # not accessible (permissions, etc) - pass on
+                    continue
+                files_by_size[file_size].append(full_path)
+
+    # For all files with the same file size, get their hash on the first 1024 bytes
+    for files in files_by_size.values():
+        if len(files) < 2:
+            continue  # this file size is unique, no need to spend cpu cycles on it
+
+        for filename in files:
+            try:
+                small_hash = get_hash(filename, first_chunk_only=True)
+            except OSError:
+                # the file access might've changed till the exec point got here
+                continue
+            files_by_small_hash[small_hash].append(filename)
+
+    # For all files with the hash on the first 1024 bytes, get their hash on the full
+    # file - collisions will be duplicates
+    for files in files_by_small_hash.values():
+        if len(files) < 2:
+            # the hash of the first 1k bytes is unique -> skip this file
+            continue
+
+        for filename in files:
+            try:
+                full_hash = get_hash(filename, first_chunk_only=False)
+            except OSError:
+                # the file access might've changed till the exec point got here
+                continue
+
+            if full_hash in files_by_full_hash:
+                duplicate = files_by_full_hash[full_hash]
+                print("Duplicate found:\n - %s\n - %s\n" % (filename, duplicate))
+            else:
+                files_by_full_hash[full_hash] = filename
+
+
+if __name__ == "__main__":
+    if sys.argv[1:]:
+        check_for_duplicates(sys.argv[1:])
+    else:
+        print("Usage: %s <folder> [<folder>...]" % sys.argv[0])
No results found