import os
import re
from collections import defaultdict
# Define all duplicate patterns with named groups
DUP_PATTERNS = [
re.compile(r'^Copy of (?P.+?)(?P\.[^.]+)$', re.IGNORECASE),
re.compile(r'^(?P.+?) - Copy \(\d+\)(?P\.[^.]+)$', re.IGNORECASE),
re.compile(r'^(?P.+?) - Copy(?P\.[^.]+)$', re.IGNORECASE),
re.compile(r'^(?P.+?) - 副本 \(\d+\)(?P\.[^.]+)$'),
re.compile(r'^(?P.+?) - 副本(?P\.[^.]+)$'),
re.compile(r'^副本 (?P.+?)(?P\.[^.]+)$'),
re.compile(r'^(?P.+?) - 复制 \(\d+\)(?P\.[^.]+)$'),
re.compile(r'^(?P.+?) - 复制(?P\.[^.]+)$'),
re.compile(r'^(?P.+?) \(\d+\)(?P\.[^.]+)$'),
re.compile(r'^(?P.+?)\(\d+\)(?P\.[^.]+)$'),
]
def parse_filename(name):
for pat in DUP_PATTERNS:
m = pat.match(name)
if m:
return m.group('base').strip(), m.group('ext'), True
base, ext = os.path.splitext(name)
return base.strip(), ext, False
def dedup_files(root_dir):
"""
Remove duplicate files from a directory tree.
Deduplication rules:
- Files are grouped by: basename + extension + size
- Supports macOS, Windows (English/Chinese) duplicate patterns
- Only files with duplicate tags are deleted
- Original files (without tags) are preserved
Examples:
- a.py + a(1).png -> both kept (different extensions)
- document.pdf + "Copy of document.pdf" -> "Copy of document.pdf" deleted
- image.jpg + "image - 副本.jpg" -> "image - 副本.jpg" deleted
"""
if not os.path.exists(root_dir):
print(f"Error: Directory '{root_dir}' does not exist!")
return
file_groups = defaultdict(list)
for dirpath, dirnames, filenames in os.walk(root_dir):
for name in filenames:
full_path = os.path.join(dirpath, name)
try:
size = os.path.getsize(full_path)
base_name, ext, is_duplicate = parse_filename(name)
key = (base_name, ext, size)
file_groups[key].append((full_path, is_duplicate))
except Exception as e:
print(f"Skip {full_path}: {e}")
deleted_count = 0
for files in file_groups.values():
originals = [f for f, is_dup in files if not is_dup]
if originals:
for f, is_dup in files:
if is_dup:
print(f"Deleting duplicate: {f}")
try:
os.remove(f)
deleted_count += 1
except Exception as e:
print(f"Failed to delete {f}: {e}")
if deleted_count > 0:
print(f"Deduplication complete! Deleted {deleted_count} duplicate files.")
else:
print("No duplicate files found.")
# Usage
target_path = "~/Downloads/"
target_path = os.path.expanduser(target_path) if '~' in target_path else target_path
dedup_files(target_path)