import os import re from collections import defaultdict # Define all duplicate patterns with named groups DUP_PATTERNS = [ re.compile(r'^Copy of (?P.+?)(?P\.[^.]+)$', re.IGNORECASE), re.compile(r'^(?P.+?) - Copy \(\d+\)(?P\.[^.]+)$', re.IGNORECASE), re.compile(r'^(?P.+?) - Copy(?P\.[^.]+)$', re.IGNORECASE), re.compile(r'^(?P.+?) - 副本 \(\d+\)(?P\.[^.]+)$'), re.compile(r'^(?P.+?) - 副本(?P\.[^.]+)$'), re.compile(r'^副本 (?P.+?)(?P\.[^.]+)$'), re.compile(r'^(?P.+?) - 复制 \(\d+\)(?P\.[^.]+)$'), re.compile(r'^(?P.+?) - 复制(?P\.[^.]+)$'), re.compile(r'^(?P.+?) \(\d+\)(?P\.[^.]+)$'), re.compile(r'^(?P.+?)\(\d+\)(?P\.[^.]+)$'), ] def parse_filename(name): for pat in DUP_PATTERNS: m = pat.match(name) if m: return m.group('base').strip(), m.group('ext'), True base, ext = os.path.splitext(name) return base.strip(), ext, False def dedup_files(root_dir): """ Remove duplicate files from a directory tree. Deduplication rules: - Files are grouped by: basename + extension + size - Supports macOS, Windows (English/Chinese) duplicate patterns - Only files with duplicate tags are deleted - Original files (without tags) are preserved Examples: - a.py + a(1).png -> both kept (different extensions) - document.pdf + "Copy of document.pdf" -> "Copy of document.pdf" deleted - image.jpg + "image - 副本.jpg" -> "image - 副本.jpg" deleted """ if not os.path.exists(root_dir): print(f"Error: Directory '{root_dir}' does not exist!") return file_groups = defaultdict(list) for dirpath, dirnames, filenames in os.walk(root_dir): for name in filenames: full_path = os.path.join(dirpath, name) try: size = os.path.getsize(full_path) base_name, ext, is_duplicate = parse_filename(name) key = (base_name, ext, size) file_groups[key].append((full_path, is_duplicate)) except Exception as e: print(f"Skip {full_path}: {e}") deleted_count = 0 for files in file_groups.values(): originals = [f for f, is_dup in files if not is_dup] if originals: for f, is_dup in files: if is_dup: print(f"Deleting duplicate: {f}") try: os.remove(f) deleted_count += 1 except Exception as e: print(f"Failed to delete {f}: {e}") if deleted_count > 0: print(f"Deduplication complete! Deleted {deleted_count} duplicate files.") else: print("No duplicate files found.") # Usage target_path = "~/Downloads/" target_path = os.path.expanduser(target_path) if '~' in target_path else target_path dedup_files(target_path)