import os
import re
from collections import defaultdict

# Define all duplicate patterns with named groups
DUP_PATTERNS = [
    re.compile(r'^Copy of (?P<base>.+?)(?P<ext>\.[^.]+)$', re.IGNORECASE),
    re.compile(r'^(?P<base>.+?) - Copy \(\d+\)(?P<ext>\.[^.]+)$', re.IGNORECASE),
    re.compile(r'^(?P<base>.+?) - Copy(?P<ext>\.[^.]+)$', re.IGNORECASE),
    re.compile(r'^(?P<base>.+?) - 副本 \(\d+\)(?P<ext>\.[^.]+)$'),
    re.compile(r'^(?P<base>.+?) - 副本(?P<ext>\.[^.]+)$'),
    re.compile(r'^副本 (?P<base>.+?)(?P<ext>\.[^.]+)$'),
    re.compile(r'^(?P<base>.+?) - 复制 \(\d+\)(?P<ext>\.[^.]+)$'),
    re.compile(r'^(?P<base>.+?) - 复制(?P<ext>\.[^.]+)$'),
    re.compile(r'^(?P<base>.+?) \(\d+\)(?P<ext>\.[^.]+)$'),
    re.compile(r'^(?P<base>.+?)\(\d+\)(?P<ext>\.[^.]+)$'),
]

def parse_filename(name):
    for pat in DUP_PATTERNS:
        m = pat.match(name)
        if m:
            return m.group('base').strip(), m.group('ext'), True
    base, ext = os.path.splitext(name)
    return base.strip(), ext, False

def dedup_files(root_dir):
    """
    Remove duplicate files from a directory tree.
    
    Deduplication rules:
    - Files are grouped by: basename + extension + size
    - Supports macOS, Windows (English/Chinese) duplicate patterns
    - Only files with duplicate tags are deleted
    - Original files (without tags) are preserved
    
    Examples:
    - a.py + a(1).png -> both kept (different extensions)
    - document.pdf + "Copy of document.pdf" -> "Copy of document.pdf" deleted
    - image.jpg + "image - 副本.jpg" -> "image - 副本.jpg" deleted
    """
    if not os.path.exists(root_dir):
        print(f"Error: Directory '{root_dir}' does not exist!")
        return
    
    file_groups = defaultdict(list)
    
    for dirpath, dirnames, filenames in os.walk(root_dir):
        for name in filenames:
            full_path = os.path.join(dirpath, name)
            try:
                size = os.path.getsize(full_path)
                base_name, ext, is_duplicate = parse_filename(name)
                key = (base_name, ext, size)
                file_groups[key].append((full_path, is_duplicate))
            except Exception as e:
                print(f"Skip {full_path}: {e}")
    
    deleted_count = 0
    for files in file_groups.values():
        originals = [f for f, is_dup in files if not is_dup]
        if originals:
            for f, is_dup in files:
                if is_dup:
                    print(f"Deleting duplicate: {f}")
                    try:
                        os.remove(f)
                        deleted_count += 1
                    except Exception as e:
                        print(f"Failed to delete {f}: {e}")
    
    if deleted_count > 0:
        print(f"Deduplication complete! Deleted {deleted_count} duplicate files.")
    else:
        print("No duplicate files found.")

# Usage
target_path = "~/Downloads/"
target_path = os.path.expanduser(target_path) if '~' in target_path else target_path
dedup_files(target_path)