Skip to content

Instantly share code, notes, and snippets.

@samehkamaleldin
Created July 21, 2025 20:49
Show Gist options
  • Save samehkamaleldin/28af8ab8393b7d70d4bd5eb29b6cefc0 to your computer and use it in GitHub Desktop.
Save samehkamaleldin/28af8ab8393b7d70d4bd5eb29b6cefc0 to your computer and use it in GitHub Desktop.

Revisions

  1. samehkamaleldin created this gist Jul 21, 2025.
    146 changes: 146 additions & 0 deletions group_data.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,146 @@
    #!/usr/bin/env python3

    import argparse
    import re
    import shutil
    from pathlib import Path
    from typing import Dict, List, Set

    DATA_DIRPATH = r"PATH_TO_YOUR_DATA_DIRECTORY" # Replace with your actual data directory path


    def extract_case_id(filename: str) -> str:
    """Extract case ID from filename.
    Examples:
    - 888-EAP-029_LVOT.csv -> 888-EAP-029
    - 888-EAP-029 _ CT_Baseline_CT 1.xml -> 888-EAP-029
    - 880-EAP-1 _ CT_Baseline_CT 1.xml -> 880-EAP-1
    """
    # Handle XML files with space pattern first
    xml_match = re.match(r'^([^_\s]+(?:-[^_\s]+)*)\s+_', filename)
    if xml_match:
    return xml_match.group(1)

    # Handle CSV and other files with underscore pattern
    csv_match = re.match(r'^([^_]+)_', filename)
    if csv_match:
    return csv_match.group(1)

    # Fallback: return filename without extension if no pattern matches
    return Path(filename).stem


    def scan_files(data_dir: Path) -> Dict[str, List[str]]:
    """Scan directory and group files by case ID."""
    case_files: Dict[str, List[str]] = {}

    for file_path in data_dir.iterdir():
    if file_path.is_file() and file_path.suffix.lower() in {'.csv', '.xml'}:
    case_id = extract_case_id(file_path.name)
    if case_id not in case_files:
    case_files[case_id] = []
    case_files[case_id].append(file_path.name)

    return case_files


    def create_case_directories(data_dir: Path, case_ids: Set[str], dry_run: bool = False) -> None:
    """Create directories for each case ID."""
    for case_id in case_ids:
    case_dir = data_dir / case_id
    if not case_dir.exists():
    if not dry_run:
    case_dir.mkdir(parents=True, exist_ok=True)
    print(f"Created directory: {case_dir}")
    else:
    print(f"Would create directory: {case_dir}")


    def move_files(data_dir: Path, case_files: Dict[str, List[str]], dry_run: bool = False) -> None:
    """Move files to their respective case directories."""
    moved_count = 0
    error_count = 0

    for case_id, files in case_files.items():
    case_dir = data_dir / case_id

    for filename in files:
    source_path = data_dir / filename
    target_path = case_dir / filename

    try:
    if source_path.exists() and not target_path.exists():
    if not dry_run:
    shutil.move(str(source_path), str(target_path))
    moved_count += 1
    else:
    print(f"Would move: {filename} -> {case_id}/")
    moved_count += 1
    elif target_path.exists():
    print(f"Skipped (already exists): {filename}")
    except Exception as e:
    print(f"Error moving {filename}: {e}")
    error_count += 1

    print(f"Summary: {moved_count} files {'would be ' if dry_run else ''}moved, {error_count} errors")


    def display_summary(case_files: Dict[str, List[str]]) -> None:
    """Display a summary of cases and their files."""
    for case_id, files in sorted(case_files.items()):
    file_types = set(Path(f).suffix.lower() for f in files)
    print(f"{case_id}: {len(files)} files ({', '.join(sorted(file_types))})")

    print(f"\nTotal cases: {len(case_files)}")
    print(f"Total files: {sum(len(files) for files in case_files.values())}")


    def main():
    """Group case files into directories by case ID.
    This script scans DATA_DIR for CSV and XML files, extracts case IDs from filenames,
    creates directories for each case, and moves files into their respective case directories.
    Examples:
    - 888-EAP-029_LVOT.csv -> 888-EAP-029/888-EAP-029_LVOT.csv
    - 888-EAP-029 _ CT_Baseline_CT 1.xml -> 888-EAP-029/888-EAP-029 _ CT_Baseline_CT 1.xml
    """
    parser = argparse.ArgumentParser(description="Group case files into directories by case ID")
    parser.add_argument('data_dir', nargs='?', default=DATA_DIRPATH, help='Data directory path')
    parser.add_argument('--dry-run', '-n', action='store_true', help='Show what would be done without actually doing it')
    parser.add_argument('--verbose', '-v', action='store_true', help='Enable verbose output')

    args = parser.parse_args()
    data_dir = Path(args.data_dir)

    print(f"Scanning directory: {data_dir}")

    if args.dry_run:
    print("DRY RUN MODE - No files will be moved")
    # Scan files and group by case ID
    case_files = scan_files(data_dir)

    if not case_files:
    print("No CSV or XML files found in the directory")
    return

    # Display summary
    if args.verbose or args.dry_run:
    display_summary(case_files)

    # Create directories
    print(f"\nCreating directories for {len(case_files)} cases...")
    create_case_directories(data_dir, set(case_files.keys()), args.dry_run)

    # Move files
    move_files(data_dir, case_files, args.dry_run)

    if not args.dry_run:
    print("Grouping completed successfully!")
    else:
    print("Dry run completed. Use without --dry-run to perform actual operations.")


    if __name__ == '__main__':
    main()