Skip to content

Instantly share code, notes, and snippets.

@nullenc0de
Created October 9, 2025 20:11
Show Gist options
  • Save nullenc0de/2f2cd0c2046bcf569d5f5e1079401aa9 to your computer and use it in GitHub Desktop.
Save nullenc0de/2f2cd0c2046bcf569d5f5e1079401aa9 to your computer and use it in GitHub Desktop.

Revisions

  1. nullenc0de created this gist Oct 9, 2025.
    462 changes: 462 additions & 0 deletions github_to_txt.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,462 @@
    #!/usr/bin/env python3
    """
    Enhanced GitHub Repository to Text File Converter
    Downloads a GitHub repository and combines all text files into a single output file.
    Uses a robust "blacklist and inspect" method with additional features.
    """

    import os
    import sys
    import shutil
    import tempfile
    import argparse
    from pathlib import Path
    from urllib.parse import urlparse
    import subprocess
    from datetime import datetime
    import hashlib
    try:
    from tqdm import tqdm
    HAS_TQDM = True
    except ImportError:
    HAS_TQDM = False
    print("Note: Install 'tqdm' for progress bars: pip install tqdm")

    # --- Default Configuration ---

    # Blacklist of extensions for files that are almost certainly not text.
    BINARY_EXTENSIONS = {
    # Images
    '.png', '.jpg', '.jpeg', '.gif', '.bmp', '.ico', '.webp', '.tiff', '.tif', '.svg',
    '.psd', '.ai', '.sketch', '.fig', '.xd',
    # Audio/Video
    '.mp3', '.wav', '.ogg', '.flac', '.mp4', '.avi', '.mov', '.mkv', '.webm', '.aac',
    '.m4a', '.wmv', '.flv', '.mpg', '.mpeg', '.3gp',
    # Compressed Archives
    '.zip', '.rar', '.7z', '.tar', '.gz', '.bz2', '.iso', '.dmg', '.xz', '.tgz',
    # Fonts
    '.woff', '.woff2', '.ttf', '.eot', '.otf',
    # Documents
    '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ods', '.odp',
    # Binaries/Executables
    '.exe', '.dll', '.so', '.o', '.a', '.jar', '.pyc', '.class', '.com', '.app', '.deb', '.rpm',
    # Database files
    '.db', '.sqlite', '.sqlite3', '.mdb', '.accdb',
    # Other common binary formats
    '.lock', '.bin', '.dat', '.pkl', '.model', '.h5', '.joblib', '.npy', '.npz',
    # IDE/Editor specific
    '.suo', '.user', '.userosscache', '.sln.docstates',
    # Package files
    '.whl', '.egg', '.gem',
    }

    # Default files/directories to exclude
    DEFAULT_EXCLUDE_PATTERNS = {
    '.git', '.svn', '.hg', 'node_modules', '__pycache__', '.pytest_cache',
    'venv', 'env', '.env', 'dist', 'build', 'target', '.idea', '.vscode',
    '*.pyc', '*.pyo', '*.pyd', '.DS_Store', 'Thumbs.db', '*.swp', '*.swo',
    'coverage', '.coverage', 'htmlcov', '.tox', '.nox', '.hypothesis',
    'vendor', 'bower_components', '.sass-cache', '.gradle', '.m2',
    }

    def format_size(bytes_size):
    """Format bytes into human-readable size."""
    for unit in ['B', 'KB', 'MB', 'GB']:
    if bytes_size < 1024.0:
    return f"{bytes_size:.2f} {unit}"
    bytes_size /= 1024.0
    return f"{bytes_size:.2f} TB"

    def is_likely_text_file(filepath, sample_size=8192):
    """
    Determine if a file is likely text-based by checking for binary extension
    and inspecting content for null bytes and text encoding.
    """
    path = Path(filepath)

    # Fast check: binary extension blacklist
    if path.suffix.lower() in BINARY_EXTENSIONS:
    return False

    # Content check with larger sample
    try:
    with open(filepath, 'rb') as f:
    chunk = f.read(sample_size)

    # Check for null bytes (strong binary indicator)
    if b'\0' in chunk:
    return False

    # Try to decode as UTF-8
    try:
    chunk.decode('utf-8')
    return True
    except UnicodeDecodeError:
    # Try other common encodings
    for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
    try:
    chunk.decode(encoding)
    return True
    except UnicodeDecodeError:
    continue
    return False

    except (IOError, PermissionError):
    return False

    return True

    def should_exclude(filepath, exclude_set):
    """Check if file/directory should be excluded."""
    path = Path(filepath)
    parts = path.parts

    for pattern in exclude_set:
    if pattern.startswith('*'): # Glob pattern
    if path.match(pattern):
    return True
    elif pattern in parts: # Directory or filename
    return True

    return False

    def get_file_encoding(filepath):
    """Try to detect file encoding."""
    encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1', 'utf-16']

    for encoding in encodings:
    try:
    with open(filepath, 'r', encoding=encoding) as f:
    f.read(1024) # Test read
    return encoding
    except (UnicodeDecodeError, UnicodeError):
    continue

    return 'utf-8' # Fallback with errors='ignore'

    def clone_repository(repo_url, temp_dir, branch=None):
    """Clone the GitHub repository using git."""
    print(f"Cloning repository: {repo_url}")

    cmd = ['git', 'clone', '--depth', '1']
    if branch:
    cmd.extend(['-b', branch])
    cmd.extend([repo_url, temp_dir])

    try:
    result = subprocess.run(cmd, check=True, capture_output=True, text=True)
    print(f"Successfully cloned to {temp_dir}")
    return True
    except FileNotFoundError:
    print("\nERROR: git is not installed or not in your PATH.")
    print("Please install git: https://git-scm.com/downloads")
    return False
    except subprocess.CalledProcessError as e:
    print(f"\nERROR: Failed to clone repository.")
    if "Repository not found" in e.stderr:
    print("The repository may be private or doesn't exist.")
    else:
    print(f"Git error: {e.stderr}")
    return False

    def process_repository(repo_path, files_to_process, output_file, options):
    """Process files and write to output file with enhanced formatting."""
    processed_files = 0
    error_files = 0

    # Calculate content hash for duplicate detection
    content_hashes = {}
    duplicate_files = []

    with open(output_file, 'w', encoding='utf-8') as out:
    # Write enhanced header with metadata
    out.write("=" * 80 + "\n")
    out.write(f"GitHub Repository Contents\n")
    out.write(f"Repository: {repo_path.name}\n")
    out.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    out.write(f"Total files to process: {len(files_to_process)}\n")
    out.write("=" * 80 + "\n\n")

    # Write table of contents if requested
    if options.get('toc', False):
    out.write("TABLE OF CONTENTS\n")
    out.write("-" * 40 + "\n")
    for i, filepath in enumerate(files_to_process, 1):
    rel_path = filepath.relative_to(repo_path)
    out.write(f"{i:4}. {rel_path.as_posix()}\n")
    out.write("\n" + "=" * 80 + "\n\n")

    # Process files with progress indicator
    iterator = tqdm(files_to_process, desc="Processing", unit="file", ncols=100) if HAS_TQDM else files_to_process

    for filepath in iterator:
    try:
    rel_path = filepath.relative_to(repo_path)
    file_size = filepath.stat().st_size

    # Detect encoding
    encoding = get_file_encoding(filepath)

    # Read content
    with open(filepath, 'r', encoding=encoding, errors='ignore') as f:
    content = f.read()

    # Check for duplicates
    content_hash = hashlib.md5(content.encode('utf-8')).hexdigest()
    if content_hash in content_hashes:
    duplicate_files.append((rel_path, content_hashes[content_hash]))
    if options.get('skip_duplicates', False):
    continue
    content_hashes[content_hash] = rel_path

    # Determine file type/language for syntax hint
    extension = filepath.suffix.lower()

    # Write enhanced file header
    out.write("#" * 80 + "\n")
    out.write(f"# File: {rel_path.as_posix()}\n")
    out.write(f"# Size: {format_size(file_size)}\n")
    out.write(f"# Encoding: {encoding}\n")
    if extension:
    out.write(f"# Type: {extension[1:] if extension else 'text'}\n")
    out.write("#" * 80 + "\n\n")

    # Add language hint for potential syntax highlighting
    if options.get('markdown', False) and extension in {'.py', '.js', '.java', '.cpp', '.c', '.go', '.rs'}:
    lang = extension[1:]
    out.write(f"```{lang}\n")

    # Write content with optional line numbers
    if options.get('line_numbers', False):
    lines = content.splitlines()
    width = len(str(len(lines)))
    for i, line in enumerate(lines, 1):
    out.write(f"{i:>{width}} | {line}\n")
    else:
    out.write(content)
    if not content.endswith('\n'):
    out.write('\n')

    if options.get('markdown', False) and extension in {'.py', '.js', '.java', '.cpp', '.c', '.go', '.rs'}:
    out.write("```\n")

    out.write("\n\n")
    processed_files += 1

    except Exception as e:
    error_files += 1
    out.write(f"ERROR: Could not read {rel_path.as_posix()}: {e}\n\n")

    # Write summary
    out.write("=" * 80 + "\n")
    out.write("PROCESSING SUMMARY\n")
    out.write("=" * 80 + "\n")
    out.write(f"Files processed successfully: {processed_files}\n")
    out.write(f"Files with errors: {error_files}\n")

    if duplicate_files:
    out.write(f"\nDuplicate files detected: {len(duplicate_files)}\n")
    for dup, original in duplicate_files[:5]: # Show first 5
    out.write(f" - {dup} (duplicate of {original})\n")
    if len(duplicate_files) > 5:
    out.write(f" ... and {len(duplicate_files) - 5} more\n")

    out.write("=" * 80 + "\n")

    return processed_files, error_files

    def parse_github_url(url):
    """Parse various GitHub URL formats."""
    url = url.strip()

    # Handle git@ SSH URLs
    if url.startswith('[email protected]:'):
    url = url.replace('[email protected]:', 'https://github.com/')

    # Remove .git suffix
    if url.endswith('.git'):
    url = url[:-4]

    # Handle short format (owner/repo)
    if '/' in url and not url.startswith(('http', 'git@')):
    return f"https://github.com/{url}.git"

    # Handle full URLs
    if 'github.com' in url:
    return f"{url}.git" if not url.endswith('.git') else url

    return url

    def main():
    parser = argparse.ArgumentParser(
    description='Download a GitHub repository and combine all text files into one output file.',
    formatter_class=argparse.RawTextHelpFormatter
    )
    parser.add_argument('repo', help='GitHub repository (owner/repo or URL)')
    parser.add_argument('-o', '--output', default='repository_contents.txt',
    help='Output file name (default: repository_contents.txt)')
    parser.add_argument('-b', '--branch', help='Specific branch to clone')
    parser.add_argument('--max-file-size', type=int, default=10,
    help='Max file size in MB (default: 10)')
    parser.add_argument('--total-size-limit', type=int, default=100,
    help='Warning threshold for total size in MB (default: 100)')
    parser.add_argument('--line-numbers', action='store_true',
    help='Add line numbers to output')
    parser.add_argument('--toc', action='store_true',
    help='Add table of contents at the beginning')
    parser.add_argument('--markdown', action='store_true',
    help='Add markdown code blocks for better formatting')
    parser.add_argument('--skip-duplicates', action='store_true',
    help='Skip duplicate files (same content)')
    parser.add_argument('--exclude-dir', action='append', default=[],
    help='Directory to exclude (repeatable)')
    parser.add_argument('--exclude-ext', action='append', default=[],
    help='File extension to exclude (repeatable)')
    parser.add_argument('--include-only-ext', action='append', default=[],
    help='Process ONLY these extensions (repeatable)')
    parser.add_argument('--keep-temp', action='store_true',
    help='Keep temporary clone after processing')
    parser.add_argument('--stats', action='store_true',
    help='Show detailed statistics after processing')

    args = parser.parse_args()

    # Setup
    repo_url = parse_github_url(args.repo)
    max_file_size_bytes = args.max_file_size * 1024 * 1024
    total_size_limit_bytes = args.total_size_limit * 1024 * 1024

    # Build exclusion patterns
    exclude_patterns = DEFAULT_EXCLUDE_PATTERNS.copy()
    for d in args.exclude_dir:
    exclude_patterns.add(d)
    for ext in args.exclude_ext:
    exclude_patterns.add(f"*{ext if ext.startswith('.') else '.' + ext}")

    temp_dir = tempfile.mkdtemp(prefix='github_repo_')

    try:
    # Clone repository
    if not clone_repository(repo_url, temp_dir, args.branch):
    return 1

    repo_path = Path(temp_dir)

    # Scan repository
    print("\nScanning repository...")
    files_to_process = []
    total_size = 0
    skipped_count = 0
    stats = {'by_extension': {}, 'by_size': {'<1KB': 0, '1-10KB': 0, '10-100KB': 0, '100KB-1MB': 0, '>1MB': 0}}

    all_files = list(repo_path.rglob('*'))
    all_files = [f for f in all_files if f.is_file()]

    iterator = tqdm(all_files, desc="Scanning", unit="file", ncols=100) if HAS_TQDM else all_files

    for filepath in iterator:
    rel_path = filepath.relative_to(repo_path)
    file_size = filepath.stat().st_size

    # Track statistics
    ext = filepath.suffix.lower() or 'no_extension'
    stats['by_extension'][ext] = stats['by_extension'].get(ext, 0) + 1

    if file_size < 1024:
    stats['by_size']['<1KB'] += 1
    elif file_size < 10240:
    stats['by_size']['1-10KB'] += 1
    elif file_size < 102400:
    stats['by_size']['10-100KB'] += 1
    elif file_size < 1048576:
    stats['by_size']['100KB-1MB'] += 1
    else:
    stats['by_size']['>1MB'] += 1

    # Apply filters
    if should_exclude(rel_path, exclude_patterns):
    skipped_count += 1
    continue

    if file_size > max_file_size_bytes:
    skipped_count += 1
    continue

    # Include-only filter
    if args.include_only_ext:
    ext = filepath.suffix.lower()
    if not any(ext == (e if e.startswith('.') else f'.{e}') for e in args.include_only_ext):
    skipped_count += 1
    continue

    if not is_likely_text_file(filepath):
    skipped_count += 1
    continue

    files_to_process.append(filepath)
    total_size += file_size

    print(f"\nFound {len(all_files)} total files")
    print(f"Will process {len(files_to_process)} text files")
    print(f"Skipped {skipped_count} files (binary/excluded/oversized)")
    print(f"Total size to process: {format_size(total_size)}")

    # Show statistics if requested
    if args.stats:
    print("\n" + "="*40)
    print("FILE STATISTICS")
    print("="*40)
    print("\nTop 10 extensions by count:")
    sorted_exts = sorted(stats['by_extension'].items(), key=lambda x: x[1], reverse=True)
    for ext, count in sorted_exts[:10]:
    print(f" {ext:15} {count:5} files")
    print("\nFile size distribution:")
    for size_range, count in stats['by_size'].items():
    print(f" {size_range:15} {count:5} files")

    # Size warning
    if total_size > total_size_limit_bytes:
    print(f"\n⚠️ WARNING: Total size ({format_size(total_size)}) exceeds limit ({format_size(total_size_limit_bytes)})")
    if input("Continue anyway? (y/n): ").lower() != 'y':
    print("Cancelled.")
    return 1

    # Process files
    if files_to_process:
    print(f"\nWriting to {args.output}...")
    options = {
    'line_numbers': args.line_numbers,
    'toc': args.toc,
    'markdown': args.markdown,
    'skip_duplicates': args.skip_duplicates
    }
    processed, errors = process_repository(repo_path, files_to_process, args.output, options)

    # Summary
    print(f"\n{'='*50}")
    print("✅ COMPLETE!")
    print(f" Processed: {processed} files")
    print(f" Errors: {errors} files")
    print(f" Output: {args.output}")
    print(f" Size: {format_size(Path(args.output).stat().st_size)}")
    print(f"{'='*50}")
    else:
    print("\n⚠️ No text files found to process!")

    return 0

    except KeyboardInterrupt:
    print("\n\n❌ Cancelled by user")
    return 1
    except Exception as e:
    print(f"\n❌ ERROR: {e}")
    import traceback
    traceback.print_exc()
    return 1
    finally:
    if not args.keep_temp and os.path.exists(temp_dir):
    print("\nCleaning up...")
    shutil.rmtree(temp_dir)

    if __name__ == "__main__":
    sys.exit(main())