nullenc0de · October 9, 2025 20:11 · Oct 9, 2025
diff --git a/github_to_txt.py b/github_to_txt.py
@@ -0,0 +1,462 @@
+#!/usr/bin/env python3
+"""
+Enhanced GitHub Repository to Text File Converter
+Downloads a GitHub repository and combines all text files into a single output file.
+Uses a robust "blacklist and inspect" method with additional features.
+"""
+
+import os
+import sys
+import shutil
+import tempfile
+import argparse
+from pathlib import Path
+from urllib.parse import urlparse
+import subprocess
+from datetime import datetime
+import hashlib
+try:
+    from tqdm import tqdm
+    HAS_TQDM = True
+except ImportError:
+    HAS_TQDM = False
+    print("Note: Install 'tqdm' for progress bars: pip install tqdm")
+
+# --- Default Configuration ---
+
+# Blacklist of extensions for files that are almost certainly not text.
+BINARY_EXTENSIONS = {
+    # Images
+    '.png', '.jpg', '.jpeg', '.gif', '.bmp', '.ico', '.webp', '.tiff', '.tif', '.svg',
+    '.psd', '.ai', '.sketch', '.fig', '.xd',
+    # Audio/Video
+    '.mp3', '.wav', '.ogg', '.flac', '.mp4', '.avi', '.mov', '.mkv', '.webm', '.aac',
+    '.m4a', '.wmv', '.flv', '.mpg', '.mpeg', '.3gp',
+    # Compressed Archives
+    '.zip', '.rar', '.7z', '.tar', '.gz', '.bz2', '.iso', '.dmg', '.xz', '.tgz',
+    # Fonts
+    '.woff', '.woff2', '.ttf', '.eot', '.otf',
+    # Documents
+    '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ods', '.odp',
+    # Binaries/Executables
+    '.exe', '.dll', '.so', '.o', '.a', '.jar', '.pyc', '.class', '.com', '.app', '.deb', '.rpm',
+    # Database files
+    '.db', '.sqlite', '.sqlite3', '.mdb', '.accdb',
+    # Other common binary formats
+    '.lock', '.bin', '.dat', '.pkl', '.model', '.h5', '.joblib', '.npy', '.npz',
+    # IDE/Editor specific
+    '.suo', '.user', '.userosscache', '.sln.docstates',
+    # Package files
+    '.whl', '.egg', '.gem',
+}
+
+# Default files/directories to exclude
+DEFAULT_EXCLUDE_PATTERNS = {
+    '.git', '.svn', '.hg', 'node_modules', '__pycache__', '.pytest_cache',
+    'venv', 'env', '.env', 'dist', 'build', 'target', '.idea', '.vscode',
+    '*.pyc', '*.pyo', '*.pyd', '.DS_Store', 'Thumbs.db', '*.swp', '*.swo',
+    'coverage', '.coverage', 'htmlcov', '.tox', '.nox', '.hypothesis',
+    'vendor', 'bower_components', '.sass-cache', '.gradle', '.m2',
+}
+
+def format_size(bytes_size):
+    """Format bytes into human-readable size."""
+    for unit in ['B', 'KB', 'MB', 'GB']:
+        if bytes_size < 1024.0:
+            return f"{bytes_size:.2f} {unit}"
+        bytes_size /= 1024.0
+    return f"{bytes_size:.2f} TB"
+
+def is_likely_text_file(filepath, sample_size=8192):
+    """
+    Determine if a file is likely text-based by checking for binary extension
+    and inspecting content for null bytes and text encoding.
+    """
+    path = Path(filepath)
+
+    # Fast check: binary extension blacklist
+    if path.suffix.lower() in BINARY_EXTENSIONS:
+        return False
+
+    # Content check with larger sample
+    try:
+        with open(filepath, 'rb') as f:
+            chunk = f.read(sample_size)
+
+            # Check for null bytes (strong binary indicator)
+            if b'\0' in chunk:
+                return False
+
+            # Try to decode as UTF-8
+            try:
+                chunk.decode('utf-8')
+                return True
+            except UnicodeDecodeError:
+                # Try other common encodings
+                for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
+                    try:
+                        chunk.decode(encoding)
+                        return True
+                    except UnicodeDecodeError:
+                        continue
+                return False
+
+    except (IOError, PermissionError):
+        return False
+
+    return True
+
+def should_exclude(filepath, exclude_set):
+    """Check if file/directory should be excluded."""
+    path = Path(filepath)
+    parts = path.parts
+
+    for pattern in exclude_set:
+        if pattern.startswith('*'):  # Glob pattern
+            if path.match(pattern):
+                return True
+        elif pattern in parts:  # Directory or filename
+            return True
+
+    return False
+
+def get_file_encoding(filepath):
+    """Try to detect file encoding."""
+    encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1', 'utf-16']
+
+    for encoding in encodings:
+        try:
+            with open(filepath, 'r', encoding=encoding) as f:
+                f.read(1024)  # Test read
+            return encoding
+        except (UnicodeDecodeError, UnicodeError):
+            continue
+
+    return 'utf-8'  # Fallback with errors='ignore'
+
+def clone_repository(repo_url, temp_dir, branch=None):
+    """Clone the GitHub repository using git."""
+    print(f"Cloning repository: {repo_url}")
+
+    cmd = ['git', 'clone', '--depth', '1']
+    if branch:
+        cmd.extend(['-b', branch])
+    cmd.extend([repo_url, temp_dir])
+
+    try:
+        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
+        print(f"Successfully cloned to {temp_dir}")
+        return True
+    except FileNotFoundError:
+        print("\nERROR: git is not installed or not in your PATH.")
+        print("Please install git: https://git-scm.com/downloads")
+        return False
+    except subprocess.CalledProcessError as e:
+        print(f"\nERROR: Failed to clone repository.")
+        if "Repository not found" in e.stderr:
+            print("The repository may be private or doesn't exist.")
+        else:
+            print(f"Git error: {e.stderr}")
+        return False
+
+def process_repository(repo_path, files_to_process, output_file, options):
+    """Process files and write to output file with enhanced formatting."""
+    processed_files = 0
+    error_files = 0
+
+    # Calculate content hash for duplicate detection
+    content_hashes = {}
+    duplicate_files = []
+
+    with open(output_file, 'w', encoding='utf-8') as out:
+        # Write enhanced header with metadata
+        out.write("=" * 80 + "\n")
+        out.write(f"GitHub Repository Contents\n")
+        out.write(f"Repository: {repo_path.name}\n")
+        out.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+        out.write(f"Total files to process: {len(files_to_process)}\n")
+        out.write("=" * 80 + "\n\n")
+
+        # Write table of contents if requested
+        if options.get('toc', False):
+            out.write("TABLE OF CONTENTS\n")
+            out.write("-" * 40 + "\n")
+            for i, filepath in enumerate(files_to_process, 1):
+                rel_path = filepath.relative_to(repo_path)
+                out.write(f"{i:4}. {rel_path.as_posix()}\n")
+            out.write("\n" + "=" * 80 + "\n\n")
+
+        # Process files with progress indicator
+        iterator = tqdm(files_to_process, desc="Processing", unit="file", ncols=100) if HAS_TQDM else files_to_process
+
+        for filepath in iterator:
+            try:
+                rel_path = filepath.relative_to(repo_path)
+                file_size = filepath.stat().st_size
+
+                # Detect encoding
+                encoding = get_file_encoding(filepath)
+
+                # Read content
+                with open(filepath, 'r', encoding=encoding, errors='ignore') as f:
+                    content = f.read()
+
+                # Check for duplicates
+                content_hash = hashlib.md5(content.encode('utf-8')).hexdigest()
+                if content_hash in content_hashes:
+                    duplicate_files.append((rel_path, content_hashes[content_hash]))
+                    if options.get('skip_duplicates', False):
+                        continue
+                content_hashes[content_hash] = rel_path
+
+                # Determine file type/language for syntax hint
+                extension = filepath.suffix.lower()
+
+                # Write enhanced file header
+                out.write("#" * 80 + "\n")
+                out.write(f"# File: {rel_path.as_posix()}\n")
+                out.write(f"# Size: {format_size(file_size)}\n")
+                out.write(f"# Encoding: {encoding}\n")
+                if extension:
+                    out.write(f"# Type: {extension[1:] if extension else 'text'}\n")
+                out.write("#" * 80 + "\n\n")
+
+                # Add language hint for potential syntax highlighting
+                if options.get('markdown', False) and extension in {'.py', '.js', '.java', '.cpp', '.c', '.go', '.rs'}:
+                    lang = extension[1:]
+                    out.write(f"```{lang}\n")
+
+                # Write content with optional line numbers
+                if options.get('line_numbers', False):
+                    lines = content.splitlines()
+                    width = len(str(len(lines)))
+                    for i, line in enumerate(lines, 1):
+                        out.write(f"{i:>{width}} | {line}\n")
+                else:
+                    out.write(content)
+                    if not content.endswith('\n'):
+                        out.write('\n')
+
+                if options.get('markdown', False) and extension in {'.py', '.js', '.java', '.cpp', '.c', '.go', '.rs'}:
+                    out.write("```\n")
+
+                out.write("\n\n")
+                processed_files += 1
+
+            except Exception as e:
+                error_files += 1
+                out.write(f"ERROR: Could not read {rel_path.as_posix()}: {e}\n\n")
+
+        # Write summary
+        out.write("=" * 80 + "\n")
+        out.write("PROCESSING SUMMARY\n")
+        out.write("=" * 80 + "\n")
+        out.write(f"Files processed successfully: {processed_files}\n")
+        out.write(f"Files with errors: {error_files}\n")
+
+        if duplicate_files:
+            out.write(f"\nDuplicate files detected: {len(duplicate_files)}\n")
+            for dup, original in duplicate_files[:5]:  # Show first 5
+                out.write(f"  - {dup} (duplicate of {original})\n")
+            if len(duplicate_files) > 5:
+                out.write(f"  ... and {len(duplicate_files) - 5} more\n")
+
+        out.write("=" * 80 + "\n")
+
+    return processed_files, error_files
+
+def parse_github_url(url):
+    """Parse various GitHub URL formats."""
+    url = url.strip()
+
+    # Handle git@ SSH URLs
+    if url.startswith('[email protected]:'):
+        url = url.replace('[email protected]:', 'https://github.com/')
+
+    # Remove .git suffix
+    if url.endswith('.git'):
+        url = url[:-4]
+
+    # Handle short format (owner/repo)
+    if '/' in url and not url.startswith(('http', 'git@')):
+        return f"https://github.com/{url}.git"
+
+    # Handle full URLs
+    if 'github.com' in url:
+        return f"{url}.git" if not url.endswith('.git') else url
+
+    return url
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Download a GitHub repository and combine all text files into one output file.',
+        formatter_class=argparse.RawTextHelpFormatter
+    )
+    parser.add_argument('repo', help='GitHub repository (owner/repo or URL)')
+    parser.add_argument('-o', '--output', default='repository_contents.txt',
+                      help='Output file name (default: repository_contents.txt)')
+    parser.add_argument('-b', '--branch', help='Specific branch to clone')
+    parser.add_argument('--max-file-size', type=int, default=10,
+                      help='Max file size in MB (default: 10)')
+    parser.add_argument('--total-size-limit', type=int, default=100,
+                      help='Warning threshold for total size in MB (default: 100)')
+    parser.add_argument('--line-numbers', action='store_true',
+                      help='Add line numbers to output')
+    parser.add_argument('--toc', action='store_true',
+                      help='Add table of contents at the beginning')
+    parser.add_argument('--markdown', action='store_true',
+                      help='Add markdown code blocks for better formatting')
+    parser.add_argument('--skip-duplicates', action='store_true',
+                      help='Skip duplicate files (same content)')
+    parser.add_argument('--exclude-dir', action='append', default=[],
+                      help='Directory to exclude (repeatable)')
+    parser.add_argument('--exclude-ext', action='append', default=[],
+                      help='File extension to exclude (repeatable)')
+    parser.add_argument('--include-only-ext', action='append', default=[],
+                      help='Process ONLY these extensions (repeatable)')
+    parser.add_argument('--keep-temp', action='store_true',
+                      help='Keep temporary clone after processing')
+    parser.add_argument('--stats', action='store_true',
+                      help='Show detailed statistics after processing')
+
+    args = parser.parse_args()
+
+    # Setup
+    repo_url = parse_github_url(args.repo)
+    max_file_size_bytes = args.max_file_size * 1024 * 1024
+    total_size_limit_bytes = args.total_size_limit * 1024 * 1024
+
+    # Build exclusion patterns
+    exclude_patterns = DEFAULT_EXCLUDE_PATTERNS.copy()
+    for d in args.exclude_dir:
+        exclude_patterns.add(d)
+    for ext in args.exclude_ext:
+        exclude_patterns.add(f"*{ext if ext.startswith('.') else '.' + ext}")
+
+    temp_dir = tempfile.mkdtemp(prefix='github_repo_')
+
+    try:
+        # Clone repository
+        if not clone_repository(repo_url, temp_dir, args.branch):
+            return 1
+
+        repo_path = Path(temp_dir)
+
+        # Scan repository
+        print("\nScanning repository...")
+        files_to_process = []
+        total_size = 0
+        skipped_count = 0
+        stats = {'by_extension': {}, 'by_size': {'<1KB': 0, '1-10KB': 0, '10-100KB': 0, '100KB-1MB': 0, '>1MB': 0}}
+
+        all_files = list(repo_path.rglob('*'))
+        all_files = [f for f in all_files if f.is_file()]
+
+        iterator = tqdm(all_files, desc="Scanning", unit="file", ncols=100) if HAS_TQDM else all_files
+
+        for filepath in iterator:
+            rel_path = filepath.relative_to(repo_path)
+            file_size = filepath.stat().st_size
+
+            # Track statistics
+            ext = filepath.suffix.lower() or 'no_extension'
+            stats['by_extension'][ext] = stats['by_extension'].get(ext, 0) + 1
+
+            if file_size < 1024:
+                stats['by_size']['<1KB'] += 1
+            elif file_size < 10240:
+                stats['by_size']['1-10KB'] += 1
+            elif file_size < 102400:
+                stats['by_size']['10-100KB'] += 1
+            elif file_size < 1048576:
+                stats['by_size']['100KB-1MB'] += 1
+            else:
+                stats['by_size']['>1MB'] += 1
+
+            # Apply filters
+            if should_exclude(rel_path, exclude_patterns):
+                skipped_count += 1
+                continue
+
+            if file_size > max_file_size_bytes:
+                skipped_count += 1
+                continue
+
+            # Include-only filter
+            if args.include_only_ext:
+                ext = filepath.suffix.lower()
+                if not any(ext == (e if e.startswith('.') else f'.{e}') for e in args.include_only_ext):
+                    skipped_count += 1
+                    continue
+
+            if not is_likely_text_file(filepath):
+                skipped_count += 1
+                continue
+
+            files_to_process.append(filepath)
+            total_size += file_size
+
+        print(f"\nFound {len(all_files)} total files")
+        print(f"Will process {len(files_to_process)} text files")
+        print(f"Skipped {skipped_count} files (binary/excluded/oversized)")
+        print(f"Total size to process: {format_size(total_size)}")
+
+        # Show statistics if requested
+        if args.stats:
+            print("\n" + "="*40)
+            print("FILE STATISTICS")
+            print("="*40)
+            print("\nTop 10 extensions by count:")
+            sorted_exts = sorted(stats['by_extension'].items(), key=lambda x: x[1], reverse=True)
+            for ext, count in sorted_exts[:10]:
+                print(f"  {ext:15} {count:5} files")
+            print("\nFile size distribution:")
+            for size_range, count in stats['by_size'].items():
+                print(f"  {size_range:15} {count:5} files")
+
+        # Size warning
+        if total_size > total_size_limit_bytes:
+            print(f"\n⚠️  WARNING: Total size ({format_size(total_size)}) exceeds limit ({format_size(total_size_limit_bytes)})")
+            if input("Continue anyway? (y/n): ").lower() != 'y':
+                print("Cancelled.")
+                return 1
+
+        # Process files
+        if files_to_process:
+            print(f"\nWriting to {args.output}...")
+            options = {
+                'line_numbers': args.line_numbers,
+                'toc': args.toc,
+                'markdown': args.markdown,
+                'skip_duplicates': args.skip_duplicates
+            }
+            processed, errors = process_repository(repo_path, files_to_process, args.output, options)
+
+            # Summary
+            print(f"\n{'='*50}")
+            print("✅ COMPLETE!")
+            print(f"  Processed: {processed} files")
+            print(f"  Errors: {errors} files")
+            print(f"  Output: {args.output}")
+            print(f"  Size: {format_size(Path(args.output).stat().st_size)}")
+            print(f"{'='*50}")
+        else:
+            print("\n⚠️  No text files found to process!")
+
+        return 0
+
+    except KeyboardInterrupt:
+        print("\n\n❌ Cancelled by user")
+        return 1
+    except Exception as e:
+        print(f"\n❌ ERROR: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+    finally:
+        if not args.keep_temp and os.path.exists(temp_dir):
+            print("\nCleaning up...")
+            shutil.rmtree(temp_dir)
+
+if __name__ == "__main__":
+    sys.exit(main())