Last active
May 12, 2025 21:11
-
-
Save Paradiddle131/f3bf9cf0949d60d9d377949fc4484f35 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| OUTPUT_FILE="" | |
| VERBOSE=false | |
| DRY_RUN=false | |
| ESTIMATE_TOKENS=true | |
| INPUT_PATHS=() | |
| EXPLICIT_EXCLUDE_PATHS=() # For --exclude specific_path | |
| INCLUDE_PATTERNS=() # For --include-pattern (glob) | |
| EXCLUDE_SHELL_PATTERNS=() # For --exclude-pattern (glob) | |
| INCLUDE_REGEX_PATTERNS=() # New: For --include-regex | |
| EXCLUDE_REGEX_PATTERNS=() # New: For --exclude-regex | |
| # Default patterns for initial pruning (applied before glob/regex filtering) | |
| EXCLUDE_DIRS_PATTERN=( ".git" "node_modules" "dist" "build" ".venv" "venv" "env" "bin" "obj" "__pycache__" ".pytest_cache" ".mypy_cache" ".ruff_cache" ".idea" ".vscode" "target" "out" "logs" ) | |
| EXCLUDE_FILES_PATTERN=( "*.pyc" "*.pyo" "*.log" "*.swp" "*.swo" "*.lock" "poetry.lock" "package-lock.json" "yarn.lock" "Pipfile.lock" "uv.lock" "*.DS_Store" "Thumbs.db" "*.class" "*.jar" "*.war" "*.ear" ) | |
| log_verbose() { | |
| if [ "$VERBOSE" = true ]; then | |
| echo "[VERBOSE] $@" >&2 | |
| fi | |
| } | |
| get_lang_hint() { | |
| local filename="$1" | |
| local ext="${filename##*.}" | |
| case "$ext" in | |
| py) echo "python" ;; js) echo "javascript" ;; ts) echo "typescript" ;; | |
| java) echo "java" ;; c) echo "c" ;; cpp | cxx | h | hpp) echo "cpp" ;; | |
| cs) echo "csharp" ;; go) echo "go" ;; rb) echo "ruby" ;; php) echo "php" ;; | |
| html | htm) echo "html" ;; css) echo "css" ;; scss | sass) echo "scss" ;; | |
| sh | bash) echo "bash" ;; zsh) echo "zsh" ;; sql) echo "sql" ;; | |
| md | markdown) echo "markdown" ;; json) echo "json" ;; | |
| yaml | yml) echo "yaml" ;; xml) echo "xml" ;; *) echo "" ;; | |
| esac | |
| } | |
| process_file() { | |
| local filepath="$1"; local relative_path="$2"; local output_target="$3" | |
| if [ ! -f "$filepath" ] || [ ! -r "$filepath" ]; then | |
| echo "Warning: Cannot read file: $filepath. Skipping." >&2 | |
| return 1 | |
| fi | |
| log_verbose "Processing file: $filepath (as $relative_path)" | |
| local lang_hint=$(get_lang_hint "$filepath") | |
| echo "================================================" >> "$output_target" | |
| echo "FILE: $relative_path" >> "$output_target" | |
| echo "================================================" >> "$output_target" | |
| echo '```'$lang_hint >> "$output_target" | |
| cat "$filepath" >> "$output_target"; local exit_code=$? | |
| echo '```' >> "$output_target" | |
| echo "" >> "$output_target" | |
| if [ $exit_code -ne 0 ]; then | |
| echo "Warning: Error reading file content: $filepath" >&2 | |
| return 1 | |
| fi | |
| return 0 | |
| } | |
| is_match_by_pattern() { # For shell globs | |
| local name_to_check="$1"; shift; local patterns=("$@") | |
| for pattern in "${patterns[@]}"; do | |
| if [[ "$name_to_check" == $pattern ]]; then | |
| return 0 # Matches | |
| fi | |
| done | |
| return 1 # No match | |
| } | |
| is_match_by_regex() { # For regex against a string (path) | |
| local string_to_check="$1"; shift; local regex_patterns=("$@") | |
| for regex in "${regex_patterns[@]}"; do | |
| if [[ "$string_to_check" =~ $regex ]]; then | |
| return 0 # Matches | |
| fi | |
| done | |
| return 1 # No match | |
| } | |
| is_explicitly_excluded() { # For --exclude specific_path | |
| local candidate_path="$1"; local abs_candidate_path | |
| abs_candidate_path=$(realpath -sm -- "$candidate_path" 2>/dev/null || echo "$candidate_path") | |
| for excluded_path_entry in "${EXPLICIT_EXCLUDE_PATHS[@]}"; do | |
| if [[ "$abs_candidate_path" == "$excluded_path_entry" ]]; then | |
| log_verbose "Explicitly excluding '$candidate_path' (abs: '$abs_candidate_path') due to --exclude exact match with '$excluded_path_entry'" | |
| return 0 | |
| fi | |
| if [[ -d "$excluded_path_entry" && "$abs_candidate_path" == "$excluded_path_entry/"* ]]; then | |
| log_verbose "Explicitly excluding '$candidate_path' (abs: '$abs_candidate_path') because it's inside --exclude directory '$excluded_path_entry'" | |
| return 0 | |
| fi | |
| done | |
| return 1 | |
| } | |
| run_find_in_dir() { | |
| local search_path="$1" | |
| local find_cmd_args=() | |
| find_cmd_args+=("$search_path") | |
| # --- Initial Pruning (always applied) --- | |
| local prune_conditions=() | |
| local has_prune_conditions=false | |
| if [ ${#EXCLUDE_DIRS_PATTERN[@]} -gt 0 ]; then | |
| prune_conditions+=("("); for p in "${EXCLUDE_DIRS_PATTERN[@]}"; do prune_conditions+=("-name" "$p" "-o"); done | |
| prune_conditions[${#prune_conditions[@]}-1]=")"; prune_conditions+=("-type" "d"); has_prune_conditions=true | |
| fi | |
| if [ ${#EXCLUDE_FILES_PATTERN[@]} -gt 0 ]; then | |
| if [ "$has_prune_conditions" = true ]; then prune_conditions+=("-o"); fi | |
| prune_conditions+=("("); for p in "${EXCLUDE_FILES_PATTERN[@]}"; do prune_conditions+=("-name" "$p" "-o"); done | |
| prune_conditions[${#prune_conditions[@]}-1]=")"; prune_conditions+=("-type" "f"); has_prune_conditions=true | |
| fi | |
| if [ "$has_prune_conditions" = true ]; then | |
| find_cmd_args+=("${prune_conditions[@]}" "-prune" "-o") | |
| fi | |
| # --- Main Filtering (Glob or Regex based) --- | |
| local using_regex_filter=false | |
| if [ ${#INCLUDE_REGEX_PATTERNS[@]} -gt 0 ] || [ ${#EXCLUDE_REGEX_PATTERNS[@]} -gt 0 ]; then | |
| using_regex_filter=true | |
| fi | |
| if [ "$using_regex_filter" = true ]; then | |
| log_verbose "Using REGEX filtering for find." | |
| find_cmd_args+=("-regextype" "posix-extended") | |
| # Include Regexes | |
| if [ ${#INCLUDE_REGEX_PATTERNS[@]} -gt 0 ]; then | |
| find_cmd_args+=("-and" "(") | |
| for regex in "${INCLUDE_REGEX_PATTERNS[@]}"; do find_cmd_args+=("-regex" "$regex" "-o"); done | |
| find_cmd_args[${#find_cmd_args[@]}-1]=")" | |
| fi | |
| # Exclude Regexes | |
| if [ ${#EXCLUDE_REGEX_PATTERNS[@]} -gt 0 ]; then | |
| find_cmd_args+=("-and" "-not" "(") | |
| for regex in "${EXCLUDE_REGEX_PATTERNS[@]}"; do find_cmd_args+=("-regex" "$regex" "-o"); done | |
| find_cmd_args[${#find_cmd_args[@]}-1]=")" | |
| fi | |
| else # Using Glob filtering | |
| log_verbose "Using GLOB filtering for find." | |
| # Include Patterns (Globs) | |
| if [ ${#INCLUDE_PATTERNS[@]} -gt 0 ]; then | |
| find_cmd_args+=("-and" "(") | |
| for p in "${INCLUDE_PATTERNS[@]}"; do | |
| if [[ "$p" == *"/"* ]]; then find_cmd_args+=("-path" "$p" "-o"); else find_cmd_args+=("-name" "$p" "-o"); fi | |
| done | |
| find_cmd_args[${#find_cmd_args[@]}-1]=")" | |
| fi | |
| # Exclude Patterns (Globs) | |
| if [ ${#EXCLUDE_SHELL_PATTERNS[@]} -gt 0 ]; then | |
| find_cmd_args+=("-and" "-not" "(") | |
| for p in "${EXCLUDE_SHELL_PATTERNS[@]}"; do | |
| if [[ "$p" == *"/"* ]]; then find_cmd_args+=("-path" "$p" "-o"); else find_cmd_args+=("-name" "$p" "-o"); fi | |
| done | |
| find_cmd_args[${#find_cmd_args[@]}-1]=")" | |
| fi | |
| fi | |
| find_cmd_args+=("-type" "f" "-printf" "%P\0") | |
| log_verbose "Executing find: find ${find_cmd_args[*]}" | |
| find "${find_cmd_args[@]}" 2>/dev/null | |
| } | |
| calculate_and_print_token_estimate() { | |
| local total_bytes="$1"; local output_target="$2" | |
| if [[ ! "$total_bytes" =~ ^[0-9]+$ ]] || [ "$total_bytes" -eq 0 ]; then log_verbose "No bytes processed, cannot estimate tokens."; return; fi | |
| local estimated_tokens=$((total_bytes / 4)); local formatted_tokens | |
| if [ "$estimated_tokens" -ge 1000000 ]; then | |
| local m=$((estimated_tokens/1000000)); local rk=$(((estimated_tokens%1000000)/100000)); formatted_tokens="${m}.${rk}M" | |
| elif [ "$estimated_tokens" -ge 1000 ]; then | |
| local k=$((estimated_tokens/1000)); local rh=$(((estimated_tokens%1000)/100)); formatted_tokens="${k}.${rh}K" | |
| else formatted_tokens="$estimated_tokens"; fi | |
| echo "Rough Token Estimate (~chars/4) for ${output_target:-processed files}: ${formatted_tokens} tokens (${total_bytes} bytes)" | |
| } | |
| usage() { | |
| # Get default name for usage message *only* | |
| local default_name_example=$(basename "$(pwd)")"-ingest.txt" | |
| echo "Usage: $0 [options] [--] [file_or_dir ...]" >&2 | |
| echo "Options:" >&2 | |
| echo " -o <filename.txt>: Output file (default: '$default_name_example')." >&2 | |
| echo " -v, --verbose: Enable verbose logging." >&2 | |
| echo " -d, --dry-run: List files to include, don't create output." >&2 | |
| echo " --estimate-tokens: Show rough token estimate (~chars/4) at the end." >&2 | |
| echo " --exclude <path>: Specific file/dir to exclude (shell wildcards expanded by shell)." >&2 | |
| echo " --include-pattern <glob>: Glob pattern ('*.py') to include in search. Repeatable." >&2 | |
| echo " --exclude-pattern <glob>: Glob pattern ('*.log') to exclude from search. Repeatable." >&2 | |
| echo " --include-regex <regex>: POSIX ERE regex to include files (matches full relative path). Repeatable." >&2 | |
| echo " --exclude-regex <regex>: POSIX ERE regex to exclude files (matches full relative path). Repeatable." >&2 | |
| echo " --: Use if a filename starts with '-'." >&2 | |
| echo "Input: Files or directories. Default '.' if using --include-pattern/regex and no paths given." >&2 | |
| exit 1 | |
| } | |
| if [ $# -eq 0 ]; then | |
| usage | |
| fi | |
| while [[ $# -gt 0 ]]; do | |
| case "$1" in | |
| -o) if [ -z "$2" ] || [[ "$2" == -* ]]; then echo "Error: -o requires filename." >&2; usage; fi; OUTPUT_FILE="$2"; shift 2 ;; | |
| -v|--verbose) VERBOSE=true; shift ;; | |
| -d|--dry-run) DRY_RUN=true; shift ;; | |
| --estimate-tokens) ESTIMATE_TOKENS=true; shift ;; | |
| --exclude) shift; while [[ $# -gt 0 && "$1" != -* ]]; do EXPLICIT_EXCLUDE_PATHS+=("$(realpath -sm -- "$1" 2>/dev/null || echo "$1")"); shift; done ;; | |
| --include-pattern) shift; while [[ $# -gt 0 && "$1" != -* ]]; do INCLUDE_PATTERNS+=("$1"); shift; done ;; | |
| --exclude-pattern) shift; while [[ $# -gt 0 && "$1" != -* ]]; do EXCLUDE_SHELL_PATTERNS+=("$1"); shift; done ;; | |
| --include-regex) shift; while [[ $# -gt 0 && "$1" != -* ]]; do INCLUDE_REGEX_PATTERNS+=("$1"); shift; done ;; | |
| --exclude-regex) shift; while [[ $# -gt 0 && "$1" != -* ]]; do EXCLUDE_REGEX_PATTERNS+=("$1"); shift; done ;; | |
| --) shift; while [[ $# -gt 0 ]]; do INPUT_PATHS+=("$1"); shift; done; break ;; | |
| -*) echo "Error: Unknown option '$1'." >&2; usage ;; | |
| *) INPUT_PATHS+=("$1"); shift ;; | |
| esac | |
| done | |
| # --- Set Default Input/Output --- | |
| if [ ${#INPUT_PATHS[@]} -eq 0 ] && { [ ${#INCLUDE_PATTERNS[@]} -gt 0 ] || [ ${#INCLUDE_REGEX_PATTERNS[@]} -gt 0 ]; }; then | |
| INPUT_PATHS+=(".") | |
| log_verbose "Defaulting input path to '.' with include patterns/regex." | |
| elif [ ${#INPUT_PATHS[@]} -eq 0 ]; then | |
| echo "Error: No input or include patterns/regex specified." >&2 | |
| usage | |
| fi | |
| # Set default output file name if -o was not used | |
| if [ -z "$OUTPUT_FILE" ]; then | |
| current_dir_name=$(basename "$(pwd)") | |
| OUTPUT_FILE="${current_dir_name}-ingest.txt" | |
| log_verbose "Output file not specified, defaulting to: $OUTPUT_FILE" | |
| fi | |
| # --- Initialization --- | |
| total_bytes_processed=0 | |
| processed_count=0 | |
| files_to_process=() | |
| # --- Prepare Output / Dry Run Start --- | |
| if [ "$DRY_RUN" = true ]; then | |
| log_verbose "Starting dry run..." | |
| echo "--- Files that would be included ---" | |
| else | |
| log_verbose "Starting code ingestion..." | |
| log_verbose "Output file: $OUTPUT_FILE" | |
| : > "$OUTPUT_FILE" | |
| if [ $? -ne 0 ]; then | |
| echo "Error: Could not create/clear output file: $OUTPUT_FILE" >&2 | |
| exit 1 | |
| fi | |
| fi | |
| log_verbose "Input paths: ${INPUT_PATHS[*]}" | |
| log_verbose "Default Exclude Dirs Pattern: ${EXCLUDE_DIRS_PATTERN[*]}" | |
| log_verbose "Default Exclude Files Pattern: ${EXCLUDE_FILES_PATTERN[*]}" | |
| if [ ${#EXPLICIT_EXCLUDE_PATHS[@]} -gt 0 ]; then log_verbose "Explicit Exclude Paths (--exclude): ${EXPLICIT_EXCLUDE_PATHS[*]}"; fi | |
| if [ ${#INCLUDE_PATTERNS[@]} -gt 0 ]; then log_verbose "Include Patterns (Glob): ${INCLUDE_PATTERNS[*]}"; fi | |
| if [ ${#EXCLUDE_SHELL_PATTERNS[@]} -gt 0 ]; then log_verbose "Exclude Patterns (Glob): ${EXCLUDE_SHELL_PATTERNS[*]}"; fi | |
| if [ ${#INCLUDE_REGEX_PATTERNS[@]} -gt 0 ]; then log_verbose "Include Regex: ${INCLUDE_REGEX_PATTERNS[*]}"; fi | |
| if [ ${#EXCLUDE_REGEX_PATTERNS[@]} -gt 0 ]; then log_verbose "Exclude Regex: ${EXCLUDE_REGEX_PATTERNS[*]}"; fi | |
| USING_REGEX_FOR_SINGLE_CHECKS=false | |
| if [ ${#INCLUDE_REGEX_PATTERNS[@]} -gt 0 ] || [ ${#EXCLUDE_REGEX_PATTERNS[@]} -gt 0 ]; then | |
| USING_REGEX_FOR_SINGLE_CHECKS=true | |
| fi | |
| # --- File Discovery and Filtering --- | |
| for item in "${INPUT_PATHS[@]}"; do | |
| current_item_relative_path="$item" | |
| if [[ "$item" != /* && "$item" != "./"* ]]; then | |
| current_item_relative_path="./$item" | |
| fi | |
| if [ ! -e "$item" ]; then | |
| echo "Warning: Input path does not exist: '$item'. Skipping." >&2 | |
| continue | |
| fi | |
| if is_explicitly_excluded "$item"; then | |
| continue | |
| fi | |
| item_basename=$(basename "$item") | |
| if [ -f "$item" ]; then | |
| # Default file pattern excludes (e.g. *.pyc) | |
| if is_match_by_pattern "$item_basename" "${EXCLUDE_FILES_PATTERN[@]}"; then | |
| log_verbose "Excluding single file (default pattern): $item" | |
| continue | |
| fi | |
| if [ "$USING_REGEX_FOR_SINGLE_CHECKS" = true ]; then | |
| passes_include_regex=true | |
| if [ ${#INCLUDE_REGEX_PATTERNS[@]} -gt 0 ]; then | |
| passes_include_regex=false | |
| if is_match_by_regex "$current_item_relative_path" "${INCLUDE_REGEX_PATTERNS[@]}"; then passes_include_regex=true; fi | |
| fi | |
| if ! $passes_include_regex; then | |
| log_verbose "Excluding single file '$item' (no include regex match)" | |
| continue | |
| fi | |
| if [ ${#EXCLUDE_REGEX_PATTERNS[@]} -gt 0 ]; then | |
| if is_match_by_regex "$current_item_relative_path" "${EXCLUDE_REGEX_PATTERNS[@]}"; then | |
| log_verbose "Excluding single file '$item' (exclude regex match)" | |
| continue | |
| fi | |
| fi | |
| else # Using Glob filtering | |
| if [ ${#EXCLUDE_SHELL_PATTERNS[@]} -gt 0 ]; then | |
| if is_match_by_pattern "$item_basename" "${EXCLUDE_SHELL_PATTERNS[@]}"; then | |
| log_verbose "Excluding single file '$item' (glob --exclude-pattern)" | |
| continue | |
| fi | |
| if is_match_by_pattern "$item" "${EXCLUDE_SHELL_PATTERNS[@]}"; then | |
| log_verbose "Excluding single file '$item' (path glob --exclude-pattern)" | |
| continue | |
| fi | |
| fi | |
| passes_include_glob=true | |
| if [ ${#INCLUDE_PATTERNS[@]} -gt 0 ]; then | |
| passes_include_glob=false | |
| if is_match_by_pattern "$item_basename" "${INCLUDE_PATTERNS[@]}"; then passes_include_glob=true; fi | |
| if ! $passes_include_glob && is_match_by_pattern "$item" "${INCLUDE_PATTERNS[@]}"; then passes_include_glob=true; fi | |
| fi | |
| if ! $passes_include_glob; then | |
| log_verbose "Excluding single file '$item' (no include glob match)" | |
| continue | |
| fi | |
| fi | |
| files_to_process+=("$item") | |
| elif [ -d "$item" ]; then | |
| if is_match_by_pattern "$item_basename" "${EXCLUDE_DIRS_PATTERN[@]}"; then | |
| log_verbose "Excluding directory (default pattern): $item" | |
| continue | |
| fi | |
| if ! $USING_REGEX_FOR_SINGLE_CHECKS && [ ${#EXCLUDE_SHELL_PATTERNS[@]} -gt 0 ]; then | |
| if is_match_by_pattern "$item_basename" "${EXCLUDE_SHELL_PATTERNS[@]}"; then | |
| log_verbose "Excluding directory '$item' (glob --exclude-pattern)" | |
| continue | |
| fi | |
| fi | |
| log_verbose "Searching directory: $item" | |
| mapfile -d $'\0' -t found_files < <(run_find_in_dir "$item") | |
| for relative_path_from_find in "${found_files[@]}"; do | |
| if [ -z "$relative_path_from_find" ]; then | |
| continue | |
| fi | |
| full_path="$item/$relative_path_from_find" | |
| full_path=$(echo "$full_path" | sed 's://:/:g') | |
| if is_explicitly_excluded "$full_path"; then # Check --exclude specific path again | |
| continue | |
| fi | |
| files_to_process+=("$full_path") | |
| done | |
| else | |
| echo "Warning: Input path '$item' is not a file or directory. Skipping." >&2 | |
| fi | |
| done | |
| # --- Processing Phase --- | |
| for file_path in "${files_to_process[@]}"; do | |
| relative_display_path="$file_path" # Simple approach for now | |
| if [ "$DRY_RUN" = true ]; then | |
| echo "$file_path" | |
| if [ "$ESTIMATE_TOKENS" = true ]; then | |
| file_bytes=$(wc -c < "$file_path" 2>/dev/null || echo 0) | |
| total_bytes_processed=$((total_bytes_processed + file_bytes)) | |
| fi | |
| processed_count=$((processed_count + 1)) | |
| else | |
| if process_file "$file_path" "$relative_display_path" "$OUTPUT_FILE"; then | |
| processed_count=$((processed_count + 1)) | |
| fi | |
| fi | |
| done | |
| # --- Final Output / Summary --- | |
| if [ "$DRY_RUN" = true ]; then | |
| echo "--- End of dry run list ---" | |
| echo "Dry run complete. Would include $processed_count file(s)." | |
| if [ "$ESTIMATE_TOKENS" = true ]; then | |
| calculate_and_print_token_estimate "$total_bytes_processed" "files listed above" | |
| fi | |
| else | |
| log_verbose "Finished processing $processed_count files." | |
| echo "Code ingested into: $OUTPUT_FILE" | |
| if [ "$ESTIMATE_TOKENS" = true ]; then | |
| final_bytes=$(wc -c < "$OUTPUT_FILE" 2>/dev/null || echo 0) | |
| calculate_and_print_token_estimate "$final_bytes" "$OUTPUT_FILE" | |
| fi | |
| fi | |
| exit 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment