Skip to content

Instantly share code, notes, and snippets.

@Paradiddle131
Last active May 12, 2025 21:11
Show Gist options
  • Select an option

  • Save Paradiddle131/f3bf9cf0949d60d9d377949fc4484f35 to your computer and use it in GitHub Desktop.

Select an option

Save Paradiddle131/f3bf9cf0949d60d9d377949fc4484f35 to your computer and use it in GitHub Desktop.
#!/bin/bash
OUTPUT_FILE=""
VERBOSE=false
DRY_RUN=false
ESTIMATE_TOKENS=true
INPUT_PATHS=()
EXPLICIT_EXCLUDE_PATHS=() # For --exclude specific_path
INCLUDE_PATTERNS=() # For --include-pattern (glob)
EXCLUDE_SHELL_PATTERNS=() # For --exclude-pattern (glob)
INCLUDE_REGEX_PATTERNS=() # New: For --include-regex
EXCLUDE_REGEX_PATTERNS=() # New: For --exclude-regex
# Default patterns for initial pruning (applied before glob/regex filtering)
EXCLUDE_DIRS_PATTERN=( ".git" "node_modules" "dist" "build" ".venv" "venv" "env" "bin" "obj" "__pycache__" ".pytest_cache" ".mypy_cache" ".ruff_cache" ".idea" ".vscode" "target" "out" "logs" )
EXCLUDE_FILES_PATTERN=( "*.pyc" "*.pyo" "*.log" "*.swp" "*.swo" "*.lock" "poetry.lock" "package-lock.json" "yarn.lock" "Pipfile.lock" "uv.lock" "*.DS_Store" "Thumbs.db" "*.class" "*.jar" "*.war" "*.ear" )
log_verbose() {
if [ "$VERBOSE" = true ]; then
echo "[VERBOSE] $@" >&2
fi
}
get_lang_hint() {
local filename="$1"
local ext="${filename##*.}"
case "$ext" in
py) echo "python" ;; js) echo "javascript" ;; ts) echo "typescript" ;;
java) echo "java" ;; c) echo "c" ;; cpp | cxx | h | hpp) echo "cpp" ;;
cs) echo "csharp" ;; go) echo "go" ;; rb) echo "ruby" ;; php) echo "php" ;;
html | htm) echo "html" ;; css) echo "css" ;; scss | sass) echo "scss" ;;
sh | bash) echo "bash" ;; zsh) echo "zsh" ;; sql) echo "sql" ;;
md | markdown) echo "markdown" ;; json) echo "json" ;;
yaml | yml) echo "yaml" ;; xml) echo "xml" ;; *) echo "" ;;
esac
}
process_file() {
local filepath="$1"; local relative_path="$2"; local output_target="$3"
if [ ! -f "$filepath" ] || [ ! -r "$filepath" ]; then
echo "Warning: Cannot read file: $filepath. Skipping." >&2
return 1
fi
log_verbose "Processing file: $filepath (as $relative_path)"
local lang_hint=$(get_lang_hint "$filepath")
echo "================================================" >> "$output_target"
echo "FILE: $relative_path" >> "$output_target"
echo "================================================" >> "$output_target"
echo '```'$lang_hint >> "$output_target"
cat "$filepath" >> "$output_target"; local exit_code=$?
echo '```' >> "$output_target"
echo "" >> "$output_target"
if [ $exit_code -ne 0 ]; then
echo "Warning: Error reading file content: $filepath" >&2
return 1
fi
return 0
}
is_match_by_pattern() { # For shell globs
local name_to_check="$1"; shift; local patterns=("$@")
for pattern in "${patterns[@]}"; do
if [[ "$name_to_check" == $pattern ]]; then
return 0 # Matches
fi
done
return 1 # No match
}
is_match_by_regex() { # For regex against a string (path)
local string_to_check="$1"; shift; local regex_patterns=("$@")
for regex in "${regex_patterns[@]}"; do
if [[ "$string_to_check" =~ $regex ]]; then
return 0 # Matches
fi
done
return 1 # No match
}
is_explicitly_excluded() { # For --exclude specific_path
local candidate_path="$1"; local abs_candidate_path
abs_candidate_path=$(realpath -sm -- "$candidate_path" 2>/dev/null || echo "$candidate_path")
for excluded_path_entry in "${EXPLICIT_EXCLUDE_PATHS[@]}"; do
if [[ "$abs_candidate_path" == "$excluded_path_entry" ]]; then
log_verbose "Explicitly excluding '$candidate_path' (abs: '$abs_candidate_path') due to --exclude exact match with '$excluded_path_entry'"
return 0
fi
if [[ -d "$excluded_path_entry" && "$abs_candidate_path" == "$excluded_path_entry/"* ]]; then
log_verbose "Explicitly excluding '$candidate_path' (abs: '$abs_candidate_path') because it's inside --exclude directory '$excluded_path_entry'"
return 0
fi
done
return 1
}
run_find_in_dir() {
local search_path="$1"
local find_cmd_args=()
find_cmd_args+=("$search_path")
# --- Initial Pruning (always applied) ---
local prune_conditions=()
local has_prune_conditions=false
if [ ${#EXCLUDE_DIRS_PATTERN[@]} -gt 0 ]; then
prune_conditions+=("("); for p in "${EXCLUDE_DIRS_PATTERN[@]}"; do prune_conditions+=("-name" "$p" "-o"); done
prune_conditions[${#prune_conditions[@]}-1]=")"; prune_conditions+=("-type" "d"); has_prune_conditions=true
fi
if [ ${#EXCLUDE_FILES_PATTERN[@]} -gt 0 ]; then
if [ "$has_prune_conditions" = true ]; then prune_conditions+=("-o"); fi
prune_conditions+=("("); for p in "${EXCLUDE_FILES_PATTERN[@]}"; do prune_conditions+=("-name" "$p" "-o"); done
prune_conditions[${#prune_conditions[@]}-1]=")"; prune_conditions+=("-type" "f"); has_prune_conditions=true
fi
if [ "$has_prune_conditions" = true ]; then
find_cmd_args+=("${prune_conditions[@]}" "-prune" "-o")
fi
# --- Main Filtering (Glob or Regex based) ---
local using_regex_filter=false
if [ ${#INCLUDE_REGEX_PATTERNS[@]} -gt 0 ] || [ ${#EXCLUDE_REGEX_PATTERNS[@]} -gt 0 ]; then
using_regex_filter=true
fi
if [ "$using_regex_filter" = true ]; then
log_verbose "Using REGEX filtering for find."
find_cmd_args+=("-regextype" "posix-extended")
# Include Regexes
if [ ${#INCLUDE_REGEX_PATTERNS[@]} -gt 0 ]; then
find_cmd_args+=("-and" "(")
for regex in "${INCLUDE_REGEX_PATTERNS[@]}"; do find_cmd_args+=("-regex" "$regex" "-o"); done
find_cmd_args[${#find_cmd_args[@]}-1]=")"
fi
# Exclude Regexes
if [ ${#EXCLUDE_REGEX_PATTERNS[@]} -gt 0 ]; then
find_cmd_args+=("-and" "-not" "(")
for regex in "${EXCLUDE_REGEX_PATTERNS[@]}"; do find_cmd_args+=("-regex" "$regex" "-o"); done
find_cmd_args[${#find_cmd_args[@]}-1]=")"
fi
else # Using Glob filtering
log_verbose "Using GLOB filtering for find."
# Include Patterns (Globs)
if [ ${#INCLUDE_PATTERNS[@]} -gt 0 ]; then
find_cmd_args+=("-and" "(")
for p in "${INCLUDE_PATTERNS[@]}"; do
if [[ "$p" == *"/"* ]]; then find_cmd_args+=("-path" "$p" "-o"); else find_cmd_args+=("-name" "$p" "-o"); fi
done
find_cmd_args[${#find_cmd_args[@]}-1]=")"
fi
# Exclude Patterns (Globs)
if [ ${#EXCLUDE_SHELL_PATTERNS[@]} -gt 0 ]; then
find_cmd_args+=("-and" "-not" "(")
for p in "${EXCLUDE_SHELL_PATTERNS[@]}"; do
if [[ "$p" == *"/"* ]]; then find_cmd_args+=("-path" "$p" "-o"); else find_cmd_args+=("-name" "$p" "-o"); fi
done
find_cmd_args[${#find_cmd_args[@]}-1]=")"
fi
fi
find_cmd_args+=("-type" "f" "-printf" "%P\0")
log_verbose "Executing find: find ${find_cmd_args[*]}"
find "${find_cmd_args[@]}" 2>/dev/null
}
calculate_and_print_token_estimate() {
local total_bytes="$1"; local output_target="$2"
if [[ ! "$total_bytes" =~ ^[0-9]+$ ]] || [ "$total_bytes" -eq 0 ]; then log_verbose "No bytes processed, cannot estimate tokens."; return; fi
local estimated_tokens=$((total_bytes / 4)); local formatted_tokens
if [ "$estimated_tokens" -ge 1000000 ]; then
local m=$((estimated_tokens/1000000)); local rk=$(((estimated_tokens%1000000)/100000)); formatted_tokens="${m}.${rk}M"
elif [ "$estimated_tokens" -ge 1000 ]; then
local k=$((estimated_tokens/1000)); local rh=$(((estimated_tokens%1000)/100)); formatted_tokens="${k}.${rh}K"
else formatted_tokens="$estimated_tokens"; fi
echo "Rough Token Estimate (~chars/4) for ${output_target:-processed files}: ${formatted_tokens} tokens (${total_bytes} bytes)"
}
usage() {
# Get default name for usage message *only*
local default_name_example=$(basename "$(pwd)")"-ingest.txt"
echo "Usage: $0 [options] [--] [file_or_dir ...]" >&2
echo "Options:" >&2
echo " -o <filename.txt>: Output file (default: '$default_name_example')." >&2
echo " -v, --verbose: Enable verbose logging." >&2
echo " -d, --dry-run: List files to include, don't create output." >&2
echo " --estimate-tokens: Show rough token estimate (~chars/4) at the end." >&2
echo " --exclude <path>: Specific file/dir to exclude (shell wildcards expanded by shell)." >&2
echo " --include-pattern <glob>: Glob pattern ('*.py') to include in search. Repeatable." >&2
echo " --exclude-pattern <glob>: Glob pattern ('*.log') to exclude from search. Repeatable." >&2
echo " --include-regex <regex>: POSIX ERE regex to include files (matches full relative path). Repeatable." >&2
echo " --exclude-regex <regex>: POSIX ERE regex to exclude files (matches full relative path). Repeatable." >&2
echo " --: Use if a filename starts with '-'." >&2
echo "Input: Files or directories. Default '.' if using --include-pattern/regex and no paths given." >&2
exit 1
}
if [ $# -eq 0 ]; then
usage
fi
while [[ $# -gt 0 ]]; do
case "$1" in
-o) if [ -z "$2" ] || [[ "$2" == -* ]]; then echo "Error: -o requires filename." >&2; usage; fi; OUTPUT_FILE="$2"; shift 2 ;;
-v|--verbose) VERBOSE=true; shift ;;
-d|--dry-run) DRY_RUN=true; shift ;;
--estimate-tokens) ESTIMATE_TOKENS=true; shift ;;
--exclude) shift; while [[ $# -gt 0 && "$1" != -* ]]; do EXPLICIT_EXCLUDE_PATHS+=("$(realpath -sm -- "$1" 2>/dev/null || echo "$1")"); shift; done ;;
--include-pattern) shift; while [[ $# -gt 0 && "$1" != -* ]]; do INCLUDE_PATTERNS+=("$1"); shift; done ;;
--exclude-pattern) shift; while [[ $# -gt 0 && "$1" != -* ]]; do EXCLUDE_SHELL_PATTERNS+=("$1"); shift; done ;;
--include-regex) shift; while [[ $# -gt 0 && "$1" != -* ]]; do INCLUDE_REGEX_PATTERNS+=("$1"); shift; done ;;
--exclude-regex) shift; while [[ $# -gt 0 && "$1" != -* ]]; do EXCLUDE_REGEX_PATTERNS+=("$1"); shift; done ;;
--) shift; while [[ $# -gt 0 ]]; do INPUT_PATHS+=("$1"); shift; done; break ;;
-*) echo "Error: Unknown option '$1'." >&2; usage ;;
*) INPUT_PATHS+=("$1"); shift ;;
esac
done
# --- Set Default Input/Output ---
if [ ${#INPUT_PATHS[@]} -eq 0 ] && { [ ${#INCLUDE_PATTERNS[@]} -gt 0 ] || [ ${#INCLUDE_REGEX_PATTERNS[@]} -gt 0 ]; }; then
INPUT_PATHS+=(".")
log_verbose "Defaulting input path to '.' with include patterns/regex."
elif [ ${#INPUT_PATHS[@]} -eq 0 ]; then
echo "Error: No input or include patterns/regex specified." >&2
usage
fi
# Set default output file name if -o was not used
if [ -z "$OUTPUT_FILE" ]; then
current_dir_name=$(basename "$(pwd)")
OUTPUT_FILE="${current_dir_name}-ingest.txt"
log_verbose "Output file not specified, defaulting to: $OUTPUT_FILE"
fi
# --- Initialization ---
total_bytes_processed=0
processed_count=0
files_to_process=()
# --- Prepare Output / Dry Run Start ---
if [ "$DRY_RUN" = true ]; then
log_verbose "Starting dry run..."
echo "--- Files that would be included ---"
else
log_verbose "Starting code ingestion..."
log_verbose "Output file: $OUTPUT_FILE"
: > "$OUTPUT_FILE"
if [ $? -ne 0 ]; then
echo "Error: Could not create/clear output file: $OUTPUT_FILE" >&2
exit 1
fi
fi
log_verbose "Input paths: ${INPUT_PATHS[*]}"
log_verbose "Default Exclude Dirs Pattern: ${EXCLUDE_DIRS_PATTERN[*]}"
log_verbose "Default Exclude Files Pattern: ${EXCLUDE_FILES_PATTERN[*]}"
if [ ${#EXPLICIT_EXCLUDE_PATHS[@]} -gt 0 ]; then log_verbose "Explicit Exclude Paths (--exclude): ${EXPLICIT_EXCLUDE_PATHS[*]}"; fi
if [ ${#INCLUDE_PATTERNS[@]} -gt 0 ]; then log_verbose "Include Patterns (Glob): ${INCLUDE_PATTERNS[*]}"; fi
if [ ${#EXCLUDE_SHELL_PATTERNS[@]} -gt 0 ]; then log_verbose "Exclude Patterns (Glob): ${EXCLUDE_SHELL_PATTERNS[*]}"; fi
if [ ${#INCLUDE_REGEX_PATTERNS[@]} -gt 0 ]; then log_verbose "Include Regex: ${INCLUDE_REGEX_PATTERNS[*]}"; fi
if [ ${#EXCLUDE_REGEX_PATTERNS[@]} -gt 0 ]; then log_verbose "Exclude Regex: ${EXCLUDE_REGEX_PATTERNS[*]}"; fi
USING_REGEX_FOR_SINGLE_CHECKS=false
if [ ${#INCLUDE_REGEX_PATTERNS[@]} -gt 0 ] || [ ${#EXCLUDE_REGEX_PATTERNS[@]} -gt 0 ]; then
USING_REGEX_FOR_SINGLE_CHECKS=true
fi
# --- File Discovery and Filtering ---
for item in "${INPUT_PATHS[@]}"; do
current_item_relative_path="$item"
if [[ "$item" != /* && "$item" != "./"* ]]; then
current_item_relative_path="./$item"
fi
if [ ! -e "$item" ]; then
echo "Warning: Input path does not exist: '$item'. Skipping." >&2
continue
fi
if is_explicitly_excluded "$item"; then
continue
fi
item_basename=$(basename "$item")
if [ -f "$item" ]; then
# Default file pattern excludes (e.g. *.pyc)
if is_match_by_pattern "$item_basename" "${EXCLUDE_FILES_PATTERN[@]}"; then
log_verbose "Excluding single file (default pattern): $item"
continue
fi
if [ "$USING_REGEX_FOR_SINGLE_CHECKS" = true ]; then
passes_include_regex=true
if [ ${#INCLUDE_REGEX_PATTERNS[@]} -gt 0 ]; then
passes_include_regex=false
if is_match_by_regex "$current_item_relative_path" "${INCLUDE_REGEX_PATTERNS[@]}"; then passes_include_regex=true; fi
fi
if ! $passes_include_regex; then
log_verbose "Excluding single file '$item' (no include regex match)"
continue
fi
if [ ${#EXCLUDE_REGEX_PATTERNS[@]} -gt 0 ]; then
if is_match_by_regex "$current_item_relative_path" "${EXCLUDE_REGEX_PATTERNS[@]}"; then
log_verbose "Excluding single file '$item' (exclude regex match)"
continue
fi
fi
else # Using Glob filtering
if [ ${#EXCLUDE_SHELL_PATTERNS[@]} -gt 0 ]; then
if is_match_by_pattern "$item_basename" "${EXCLUDE_SHELL_PATTERNS[@]}"; then
log_verbose "Excluding single file '$item' (glob --exclude-pattern)"
continue
fi
if is_match_by_pattern "$item" "${EXCLUDE_SHELL_PATTERNS[@]}"; then
log_verbose "Excluding single file '$item' (path glob --exclude-pattern)"
continue
fi
fi
passes_include_glob=true
if [ ${#INCLUDE_PATTERNS[@]} -gt 0 ]; then
passes_include_glob=false
if is_match_by_pattern "$item_basename" "${INCLUDE_PATTERNS[@]}"; then passes_include_glob=true; fi
if ! $passes_include_glob && is_match_by_pattern "$item" "${INCLUDE_PATTERNS[@]}"; then passes_include_glob=true; fi
fi
if ! $passes_include_glob; then
log_verbose "Excluding single file '$item' (no include glob match)"
continue
fi
fi
files_to_process+=("$item")
elif [ -d "$item" ]; then
if is_match_by_pattern "$item_basename" "${EXCLUDE_DIRS_PATTERN[@]}"; then
log_verbose "Excluding directory (default pattern): $item"
continue
fi
if ! $USING_REGEX_FOR_SINGLE_CHECKS && [ ${#EXCLUDE_SHELL_PATTERNS[@]} -gt 0 ]; then
if is_match_by_pattern "$item_basename" "${EXCLUDE_SHELL_PATTERNS[@]}"; then
log_verbose "Excluding directory '$item' (glob --exclude-pattern)"
continue
fi
fi
log_verbose "Searching directory: $item"
mapfile -d $'\0' -t found_files < <(run_find_in_dir "$item")
for relative_path_from_find in "${found_files[@]}"; do
if [ -z "$relative_path_from_find" ]; then
continue
fi
full_path="$item/$relative_path_from_find"
full_path=$(echo "$full_path" | sed 's://:/:g')
if is_explicitly_excluded "$full_path"; then # Check --exclude specific path again
continue
fi
files_to_process+=("$full_path")
done
else
echo "Warning: Input path '$item' is not a file or directory. Skipping." >&2
fi
done
# --- Processing Phase ---
for file_path in "${files_to_process[@]}"; do
relative_display_path="$file_path" # Simple approach for now
if [ "$DRY_RUN" = true ]; then
echo "$file_path"
if [ "$ESTIMATE_TOKENS" = true ]; then
file_bytes=$(wc -c < "$file_path" 2>/dev/null || echo 0)
total_bytes_processed=$((total_bytes_processed + file_bytes))
fi
processed_count=$((processed_count + 1))
else
if process_file "$file_path" "$relative_display_path" "$OUTPUT_FILE"; then
processed_count=$((processed_count + 1))
fi
fi
done
# --- Final Output / Summary ---
if [ "$DRY_RUN" = true ]; then
echo "--- End of dry run list ---"
echo "Dry run complete. Would include $processed_count file(s)."
if [ "$ESTIMATE_TOKENS" = true ]; then
calculate_and_print_token_estimate "$total_bytes_processed" "files listed above"
fi
else
log_verbose "Finished processing $processed_count files."
echo "Code ingested into: $OUTPUT_FILE"
if [ "$ESTIMATE_TOKENS" = true ]; then
final_bytes=$(wc -c < "$OUTPUT_FILE" 2>/dev/null || echo 0)
calculate_and_print_token_estimate "$final_bytes" "$OUTPUT_FILE"
fi
fi
exit 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment