Skip to content

Instantly share code, notes, and snippets.

@and0x00
Last active March 25, 2025 01:40
Show Gist options
  • Select an option

  • Save and0x00/dd5fb8ede37325c80d84f40ba7f94707 to your computer and use it in GitHub Desktop.

Select an option

Save and0x00/dd5fb8ede37325c80d84f40ba7f94707 to your computer and use it in GitHub Desktop.
#!/bin/bash
# Script to split files into chunks, process each line with "subfy",
# and consolidate results while avoiding duplicate processing via hashes.
# --- Usage ---------------------------------------------------------------
usage() {
cat <<EOF
Usage: $0 [options] <file1> [file2 ... fileN]
Options:
--no-hash Skip hash file creation/verification.
--terminal Display output on terminal; skip hash file operations.
--debug Enable debug output.
-o <output> Output file (default: ${HOME}/.subdomains_db).
--hash-file <log> Hash log file (default: ${HOME}/.processed_hashes.log).
--base-dir <dir> Base directory for temporary files (default: /tmp).
EOF
exit 1
}
# --- Global Variables ----------------------------------------------------
OUTPUT="${HOME}/.subdomains_db"
NO_HASH=false
DEBUG=false
HASH_LOG="${HOME}/.processed_hashes.log"
BASE_DIR="/tmp"
MAX_FILE_SIZE=1048576 # 1MB
INPUT_FILES=()
# --- Parse Arguments -----------------------------------------------------
parse_args() {
while [[ "$#" -gt 0 ]]; do
case $1 in
-o)
OUTPUT="$2"
shift ;;
--hash-file)
HASH_LOG="$2"
shift ;;
--base-dir)
BASE_DIR="$2"
shift ;;
--no-hash)
NO_HASH=true ;;
--terminal)
NO_HASH=true
OUTPUT="" ;;
--debug)
DEBUG=true ;;
-*)
echo "Unknown option: $1"
usage ;;
*)
INPUT_FILES+=("$1") ;;
esac
shift
done
[[ ${#INPUT_FILES[@]} -ge 1 ]] || usage
}
# --- Setup Temporary Directory -------------------------------------------
setup_temp_dir() {
TEMP_DIR=$(mktemp -d -p "$BASE_DIR" tmp.XXXXXX)
[[ "$DEBUG" == true ]] && echo "Temporary directory: $TEMP_DIR"
TEMP_PREFIX="${TEMP_DIR}/tmp_"
}
# --- Cleanup -------------------------------------------------------------
cleanup() {
[[ "$DEBUG" == true ]] && echo "Cleaning up temporary files..."
rm -rf "$TEMP_DIR"
}
trap cleanup EXIT
# --- Hash Verification ---------------------------------------------------
file_processed() {
local hash="$1"
grep -q "^${hash} " "$HASH_LOG"
}
# --- Generate Random String ----------------------------------------------
gen_rand() {
openssl rand -hex 8
}
# --- Process File --------------------------------------------------------
process_file() {
local file="$1"
[[ "$DEBUG" == true ]] && echo "Splitting '$file' into chunks..."
# Split file into 1000-line chunks
split -d -a 5 -l 1000 "$file" "${TEMP_DIR}/chunk_"
shopt -s nullglob
local chunks=("${TEMP_DIR}/chunk_"*)
shopt -u nullglob
[[ "$DEBUG" == true ]] && echo "File '$file' split into ${#chunks[@]} chunks."
local counter=1
for chunk in "${chunks[@]}"; do
local rand_str=$(gen_rand)
local tmp_file="${TEMP_PREFIX}${rand_str}_${counter}.tmp"
[[ "$DEBUG" == true ]] && echo "Creating temp file: $tmp_file"
while IFS= read -r line; do
# Process line with subfy; sort and remove duplicates per line.
echo "$line" | /usr/local/bin/subfy -f /dev/stdin | sort -T /tmp -S 50% | uniq >> "$tmp_file"
# Check file size and start a new file if needed.
if [ -f "$tmp_file" ]; then
local size
size=$(stat -c%s "$tmp_file")
if [ "$size" -ge "$MAX_FILE_SIZE" ]; then
counter=$((counter + 1))
rand_str=$(gen_rand)
tmp_file="${TEMP_PREFIX}${rand_str}_${counter}.tmp"
[[ "$DEBUG" == true ]] && echo "Max size reached. New temp file: $tmp_file"
fi
fi
done < "$chunk"
rm -f "$chunk"
[[ "$DEBUG" == true ]] && echo "Removed chunk: $chunk"
done
}
# --- Merge Output --------------------------------------------------------
merge_temp_files() {
local output_file="$1"
sort -T /tmp -S 50% -u "${TEMP_PREFIX}"*.tmp > "$output_file"
[[ "$DEBUG" == true ]] && echo "Merged temporary files into: $output_file"
}
# --- Merge With Existing Output ------------------------------------------
merge_with_existing() {
local new_output="$1"
if [ -f "$OUTPUT" ]; then
local before_count
before_count=$(du -b "$OUTPUT" | cut -f1)
sort -T /tmp -S 50% -u "$OUTPUT" "$new_output" > "${OUTPUT}.tmp"
mv "${OUTPUT}.tmp" "$OUTPUT"
local after_count
after_count=$(du -b "$OUTPUT" | cut -f1)
[[ "$DEBUG" == true ]] && echo "Output updated: $before_count -> $after_count bytes."
else
mv "$new_output" "$OUTPUT"
local count
count=$(wc -l < "$OUTPUT")
[[ "$DEBUG" == true ]] && echo "Created output file '$OUTPUT' with $count bytes."
fi
}
# --- Main ----------------------------------------------------------------
main() {
parse_args "$@"
setup_temp_dir
for file in "${INPUT_FILES[@]}"; do
if [ "$NO_HASH" = false ]; then
local file_hash
file_hash=$(sha256sum "$file" | awk '{print $1}')
if file_processed "$file_hash"; then
[[ "$DEBUG" == true ]] && echo "Skipping already processed file: $file"
continue
fi
fi
[[ "$DEBUG" == true ]] && echo "Processing file: $file"
process_file "$file"
if [ "$NO_HASH" = false ]; then
file_hash=$(sha256sum "$file" | awk '{print $1}')
echo "$file_hash $file" >> "$HASH_LOG"
fi
done
# Merge all temporary results into a final output file.
local session_output="${TEMP_DIR}/session_output.txt"
merge_temp_files "$session_output"
if [ -n "$OUTPUT" ]; then
merge_with_existing "$session_output"
[[ "$DEBUG" == true ]] && echo "Processing complete. Output saved to '$OUTPUT'."
else
cat "$session_output"
fi
# Optional debug report.
local tmp_count total_size
tmp_count=$(ls "${TEMP_PREFIX}"*.tmp 2>/dev/null | wc -l)
total_size=$(du -ch "${TEMP_PREFIX}"*.tmp 2>/dev/null | grep total$ | awk '{print $1}')
[[ "$DEBUG" == true ]] && echo "Temporary files removed: $tmp_count, totaling ~$total_size."
}
# --- Execute -------------------------------------------------------------
main "$@"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment