#!/usr/bin/env python3
import sys, re

MIN_MATCH=3
MAX_MATCH=9
# The lyrics of "Wet Ass Pussy" by Cardi B
target=open(sys.argv[1]).read()
# United States House Resolution 2617, Omnibus spending bill for FY2023
reference=open(sys.argv[2]).read().lower()

def sanitize(s): return re.sub('[\W_]+', '', s.lower())

# Sanitize the input by converting to lowercase and removing all non-letters
target = re.sub('[\W_]+', '', target.lower())

# Modifiers that can be added to regex letters
OPTIONAL_WITH_NOISE='?[^a-z\n]*'
WITH_NOISE='[^a-z\n]*'
def regex_modify(s, option):
    return re.sub('([a-z])', r'\1' + option, s)

# ANSI Output modifiers
BOLD = '\033[1m'
PURPLE = '\033[95m'
UNDERLINE = '\033[4m'
END = '\033[0m'

offset = 0

def longest_match_regex(s):
    """Return a regex that matches any number of letters from s"""
    result = ""
    for c in s:
        result += f"({c}[^a-z\n]*"
    for _ in s:
        result += ")?"
    return result

while reference and target:
    searchstr = regex_modify(target[:MIN_MATCH], WITH_NOISE)
    searchstr += longest_match_regex(target[MIN_MATCH:MAX_MATCH])
    line = re.search(f"\n.*({searchstr}).*\n", reference, re.IGNORECASE)
    if line is None:
        break
    bolded_line = re.sub(f'({searchstr})', BOLD + PURPLE + UNDERLINE + r'\1' + END, line.group(0), re.IGNORECASE).strip()
    # Find boundaries of word match within the line
    mstart, mend = line.span(1)
    matched_chars = len(re.sub('[^a-zA-Z]', '', line.group(1)))
    print(f"Omnibus spending bill, characters {offset + mstart} to {offset + mend}")
    print("\t" + bolded_line)
    target = target[matched_chars:]
    reference = reference[mend:]
    offset += mstart