#!/usr/bin/env python3 import sys, re MIN_MATCH=3 MAX_MATCH=9 # The lyrics of "Wet Ass Pussy" by Cardi B target=open(sys.argv[1]).read() # United States House Resolution 2617, Omnibus spending bill for FY2023 reference=open(sys.argv[2]).read().lower() def sanitize(s): return re.sub('[\W_]+', '', s.lower()) # Sanitize the input by converting to lowercase and removing all non-letters target = re.sub('[\W_]+', '', target.lower()) # Modifiers that can be added to regex letters OPTIONAL_WITH_NOISE='?[^a-z\n]*' WITH_NOISE='[^a-z\n]*' def regex_modify(s, option): return re.sub('([a-z])', r'\1' + option, s) # ANSI Output modifiers BOLD = '\033[1m' PURPLE = '\033[95m' UNDERLINE = '\033[4m' END = '\033[0m' offset = 0 def longest_match_regex(s): """Return a regex that matches any number of letters from s""" result = "" for c in s: result += f"({c}[^a-z\n]*" for _ in s: result += ")?" return result while reference and target: searchstr = regex_modify(target[:MIN_MATCH], WITH_NOISE) searchstr += longest_match_regex(target[MIN_MATCH:MAX_MATCH]) line = re.search(f"\n.*({searchstr}).*\n", reference, re.IGNORECASE) if line is None: break bolded_line = re.sub(f'({searchstr})', BOLD + PURPLE + UNDERLINE + r'\1' + END, line.group(0), re.IGNORECASE).strip() # Find boundaries of word match within the line mstart, mend = line.span(1) matched_chars = len(re.sub('[^a-zA-Z]', '', line.group(1))) print(f"Omnibus spending bill, characters {offset + mstart} to {offset + mend}") print("\t" + bolded_line) target = target[matched_chars:] reference = reference[mend:] offset += mstart