Created
December 26, 2022 22:37
-
-
Save reallymemorable/18b4788f84841a3e990e841fa70e60bc to your computer and use it in GitHub Desktop.
Revisions
-
reallymemorable created this gist
Dec 26, 2022 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,71 @@ from PIL import Image import pytesseract import sys def process(image_file): # If you don't have tesseract executable in your PATH, include the following: # pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/bin/tesseract' pytesseract.pytesseract.tesseract_cmd = 'tesseract' # Example tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract' # Simple image to string # print(pytesseract.image_to_string(Image.open(image_file))) # Timeout/terminate the tesseract job after a period of time try: ocr_output = pytesseract.image_to_string(image_file, timeout=2) # Timeout after 2 seconds ocr_lines = ocr_output.split('\n') for i in range(len(ocr_lines)): # for each in ocr_lines, remove numbers from the strings ocr_lines[i] = ''.join([j for j in ocr_lines[i] if not j.isdigit()]) # for each in ocr_lines, remove special characters from the strings ocr_lines[i] = ''.join([j for j in ocr_lines[i] if j.isalnum() or j == ' ']) # remove any words with less than 2 characters ocr_lines[i] = ' '.join([word for word in ocr_lines[i].split() if len(word) > 1]) # for each in ocr_lines, remove leading and trailing spaces from the strings ocr_lines[i] = ocr_lines[i].strip() # replace "HUHHUS" with "HUMMUS" if it occurs inside the string if "HUHHUS" in ocr_lines[i]: ocr_lines[i] = ocr_lines[i].replace("HUHHUS", "HUMMUS") # replace "EDAMAHE" with "EDAMAME" if it occurs inside the string if "EDAMAHE" in ocr_lines[i]: ocr_lines[i] = ocr_lines[i].replace("EDAMAHE", "EDAMAME") # replace "BAHAHAS" with "BANANAS" if it occurs inside the string if "BAHAHAS" in ocr_lines[i]: ocr_lines[i] = ocr_lines[i].replace("BAHAHAS", "BANANAS") # for each in ocr_lines, remove empty strings if ocr_lines[i] == '': ocr_lines[i] = None # if not None, apply normal capitalization to the strings if ocr_lines[i] != None: ocr_lines[i] = ocr_lines[i].title() # remove any list items that only have two characters if len(ocr_lines[i]) == 2: ocr_lines[i] = None # remove empty strings from ocr_lines ocr_lines = list(filter(None, ocr_lines)) # remove any list items that have the words "Tax", "Total", "Subtotal", or "Visa" in them for i in range(len(ocr_lines)): if "Tax" in ocr_lines[i] or "Total" in ocr_lines[i] or "Subtotal" in ocr_lines[i] or "Visa" in ocr_lines[i] or "Manager" in ocr_lines[i] or "Walmart" in ocr_lines[i]: ocr_lines[i] = None # TODO: not sure why but I need to run this again after the above function # remove empty strings from ocr_lines ocr_lines = list(filter(None, ocr_lines)) return ocr_lines # print(ocr_lines) # print(pytesseract.image_to_string(image_file, timeout=0.5)) # Timeout after half a second except RuntimeError as timeout_error: # Tesseract processing is terminated pass if __name__ == '__main__': globals()[sys.argv[1]](sys.argv[2])