Created
December 27, 2022 01:35
-
-
Save reallymemorable/b832fba7e59d36d0afe015ec8725d163 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from PIL import Image | |
| import pytesseract | |
| import sys | |
| def process(image_file): | |
| # If you don't have tesseract executable in your PATH, include the following: | |
| # pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/bin/tesseract' | |
| pytesseract.pytesseract.tesseract_cmd = 'tesseract' | |
| # Example tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract' | |
| # Simple image to string | |
| # print(pytesseract.image_to_string(Image.open(image_file))) | |
| # Timeout/terminate the tesseract job after a period of time | |
| try: | |
| ocr_output = pytesseract.image_to_string(image_file, timeout=2) # Timeout after 2 seconds | |
| ocr_lines = ocr_output.split('\n') | |
| for i in range(len(ocr_lines)): | |
| # for each in ocr_lines, remove numbers from the strings | |
| ocr_lines[i] = ''.join([j for j in ocr_lines[i] if not j.isdigit()]) | |
| # for each in ocr_lines, remove special characters from the strings | |
| ocr_lines[i] = ''.join([j for j in ocr_lines[i] if j.isalnum() or j == ' ']) | |
| # remove any words with less than 2 characters | |
| ocr_lines[i] = ' '.join([word for word in ocr_lines[i].split() if len(word) > 1]) | |
| # for each in ocr_lines, remove leading and trailing spaces from the strings | |
| ocr_lines[i] = ocr_lines[i].strip() | |
| # replace "HUHHUS" with "HUMMUS" if it occurs inside the string | |
| if "HUHHUS" in ocr_lines[i]: | |
| ocr_lines[i] = ocr_lines[i].replace("HUHHUS", "HUMMUS") | |
| # replace "EDAMAHE" with "EDAMAME" if it occurs inside the string | |
| if "EDAMAHE" in ocr_lines[i]: | |
| ocr_lines[i] = ocr_lines[i].replace("EDAMAHE", "EDAMAME") | |
| # replace "BAHAHAS" with "BANANAS" if it occurs inside the string | |
| if "BAHAHAS" in ocr_lines[i]: | |
| ocr_lines[i] = ocr_lines[i].replace("BAHAHAS", "BANANAS") | |
| # for each in ocr_lines, remove empty strings | |
| if ocr_lines[i] == '': | |
| ocr_lines[i] = None | |
| # if not None, apply normal capitalization to the strings | |
| if ocr_lines[i] != None: | |
| ocr_lines[i] = ocr_lines[i].title() | |
| # remove any list items that only have two characters | |
| if len(ocr_lines[i]) == 2: | |
| ocr_lines[i] = None | |
| # remove empty strings from ocr_lines | |
| ocr_lines = list(filter(None, ocr_lines)) | |
| # remove any list items that have the words "Tax", "Total", "Subtotal", or "Visa" in them | |
| for i in range(len(ocr_lines)): | |
| if "Tax" in ocr_lines[i] or "Total" in ocr_lines[i] or "Subtotal" in ocr_lines[i] or "Visa" in ocr_lines[i] or "Manager" in ocr_lines[i] or "Walmart" in ocr_lines[i]: | |
| ocr_lines[i] = None | |
| # TODO: not sure why but I need to run this again after the above function | |
| # remove empty strings from ocr_lines | |
| ocr_lines = list(filter(None, ocr_lines)) | |
| return ocr_lines | |
| # print(ocr_lines) | |
| # print(pytesseract.image_to_string(image_file, timeout=0.5)) # Timeout after half a second | |
| except RuntimeError as timeout_error: | |
| # Tesseract processing is terminated | |
| pass | |
| if __name__ == '__main__': | |
| globals()[sys.argv[1]](sys.argv[2]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment