Skip to content

Instantly share code, notes, and snippets.

@reallymemorable
Created December 26, 2022 22:37
Show Gist options
  • Save reallymemorable/18b4788f84841a3e990e841fa70e60bc to your computer and use it in GitHub Desktop.
Save reallymemorable/18b4788f84841a3e990e841fa70e60bc to your computer and use it in GitHub Desktop.

Revisions

  1. reallymemorable created this gist Dec 26, 2022.
    71 changes: 71 additions & 0 deletions gistfile1.txt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,71 @@
    from PIL import Image
    import pytesseract
    import sys


    def process(image_file):
    # If you don't have tesseract executable in your PATH, include the following:
    # pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/bin/tesseract'
    pytesseract.pytesseract.tesseract_cmd = 'tesseract'
    # Example tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract'

    # Simple image to string
    # print(pytesseract.image_to_string(Image.open(image_file)))

    # Timeout/terminate the tesseract job after a period of time
    try:
    ocr_output = pytesseract.image_to_string(image_file, timeout=2) # Timeout after 2 seconds
    ocr_lines = ocr_output.split('\n')

    for i in range(len(ocr_lines)):
    # for each in ocr_lines, remove numbers from the strings
    ocr_lines[i] = ''.join([j for j in ocr_lines[i] if not j.isdigit()])
    # for each in ocr_lines, remove special characters from the strings
    ocr_lines[i] = ''.join([j for j in ocr_lines[i] if j.isalnum() or j == ' '])

    # remove any words with less than 2 characters
    ocr_lines[i] = ' '.join([word for word in ocr_lines[i].split() if len(word) > 1])

    # for each in ocr_lines, remove leading and trailing spaces from the strings
    ocr_lines[i] = ocr_lines[i].strip()
    # replace "HUHHUS" with "HUMMUS" if it occurs inside the string
    if "HUHHUS" in ocr_lines[i]:
    ocr_lines[i] = ocr_lines[i].replace("HUHHUS", "HUMMUS")
    # replace "EDAMAHE" with "EDAMAME" if it occurs inside the string
    if "EDAMAHE" in ocr_lines[i]:
    ocr_lines[i] = ocr_lines[i].replace("EDAMAHE", "EDAMAME")
    # replace "BAHAHAS" with "BANANAS" if it occurs inside the string
    if "BAHAHAS" in ocr_lines[i]:
    ocr_lines[i] = ocr_lines[i].replace("BAHAHAS", "BANANAS")
    # for each in ocr_lines, remove empty strings
    if ocr_lines[i] == '':
    ocr_lines[i] = None
    # if not None, apply normal capitalization to the strings
    if ocr_lines[i] != None:
    ocr_lines[i] = ocr_lines[i].title()
    # remove any list items that only have two characters
    if len(ocr_lines[i]) == 2:
    ocr_lines[i] = None


    # remove empty strings from ocr_lines
    ocr_lines = list(filter(None, ocr_lines))

    # remove any list items that have the words "Tax", "Total", "Subtotal", or "Visa" in them
    for i in range(len(ocr_lines)):
    if "Tax" in ocr_lines[i] or "Total" in ocr_lines[i] or "Subtotal" in ocr_lines[i] or "Visa" in ocr_lines[i] or "Manager" in ocr_lines[i] or "Walmart" in ocr_lines[i]:
    ocr_lines[i] = None

    # TODO: not sure why but I need to run this again after the above function
    # remove empty strings from ocr_lines
    ocr_lines = list(filter(None, ocr_lines))

    return ocr_lines
    # print(ocr_lines)
    # print(pytesseract.image_to_string(image_file, timeout=0.5)) # Timeout after half a second
    except RuntimeError as timeout_error:
    # Tesseract processing is terminated
    pass

    if __name__ == '__main__':
    globals()[sys.argv[1]](sys.argv[2])