Skip to content

Instantly share code, notes, and snippets.

@reallymemorable
Created December 27, 2022 01:35
Show Gist options
  • Save reallymemorable/b832fba7e59d36d0afe015ec8725d163 to your computer and use it in GitHub Desktop.
Save reallymemorable/b832fba7e59d36d0afe015ec8725d163 to your computer and use it in GitHub Desktop.
from PIL import Image
import pytesseract
import sys
def process(image_file):
# If you don't have tesseract executable in your PATH, include the following:
# pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/bin/tesseract'
pytesseract.pytesseract.tesseract_cmd = 'tesseract'
# Example tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract'
# Simple image to string
# print(pytesseract.image_to_string(Image.open(image_file)))
# Timeout/terminate the tesseract job after a period of time
try:
ocr_output = pytesseract.image_to_string(image_file, timeout=2) # Timeout after 2 seconds
ocr_lines = ocr_output.split('\n')
for i in range(len(ocr_lines)):
# for each in ocr_lines, remove numbers from the strings
ocr_lines[i] = ''.join([j for j in ocr_lines[i] if not j.isdigit()])
# for each in ocr_lines, remove special characters from the strings
ocr_lines[i] = ''.join([j for j in ocr_lines[i] if j.isalnum() or j == ' '])
# remove any words with less than 2 characters
ocr_lines[i] = ' '.join([word for word in ocr_lines[i].split() if len(word) > 1])
# for each in ocr_lines, remove leading and trailing spaces from the strings
ocr_lines[i] = ocr_lines[i].strip()
# replace "HUHHUS" with "HUMMUS" if it occurs inside the string
if "HUHHUS" in ocr_lines[i]:
ocr_lines[i] = ocr_lines[i].replace("HUHHUS", "HUMMUS")
# replace "EDAMAHE" with "EDAMAME" if it occurs inside the string
if "EDAMAHE" in ocr_lines[i]:
ocr_lines[i] = ocr_lines[i].replace("EDAMAHE", "EDAMAME")
# replace "BAHAHAS" with "BANANAS" if it occurs inside the string
if "BAHAHAS" in ocr_lines[i]:
ocr_lines[i] = ocr_lines[i].replace("BAHAHAS", "BANANAS")
# for each in ocr_lines, remove empty strings
if ocr_lines[i] == '':
ocr_lines[i] = None
# if not None, apply normal capitalization to the strings
if ocr_lines[i] != None:
ocr_lines[i] = ocr_lines[i].title()
# remove any list items that only have two characters
if len(ocr_lines[i]) == 2:
ocr_lines[i] = None
# remove empty strings from ocr_lines
ocr_lines = list(filter(None, ocr_lines))
# remove any list items that have the words "Tax", "Total", "Subtotal", or "Visa" in them
for i in range(len(ocr_lines)):
if "Tax" in ocr_lines[i] or "Total" in ocr_lines[i] or "Subtotal" in ocr_lines[i] or "Visa" in ocr_lines[i] or "Manager" in ocr_lines[i] or "Walmart" in ocr_lines[i]:
ocr_lines[i] = None
# TODO: not sure why but I need to run this again after the above function
# remove empty strings from ocr_lines
ocr_lines = list(filter(None, ocr_lines))
return ocr_lines
# print(ocr_lines)
# print(pytesseract.image_to_string(image_file, timeout=0.5)) # Timeout after half a second
except RuntimeError as timeout_error:
# Tesseract processing is terminated
pass
if __name__ == '__main__':
globals()[sys.argv[1]](sys.argv[2])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment