reallymemorable · December 26, 2022 22:37
diff --git a/gistfile1.txt b/gistfile1.txt
 from PIL import Image
 import pytesseract
 import sys


 def process(image_file):
    # If you don't have tesseract executable in your PATH, include the following:
    # pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/bin/tesseract'
    pytesseract.pytesseract.tesseract_cmd = 'tesseract'
    # Example tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract'

    # Simple image to string
    # print(pytesseract.image_to_string(Image.open(image_file)))

    # Timeout/terminate the tesseract job after a period of time
    try:
        ocr_output = pytesseract.image_to_string(image_file, timeout=2) # Timeout after 2 seconds
        ocr_lines = ocr_output.split('\n')
        
        for i in range(len(ocr_lines)):
            # for each in ocr_lines, remove numbers from the strings
            ocr_lines[i] = ''.join([j for j in ocr_lines[i] if not j.isdigit()])
            # for each in ocr_lines, remove special characters from the strings
            ocr_lines[i] = ''.join([j for j in ocr_lines[i] if j.isalnum() or j == ' '])

            # remove any words with less than 2 characters
            ocr_lines[i] = ' '.join([word for word in ocr_lines[i].split() if len(word) > 1])

            # for each in ocr_lines, remove leading and trailing spaces from the strings
            ocr_lines[i] = ocr_lines[i].strip()
            # replace "HUHHUS" with "HUMMUS" if it occurs inside the string
            if "HUHHUS" in ocr_lines[i]:
                ocr_lines[i] = ocr_lines[i].replace("HUHHUS", "HUMMUS")
            # replace "EDAMAHE" with "EDAMAME" if it occurs inside the string
            if "EDAMAHE" in ocr_lines[i]:
                ocr_lines[i] = ocr_lines[i].replace("EDAMAHE", "EDAMAME")
            # replace "BAHAHAS" with "BANANAS" if it occurs inside the string
            if "BAHAHAS" in ocr_lines[i]:
                ocr_lines[i] = ocr_lines[i].replace("BAHAHAS", "BANANAS")
            # for each in ocr_lines, remove empty strings
            if ocr_lines[i] == '':
                ocr_lines[i] = None
            # if not None, apply normal capitalization to the strings
            if ocr_lines[i] != None:
                ocr_lines[i] = ocr_lines[i].title()
                # remove any list items that only have two characters
                if len(ocr_lines[i]) == 2:
                    ocr_lines[i] = None

                
        # remove empty strings from ocr_lines
        ocr_lines = list(filter(None, ocr_lines))

        # remove any list items that have the words "Tax", "Total", "Subtotal", or "Visa" in them
        for i in range(len(ocr_lines)):
            if "Tax" in ocr_lines[i] or "Total" in ocr_lines[i] or "Subtotal" in ocr_lines[i] or "Visa" in ocr_lines[i] or "Manager" in ocr_lines[i] or "Walmart" in ocr_lines[i]:
                ocr_lines[i] = None

        # TODO: not sure why but I need to run this again after the above function
        # remove empty strings from ocr_lines 
        ocr_lines = list(filter(None, ocr_lines))

        return ocr_lines
        # print(ocr_lines)
        # print(pytesseract.image_to_string(image_file, timeout=0.5)) # Timeout after half a second
    except RuntimeError as timeout_error:
        # Tesseract processing is terminated
        pass

 if __name__ == '__main__':
    globals()[sys.argv[1]](sys.argv[2])
	from PIL import Image
	import pytesseract
	import sys


	def process(image_file):
	# If you don't have tesseract executable in your PATH, include the following:
	# pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/bin/tesseract'
	pytesseract.pytesseract.tesseract_cmd = 'tesseract'
	# Example tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract'

	# Simple image to string
	# print(pytesseract.image_to_string(Image.open(image_file)))

	# Timeout/terminate the tesseract job after a period of time
	try:
	ocr_output = pytesseract.image_to_string(image_file, timeout=2) # Timeout after 2 seconds
	ocr_lines = ocr_output.split('\n')

	for i in range(len(ocr_lines)):
	# for each in ocr_lines, remove numbers from the strings
	ocr_lines[i] = ''.join([j for j in ocr_lines[i] if not j.isdigit()])
	# for each in ocr_lines, remove special characters from the strings
	ocr_lines[i] = ''.join([j for j in ocr_lines[i] if j.isalnum() or j == ' '])

	# remove any words with less than 2 characters
	ocr_lines[i] = ' '.join([word for word in ocr_lines[i].split() if len(word) > 1])

	# for each in ocr_lines, remove leading and trailing spaces from the strings
	ocr_lines[i] = ocr_lines[i].strip()
	# replace "HUHHUS" with "HUMMUS" if it occurs inside the string
	if "HUHHUS" in ocr_lines[i]:
	ocr_lines[i] = ocr_lines[i].replace("HUHHUS", "HUMMUS")
	# replace "EDAMAHE" with "EDAMAME" if it occurs inside the string
	if "EDAMAHE" in ocr_lines[i]:
	ocr_lines[i] = ocr_lines[i].replace("EDAMAHE", "EDAMAME")
	# replace "BAHAHAS" with "BANANAS" if it occurs inside the string
	if "BAHAHAS" in ocr_lines[i]:
	ocr_lines[i] = ocr_lines[i].replace("BAHAHAS", "BANANAS")
	# for each in ocr_lines, remove empty strings
	if ocr_lines[i] == '':
	ocr_lines[i] = None
	# if not None, apply normal capitalization to the strings
	if ocr_lines[i] != None:
	ocr_lines[i] = ocr_lines[i].title()
	# remove any list items that only have two characters
	if len(ocr_lines[i]) == 2:
	ocr_lines[i] = None


	# remove empty strings from ocr_lines
	ocr_lines = list(filter(None, ocr_lines))

	# remove any list items that have the words "Tax", "Total", "Subtotal", or "Visa" in them
	for i in range(len(ocr_lines)):
	if "Tax" in ocr_lines[i] or "Total" in ocr_lines[i] or "Subtotal" in ocr_lines[i] or "Visa" in ocr_lines[i] or "Manager" in ocr_lines[i] or "Walmart" in ocr_lines[i]:
	ocr_lines[i] = None

	# TODO: not sure why but I need to run this again after the above function
	# remove empty strings from ocr_lines
	ocr_lines = list(filter(None, ocr_lines))

	return ocr_lines
	# print(ocr_lines)
	# print(pytesseract.image_to_string(image_file, timeout=0.5)) # Timeout after half a second
	except RuntimeError as timeout_error:
	# Tesseract processing is terminated
	pass

	if __name__ == '__main__':
	globals()[sys.argv[1]](sys.argv[2])