reallymemorable · December 26, 2022 22:37 · Dec 26, 2022
diff --git a/gistfile1.txt b/gistfile1.txt
@@ -0,0 +1,71 @@
+from PIL import Image
+import pytesseract
+import sys
+
+
+def process(image_file):
+    # If you don't have tesseract executable in your PATH, include the following:
+    # pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/bin/tesseract'
+    pytesseract.pytesseract.tesseract_cmd = 'tesseract'
+    # Example tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract'
+
+    # Simple image to string
+    # print(pytesseract.image_to_string(Image.open(image_file)))
+
+    # Timeout/terminate the tesseract job after a period of time
+    try:
+        ocr_output = pytesseract.image_to_string(image_file, timeout=2) # Timeout after 2 seconds
+        ocr_lines = ocr_output.split('\n')
+
+        for i in range(len(ocr_lines)):
+            # for each in ocr_lines, remove numbers from the strings
+            ocr_lines[i] = ''.join([j for j in ocr_lines[i] if not j.isdigit()])
+            # for each in ocr_lines, remove special characters from the strings
+            ocr_lines[i] = ''.join([j for j in ocr_lines[i] if j.isalnum() or j == ' '])
+
+            # remove any words with less than 2 characters
+            ocr_lines[i] = ' '.join([word for word in ocr_lines[i].split() if len(word) > 1])
+
+            # for each in ocr_lines, remove leading and trailing spaces from the strings
+            ocr_lines[i] = ocr_lines[i].strip()
+            # replace "HUHHUS" with "HUMMUS" if it occurs inside the string
+            if "HUHHUS" in ocr_lines[i]:
+                ocr_lines[i] = ocr_lines[i].replace("HUHHUS", "HUMMUS")
+            # replace "EDAMAHE" with "EDAMAME" if it occurs inside the string
+            if "EDAMAHE" in ocr_lines[i]:
+                ocr_lines[i] = ocr_lines[i].replace("EDAMAHE", "EDAMAME")
+            # replace "BAHAHAS" with "BANANAS" if it occurs inside the string
+            if "BAHAHAS" in ocr_lines[i]:
+                ocr_lines[i] = ocr_lines[i].replace("BAHAHAS", "BANANAS")
+            # for each in ocr_lines, remove empty strings
+            if ocr_lines[i] == '':
+                ocr_lines[i] = None
+            # if not None, apply normal capitalization to the strings
+            if ocr_lines[i] != None:
+                ocr_lines[i] = ocr_lines[i].title()
+                # remove any list items that only have two characters
+                if len(ocr_lines[i]) == 2:
+                    ocr_lines[i] = None
+
+
+        # remove empty strings from ocr_lines
+        ocr_lines = list(filter(None, ocr_lines))
+
+        # remove any list items that have the words "Tax", "Total", "Subtotal", or "Visa" in them
+        for i in range(len(ocr_lines)):
+            if "Tax" in ocr_lines[i] or "Total" in ocr_lines[i] or "Subtotal" in ocr_lines[i] or "Visa" in ocr_lines[i] or "Manager" in ocr_lines[i] or "Walmart" in ocr_lines[i]:
+                ocr_lines[i] = None
+
+        # TODO: not sure why but I need to run this again after the above function
+        # remove empty strings from ocr_lines 
+        ocr_lines = list(filter(None, ocr_lines))
+
+        return ocr_lines
+        # print(ocr_lines)
+        # print(pytesseract.image_to_string(image_file, timeout=0.5)) # Timeout after half a second
+    except RuntimeError as timeout_error:
+        # Tesseract processing is terminated
+        pass
+
+if __name__ == '__main__':
+    globals()[sys.argv[1]](sys.argv[2])