Created
          December 26, 2022 22:37 
        
      - 
      
- 
        Save reallymemorable/18b4788f84841a3e990e841fa70e60bc to your computer and use it in GitHub Desktop. 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | from PIL import Image | |
| import pytesseract | |
| import sys | |
| def process(image_file): | |
| # If you don't have tesseract executable in your PATH, include the following: | |
| # pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/bin/tesseract' | |
| pytesseract.pytesseract.tesseract_cmd = 'tesseract' | |
| # Example tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract' | |
| # Simple image to string | |
| # print(pytesseract.image_to_string(Image.open(image_file))) | |
| # Timeout/terminate the tesseract job after a period of time | |
| try: | |
| ocr_output = pytesseract.image_to_string(image_file, timeout=2) # Timeout after 2 seconds | |
| ocr_lines = ocr_output.split('\n') | |
| for i in range(len(ocr_lines)): | |
| # for each in ocr_lines, remove numbers from the strings | |
| ocr_lines[i] = ''.join([j for j in ocr_lines[i] if not j.isdigit()]) | |
| # for each in ocr_lines, remove special characters from the strings | |
| ocr_lines[i] = ''.join([j for j in ocr_lines[i] if j.isalnum() or j == ' ']) | |
| # remove any words with less than 2 characters | |
| ocr_lines[i] = ' '.join([word for word in ocr_lines[i].split() if len(word) > 1]) | |
| # for each in ocr_lines, remove leading and trailing spaces from the strings | |
| ocr_lines[i] = ocr_lines[i].strip() | |
| # replace "HUHHUS" with "HUMMUS" if it occurs inside the string | |
| if "HUHHUS" in ocr_lines[i]: | |
| ocr_lines[i] = ocr_lines[i].replace("HUHHUS", "HUMMUS") | |
| # replace "EDAMAHE" with "EDAMAME" if it occurs inside the string | |
| if "EDAMAHE" in ocr_lines[i]: | |
| ocr_lines[i] = ocr_lines[i].replace("EDAMAHE", "EDAMAME") | |
| # replace "BAHAHAS" with "BANANAS" if it occurs inside the string | |
| if "BAHAHAS" in ocr_lines[i]: | |
| ocr_lines[i] = ocr_lines[i].replace("BAHAHAS", "BANANAS") | |
| # for each in ocr_lines, remove empty strings | |
| if ocr_lines[i] == '': | |
| ocr_lines[i] = None | |
| # if not None, apply normal capitalization to the strings | |
| if ocr_lines[i] != None: | |
| ocr_lines[i] = ocr_lines[i].title() | |
| # remove any list items that only have two characters | |
| if len(ocr_lines[i]) == 2: | |
| ocr_lines[i] = None | |
| # remove empty strings from ocr_lines | |
| ocr_lines = list(filter(None, ocr_lines)) | |
| # remove any list items that have the words "Tax", "Total", "Subtotal", or "Visa" in them | |
| for i in range(len(ocr_lines)): | |
| if "Tax" in ocr_lines[i] or "Total" in ocr_lines[i] or "Subtotal" in ocr_lines[i] or "Visa" in ocr_lines[i] or "Manager" in ocr_lines[i] or "Walmart" in ocr_lines[i]: | |
| ocr_lines[i] = None | |
| # TODO: not sure why but I need to run this again after the above function | |
| # remove empty strings from ocr_lines | |
| ocr_lines = list(filter(None, ocr_lines)) | |
| return ocr_lines | |
| # print(ocr_lines) | |
| # print(pytesseract.image_to_string(image_file, timeout=0.5)) # Timeout after half a second | |
| except RuntimeError as timeout_error: | |
| # Tesseract processing is terminated | |
| pass | |
| if __name__ == '__main__': | |
| globals()[sys.argv[1]](sys.argv[2]) | 
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment