gotev · October 16, 2023 12:39 · Oct 16, 2023 · Oct 16, 2023
diff --git a/convert_pdf_to_text.py b/convert_pdf_to_text.py
@@ -1,5 +1,5 @@
 # Requirements:
-# pip3 install pdf2image pytesseract Pillow
+# pip install pdf2image pytesseract Pillow
 #
 import os
 import glob
@@ -26,14 +26,12 @@ def pdf_to_images(pdf_path, folder_name):
     if not os.path.exists(folder_name):
         os.mkdir(folder_name)
 
-    saved_images = []
     total_images=len(images)
     current_image=1
 
     for image in images:
         print(f'Processing {current_image} of {total_images}')
         image_file_name = f'{folder_name}/{current_image}.png'
-        saved_images.append(image_file_name)
         image.save(image_file_name)
         current_image += 1
 

diff --git a/convert_pdf_to_text.py b/convert_pdf_to_text.py
@@ -0,0 +1,69 @@
+# Requirements:
+# pip3 install pdf2image pytesseract Pillow
+#
+import os
+import glob
+import sys
+import pytesseract
+from pdf2image import convert_from_path
+from PIL import Image
+
+
+if len(sys.argv) < 3:
+    print("Usage: python3 convert_pdf_to_text.py pdf_file output_folder")
+    sys.exit(1)  # Exit the script with an error code
+
+
+pdf_path = sys.argv[1]
+folder_name = sys.argv[2]
+
+
+def pdf_to_images(pdf_path, folder_name):
+    # Convert PDF to images
+    images = convert_from_path(pdf_path)
+
+    # Check if the directory already exists, and if not, create it
+    if not os.path.exists(folder_name):
+        os.mkdir(folder_name)
+
+    saved_images = []
+    total_images=len(images)
+    current_image=1
+
+    for image in images:
+        print(f'Processing {current_image} of {total_images}')
+        image_file_name = f'{folder_name}/{current_image}.png'
+        saved_images.append(image_file_name)
+        image.save(image_file_name)
+        current_image += 1
+
+
+def images_to_text(folder_path):
+    # Initialize Tesseract OCR
+    # For macOS, you can typically use the default Tesseract installation path
+    # If you installed Tesseract using Homebrew, the path is '/usr/local/bin/tesseract'
+    pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'
+
+    png_files = glob.glob(os.path.join(folder_path, "*.png"))
+
+    total_images = len(png_files)
+    current_image=1
+
+    for png_file in png_files:
+        print(f'Processing {current_image} of {total_images}')
+        text = pytesseract.image_to_string(Image.open(png_file))
+
+        with open(f'{png_file}.txt', "w") as file:
+            # Write the text to the file
+            file.write(text)
+
+        current_image += 1
+
+
+print("Converting PDF to images")
+pdf_to_images(pdf_path, folder_name)
+print("Extracting text from images")
+images_to_text(folder_name)
+
+
+print(f'Text extraction completed. Check output in {folder_name}')
No results found