Skip to content

Instantly share code, notes, and snippets.

@gotev
Last active October 16, 2023 12:39
Show Gist options
  • Select an option

  • Save gotev/e936c65a903650f00e27d698b06d092c to your computer and use it in GitHub Desktop.

Select an option

Save gotev/e936c65a903650f00e27d698b06d092c to your computer and use it in GitHub Desktop.

Revisions

  1. gotev revised this gist Oct 16, 2023. 1 changed file with 1 addition and 3 deletions.
    4 changes: 1 addition & 3 deletions convert_pdf_to_text.py
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,5 @@
    # Requirements:
    # pip3 install pdf2image pytesseract Pillow
    # pip install pdf2image pytesseract Pillow
    #
    import os
    import glob
    @@ -26,14 +26,12 @@ def pdf_to_images(pdf_path, folder_name):
    if not os.path.exists(folder_name):
    os.mkdir(folder_name)

    saved_images = []
    total_images=len(images)
    current_image=1

    for image in images:
    print(f'Processing {current_image} of {total_images}')
    image_file_name = f'{folder_name}/{current_image}.png'
    saved_images.append(image_file_name)
    image.save(image_file_name)
    current_image += 1

  2. gotev created this gist Oct 16, 2023.
    69 changes: 69 additions & 0 deletions convert_pdf_to_text.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,69 @@
    # Requirements:
    # pip3 install pdf2image pytesseract Pillow
    #
    import os
    import glob
    import sys
    import pytesseract
    from pdf2image import convert_from_path
    from PIL import Image


    if len(sys.argv) < 3:
    print("Usage: python3 convert_pdf_to_text.py pdf_file output_folder")
    sys.exit(1) # Exit the script with an error code


    pdf_path = sys.argv[1]
    folder_name = sys.argv[2]


    def pdf_to_images(pdf_path, folder_name):
    # Convert PDF to images
    images = convert_from_path(pdf_path)

    # Check if the directory already exists, and if not, create it
    if not os.path.exists(folder_name):
    os.mkdir(folder_name)

    saved_images = []
    total_images=len(images)
    current_image=1

    for image in images:
    print(f'Processing {current_image} of {total_images}')
    image_file_name = f'{folder_name}/{current_image}.png'
    saved_images.append(image_file_name)
    image.save(image_file_name)
    current_image += 1


    def images_to_text(folder_path):
    # Initialize Tesseract OCR
    # For macOS, you can typically use the default Tesseract installation path
    # If you installed Tesseract using Homebrew, the path is '/usr/local/bin/tesseract'
    pytesseract.pytesseract.tesseract_cmd = '/usr/local/bin/tesseract'

    png_files = glob.glob(os.path.join(folder_path, "*.png"))

    total_images = len(png_files)
    current_image=1

    for png_file in png_files:
    print(f'Processing {current_image} of {total_images}')
    text = pytesseract.image_to_string(Image.open(png_file))

    with open(f'{png_file}.txt', "w") as file:
    # Write the text to the file
    file.write(text)

    current_image += 1


    print("Converting PDF to images")
    pdf_to_images(pdf_path, folder_name)
    print("Extracting text from images")
    images_to_text(folder_name)


    print(f'Text extraction completed. Check output in {folder_name}')