Skip to content

Instantly share code, notes, and snippets.

@martenc
Created March 20, 2025 14:28
Show Gist options
  • Save martenc/5753112074c56e5989250c4d6de716af to your computer and use it in GitHub Desktop.
Save martenc/5753112074c56e5989250c4d6de716af to your computer and use it in GitHub Desktop.

Revisions

  1. martenc created this gist Mar 20, 2025.
    31 changes: 31 additions & 0 deletions extract-from-paper
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,31 @@
    from pathlib import Path
    from langchain.docstore.document import Document

    from unstructured.partition.pdf import partition_pdf
    from unstructured.documents.elements import Text, Image, Table, CompositeElement

    class ExtractionPipeline:
    def __init__(self, folder_path):
    self.folder_path = folder_path
    self.pdf_filenames = self.get_pdf_filenames_from_folder()

    def get_pdf_filenames_from_folder(self):
    return list(Path(self.folder_path).glob("*.pdf"))

    def load_and_split_documents(self, folder_path):
    document_chunks = []
    for pdf_filename in self.pdf_filenames:
    print("processing document: ", pdf_filename)
    raw_pdf_elements = partition_pdf(pdf_filename,
    chunking_strategy="by_title",
    max_characters=2000,
    new_after_n_chars=1800,
    combine_text_under_n_chars=1000,
    infer_table_structure=True)
    Text_elements = [Document(page_content = e.text.strip(), metadata={"filename": e.metadata.filename, "source_type": "text"}) for e in raw_pdf_elements if type(e) == Text or CompositeElement]
    print("Number of Detected Text elements: ", len(Text_elements))

    # Tables
    Table_elements = [Document(page_content = e.metadata.text_as_html, metadata={"filename": e.metadata.filename, "source_type": "table_html"}) for e in raw_pdf_elements if type(e) == Table]
    print("Number of Detected HTML Tables: ", len(Table_elements))
    return Text_elements, Table_elements
    24 changes: 24 additions & 0 deletions hi-res.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,24 @@
    class VisualExtractionPipeline:
    def __init__(self, folder_path):
    self.folder_path = folder_path
    self.pdf_filenames = self.get_pdf_filenames_from_folder()

    def get_pdf_filenames_from_folder(self):
    return list(Path(self.folder_path).glob("*.pdf"))

    def load_and_split_documents(self, folder_path):
    for pdf_filename in self.pdf_filenames:
    print("processing document: ", pdf_filename)
    image_text_elements = partition_pdf(
    filename=pdf_filename,
    strategy="hi_res",
    hi_res_model_name="yolox",
    extract_images_in_pdf=True,
    extract_image_block_types=["Image", "Table"],
    extract_image_block_to_payload=False,
    extract_image_block_output_dir="/dli/task/03-Lab/figures/",
    )

    image_elements=[Document(page_content = e.metadata.image_path, metadata={"filename": e.metadata.filename, "source_type": "image"}) for e in image_text_elements if type(e) == Image]
    #table_elements=[Document(page_content = e.metadata.image_path, metadata={"filename": e.metadata.filename, "source_type": "image_table"}) for e in image_text_elements if type(e) == Table]
    return image_elements #+ table_elements