Created
March 20, 2025 14:28
-
-
Save martenc/5753112074c56e5989250c4d6de716af to your computer and use it in GitHub Desktop.
Revisions
-
martenc created this gist
Mar 20, 2025 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,31 @@ from pathlib import Path from langchain.docstore.document import Document from unstructured.partition.pdf import partition_pdf from unstructured.documents.elements import Text, Image, Table, CompositeElement class ExtractionPipeline: def __init__(self, folder_path): self.folder_path = folder_path self.pdf_filenames = self.get_pdf_filenames_from_folder() def get_pdf_filenames_from_folder(self): return list(Path(self.folder_path).glob("*.pdf")) def load_and_split_documents(self, folder_path): document_chunks = [] for pdf_filename in self.pdf_filenames: print("processing document: ", pdf_filename) raw_pdf_elements = partition_pdf(pdf_filename, chunking_strategy="by_title", max_characters=2000, new_after_n_chars=1800, combine_text_under_n_chars=1000, infer_table_structure=True) Text_elements = [Document(page_content = e.text.strip(), metadata={"filename": e.metadata.filename, "source_type": "text"}) for e in raw_pdf_elements if type(e) == Text or CompositeElement] print("Number of Detected Text elements: ", len(Text_elements)) # Tables Table_elements = [Document(page_content = e.metadata.text_as_html, metadata={"filename": e.metadata.filename, "source_type": "table_html"}) for e in raw_pdf_elements if type(e) == Table] print("Number of Detected HTML Tables: ", len(Table_elements)) return Text_elements, Table_elements This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,24 @@ class VisualExtractionPipeline: def __init__(self, folder_path): self.folder_path = folder_path self.pdf_filenames = self.get_pdf_filenames_from_folder() def get_pdf_filenames_from_folder(self): return list(Path(self.folder_path).glob("*.pdf")) def load_and_split_documents(self, folder_path): for pdf_filename in self.pdf_filenames: print("processing document: ", pdf_filename) image_text_elements = partition_pdf( filename=pdf_filename, strategy="hi_res", hi_res_model_name="yolox", extract_images_in_pdf=True, extract_image_block_types=["Image", "Table"], extract_image_block_to_payload=False, extract_image_block_output_dir="/dli/task/03-Lab/figures/", ) image_elements=[Document(page_content = e.metadata.image_path, metadata={"filename": e.metadata.filename, "source_type": "image"}) for e in image_text_elements if type(e) == Image] #table_elements=[Document(page_content = e.metadata.image_path, metadata={"filename": e.metadata.filename, "source_type": "image_table"}) for e in image_text_elements if type(e) == Table] return image_elements #+ table_elements