Last active
October 10, 2023 13:25
-
-
Save josemarcosrf/5e40936fa4fc6c1490c41cc36e4dd263 to your computer and use it in GitHub Desktop.
Revisions
-
josemarcosrf revised this gist
Oct 10, 2023 . 1 changed file with 18 additions and 10 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -56,16 +56,24 @@ def textract(pdf_path: str): bb = [] w = "" for tbox in tbox_list: try: coords, c = tbox if c != " ": w += c bb.append(coords) else: if len(bb): words.append(w) if len(bb) > 1: bb_s, bb_f = bb[0], bb[-1] wboxes.append((bb_s[0], bb_s[1], bb_f[2], bb_f[3])) else: wboxes.append((bb[0])) w = "" bb = [] except Exception as e: rprint(f"[red] Error: {e}[/red]") rprint(f"[red] w: {w} | bb: {bb}[/red]") tboxes = list(zip(words, wboxes)) rprint(f"PAGE {p_idx} - LINE: {i}") -
josemarcosrf revised this gist
Oct 10, 2023 . No changes.There are no files selected for viewing
-
josemarcosrf revised this gist
Oct 10, 2023 . 1 changed file with 32 additions and 29 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -40,38 +40,41 @@ def get_char_bboxes(pdf_path: str): def textract(pdf_path: str): per_page_bboxes = get_char_bboxes(pdf_path) for p_idx, page_bboxes in per_page_bboxes.items(): # 1. Group by vertical position groups = defaultdict(list) for tbox in page_bboxes: coords, _ = tbox _, y0, _, _ = coords groups[y0].append(tbox) # 2. Split by space and get the enclosing box for i, (y0, tbox_list) in enumerate(groups.items()): # Group in words words = [] wboxes = [] bb = [] w = "" for tbox in tbox_list: coords, c = tbox if c != " ": w += c bb.append(coords) else: words.append(w) bb_s, bb_f = bb[0], bb[-1] wboxes.append((bb_s[0], bb_s[1], bb_f[2], bb_f[3])) w = "" bb = [] tboxes = list(zip(words, wboxes)) rprint(f"PAGE {p_idx} - LINE: {i}") rprint(tboxes) if __name__ == "__main__": # Requires: # fire==0.5.0 # pdfminer.six==20191110 # rich==13.6.0 fire.Fire({"pdf": textract}) -
josemarcosrf created this gist
Oct 10, 2023 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,77 @@ from collections import defaultdict import fire from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams from pdfminer.layout import LTChar from pdfminer.layout import LTTextBox from pdfminer.layout import LTTextLine from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfpage import PDFPage from rich import print as rprint def get_char_bboxes(pdf_path: str): fp = open(pdf_path, "rb") rsrcmgr = PDFResourceManager() laparams = LAParams() device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pages = PDFPage.get_pages(fp) per_page_tboxes = defaultdict(list) for i, page in enumerate(pages): print("Processing next page...") interpreter.process_page(page) layout = device.get_result() for lobj in layout: if isinstance(lobj, LTTextBox): for tline in lobj: for tchar in tline: if isinstance(tchar, LTChar): # x0, y0, x1, y1 = tchar.bbox[0], tchar.bbox[1], tchar.bbox[2], tchar.bbox[3] # text = tchar.get_text() per_page_tboxes[i].append((tchar.bbox, tchar.get_text())) return per_page_tboxes def textract(pdf_path: str): per_page_bboxes = get_char_bboxes(pdf_path) page_bboxes = per_page_bboxes[0] # 1. Group by vertical position groups = defaultdict(list) for tbox in page_bboxes: coords, _ = tbox _, y0, _, _ = coords groups[y0].append(tbox) # 2. Split by space and get the enclosing box for i, (y0, tbox_list) in enumerate(groups.items()): # Group in words words = [] wboxes = [] bb = [] w = "" for tbox in tbox_list: coords, c = tbox if c != " ": w += c bb.append(coords) else: words.append(w) bb_s, bb_f = bb[0], bb[-1] wboxes.append((bb_s[0], bb_s[1], bb_f[2], bb_f[3])) w = "" bb = [] tboxes = list(zip(words, wboxes)) rprint(f"LINE: {i}") rprint(tboxes) if __name__ == "__main__": fire.Fire({"pdf": textract})