Skip to content

Instantly share code, notes, and snippets.

@josemarcosrf
Last active October 10, 2023 13:25
Show Gist options
  • Save josemarcosrf/5e40936fa4fc6c1490c41cc36e4dd263 to your computer and use it in GitHub Desktop.
Save josemarcosrf/5e40936fa4fc6c1490c41cc36e4dd263 to your computer and use it in GitHub Desktop.

Revisions

  1. josemarcosrf revised this gist Oct 10, 2023. 1 changed file with 18 additions and 10 deletions.
    28 changes: 18 additions & 10 deletions pdf_mine_with_boxes.py
    Original file line number Diff line number Diff line change
    @@ -56,16 +56,24 @@ def textract(pdf_path: str):
    bb = []
    w = ""
    for tbox in tbox_list:
    coords, c = tbox
    if c != " ":
    w += c
    bb.append(coords)
    else:
    words.append(w)
    bb_s, bb_f = bb[0], bb[-1]
    wboxes.append((bb_s[0], bb_s[1], bb_f[2], bb_f[3]))
    w = ""
    bb = []
    try:
    coords, c = tbox
    if c != " ":
    w += c
    bb.append(coords)
    else:
    if len(bb):
    words.append(w)
    if len(bb) > 1:
    bb_s, bb_f = bb[0], bb[-1]
    wboxes.append((bb_s[0], bb_s[1], bb_f[2], bb_f[3]))
    else:
    wboxes.append((bb[0]))
    w = ""
    bb = []
    except Exception as e:
    rprint(f"[red] Error: {e}[/red]")
    rprint(f"[red] w: {w} | bb: {bb}[/red]")

    tboxes = list(zip(words, wboxes))
    rprint(f"PAGE {p_idx} - LINE: {i}")
  2. josemarcosrf revised this gist Oct 10, 2023. No changes.
  3. josemarcosrf revised this gist Oct 10, 2023. 1 changed file with 32 additions and 29 deletions.
    61 changes: 32 additions & 29 deletions pdf_mine_with_boxes.py
    Original file line number Diff line number Diff line change
    @@ -40,38 +40,41 @@ def get_char_bboxes(pdf_path: str):
    def textract(pdf_path: str):
    per_page_bboxes = get_char_bboxes(pdf_path)

    page_bboxes = per_page_bboxes[0]
    for p_idx, page_bboxes in per_page_bboxes.items():
    # 1. Group by vertical position
    groups = defaultdict(list)
    for tbox in page_bboxes:
    coords, _ = tbox
    _, y0, _, _ = coords
    groups[y0].append(tbox)

    # 1. Group by vertical position
    groups = defaultdict(list)
    for tbox in page_bboxes:
    coords, _ = tbox
    _, y0, _, _ = coords
    groups[y0].append(tbox)
    # 2. Split by space and get the enclosing box
    for i, (y0, tbox_list) in enumerate(groups.items()):
    # Group in words
    words = []
    wboxes = []
    bb = []
    w = ""
    for tbox in tbox_list:
    coords, c = tbox
    if c != " ":
    w += c
    bb.append(coords)
    else:
    words.append(w)
    bb_s, bb_f = bb[0], bb[-1]
    wboxes.append((bb_s[0], bb_s[1], bb_f[2], bb_f[3]))
    w = ""
    bb = []

    # 2. Split by space and get the enclosing box
    for i, (y0, tbox_list) in enumerate(groups.items()):
    # Group in words
    words = []
    wboxes = []
    bb = []
    w = ""
    for tbox in tbox_list:
    coords, c = tbox
    if c != " ":
    w += c
    bb.append(coords)
    else:
    words.append(w)
    bb_s, bb_f = bb[0], bb[-1]
    wboxes.append((bb_s[0], bb_s[1], bb_f[2], bb_f[3]))
    w = ""
    bb = []

    tboxes = list(zip(words, wboxes))
    rprint(f"LINE: {i}")
    rprint(tboxes)
    tboxes = list(zip(words, wboxes))
    rprint(f"PAGE {p_idx} - LINE: {i}")
    rprint(tboxes)


    if __name__ == "__main__":
    # Requires:
    # fire==0.5.0
    # pdfminer.six==20191110
    # rich==13.6.0
    fire.Fire({"pdf": textract})
  4. josemarcosrf created this gist Oct 10, 2023.
    77 changes: 77 additions & 0 deletions pdf_mine_with_boxes.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,77 @@
    from collections import defaultdict

    import fire
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.layout import LAParams
    from pdfminer.layout import LTChar
    from pdfminer.layout import LTTextBox
    from pdfminer.layout import LTTextLine
    from pdfminer.pdfinterp import PDFPageInterpreter
    from pdfminer.pdfinterp import PDFResourceManager
    from pdfminer.pdfpage import PDFPage
    from rich import print as rprint


    def get_char_bboxes(pdf_path: str):
    fp = open(pdf_path, "rb")
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = PDFPage.get_pages(fp)

    per_page_tboxes = defaultdict(list)
    for i, page in enumerate(pages):
    print("Processing next page...")
    interpreter.process_page(page)
    layout = device.get_result()
    for lobj in layout:
    if isinstance(lobj, LTTextBox):
    for tline in lobj:
    for tchar in tline:
    if isinstance(tchar, LTChar):
    # x0, y0, x1, y1 = tchar.bbox[0], tchar.bbox[1], tchar.bbox[2], tchar.bbox[3]
    # text = tchar.get_text()
    per_page_tboxes[i].append((tchar.bbox, tchar.get_text()))

    return per_page_tboxes


    def textract(pdf_path: str):
    per_page_bboxes = get_char_bboxes(pdf_path)

    page_bboxes = per_page_bboxes[0]

    # 1. Group by vertical position
    groups = defaultdict(list)
    for tbox in page_bboxes:
    coords, _ = tbox
    _, y0, _, _ = coords
    groups[y0].append(tbox)

    # 2. Split by space and get the enclosing box
    for i, (y0, tbox_list) in enumerate(groups.items()):
    # Group in words
    words = []
    wboxes = []
    bb = []
    w = ""
    for tbox in tbox_list:
    coords, c = tbox
    if c != " ":
    w += c
    bb.append(coords)
    else:
    words.append(w)
    bb_s, bb_f = bb[0], bb[-1]
    wboxes.append((bb_s[0], bb_s[1], bb_f[2], bb_f[3]))
    w = ""
    bb = []

    tboxes = list(zip(words, wboxes))
    rprint(f"LINE: {i}")
    rprint(tboxes)


    if __name__ == "__main__":
    fire.Fire({"pdf": textract})