Skip to content

Instantly share code, notes, and snippets.

@dungsaga
Last active December 11, 2024 05:30
Show Gist options
  • Select an option

  • Save dungsaga/ea703c17c57f249c7d7d4346245f24f2 to your computer and use it in GitHub Desktop.

Select an option

Save dungsaga/ea703c17c57f249c7d7d4346245f24f2 to your computer and use it in GitHub Desktop.

Revisions

  1. dungsaga revised this gist Dec 11, 2024. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions salvage-pdf.py
    Original file line number Diff line number Diff line change
    @@ -41,6 +41,7 @@ def extract_charmap(streams):

    # https://stackoverflow.com/questions/29467539/encoding-of-pdf-text-string
    def extract_text(data, charmap):
    if not re.search(rb'BT[ \n]', data): return b''
    # extract lines ending with Tj or TJ
    lines = [line for line in data.split(b"\n") if (line[-2:]==b'Tj' or line[-2:]==b'TJ')]
    text = [extract_line(line, charmap) for line in lines]
  2. dungsaga revised this gist Dec 11, 2024. 1 changed file with 58 additions and 13 deletions.
    71 changes: 58 additions & 13 deletions salvage-pdf.py
    Original file line number Diff line number Diff line change
    @@ -3,29 +3,74 @@

    def main(argv):
    print("Salvage text from a broken pdf file")
    print("Usage: salvage-pdf.py <input_file.pdf> <output_file.txt>")
    pdf = open(argv[1], "rb").read()
    output = salvage_pdf(pdf)
    with open(argv[2], "wb") as file: file.write(output)
    print("Usage: salvage-pdf.py [<input_file.pdf> [<output_file.txt>]]")
    pdf = argv[1] if len(argv) > 1 else sys.stdin.fileno()
    input = open(pdf, "rb").read()
    output = salvage_pdf(input)
    txt = argv[2] if len(argv) > 2 else sys.stdout.fileno()
    with open(txt, "wb") as file: file.write(output)

    def salvage_pdf(pdf):
    output = []
    def salvage_pdf(input):
    stream = re.compile(rb'/FlateDecode.*?stream(.*?)endstream', re.S)
    for s in stream.findall(pdf):
    streams = []
    for s in stream.findall(input):
    s = s.strip(b'\r\n')
    try:
    print(f"stream-{len(output):03}:")
    data = extract_text(zlib.decompress(s))
    if data: output.append(data)
    streams.append(zlib.decompress(s))
    except:
    pass # ignore decompression failures

    charmap = extract_charmap(streams)
    # print(charmap)

    text = [extract_text(data, charmap) for data in streams]
    output = [line for line in text if line] # remove empty lines
    return b"\n\n".join(output)

    def extract_text(data):
    # https://stackoverflow.com/questions/40036588/in-pdf-if-encoding-and-tounicode-are-both-present-in-pdf-how-to-map-the-text-e
    def extract_charmap(streams):
    charmap = {}
    # TODO: add support for beginbfrange\n(.+)\nendbfrange
    for data in streams:
    m = re.search(b"beginbfchar\n(.+?)\nendbfchar", data, re.DOTALL)
    if m:
    for l in m.group(1).split(b"\n"):
    pair = re.search(b'<(.+?)> *<(.+?)>',l)
    charmap[pair.group(1)] = pair.group(2)
    return charmap

    # https://stackoverflow.com/questions/29467539/encoding-of-pdf-text-string
    def extract_text(data, charmap):
    # extract lines ending with Tj or TJ
    lines = [line for line in data.split(b"\n") if (line[-2:]==b'Tj' or line[-2:]==b'TJ')]
    # remove everything except plain text inside (xxx)Tj or [(xxx)...(yyy)]TJ
    text = [re.sub(b'^.*?\(|\)Tj|\)\]TJ|\).+?\(', b'', line) for line in lines]
    text = [extract_line(line, charmap) for line in lines]
    return b"\n".join(text)

    def extract_line(line, charmap):
    l = decode_line(line, charmap)
    # l = re.sub(rb'\\222', '’'.encode('utf-8'), l)
    # l = re.sub(rb'\\225', '•'.encode('utf-8'), l)
    return re.sub(rb'\\([0-9]{3})', lambda m: num2char(m.group(1),8,'latin-1').decode('cp1252').encode('utf-8'), l)

    def decode_line(line, charmap):
    if re.search(b'<.+>Tj|\[<.+>\]TJ', line):
    # remove everything except plain text inside brackets: <xxx>Tj or [<xxx>...<yyy>]TJ
    l = re.sub(b'^.*?<|>Tj|>\]TJ|>.*?<',b'',line)
    mapped = re.sub(b'.{4}', lambda m: (charmap.get(m.group()) or ''), l)
    string = re.sub(b'.{4}', lambda m: num2char(m.group()), mapped)
    return string
    # remove everything except plain text inside brackets: (xxx)Tj or [(xxx)...(yyy)]TJ
    l= re.sub(b'^.*?\(|\)Tj|\)\]TJ|\)[^)]*?\(', b'', line)
    # remove escape for brackets
    l= re.sub(rb'\\([\(\)])', rb'\1', l)
    return l

    def num2char(hex, base=16, encoding='utf-8'):
    try:
    return chr(int(hex, base)).encode(encoding)
    except Exception as e:
    print(hex)
    print(e)
    return b'?'

    if __name__ == "__main__": main(sys.argv)
  3. dungsaga revised this gist Jun 19, 2024. 1 changed file with 3 additions and 3 deletions.
    6 changes: 3 additions & 3 deletions salvage-pdf.py
    Original file line number Diff line number Diff line change
    @@ -15,10 +15,10 @@ def salvage_pdf(pdf):
    s = s.strip(b'\r\n')
    try:
    print(f"stream-{len(output):03}:")
    data = zlib.decompress(s)
    output.append(extract_text(data))
    data = extract_text(zlib.decompress(s))
    if data: output.append(data)
    except:
    pass
    pass # ignore decompression failures
    return b"\n\n".join(output)

    def extract_text(data):
  4. dungsaga created this gist Jun 18, 2024.
    31 changes: 31 additions & 0 deletions salvage-pdf.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,31 @@
    #!/usr/bin/env python3
    import re, zlib, sys

    def main(argv):
    print("Salvage text from a broken pdf file")
    print("Usage: salvage-pdf.py <input_file.pdf> <output_file.txt>")
    pdf = open(argv[1], "rb").read()
    output = salvage_pdf(pdf)
    with open(argv[2], "wb") as file: file.write(output)

    def salvage_pdf(pdf):
    output = []
    stream = re.compile(rb'/FlateDecode.*?stream(.*?)endstream', re.S)
    for s in stream.findall(pdf):
    s = s.strip(b'\r\n')
    try:
    print(f"stream-{len(output):03}:")
    data = zlib.decompress(s)
    output.append(extract_text(data))
    except:
    pass
    return b"\n\n".join(output)

    def extract_text(data):
    # extract lines ending with Tj or TJ
    lines = [line for line in data.split(b"\n") if (line[-2:]==b'Tj' or line[-2:]==b'TJ')]
    # remove everything except plain text inside (xxx)Tj or [(xxx)...(yyy)]TJ
    text = [re.sub(b'^.*?\(|\)Tj|\)\]TJ|\).+?\(', b'', line) for line in lines]
    return b"\n".join(text)

    if __name__ == "__main__": main(sys.argv)