Last active
December 11, 2024 05:30
-
-
Save dungsaga/ea703c17c57f249c7d7d4346245f24f2 to your computer and use it in GitHub Desktop.
Revisions
-
dungsaga revised this gist
Dec 11, 2024 . 1 changed file with 1 addition and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -41,6 +41,7 @@ def extract_charmap(streams): # https://stackoverflow.com/questions/29467539/encoding-of-pdf-text-string def extract_text(data, charmap): if not re.search(rb'BT[ \n]', data): return b'' # extract lines ending with Tj or TJ lines = [line for line in data.split(b"\n") if (line[-2:]==b'Tj' or line[-2:]==b'TJ')] text = [extract_line(line, charmap) for line in lines] -
dungsaga revised this gist
Dec 11, 2024 . 1 changed file with 58 additions and 13 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -3,29 +3,74 @@ def main(argv): print("Salvage text from a broken pdf file") print("Usage: salvage-pdf.py [<input_file.pdf> [<output_file.txt>]]") pdf = argv[1] if len(argv) > 1 else sys.stdin.fileno() input = open(pdf, "rb").read() output = salvage_pdf(input) txt = argv[2] if len(argv) > 2 else sys.stdout.fileno() with open(txt, "wb") as file: file.write(output) def salvage_pdf(input): stream = re.compile(rb'/FlateDecode.*?stream(.*?)endstream', re.S) streams = [] for s in stream.findall(input): s = s.strip(b'\r\n') try: streams.append(zlib.decompress(s)) except: pass # ignore decompression failures charmap = extract_charmap(streams) # print(charmap) text = [extract_text(data, charmap) for data in streams] output = [line for line in text if line] # remove empty lines return b"\n\n".join(output) # https://stackoverflow.com/questions/40036588/in-pdf-if-encoding-and-tounicode-are-both-present-in-pdf-how-to-map-the-text-e def extract_charmap(streams): charmap = {} # TODO: add support for beginbfrange\n(.+)\nendbfrange for data in streams: m = re.search(b"beginbfchar\n(.+?)\nendbfchar", data, re.DOTALL) if m: for l in m.group(1).split(b"\n"): pair = re.search(b'<(.+?)> *<(.+?)>',l) charmap[pair.group(1)] = pair.group(2) return charmap # https://stackoverflow.com/questions/29467539/encoding-of-pdf-text-string def extract_text(data, charmap): # extract lines ending with Tj or TJ lines = [line for line in data.split(b"\n") if (line[-2:]==b'Tj' or line[-2:]==b'TJ')] text = [extract_line(line, charmap) for line in lines] return b"\n".join(text) def extract_line(line, charmap): l = decode_line(line, charmap) # l = re.sub(rb'\\222', '’'.encode('utf-8'), l) # l = re.sub(rb'\\225', '•'.encode('utf-8'), l) return re.sub(rb'\\([0-9]{3})', lambda m: num2char(m.group(1),8,'latin-1').decode('cp1252').encode('utf-8'), l) def decode_line(line, charmap): if re.search(b'<.+>Tj|\[<.+>\]TJ', line): # remove everything except plain text inside brackets: <xxx>Tj or [<xxx>...<yyy>]TJ l = re.sub(b'^.*?<|>Tj|>\]TJ|>.*?<',b'',line) mapped = re.sub(b'.{4}', lambda m: (charmap.get(m.group()) or ''), l) string = re.sub(b'.{4}', lambda m: num2char(m.group()), mapped) return string # remove everything except plain text inside brackets: (xxx)Tj or [(xxx)...(yyy)]TJ l= re.sub(b'^.*?\(|\)Tj|\)\]TJ|\)[^)]*?\(', b'', line) # remove escape for brackets l= re.sub(rb'\\([\(\)])', rb'\1', l) return l def num2char(hex, base=16, encoding='utf-8'): try: return chr(int(hex, base)).encode(encoding) except Exception as e: print(hex) print(e) return b'?' if __name__ == "__main__": main(sys.argv) -
dungsaga revised this gist
Jun 19, 2024 . 1 changed file with 3 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -15,10 +15,10 @@ def salvage_pdf(pdf): s = s.strip(b'\r\n') try: print(f"stream-{len(output):03}:") data = extract_text(zlib.decompress(s)) if data: output.append(data) except: pass # ignore decompression failures return b"\n\n".join(output) def extract_text(data): -
dungsaga created this gist
Jun 18, 2024 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,31 @@ #!/usr/bin/env python3 import re, zlib, sys def main(argv): print("Salvage text from a broken pdf file") print("Usage: salvage-pdf.py <input_file.pdf> <output_file.txt>") pdf = open(argv[1], "rb").read() output = salvage_pdf(pdf) with open(argv[2], "wb") as file: file.write(output) def salvage_pdf(pdf): output = [] stream = re.compile(rb'/FlateDecode.*?stream(.*?)endstream', re.S) for s in stream.findall(pdf): s = s.strip(b'\r\n') try: print(f"stream-{len(output):03}:") data = zlib.decompress(s) output.append(extract_text(data)) except: pass return b"\n\n".join(output) def extract_text(data): # extract lines ending with Tj or TJ lines = [line for line in data.split(b"\n") if (line[-2:]==b'Tj' or line[-2:]==b'TJ')] # remove everything except plain text inside (xxx)Tj or [(xxx)...(yyy)]TJ text = [re.sub(b'^.*?\(|\)Tj|\)\]TJ|\).+?\(', b'', line) for line in lines] return b"\n".join(text) if __name__ == "__main__": main(sys.argv)