#!/usr/bin/env python3 import re, zlib, sys def main(argv): print("Salvage text from a broken pdf file") print("Usage: salvage-pdf.py [ []]") pdf = argv[1] if len(argv) > 1 else sys.stdin.fileno() input = open(pdf, "rb").read() output = salvage_pdf(input) txt = argv[2] if len(argv) > 2 else sys.stdout.fileno() with open(txt, "wb") as file: file.write(output) def salvage_pdf(input): stream = re.compile(rb'/FlateDecode.*?stream(.*?)endstream', re.S) streams = [] for s in stream.findall(input): s = s.strip(b'\r\n') try: streams.append(zlib.decompress(s)) except: pass # ignore decompression failures charmap = extract_charmap(streams) # print(charmap) text = [extract_text(data, charmap) for data in streams] output = [line for line in text if line] # remove empty lines return b"\n\n".join(output) # https://stackoverflow.com/questions/40036588/in-pdf-if-encoding-and-tounicode-are-both-present-in-pdf-how-to-map-the-text-e def extract_charmap(streams): charmap = {} # TODO: add support for beginbfrange\n(.+)\nendbfrange for data in streams: m = re.search(b"beginbfchar\n(.+?)\nendbfchar", data, re.DOTALL) if m: for l in m.group(1).split(b"\n"): pair = re.search(b'<(.+?)> *<(.+?)>',l) charmap[pair.group(1)] = pair.group(2) return charmap # https://stackoverflow.com/questions/29467539/encoding-of-pdf-text-string def extract_text(data, charmap): if not re.search(rb'BT[ \n]', data): return b'' # extract lines ending with Tj or TJ lines = [line for line in data.split(b"\n") if (line[-2:]==b'Tj' or line[-2:]==b'TJ')] text = [extract_line(line, charmap) for line in lines] return b"\n".join(text) def extract_line(line, charmap): l = decode_line(line, charmap) # l = re.sub(rb'\\222', '’'.encode('utf-8'), l) # l = re.sub(rb'\\225', '•'.encode('utf-8'), l) return re.sub(rb'\\([0-9]{3})', lambda m: num2char(m.group(1),8,'latin-1').decode('cp1252').encode('utf-8'), l) def decode_line(line, charmap): if re.search(b'<.+>Tj|\[<.+>\]TJ', line): # remove everything except plain text inside brackets: Tj or [...]TJ l = re.sub(b'^.*?<|>Tj|>\]TJ|>.*?<',b'',line) mapped = re.sub(b'.{4}', lambda m: (charmap.get(m.group()) or ''), l) string = re.sub(b'.{4}', lambda m: num2char(m.group()), mapped) return string # remove everything except plain text inside brackets: (xxx)Tj or [(xxx)...(yyy)]TJ l= re.sub(b'^.*?\(|\)Tj|\)\]TJ|\)[^)]*?\(', b'', line) # remove escape for brackets l= re.sub(rb'\\([\(\)])', rb'\1', l) return l def num2char(hex, base=16, encoding='utf-8'): try: return chr(int(hex, base)).encode(encoding) except Exception as e: print(hex) print(e) return b'?' if __name__ == "__main__": main(sys.argv)