# Import global broad stuff import os import os.path import shutil import re from optparse import OptionParser # Import PDF Miner specific stuff to use as library from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from io import StringIO # Global Variables src_dir = os.path.join(os.curdir, 'to-process') dst_dir = os.path.join(os.curdir, 'processed') # Regex to find emails regex = re.compile(("([a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`" "{|}~-]+)*(@|\sat\s)(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.|" "\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)")) # Helper Functions def copy_rename(old_file_name, new_file_name): src_file = os.path.join(src_dir, old_file_name) shutil.copy(src_file, dst_dir) dst_file = os.path.join(dst_dir, old_file_name) new_dst_file_name = os.path.join(dst_dir, new_file_name) os.rename(dst_file, new_dst_file_name) def get_emails(s): """Returns an iterator of matched emails found in string s.""" # Removing lines that start with '//' because the regular expression # mistakenly matches patterns like 'http://foo@bar.com' as '//foo@bar.com'. return (email[0] for email in re.findall(regex, s) if not email[0].startswith('//')) def convert_pdf_to_txt(path, pages=None): if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = open(path, 'rb') for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close() return text # Iterate through files in source directory/to-process # Parse them using pdf miner # Copy to other def main(): i = 0 d = 0 for filename in os.listdir(src_dir): if filename.endswith('.pdf'): with open(os.path.join(src_dir, filename)) as f: parsed = convert_pdf_to_txt(f.name) emails = get_emails(parsed) first_email = next(emails, None) if first_email == None: print('No email addresses found', '\t', 'skipped...') else: print(f.name, '\t', first_email) print("Copying...") old_name = os.path.basename(f.name) new_name = first_email + '.pdf' print(new_name) try: copy_rename(old_name, new_name) except OSError as err: print("OS error: {0}".format(err)) except ValueError: print("Could not convert data to an integer.") except: print("Unexpected error:", sys.exc_info()[0]) raise os.remove(f.name) print("Done.") i += 1 d += 1 print("All set!", '\t', str(i), '/', str(d), ' resumes processed') if __name__ == '__main__': main()