Last active
September 25, 2018 01:41
-
-
Save pjatx/eeaf842259b618403a9cd649b11014c1 to your computer and use it in GitHub Desktop.
Revisions
-
pjatx renamed this gist
Sep 25, 2018 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
pjatx revised this gist
Sep 25, 2018 . 1 changed file with 0 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -22,8 +22,6 @@ "\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)")) # Helper Functions def copy_rename(old_file_name, new_file_name): src_file = os.path.join(src_dir, old_file_name) shutil.copy(src_file, dst_dir) @@ -107,6 +105,5 @@ def main(): print("All set!", '\t', str(i), '/', str(d), ' resumes processed') if __name__ == '__main__': main() -
pjatx renamed this gist
Sep 25, 2018 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
pjatx created this gist
Sep 25, 2018 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,112 @@ # Import global broad stuff import os import os.path import shutil import re from optparse import OptionParser # Import PDF Miner specific stuff to use as library from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from io import StringIO # Global Variables src_dir = os.path.join(os.curdir, 'to-process') dst_dir = os.path.join(os.curdir, 'processed') # Regex to find emails regex = re.compile(("([a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`" "{|}~-]+)*(@|\sat\s)(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.|" "\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)")) # Helper Functions def copy_rename(old_file_name, new_file_name): src_file = os.path.join(src_dir, old_file_name) shutil.copy(src_file, dst_dir) dst_file = os.path.join(dst_dir, old_file_name) new_dst_file_name = os.path.join(dst_dir, new_file_name) os.rename(dst_file, new_dst_file_name) def get_emails(s): """Returns an iterator of matched emails found in string s.""" # Removing lines that start with '//' because the regular expression # mistakenly matches patterns like 'http://[email protected]' as '//[email protected]'. return (email[0] for email in re.findall(regex, s) if not email[0].startswith('//')) def convert_pdf_to_txt(path, pages=None): if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = open(path, 'rb') for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close() return text # Iterate through files in source directory/to-process # Parse them using pdf miner # Copy to other def main(): i = 0 d = 0 for filename in os.listdir(src_dir): if filename.endswith('.pdf'): with open(os.path.join(src_dir, filename)) as f: parsed = convert_pdf_to_txt(f.name) emails = get_emails(parsed) first_email = next(emails, None) if first_email == None: print('No email addresses found', '\t', 'skipped...') else: print(f.name, '\t', first_email) print("Copying...") old_name = os.path.basename(f.name) new_name = first_email + '.pdf' print(new_name) try: copy_rename(old_name, new_name) except OSError as err: print("OS error: {0}".format(err)) except ValueError: print("Could not convert data to an integer.") except: print("Unexpected error:", sys.exc_info()[0]) raise os.remove(f.name) print("Done.") i += 1 d += 1 print("All set!", '\t', str(i), '/', str(d), ' resumes processed') if __name__ == '__main__': main()