Skip to content

Instantly share code, notes, and snippets.

@pjatx
Last active September 25, 2018 01:41
Show Gist options
  • Save pjatx/eeaf842259b618403a9cd649b11014c1 to your computer and use it in GitHub Desktop.
Save pjatx/eeaf842259b618403a9cd649b11014c1 to your computer and use it in GitHub Desktop.

Revisions

  1. pjatx renamed this gist Sep 25, 2018. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  2. pjatx revised this gist Sep 25, 2018. 1 changed file with 0 additions and 3 deletions.
    3 changes: 0 additions & 3 deletions gistfile1.py
    Original file line number Diff line number Diff line change
    @@ -22,8 +22,6 @@
    "\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)"))

    # Helper Functions


    def copy_rename(old_file_name, new_file_name):
    src_file = os.path.join(src_dir, old_file_name)
    shutil.copy(src_file, dst_dir)
    @@ -107,6 +105,5 @@ def main():

    print("All set!", '\t', str(i), '/', str(d), ' resumes processed')


    if __name__ == '__main__':
    main()
  3. pjatx renamed this gist Sep 25, 2018. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  4. pjatx created this gist Sep 25, 2018.
    112 changes: 112 additions & 0 deletions gistfile1.txt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,112 @@
    # Import global broad stuff
    import os
    import os.path
    import shutil
    import re
    from optparse import OptionParser

    # Import PDF Miner specific stuff to use as library
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfpage import PDFPage
    from io import StringIO

    # Global Variables
    src_dir = os.path.join(os.curdir, 'to-process')
    dst_dir = os.path.join(os.curdir, 'processed')

    # Regex to find emails
    regex = re.compile(("([a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`"
    "{|}~-]+)*(@|\sat\s)(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.|"
    "\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)"))

    # Helper Functions


    def copy_rename(old_file_name, new_file_name):
    src_file = os.path.join(src_dir, old_file_name)
    shutil.copy(src_file, dst_dir)

    dst_file = os.path.join(dst_dir, old_file_name)
    new_dst_file_name = os.path.join(dst_dir, new_file_name)
    os.rename(dst_file, new_dst_file_name)


    def get_emails(s):
    """Returns an iterator of matched emails found in string s."""
    # Removing lines that start with '//' because the regular expression
    # mistakenly matches patterns like 'http://[email protected]' as '//[email protected]'.
    return (email[0] for email in re.findall(regex, s) if not email[0].startswith('//'))


    def convert_pdf_to_txt(path, pages=None):
    if not pages:
    pagenums = set()
    else:
    pagenums = set(pages)
    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(path, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
    interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close()
    return text

    # Iterate through files in source directory/to-process
    # Parse them using pdf miner
    # Copy to other
    def main():

    i = 0
    d = 0

    for filename in os.listdir(src_dir):
    if filename.endswith('.pdf'):
    with open(os.path.join(src_dir, filename)) as f:

    parsed = convert_pdf_to_txt(f.name)
    emails = get_emails(parsed)

    first_email = next(emails, None)

    if first_email == None:
    print('No email addresses found', '\t', 'skipped...')

    else:

    print(f.name, '\t', first_email)
    print("Copying...")

    old_name = os.path.basename(f.name)
    new_name = first_email + '.pdf'
    print(new_name)

    try:
    copy_rename(old_name, new_name)
    except OSError as err:
    print("OS error: {0}".format(err))
    except ValueError:
    print("Could not convert data to an integer.")
    except:
    print("Unexpected error:", sys.exc_info()[0])
    raise

    os.remove(f.name)

    print("Done.")
    i += 1

    d += 1

    print("All set!", '\t', str(i), '/', str(d), ' resumes processed')


    if __name__ == '__main__':
    main()