# Import global broad stuff
import os
import os.path
import shutil
import re
from optparse import OptionParser

# Import PDF Miner specific stuff to use as library
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

# Global Variables
src_dir = os.path.join(os.curdir, 'to-process')
dst_dir = os.path.join(os.curdir, 'processed')

# Regex to find emails
regex = re.compile(("([a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`"
                    "{|}~-]+)*(@|\sat\s)(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.|"
                    "\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)"))

# Helper Functions
def copy_rename(old_file_name, new_file_name):
  src_file = os.path.join(src_dir, old_file_name)
  shutil.copy(src_file, dst_dir)

  dst_file = os.path.join(dst_dir, old_file_name)
  new_dst_file_name = os.path.join(dst_dir, new_file_name)
  os.rename(dst_file, new_dst_file_name)


def get_emails(s):
  """Returns an iterator of matched emails found in string s."""
  # Removing lines that start with '//' because the regular expression
  # mistakenly matches patterns like 'http://foo@bar.com' as '//foo@bar.com'.
  return (email[0] for email in re.findall(regex, s) if not email[0].startswith('//'))


def convert_pdf_to_txt(path, pages=None):
  if not pages:
    pagenums = set()
  else:
    pagenums = set(pages)
  output = StringIO()
  manager = PDFResourceManager()
  converter = TextConverter(manager, output, laparams=LAParams())
  interpreter = PDFPageInterpreter(manager, converter)

  infile = open(path, 'rb')
  for page in PDFPage.get_pages(infile, pagenums):
    interpreter.process_page(page)
  infile.close()
  converter.close()
  text = output.getvalue()
  output.close()
  return text

# Iterate through files in source directory/to-process
# Parse them using pdf miner
# Copy to other
def main():

  i = 0
  d = 0

  for filename in os.listdir(src_dir):
    if filename.endswith('.pdf'):
      with open(os.path.join(src_dir, filename)) as f:

        parsed = convert_pdf_to_txt(f.name)
        emails = get_emails(parsed)

        first_email = next(emails, None)

        if first_email == None:
          print('No email addresses found', '\t', 'skipped...')

        else:

          print(f.name, '\t', first_email)
          print("Copying...")

          old_name = os.path.basename(f.name)
          new_name = first_email + '.pdf'
          print(new_name)

          try:
            copy_rename(old_name, new_name)
          except OSError as err:
            print("OS error: {0}".format(err))
          except ValueError:
            print("Could not convert data to an integer.")
          except:
            print("Unexpected error:", sys.exc_info()[0])
            raise

          os.remove(f.name)

          print("Done.")
          i += 1

      d += 1

  print("All set!", '\t', str(i), '/', str(d), ' resumes processed')

if __name__ == '__main__':
  main()