Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save DrOctogon/ccbcacaeb47c03cb8cc2 to your computer and use it in GitHub Desktop.
Save DrOctogon/ccbcacaeb47c03cb8cc2 to your computer and use it in GitHub Desktop.

Revisions

  1. @leonardreidy leonardreidy revised this gist Jul 5, 2013. No changes.
  2. @leonardreidy leonardreidy revised this gist Jul 5, 2013. 1 changed file with 58 additions and 38 deletions.
    96 changes: 58 additions & 38 deletions prep-contacts-for-ponymailer
    Original file line number Diff line number Diff line change
    @@ -1,46 +1,66 @@
    # This represents a major refinement of the scripts to extract names and emails
    # and as soon as I'm sure it is complete, then I will delete the corresponding
    # numbers from my GistBox - 07/05/2013
    # A simple python script to extract names, and emails from
    # a certain online directory

    import os
    import json
    import os, json
    from bs4 import BeautifulSoup

    # choose file and assign to infile variable
    infile = # path/to/infile
    outfile =# path/to/outfile

    file = open(infile, 'r')
    soup = BeautifulSoup(file)
    strongs = soup.select('strong')
    mailtos = soup.select('a[href^=mailto]')
    prenames = []
    names = []
    emails = []
    jsondump = []

    # Extract names
    for i in strongs:
    for j in i:
    prenames.append(j.string)
    #get a list of the files in the current directory
    inputfiles = os.listdir(os.getcwd())

    def postproc(inputfiles):

    #for every file in the directory
    for i in inputfiles:

    #call the preproc function on said file and generate the appropriate outfile
    preproc(i, "out"+str(inputfiles.index(i))+".txt")

    def preproc(infile, outfile):

    # open the infile for reading
    file = open(infile, 'r')

    # convert the infile to soup object
    soup = BeautifulSoup(file)

    # find all <strong></strong> elements
    strongs = soup.select('strong')

    # find all mailto (email) elements
    mailtos = soup.select('a[href^=mailto]')

    # prep variables for subsequent stages i process
    prenames = []
    names = []
    emails = []
    contactzip = []
    jsondump = []

    # Extract names
    for i in strongs:
    for j in i:
    prenames.append(j.string)

    for i in prenames:
    if prenames.index(i)%2 != 0:
    if i.string != None:
    if i != '\n':
    names.append(i.string.encode('utf-8').strip())
    for i in prenames:
    if prenames.index(i)%2 != 0:
    if i.string != None:
    if i != '\n':
    names.append(i.string.encode('utf-8').strip())

    # Extract emails
    for i in mailtos:
    if i.string != None:
    emails.append(i.string.encode('utf-8').strip())
    # Extract emails
    for i in mailtos:
    if i.string != None:
    emails.append(i.string.encode('utf-8').strip())

    # zip together names,emails into a list of lists
    jsondump = zip(names,emails)
    # zip together names,emails into a list of lists
    contactzip = zip(emails, names)

    # convert list of lists to json for processing by ponymailer
    jsondump = json.dumps(jsondump)
    # convert list of lists to json for processing by ponymailer
    jsondump = json.dumps(contactzip)

    # write to file
    with open(outfile, 'w') as file:
    file.write(jsondump)

    # write to file
    with open(outfile, 'w') as file:
    file.write(jsondump)
    # run the script
    postproc(inputfiles)
  3. @leonardreidy leonardreidy created this gist Jul 5, 2013.
    46 changes: 46 additions & 0 deletions prep-contacts-for-ponymailer
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,46 @@
    # This represents a major refinement of the scripts to extract names and emails
    # and as soon as I'm sure it is complete, then I will delete the corresponding
    # numbers from my GistBox - 07/05/2013

    import os
    import json
    from bs4 import BeautifulSoup

    # choose file and assign to infile variable
    infile = # path/to/infile
    outfile =# path/to/outfile

    file = open(infile, 'r')
    soup = BeautifulSoup(file)
    strongs = soup.select('strong')
    mailtos = soup.select('a[href^=mailto]')
    prenames = []
    names = []
    emails = []
    jsondump = []

    # Extract names
    for i in strongs:
    for j in i:
    prenames.append(j.string)

    for i in prenames:
    if prenames.index(i)%2 != 0:
    if i.string != None:
    if i != '\n':
    names.append(i.string.encode('utf-8').strip())

    # Extract emails
    for i in mailtos:
    if i.string != None:
    emails.append(i.string.encode('utf-8').strip())

    # zip together names,emails into a list of lists
    jsondump = zip(names,emails)

    # convert list of lists to json for processing by ponymailer
    jsondump = json.dumps(jsondump)

    # write to file
    with open(outfile, 'w') as file:
    file.write(jsondump)