Skip to content

Instantly share code, notes, and snippets.

@alexstorer
Last active August 29, 2015 13:59
Show Gist options
  • Save alexstorer/10993204 to your computer and use it in GitHub Desktop.
Save alexstorer/10993204 to your computer and use it in GitHub Desktop.

Revisions

  1. alexstorer revised this gist Apr 17, 2014. 1 changed file with 266 additions and 0 deletions.
    266 changes: 266 additions & 0 deletions gsb_nodes.csv
    266 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
  2. alexstorer created this gist Apr 17, 2014.
    71 changes: 71 additions & 0 deletions gsb_faculty.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,71 @@
    from selenium import webdriver
    from selenium.common.exceptions import NoSuchElementException
    from selenium.webdriver.common.keys import Keys
    import time, csv

    def isReady(browser):
    return browser.execute_script("return document.readyState")=="complete"

    browser = webdriver.Firefox() # Get local session of firefox
    browser.get("http://www.gsb.stanford.edu/facultyprofiles") # Load page

    while not isReady(browser):
    time.sleep(1)

    print browser.title

    faculty = browser.find_elements_by_xpath('//span[@class="views-field views-field-field-person-last-name-1"]//a')
    ranks = browser.find_elements_by_xpath('//span[@class="views-field views-field-field-official-rank"]')

    allfaculty = [{'Label': f.text, 'Rank': r.text, 'Link': f.get_attribute('href')} for (f,r) in zip(faculty, ranks)]

    updatedfaculty = []

    for (i,d) in enumerate(allfaculty):
    browser.get(d['Link']) # Load page
    while not isReady(browser):
    time.sleep(1)
    try:
    d["Blurb"] = browser.find_elements_by_xpath('//div[@id="profile-summary-callout"]')[0].text
    except:
    d["Blurb"] = ""
    d["ID"] = i
    allfaculty[i] = d

    dw = csv.DictWriter(open('gsb_nodes.csv','w'),fieldnames=['ID','Label','Rank','Link','Blurb'])
    dw.writeheader()
    for d in allfaculty:
    for k in d:
    try:
    d[k] = d[k].encode('UTF-8')
    except:
    k
    dw.writerow(d)

    from nltk.corpus import stopwords
    import nltk, string
    porter = nltk.PorterStemmer()
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords.append('research')
    stopwords.append('interest')


    for i,d in enumerate(allfaculty):
    b = d["Blurb"]
    tokens = nltk.word_tokenize(b.translate(string.maketrans("",""), string.punctuation).lower())
    s = set()
    for t in set(tokens):
    if t not in stopwords:
    s.add(porter.stem(t.lower()))
    allfaculty[i]["blurbset"] = s

    dw = csv.DictWriter(open('gsb_edges.csv','w'),fieldnames=['Source','Target','Weight'])
    dw.writeheader()

    for i,b in enumerate(allfaculty):
    for j,bb in enumerate(allfaculty[i+1:]):
    intersection = (b["blurbset"].intersection(bb["blurbset"]))
    if len(intersection)>0:
    print i,j,len(intersection)
    dw.writerow({'Source': b["ID"], 'Target': bb["ID"], 'Weight': len(intersection)})
    print i