Skip to content

Instantly share code, notes, and snippets.

@alexstorer
Last active August 29, 2015 13:59
Show Gist options
  • Save alexstorer/10993204 to your computer and use it in GitHub Desktop.
Save alexstorer/10993204 to your computer and use it in GitHub Desktop.
Download the GSB profiles and then look at the intersection of their research terms to get a basic idea of whether they are connected. The scraping is in the first half of the document, and the processing is in the second half.
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
import time, csv
def isReady(browser):
return browser.execute_script("return document.readyState")=="complete"
browser = webdriver.Firefox() # Get local session of firefox
browser.get("http://www.gsb.stanford.edu/facultyprofiles") # Load page
while not isReady(browser):
time.sleep(1)
print browser.title
faculty = browser.find_elements_by_xpath('//span[@class="views-field views-field-field-person-last-name-1"]//a')
ranks = browser.find_elements_by_xpath('//span[@class="views-field views-field-field-official-rank"]')
allfaculty = [{'Label': f.text, 'Rank': r.text, 'Link': f.get_attribute('href')} for (f,r) in zip(faculty, ranks)]
updatedfaculty = []
for (i,d) in enumerate(allfaculty):
browser.get(d['Link']) # Load page
while not isReady(browser):
time.sleep(1)
try:
d["Blurb"] = browser.find_elements_by_xpath('//div[@id="profile-summary-callout"]')[0].text
except:
d["Blurb"] = ""
d["ID"] = i
allfaculty[i] = d
dw = csv.DictWriter(open('gsb_nodes.csv','w'),fieldnames=['ID','Label','Rank','Link','Blurb'])
dw.writeheader()
for d in allfaculty:
for k in d:
try:
d[k] = d[k].encode('UTF-8')
except:
k
dw.writerow(d)
from nltk.corpus import stopwords
import nltk, string
porter = nltk.PorterStemmer()
stopwords = nltk.corpus.stopwords.words('english')
stopwords.append('research')
stopwords.append('interest')
for i,d in enumerate(allfaculty):
b = d["Blurb"]
tokens = nltk.word_tokenize(b.translate(string.maketrans("",""), string.punctuation).lower())
s = set()
for t in set(tokens):
if t not in stopwords:
s.add(porter.stem(t.lower()))
allfaculty[i]["blurbset"] = s
dw = csv.DictWriter(open('gsb_edges.csv','w'),fieldnames=['Source','Target','Weight'])
dw.writeheader()
for i,b in enumerate(allfaculty):
for j,bb in enumerate(allfaculty[i+1:]):
intersection = (b["blurbset"].intersection(bb["blurbset"]))
if len(intersection)>0:
print i,j,len(intersection)
dw.writerow({'Source': b["ID"], 'Target': bb["ID"], 'Weight': len(intersection)})
print i
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment