from selenium import webdriver from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.common.keys import Keys import time, csv def isReady(browser): return browser.execute_script("return document.readyState")=="complete" browser = webdriver.Firefox() # Get local session of firefox browser.get("http://www.gsb.stanford.edu/facultyprofiles") # Load page while not isReady(browser): time.sleep(1) print browser.title faculty = browser.find_elements_by_xpath('//span[@class="views-field views-field-field-person-last-name-1"]//a') ranks = browser.find_elements_by_xpath('//span[@class="views-field views-field-field-official-rank"]') allfaculty = [{'Label': f.text, 'Rank': r.text, 'Link': f.get_attribute('href')} for (f,r) in zip(faculty, ranks)] updatedfaculty = [] for (i,d) in enumerate(allfaculty): browser.get(d['Link']) # Load page while not isReady(browser): time.sleep(1) try: d["Blurb"] = browser.find_elements_by_xpath('//div[@id="profile-summary-callout"]')[0].text except: d["Blurb"] = "" d["ID"] = i allfaculty[i] = d dw = csv.DictWriter(open('gsb_nodes.csv','w'),fieldnames=['ID','Label','Rank','Link','Blurb']) dw.writeheader() for d in allfaculty: for k in d: try: d[k] = d[k].encode('UTF-8') except: k dw.writerow(d) from nltk.corpus import stopwords import nltk, string porter = nltk.PorterStemmer() stopwords = nltk.corpus.stopwords.words('english') stopwords.append('research') stopwords.append('interest') for i,d in enumerate(allfaculty): b = d["Blurb"] tokens = nltk.word_tokenize(b.translate(string.maketrans("",""), string.punctuation).lower()) s = set() for t in set(tokens): if t not in stopwords: s.add(porter.stem(t.lower())) allfaculty[i]["blurbset"] = s dw = csv.DictWriter(open('gsb_edges.csv','w'),fieldnames=['Source','Target','Weight']) dw.writeheader() for i,b in enumerate(allfaculty): for j,bb in enumerate(allfaculty[i+1:]): intersection = (b["blurbset"].intersection(bb["blurbset"])) if len(intersection)>0: print i,j,len(intersection) dw.writerow({'Source': b["ID"], 'Target': bb["ID"], 'Weight': len(intersection)}) print i