''' Created on 2012-3-7 @author: GeZiyang ''' from urllib import urlopen import urllib2 import cookielib import re import time import thread import threading NAMELIST=[] #get search by shortName result ids def getLinks(shortName): searchInterface = "http://ieeexplore.ieee.org/xpl/conferences.jsp?queryText=" try: resultPage = urlopen(searchInterface + shortName) except: return False resultText = resultPage.read() matchGroup = re.findall('/xpl/conhome\.jsp\?punumber=(\d+)',resultText,re.IGNORECASE) return matchGroup #list #get search by shortName result full name list def getConfName(shortName): global NAMELIST list = getLinks(shortName) if(list == False): return False if(list == None): print("No matched conference!") return [] else: namelist = [] for id in list: try: confPage = urlopen("http://ieeexplore.ieee.org/xpl/conhome.jsp?punumber=" + id) except: return False confText = confPage.read() matchGroup = re.findall('

(.*?)

',confText,re.IGNORECASE) namelist.append((matchGroup[0],id)) NAMELIST=namelist return namelist #get all about the conference in history def getHisLinks(result_id): try: confPage = urlopen("http://ieeexplore.ieee.org/xpl/conhome.jsp?punumber=" + str(result_id)) except: return False confText = confPage.read() conf_ids = re.findall('mostRecentIssue\.jsp\?punumber=(\d+)',confText,re.IGNORECASE) return conf_ids # def getPaperLinks(conf_id): #first request to get element oqs firstreq = "http://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=" + str(conf_id) try: firstPage = urlopen(firstreq) except: return False firstText = firstPage.read() oqs = re.findall('id=\"oqs\"\s*value=\"(.*?)\"',firstText,re.IGNORECASE) secondreq = "http://ieeexplore.ieee.org/xpl/tocresult.jsp?" + oqs[0] + "&rowsPerPage=1000&pageNumber=1" #suppose the number of paper is less than 1000 try: paperListPage = urlopen(secondreq) except: return False paperText = paperListPage.read() paperlist = re.findall('/xpl/articleDetails\.jsp\?tp=&arnumber=(\d+)',paperText,re.IGNORECASE) paperset = set(paperlist) paperlist = list(paperset) #delete the same return paperlist #list #place to catch exception def getAbstract(paper_id): requrl = "http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=" + str(paper_id) try: paperPage = urlopen(requrl) paperText = paperPage.read() except Exception: paperText = "" title = re.findall('

\s(.+?)\s

',paperText,re.IGNORECASE) abs = re.findall('

Abstract

\s*\s*

(.+?)

',paperText,re.IGNORECASE) print abs #debug if title == [] or abs == []:#network exception return "empty###empty" abs[0] = title[0] + "###" + abs[0] return abs[0] def getNameById(conf_home): try: confPage = urlopen("http://ieeexplore.ieee.org/xpl/conhome.jsp?punumber=" + str(conf_home)) except: return False confText = confPage.read() matchGroup = re.findall('

(.*?)

',confText,re.IGNORECASE) return matchGroup[0] def buildNewDb(conf_home): absDb = {} #dic conf_name = getNameById(conf_home) count = 1 while conf_name == False: conf_name = getNameById(conf_home) count += 1 if count > 5: # try to get full name 5 times return False conf_all = getHisLinks(conf_home) count = 1 while conf_all == False: conf_all = getHisLinks(conf_home) count += 1 if count > 5: # try to getHisLinks 5 times return False paper_all = [] for conf_id in conf_all: paper_ids = getPaperLinks(conf_id) count = 1 while paper_ids == False: paper_ids = getPaperLinks(conf_id) count += 1 if count > 5: # try to getPaperLinks 5 times return False paper_all += paper_ids for paper_id in paper_all: abs = getAbstract(paper_id) if(abs == "empty###empty"):#mostly caused by network error print "None Abstract :%d ,retrying..." % int(paper_id) count = 1 while abs == "empty###empty": abs = getAbstract(paper_id) count += 1 if count > 5: print "cann't get the abstract of %d" % int(paper_id) #mostly caused by no abstract break absDb[paper_id] = abs print "%d:%s" % (int(paper_id),abs) #debug print "start writing file..." #debug file = open("confs/"+ str(conf_home),'w') for conf_id in conf_all: file.write(conf_id + " ") file.write("\n") for paper_id in paper_all: if(absDb[paper_id] == "empty###empty"): continue file.write(paper_id + "\n") file.write(absDb[paper_id] + "\n") file.close() indexfile = open("confs/confindex",'a') indexfile.write(str(conf_home) + '\n') indexfile.write(str(conf_name) + '\n') indexfile.close() # add write to index print "finished." return True def getFileDb(conf_home): absDb = {} try: file = open("confs/" + str(conf_home)) file.readline() while True: line = file.readline() if not line:break absDb[line.split()[0]] = file.readline() file.close() return absDb except Exception: print "No database file" return False def checkUpdate(conf_home): conf_all = getHisLinks(conf_home) try: file = open("confs/" + str(conf_home)) conflist = file.readline().split() if(set(conf_all) == set(conflist)): return True #can use else: return False # need for update except: print "no database yet" return False def searchPaper(conf_id,keylist): print keylist time.sleep(5) resultdic = {} absDb = getFileDb(conf_id) if keylist == []: return absDb for paper_id,title_abs in absDb.items(): match = True try: for key in keylist: if title_abs.find(key) >= 0: continue else: match = False break if match: resultdic[paper_id] = title_abs print paper_id print resultdic[paper_id] except: continue return resultdic #check for auth, if success,save to cookiefile def authCheck(cookiefile): cookies = cookielib.MozillaCookieJar(cookiefile) cookiehand = urllib2.HTTPCookieProcessor(cookies) opener = urllib2.build_opener(cookiehand) opener.addheaders = [("User-Agent","Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"), ("Host","ieeexplore.ieee.org")] try: opener.open("http://ieeexplore.ieee.org/xpl/conferences.jsp") except: print "network error" return False for item in cookies: if item.name == "xploreCookies": print "auth success!" cookies.save(cookiefile, ignore_discard=True, ignore_expires=True) return True return False def downWithCookies(cookiefile,paper_id): abs = getAbstract(paper_id) if(abs == "empty###empty"):#mostly caused by network error print "None Abstract :%d ,retrying..." % int(paper_id) count = 1 while abs == "empty###empty": abs = getAbstract(paper_id) count += 1 if count > 5: print "cann't get the abstract of %d" % int(paper_id) #mostly caused by no abstract break cookies = cookielib.MozillaCookieJar(cookiefile) cookies.load(cookiefile, ignore_discard=True, ignore_expires=True) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookies)) page = opener.open("http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=" + str(paper_id)) pagetext = page.read() matchGroup = re.findall('

(.*?)

\s*(.+?)\s*

Abstract

(.*?)

\s(.+?)\s