|
|
@@ -0,0 +1,349 @@ |
|
|
''' |
|
|
Created on 2012-3-7 |
|
|
|
|
|
@author: GeZiyang |
|
|
''' |
|
|
from urllib import urlopen |
|
|
import urllib2 |
|
|
import cookielib |
|
|
import re |
|
|
import time |
|
|
import thread |
|
|
import threading |
|
|
|
|
|
NAMELIST=[] |
|
|
#get search by shortName result ids |
|
|
def getLinks(shortName): |
|
|
searchInterface = "http://ieeexplore.ieee.org/xpl/conferences.jsp?queryText=" |
|
|
try: |
|
|
resultPage = urlopen(searchInterface + shortName) |
|
|
except: |
|
|
return False |
|
|
resultText = resultPage.read() |
|
|
matchGroup = re.findall('/xpl/conhome\.jsp\?punumber=(\d+)',resultText,re.IGNORECASE) |
|
|
return matchGroup #list |
|
|
|
|
|
#get search by shortName result full name list |
|
|
def getConfName(shortName): |
|
|
global NAMELIST |
|
|
list = getLinks(shortName) |
|
|
if(list == False): |
|
|
return False |
|
|
if(list == None): |
|
|
print("No matched conference!") |
|
|
return [] |
|
|
else: |
|
|
namelist = [] |
|
|
for id in list: |
|
|
try: |
|
|
confPage = urlopen("http://ieeexplore.ieee.org/xpl/conhome.jsp?punumber=" + id) |
|
|
except: |
|
|
return False |
|
|
confText = confPage.read() |
|
|
matchGroup = re.findall('<h1>(.*?)</h1>',confText,re.IGNORECASE) |
|
|
namelist.append((matchGroup[0],id)) |
|
|
NAMELIST=namelist |
|
|
return namelist |
|
|
|
|
|
#get all about the conference in history |
|
|
def getHisLinks(result_id): |
|
|
try: |
|
|
confPage = urlopen("http://ieeexplore.ieee.org/xpl/conhome.jsp?punumber=" + str(result_id)) |
|
|
except: |
|
|
return False |
|
|
confText = confPage.read() |
|
|
conf_ids = re.findall('mostRecentIssue\.jsp\?punumber=(\d+)',confText,re.IGNORECASE) |
|
|
return conf_ids |
|
|
|
|
|
# |
|
|
def getPaperLinks(conf_id): |
|
|
|
|
|
#first request to get element oqs |
|
|
firstreq = "http://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=" + str(conf_id) |
|
|
try: |
|
|
firstPage = urlopen(firstreq) |
|
|
except: |
|
|
return False |
|
|
firstText = firstPage.read() |
|
|
oqs = re.findall('id=\"oqs\"\s*value=\"(.*?)\"',firstText,re.IGNORECASE) |
|
|
secondreq = "http://ieeexplore.ieee.org/xpl/tocresult.jsp?" + oqs[0] + "&rowsPerPage=1000&pageNumber=1" #suppose the number of paper is less than 1000 |
|
|
try: |
|
|
paperListPage = urlopen(secondreq) |
|
|
except: |
|
|
return False |
|
|
paperText = paperListPage.read() |
|
|
paperlist = re.findall('/xpl/articleDetails\.jsp\?tp=&arnumber=(\d+)',paperText,re.IGNORECASE) |
|
|
paperset = set(paperlist) |
|
|
paperlist = list(paperset) #delete the same |
|
|
|
|
|
return paperlist #list |
|
|
|
|
|
#place to catch exception |
|
|
def getAbstract(paper_id): |
|
|
requrl = "http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=" + str(paper_id) |
|
|
try: |
|
|
paperPage = urlopen(requrl) |
|
|
paperText = paperPage.read() |
|
|
except Exception: |
|
|
paperText = "" |
|
|
|
|
|
title = re.findall('<h1>\s*(.+?)\s*</h1>',paperText,re.IGNORECASE) |
|
|
abs = re.findall('<h2>Abstract</h2>\s*</a>\s*<p>(.+?)</p>',paperText,re.IGNORECASE) |
|
|
print abs #debug |
|
|
if title == [] or abs == []:#network exception |
|
|
return "empty###empty" |
|
|
abs[0] = title[0] + "###" + abs[0] |
|
|
return abs[0] |
|
|
|
|
|
def getNameById(conf_home): |
|
|
try: |
|
|
confPage = urlopen("http://ieeexplore.ieee.org/xpl/conhome.jsp?punumber=" + str(conf_home)) |
|
|
except: |
|
|
return False |
|
|
confText = confPage.read() |
|
|
matchGroup = re.findall('<h1>(.*?)</h1>',confText,re.IGNORECASE) |
|
|
return matchGroup[0] |
|
|
|
|
|
def buildNewDb(conf_home): |
|
|
absDb = {} #dic |
|
|
|
|
|
conf_name = getNameById(conf_home) |
|
|
count = 1 |
|
|
while conf_name == False: |
|
|
conf_name = getNameById(conf_home) |
|
|
count += 1 |
|
|
if count > 5: # try to get full name 5 times |
|
|
return False |
|
|
|
|
|
conf_all = getHisLinks(conf_home) |
|
|
count = 1 |
|
|
while conf_all == False: |
|
|
conf_all = getHisLinks(conf_home) |
|
|
count += 1 |
|
|
if count > 5: # try to getHisLinks 5 times |
|
|
return False |
|
|
|
|
|
paper_all = [] |
|
|
for conf_id in conf_all: |
|
|
paper_ids = getPaperLinks(conf_id) |
|
|
count = 1 |
|
|
while paper_ids == False: |
|
|
paper_ids = getPaperLinks(conf_id) |
|
|
count += 1 |
|
|
if count > 5: # try to getPaperLinks 5 times |
|
|
return False |
|
|
paper_all += paper_ids |
|
|
for paper_id in paper_all: |
|
|
abs = getAbstract(paper_id) |
|
|
|
|
|
if(abs == "empty###empty"):#mostly caused by network error |
|
|
print "None Abstract :%d ,retrying..." % int(paper_id) |
|
|
count = 1 |
|
|
while abs == "empty###empty": |
|
|
abs = getAbstract(paper_id) |
|
|
count += 1 |
|
|
if count > 5: |
|
|
print "cann't get the abstract of %d" % int(paper_id) #mostly caused by no abstract |
|
|
break |
|
|
|
|
|
absDb[paper_id] = abs |
|
|
print "%d:%s" % (int(paper_id),abs) #debug |
|
|
print "start writing file..." #debug |
|
|
file = open("confs/"+ str(conf_home),'w') |
|
|
for conf_id in conf_all: |
|
|
file.write(conf_id + " ") |
|
|
file.write("\n") |
|
|
for paper_id in paper_all: |
|
|
if(absDb[paper_id] == "empty###empty"): |
|
|
continue |
|
|
file.write(paper_id + "\n") |
|
|
file.write(absDb[paper_id] + "\n") |
|
|
file.close() |
|
|
indexfile = open("confs/confindex",'a') |
|
|
indexfile.write(str(conf_home) + '\n') |
|
|
indexfile.write(str(conf_name) + '\n') |
|
|
indexfile.close() |
|
|
# add write to index |
|
|
print "finished." |
|
|
return True |
|
|
|
|
|
|
|
|
def getFileDb(conf_home): |
|
|
absDb = {} |
|
|
try: |
|
|
file = open("confs/" + str(conf_home)) |
|
|
file.readline() |
|
|
while True: |
|
|
line = file.readline() |
|
|
if not line:break |
|
|
absDb[line.split()[0]] = file.readline() |
|
|
file.close() |
|
|
return absDb |
|
|
except Exception: |
|
|
print "No database file" |
|
|
return False |
|
|
|
|
|
def checkUpdate(conf_home): |
|
|
conf_all = getHisLinks(conf_home) |
|
|
try: |
|
|
file = open("confs/" + str(conf_home)) |
|
|
conflist = file.readline().split() |
|
|
if(set(conf_all) == set(conflist)): |
|
|
return True #can use |
|
|
else: |
|
|
return False # need for update |
|
|
except: |
|
|
print "no database yet" |
|
|
return False |
|
|
|
|
|
def searchPaper(conf_id,keylist): |
|
|
print keylist |
|
|
time.sleep(5) |
|
|
resultdic = {} |
|
|
absDb = getFileDb(conf_id) |
|
|
if keylist == []: |
|
|
return absDb |
|
|
for paper_id,title_abs in absDb.items(): |
|
|
match = True |
|
|
try: |
|
|
for key in keylist: |
|
|
if title_abs.find(key) >= 0: |
|
|
continue |
|
|
else: |
|
|
match = False |
|
|
break |
|
|
if match: |
|
|
resultdic[paper_id] = title_abs |
|
|
print paper_id |
|
|
print resultdic[paper_id] |
|
|
except: |
|
|
continue |
|
|
return resultdic |
|
|
|
|
|
#check for auth, if success,save to cookiefile |
|
|
def authCheck(cookiefile): |
|
|
cookies = cookielib.MozillaCookieJar(cookiefile) |
|
|
cookiehand = urllib2.HTTPCookieProcessor(cookies) |
|
|
opener = urllib2.build_opener(cookiehand) |
|
|
opener.addheaders = [("User-Agent","Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"), |
|
|
("Host","ieeexplore.ieee.org")] |
|
|
try: |
|
|
opener.open("http://ieeexplore.ieee.org/xpl/conferences.jsp") |
|
|
except: |
|
|
print "network error" |
|
|
return False |
|
|
for item in cookies: |
|
|
if item.name == "xploreCookies": |
|
|
print "auth success!" |
|
|
cookies.save(cookiefile, ignore_discard=True, ignore_expires=True) |
|
|
return True |
|
|
return False |
|
|
|
|
|
def downWithCookies(cookiefile,paper_id): |
|
|
abs = getAbstract(paper_id) |
|
|
if(abs == "empty###empty"):#mostly caused by network error |
|
|
print "None Abstract :%d ,retrying..." % int(paper_id) |
|
|
count = 1 |
|
|
while abs == "empty###empty": |
|
|
abs = getAbstract(paper_id) |
|
|
count += 1 |
|
|
if count > 5: |
|
|
print "cann't get the abstract of %d" % int(paper_id) #mostly caused by no abstract |
|
|
break |
|
|
cookies = cookielib.MozillaCookieJar(cookiefile) |
|
|
cookies.load(cookiefile, ignore_discard=True, ignore_expires=True) |
|
|
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookies)) |
|
|
page = opener.open("http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=" + str(paper_id)) |
|
|
pagetext = page.read() |
|
|
matchGroup = re.findall('<frame src="(http.*?)"',pagetext,re.IGNORECASE) |
|
|
try: |
|
|
pdflink = matchGroup[0] |
|
|
except IndexError: |
|
|
print "cookies outdated" |
|
|
return False |
|
|
pdfpage = opener.open(pdflink) |
|
|
pdftext = pdfpage.read() |
|
|
f = open("papers/" + str(paper_id)+".pdf","wb") |
|
|
f.write(pdftext) |
|
|
f.close() |
|
|
index = open("papers/paperindex","a") |
|
|
index.write(str(paper_id) + "\n") |
|
|
index.write(str(abs) + "\n") |
|
|
index.close() |
|
|
return True |
|
|
|
|
|
def getConfIndex(): |
|
|
confIndex = {} |
|
|
try: |
|
|
file = open("confs/confindex") |
|
|
while True: |
|
|
line = file.readline() |
|
|
if not line:break |
|
|
confIndex[line.split()[0]] = file.readline() |
|
|
file.close() |
|
|
return confIndex |
|
|
except Exception: |
|
|
print "No conference index file" |
|
|
return False |
|
|
|
|
|
def getPaperIndex(): |
|
|
paperIndex = {} |
|
|
try: |
|
|
file = open("papers/paperindex") |
|
|
while True: |
|
|
line = file.readline() |
|
|
if not line:break |
|
|
paperIndex[line.split()[0]] = file.readline() |
|
|
file.close() |
|
|
return paperIndex |
|
|
except Exception: |
|
|
print "No paper index file" |
|
|
return False |
|
|
|
|
|
def timingDown(seconds,paper_id): |
|
|
time.sleep(seconds) |
|
|
print "Starting download..." |
|
|
count = 1 |
|
|
while True: |
|
|
print "trying %d" % count |
|
|
if authCheck("cookies.txt") == True: |
|
|
print "auth success!" |
|
|
break |
|
|
count += 1 |
|
|
time.sleep(5) |
|
|
if downWithCookies("cookies.txt",paper_id): |
|
|
print "download success!" |
|
|
return True |
|
|
return False |
|
|
|
|
|
def threadDown(seconds,paper_id): |
|
|
thread.start_new_thread(timingDown,(seconds,paper_id)) |
|
|
|
|
|
|
|
|
|
|
|
def test(): |
|
|
buildNewDb(1001307) |
|
|
#threadDown(100,5279450) |
|
|
#downTest() |
|
|
|
|
|
#print u"test" |
|
|
#print "test".encode('utf-8') |
|
|
''' |
|
|
paper_id = raw_input("Input the id of the paper:") |
|
|
count = 1 |
|
|
while True: |
|
|
print "trying %d" % count |
|
|
if authCheck("cookies.txt") == True: |
|
|
print "auth success!" |
|
|
break |
|
|
count += 1 |
|
|
time.sleep(5) |
|
|
if downWithCookies("cookies.txt",paper_id): |
|
|
print "download success!" |
|
|
buildNewDb(1001545) |
|
|
time.sleep(300) |
|
|
''' |
|
|
pass |
|
|
|
|
|
|
|
|
if __name__ == '__main__':test() |