__author__ = 'richie' # -*- coding: utf-8 -*- import urllib, urllib2, cookielib, re, os, codecs, time, subprocess,sys,json #import simplejson as json import dateutil class cnzz(object): def __init__(self, username = '', password = '',othername=''): self.__username = username self.__othername = othername self.__password = password self.__opener = '' self.__sitelist = [] def login(self): myCookie = urllib2.HTTPCookieProcessor(cookielib.CookieJar()); self.__opener = urllib2.build_opener(myCookie) post_data = { 'username': self.__username, 'password': self.__password } req = urllib2.Request('http://new.cnzz.com/user/login.php', urllib.urlencode(post_data)) #print 'var _username = "'+self.__username+'";' loginhtml= self.__opener.open(req).read() #print loginhtml if(loginhtml.find('_username') > 1): #self.__opener=opener return True else: return False def getSiteListPageCount(self): url="http://new.cnzz.com/v1/main.php?s=site_list&sort=0&everypage=30&setpage" req=urllib2.Request(url) html= self.__opener.open(req).read().decode('gbk').encode('utf-8') #print html match=re.compile(r'第1/(?P\d+?)页 ').search(html) if match: #print match.group('pagecount') return int(match.group('pagecount')) #html.find('第1/3页') #html=self.__opener(urllib2.Request(url)).read() #print html def getSiteList(self): pagecount=self.getSiteListPageCount() print "count page :"+str(pagecount) for i in range(1,pagecount+1): url="http://new.cnzz.com/v1/main.php?s=site_list&sort=0&page="+str(i)+"&everypage=30" #print url req=urllib2.Request(url) html= self.__opener.open(req).read().decode('gbk').encode('utf-8') #print html match2=re.compile(r'').findall(html) match3=re.compile(r'
(?P.+)').findall(html) if(match2 and match3): #print match2 #print len(match2) #print len(match3) #print match3 for i in range(0,len(match3)): print match3[i][0:-1]+"@@@"+match2[i] self.__sitelist.append(match3[i][0:-1]+"@@@"+match2[i]) else: print 'getSiteList Error' sys.exit() print 'getSiteList OK' return self.__sitelist def yesterdayinfo(self,siteid=''): if(siteid != ''): url ="http://new.cnzz.com/v1/data/site_list_data.php?siteid="+siteid req=urllib2.Request(url) try: html= self.__opener.open(req).read().decode('gbk').encode('utf-8') except : return self.yesterdayinfo(siteid) #print siteid+'ok' #data=eval(html)[1] data=json.loads(html) #print data return data #data=json.loads("{"+html+"}") #print data else: print 'no siteid' def getSiteInfoByDate(self,siteid='',startdate='',enddate=''): #url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=timeflux&st=" + startdate + "&et=" + enddate url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=timeflux&st=" + startdate + "&et=" + enddate req=urllib2.Request(url) html= self.__opener.open(req).read().decode('gbk').encode('utf-8') siteinfo={} for i in reversed(dateutil.getDays(startdate, enddate)): pn = re.compile( r'' + i + '\s+(?P.+)\s+(?P.+)\s+(?P.+)' , re.I) mn = pn.search(html) #print html_src2.decode('gbk').encode('utf-8') #sitesinfo[key][i] = mn.group('uv') #print i+"uv:"+mn.group('uv') if(mn): siteinfo[i.replace('星期六','').replace('星期天','')]=[mn.group('pv'),mn.group('uv'),mn.group('ip')] #print siteinfo return siteinfo def getHistoryKeyByDate(self,siteid='',startdate='',enddate='',page=1): url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=key&st=" + startdate + "&et=" + enddate print url req=urllib2.Request(url) html= self.__opener.open(req).read().decode('gbk').encode('utf-8') match=re.compile(r'第1/(?P\d+?)页 ').search(html) #pagecount=1 if match: #print match.group('pagecount') pagecount= int(match.group('pagecount')) # match2=re.compile(r'').findall(html) #for i in range(0,len(match2)): # print match2[i] pagecount=3 keyinfos=[] for i in range(1,pagecount+1): print '正在抓取关键词列表第'+str(i)+'页' url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=key&st=" + startdate + "&et=" + enddate +"&page="+str(i) req=urllib2.Request(url) html= self.__opener.open(req).read().decode('gbk','ignore').encode('utf-8') mt=re.compile(r'.+) \'>(?P.+)\s+(?P.+)\s+(?P.+)') mn=mt.findall(html) for i in range(0,len(mn)): #print mn[i][0],'\t',mn[i][2],'\t',mn[i][3] keyinfos.append([mn[i][0],mn[i][2],mn[i][3]]) ''' www.jxeea.cn 38024 29859 28771 24385 ''' print '共抓取关键词'+str(len(keyinfos))+'个' return keyinfos #print html def getKeyHistory(self,siteid='',startdate='',enddate='',key=''): #通过key查询时间段内最高,和最低的搜索量 #http://new.cnzz.com/v1/main.php?siteid=2918556&s=history_key&st=2013-04-08&et=2013-05-07&t=30&url=%C9%EE%DB%DA%BB%E1%BC%C6%B4%D3%D2%B5%D7%CA%B8%F1%BF%BC%CA%D4%B1%A8%C3%FB%CF%B5%CD%B3 url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=history_key&st=" + startdate + "&et=" + enddate+"&url="+urllib2.quote(key.decode('utf-8').encode('gbk')) #print url req=urllib2.Request(url) html= self.__opener.open(req).read().decode('gbk').encode('utf-8') #print html # match2=re.compile(r'').findall(html) #for i in range(0,len(match2)): # print match2[i] #mt=re.compile(r'.+) \'>(?P.+)\s+(?P.+)\s+(?P.+)') #mn=mt.findall(html) datelist=reversed(dateutil.getDays2(startdate, enddate)) uvlist={} for i in datelist: #print i pn = re.compile( r' '+i+' \s+(?P.+)\s+(?P.+)' #\s+(?P.+)\s+(?P.+)\s+(?P.+) , re.I) mn = pn.search(html) #print html_src2.decode('gbk').encode('utf-8') #sitesinfo[key][i] = mn.group('uv') #print i+"uv:"+mn.group('uv') #print mn if(mn): #siteinfo[i.replace('星期六','').replace('星期天','')]= #print i,'\t',mn.group('snum'),'uv:',mn.group('uv') uvlist[i.replace('星期六','').replace('星期天','')]=int(mn.group('uv')) #print uvlist #print 'max:',self.sort_by_value(uvlist)[-1],'min:',self.sort_by_value(uvlist)[0] return [self.sort_by_value(uvlist)[-1],self.sort_by_value(uvlist)[0]] def sort_by_value(self,d): return sorted(d.items(), key=lambda d:d[1]) ''' if __name__ == '__main__': CnzzTool=cnzz('username','password','') if(CnzzTool.login()): print "LoginOk" else: print "LoginError" '''