__author__ = 'richie' # -*- coding: utf-8 -*- import urllib, urllib2, cookielib, re, os, codecs, time, subprocess,sys,json,math #import simplejson as json import dateutil class cnzz(object): def __init__(self, username = '', password = '',othername=''): self.__username = username self.__othername = othername self.__password = password self.__opener = '' self.__sitelist = [] def login(self): myCookie = urllib2.HTTPCookieProcessor(cookielib.CookieJar()); self.__opener = urllib2.build_opener(myCookie) post_data = { 'username': self.__username, 'password': self.__password } req = urllib2.Request('http://new.cnzz.com/user/login.php', urllib.urlencode(post_data)) #print 'var _username = "'+self.__username+'";' loginhtml= self.__opener.open(req).read() #print loginhtml#.decode('gbk').encode('utf-8') if(loginhtml.find('_username') > 1 or loginhtml.find('登陆进入旧版站长')>1): #self.__opener=opener return True else: return False def getSiteListPageCount(self): url="http://tongji.cnzz.com/main.php?c=site&a=show&ajax=module=gettotallist|module=list|module=isOpenTongji&sort=1&search=¤tPage=1&pageType=30&_="+str(int(time.time())) req=urllib2.Request(url) html= self.__opener.open(req).read().decode('gbk').encode('utf-8') totalsite= int(json.loads(html)['data']['gettotallist']['totalsite']) #print (155/90.0) return math.ceil(totalsite/90.0) #print html #match=re.compile(r'第1/(?P\d+?)页 ').search(html) #if match: #print match.group('pagecount') # return int(match.group('pagecount')) #html.find('第1/3页') #html=self.__opener(urllib2.Request(url)).read() #print html def getuserdetail(self,url): req=urllib2.Request(url) html= self.__opener.open(req).read().decode('gbk').encode('utf-8') sitelist=json.loads(html) print html return sitelist def getSiteList(self): pagecount=int(self.getSiteListPageCount()) print "count page :"+str(pagecount) for i in range(1,pagecount+1): #url="http://new.cnzz.com/v1/main.php?s=site_list&sort=0&page="+str(i)+"&everypage=30" url="http://tongji.cnzz.com/main.php?c=site&a=show&ajax=module=gettotallist|module=list|module=isOpenTongji&sort=1&search=¤tPage="+str(i)+"&pageType=90&_=1385011097947" #print url req=urllib2.Request(url) html= self.__opener.open(req).read().decode('gbk').encode('utf-8') sitelist=json.loads(html) sitelist=sitelist['data']['list']['items'] for x in range(len(sitelist)): #print sitelist[x]['name']+"@@@"+sitelist[x]['siteid'] #print x self.__sitelist.append(sitelist[x]['name']+"@@@"+sitelist[x]['siteid']) #print html #match2=re.compile(r'').findall(html) #match3=re.compile(r'
(?P.+)').findall(html) #if(match2 and match3): # #print match2 # #print len(match2) # #print len(match3) # #print match3 # for i in range(0,len(match3)): # print match3[i][0:-1]+"@@@"+match2[i] # self.__sitelist.append(match3[i][0:-1]+"@@@"+match2[i]) #else: # print 'getSiteList Error' # sys.exit() print 'getSiteList OK' return self.__sitelist def yesterdayinfo(self,siteid=''): if(siteid != ''): #url ="http://new.cnzz.com/v1/data/site_list_data.php?siteid="+siteid url="http://tongji.cnzz.com/main.php?c=site&a=show&ajax=module=gettotallist|module=list|module=isOpenTongji&sort=1&search="+siteid+"¤tPage=1&pageType=90&_=1385012521584" req=urllib2.Request(url) try: html= self.__opener.open(req).read().decode('gbk').encode('utf-8') except : return self.yesterdayinfo(siteid) siteinfo= json.loads(html)['data']['list']['items'] siteyinfo=[(),()] if(len(siteinfo)==1): #print siteinfo[0]['y_uv'] siteyinfo[1]=[siteinfo[0]['y_pv'],siteinfo[0]['y_uv'],siteinfo[0]['y_ip']] #print siteyinfo #print siteid+'ok' #data=eval(html)[1] #data=json.dumps(html) #print data #print data return list(siteyinfo) #data=json.loads("{"+html+"}") #print data else: print 'no siteid' def getSiteInfoByDate(self,siteid='',startdate='',enddate=''): #url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=timeflux&st=" + startdate + "&et=" + enddate #url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=timeflux&st=" + startdate + "&et=" + enddate url ="http://tongji.cnzz.com/main.php?c=flow&a=trend&ajax=module%3Dsummary%7Cmodule%3DfluxList_currentPage%3D1_pageType%3D90&siteid="+siteid+"&st="+ startdate +"&et="+ enddate+"&_=1385013202955" req=urllib2.Request(url) html= self.__opener.open(req).read().decode('gbk').encode('utf-8') siteinfoitems= json.loads(html)['data']['fluxList']['items'] siteinfo={} for x in range(len(siteinfoitems)): print siteinfoitems[x]['key'] siteinfo[siteinfoitems[x]['key']]=[siteinfoitems[x]['pv'],siteinfoitems[x]['uv'],siteinfoitems[x]['ip']] #print siteinfo #exit() return siteinfo def getHistoryKeyByDate(self,siteid='',startdate='',enddate='',page=1): url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=key&st=" + startdate + "&et=" + enddate print url req=urllib2.Request(url) html= self.__opener.open(req).read().decode('gbk').encode('utf-8') match=re.compile(r'第1/(?P\d+?)页 ').search(html) #pagecount=1 if match: #print match.group('pagecount') pagecount= int(match.group('pagecount')) # match2=re.compile(r'').findall(html) #for i in range(0,len(match2)): # print match2[i] pagecount=3 keyinfos=[] for i in range(1,pagecount+1): print '正在抓取关键词列表第'+str(i)+'页' url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=key&st=" + startdate + "&et=" + enddate +"&page="+str(i) req=urllib2.Request(url) html= self.__opener.open(req).read().decode('gbk','ignore').encode('utf-8') mt=re.compile(r'.+) \'>(?P.+)\s+(?P.+)\s+(?P.+)') mn=mt.findall(html) for i in range(0,len(mn)): #print mn[i][0],'\t',mn[i][2],'\t',mn[i][3] keyinfos.append([mn[i][0],mn[i][2],mn[i][3]]) ''' www.jxeea.cn 38024 29859 28771 24385 ''' print '共抓取关键词'+str(len(keyinfos))+'个' return keyinfos #print html def getKeyHistory(self,siteid='',startdate='',enddate='',key=''): #通过key查询时间段内最高,和最低的搜索量 #http://new.cnzz.com/v1/main.php?siteid=2918556&s=history_key&st=2013-04-08&et=2013-05-07&t=30&url=%C9%EE%DB%DA%BB%E1%BC%C6%B4%D3%D2%B5%D7%CA%B8%F1%BF%BC%CA%D4%B1%A8%C3%FB%CF%B5%CD%B3 url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=history_key&st=" + startdate + "&et=" + enddate+"&url="+urllib2.quote(key.decode('utf-8').encode('gbk')) #print url req=urllib2.Request(url) html= self.__opener.open(req).read().decode('gbk').encode('utf-8') #print html # match2=re.compile(r'').findall(html) #for i in range(0,len(match2)): # print match2[i] #mt=re.compile(r'.+) \'>(?P.+)\s+(?P.+)\s+(?P.+)') #mn=mt.findall(html) datelist=reversed(dateutil.getDays2(startdate, enddate)) uvlist={} for i in datelist: #print i pn = re.compile( r' '+i+' \s+(?P.+)\s+(?P.+)' #\s+(?P.+)\s+(?P.+)\s+(?P.+) , re.I) mn = pn.search(html) #print html_src2.decode('gbk').encode('utf-8') #sitesinfo[key][i] = mn.group('uv') #print i+"uv:"+mn.group('uv') #print mn if(mn): #siteinfo[i.replace('星期六','').replace('星期天','')]= #print i,'\t',mn.group('snum'),'uv:',mn.group('uv') uvlist[i.replace('星期六','').replace('星期天','')]=int(mn.group('uv')) #print uvlist #print 'max:',self.sort_by_value(uvlist)[-1],'min:',self.sort_by_value(uvlist)[0] return [self.sort_by_value(uvlist)[-1],self.sort_by_value(uvlist)[0]] def sort_by_value(self,d): return sorted(d.items(), key=lambda d:d[1]) ''' if __name__ == '__main__': CnzzTool=cnzz('cnzzusername','password','mygod') if(CnzzTool.login()): print "LoginOk" #CnzzTool.getSiteListPageCount() #CnzzTool.getSiteList() #print CnzzTool.yesterdayinfo('2918848') #CnzzTool.getSiteInfoByDate('2918848','2012-05-20','2012-05-24') else: print "LoginError" '''