__author__ = 'richie'
# -*- coding: utf-8 -*-
import urllib, urllib2, cookielib, re, os, codecs, time, subprocess,sys,json,math
#import simplejson as json
import dateutil
class cnzz(object):
    def __init__(self, username = '', password = '',othername=''):
        self.__username = username
        self.__othername = othername
        self.__password = password
        self.__opener = ''
        self.__sitelist = []
    def login(self):
        myCookie = urllib2.HTTPCookieProcessor(cookielib.CookieJar());
        self.__opener = urllib2.build_opener(myCookie)
        post_data = {
            'username': self.__username,
            'password': self.__password
        }
        req = urllib2.Request('http://new.cnzz.com/user/login.php', urllib.urlencode(post_data))
        #print 'var _username = "'+self.__username+'";'
        loginhtml= self.__opener.open(req).read()
        #print loginhtml#.decode('gbk').encode('utf-8')
        if(loginhtml.find('_username') > 1 or loginhtml.find('登陆进入旧版站长')>1):
            #self.__opener=opener
            return True
        else:
            return False
    def getSiteListPageCount(self):
        url="http://tongji.cnzz.com/main.php?c=site&a=show&ajax=module=gettotallist|module=list|module=isOpenTongji&sort=1&search=&currentPage=1&pageType=30&_="+str(int(time.time()))
        req=urllib2.Request(url)
        html= self.__opener.open(req).read().decode('gbk').encode('utf-8')
        totalsite= int(json.loads(html)['data']['gettotallist']['totalsite'])
        #print (155/90.0)
        return math.ceil(totalsite/90.0)
        #print html
        #match=re.compile(r'第1/(?P<pagecount>\d+?)页 ').search(html)
        #if match:
            #print match.group('pagecount')
        #    return int(match.group('pagecount'))

        #html.find('第1/3页')
        #html=self.__opener(urllib2.Request(url)).read()
        #print html
    def getuserdetail(self,url):
        req=urllib2.Request(url)
        html= self.__opener.open(req).read().decode('gbk').encode('utf-8')
        sitelist=json.loads(html)
        print html
        return sitelist
    def getSiteList(self):
        pagecount=int(self.getSiteListPageCount())
        print "count page :"+str(pagecount)
        for i in range(1,pagecount+1):
            #url="http://new.cnzz.com/v1/main.php?s=site_list&sort=0&page="+str(i)+"&everypage=30"
            url="http://tongji.cnzz.com/main.php?c=site&a=show&ajax=module=gettotallist|module=list|module=isOpenTongji&sort=1&search=&currentPage="+str(i)+"&pageType=90&_=1385011097947"
            #print url
            req=urllib2.Request(url)
            html= self.__opener.open(req).read().decode('gbk').encode('utf-8')
            sitelist=json.loads(html)
            sitelist=sitelist['data']['list']['items']
            for x in range(len(sitelist)):
                #print sitelist[x]['name']+"@@@"+sitelist[x]['siteid']
                #print x
                self.__sitelist.append(sitelist[x]['name']+"@@@"+sitelist[x]['siteid'])
            #print html
            #match2=re.compile(r'<script>site_data\(\'(.+)\'\);</script>').findall(html)
            #match3=re.compile(r'<div class="col-1">(?P<sitename>.+)').findall(html)
            #if(match2 and match3):
            #    #print match2
            #    #print len(match2)
            #    #print len(match3)
            #    #print match3
            #    for i in range(0,len(match3)):
            #        print match3[i][0:-1]+"@@@"+match2[i]
            #        self.__sitelist.append(match3[i][0:-1]+"@@@"+match2[i])

            #else:
            #    print 'getSiteList Error'
            #    sys.exit()
        print 'getSiteList OK'
        return self.__sitelist

    def yesterdayinfo(self,siteid=''):
        if(siteid != ''):
            #url ="http://new.cnzz.com/v1/data/site_list_data.php?siteid="+siteid
            url="http://tongji.cnzz.com/main.php?c=site&a=show&ajax=module=gettotallist|module=list|module=isOpenTongji&sort=1&search="+siteid+"&currentPage=1&pageType=90&_=1385012521584"
            req=urllib2.Request(url)
            try:
                html= self.__opener.open(req).read().decode('gbk').encode('utf-8')
            except :
                return self.yesterdayinfo(siteid)
            siteinfo= json.loads(html)['data']['list']['items']
            siteyinfo=[(),()]
            if(len(siteinfo)==1):
                #print siteinfo[0]['y_uv']
                siteyinfo[1]=[siteinfo[0]['y_pv'],siteinfo[0]['y_uv'],siteinfo[0]['y_ip']]
            #print siteyinfo
            #print siteid+'ok'
            #data=eval(html)[1]
            #data=json.dumps(html)
            #print data

            #print data
            return list(siteyinfo)
            #data=json.loads("{"+html+"}")
            #print data
        else:
            print 'no siteid'
    def getSiteInfoByDate(self,siteid='',startdate='',enddate=''):
        #url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=timeflux&st=" + startdate + "&et=" + enddate
        #url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=timeflux&st=" + startdate + "&et=" + enddate
        url ="http://tongji.cnzz.com/main.php?c=flow&a=trend&ajax=module%3Dsummary%7Cmodule%3DfluxList_currentPage%3D1_pageType%3D90&siteid="+siteid+"&st="+ startdate +"&et="+ enddate+"&_=1385013202955"
        req=urllib2.Request(url)
        html= self.__opener.open(req).read().decode('gbk').encode('utf-8')
        siteinfoitems= json.loads(html)['data']['fluxList']['items']
        siteinfo={}
        for x in range(len(siteinfoitems)):
            print siteinfoitems[x]['key']
            siteinfo[siteinfoitems[x]['key']]=[siteinfoitems[x]['pv'],siteinfoitems[x]['uv'],siteinfoitems[x]['ip']]
        #print siteinfo
        #exit()
        return siteinfo
    def getHistoryKeyByDate(self,siteid='',startdate='',enddate='',page=1):
        url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=key&st=" + startdate + "&et=" + enddate
        print url
        req=urllib2.Request(url)
        html= self.__opener.open(req).read().decode('gbk').encode('utf-8')
        match=re.compile(r'第1/(?P<pagecount>\d+?)页 ').search(html)
        #pagecount=1
        if match:
            #print match.group('pagecount')
            pagecount= int(match.group('pagecount'))
       # match2=re.compile(r'<td title=\'(.+)\'>').findall(html)
        #for i in range(0,len(match2)):
        #    print match2[i]
        pagecount=3
        keyinfos=[]
        for i in range(1,pagecount+1):
            print '正在抓取关键词列表第'+str(i)+'页'
            url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=key&st=" + startdate + "&et=" + enddate +"&page="+str(i)
            req=urllib2.Request(url)
            html= self.__opener.open(req).read().decode('gbk','ignore').encode('utf-8')
            mt=re.compile(r'<td title=\'(?P<key>.+) \'>(?P<key1>.+)</td>\s+<td class=\'all_right\'>(?P<snum>.+)</td>\s+<td class=\'all_right\'>(?P<uv>.+)</td>')
            mn=mt.findall(html)

            for i in range(0,len(mn)):
                #print mn[i][0],'\t',mn[i][2],'\t',mn[i][3]
                keyinfos.append([mn[i][0],mn[i][2],mn[i][3]])
        '''
        <td title='www.jxeea.cn '>www.jxeea.cn</td>
			<td class='all_right'>38024</td>
			<td class='all_right'>29859</td>
			<td class='all_right'>28771</td>
			<td class='all_right'>24385</td>
        '''
        print '共抓取关键词'+str(len(keyinfos))+'个'
        return keyinfos
        #print html
    def getKeyHistory(self,siteid='',startdate='',enddate='',key=''):
        #通过key查询时间段内最高，和最低的搜索量
        #http://new.cnzz.com/v1/main.php?siteid=2918556&s=history_key&st=2013-04-08&et=2013-05-07&t=30&url=%C9%EE%DB%DA%BB%E1%BC%C6%B4%D3%D2%B5%D7%CA%B8%F1%BF%BC%CA%D4%B1%A8%C3%FB%CF%B5%CD%B3
        url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=history_key&st=" + startdate + "&et=" + enddate+"&url="+urllib2.quote(key.decode('utf-8').encode('gbk'))
        #print url
        req=urllib2.Request(url)
        html= self.__opener.open(req).read().decode('gbk').encode('utf-8')

        #print html
       # match2=re.compile(r'<td title=\'(.+)\'>').findall(html)
        #for i in range(0,len(match2)):
        #    print match2[i]
        #mt=re.compile(r'<td title=\'(?P<key>.+) \'>(?P<key1>.+)</td>\s+<td class=\'all_right\'>(?P<snum>.+)</td>\s+<td class=\'all_right\'>(?P<uv>.+)</td>')
        #mn=mt.findall(html)
        datelist=reversed(dateutil.getDays2(startdate, enddate))
        uvlist={}
        for i in datelist:
            #print i
            pn = re.compile(
                r'<td> '+i+' </td>\s+<td class="num1">(?P<snum>.+)</td>\s+<td class="num1">(?P<uv>.+)</td>'
                #\s+<td class="num1">(?P<uv>.+)</td>\s+<td class="num1">(?P<ip>.+)</td>\s+<td class="num2">(?P<newuv>.+)</td>
                , re.I)
            mn = pn.search(html)
            #print html_src2.decode('gbk').encode('utf-8')
            #sitesinfo[key][i] = mn.group('uv')
            #print i+"uv:"+mn.group('uv')
            #print mn
            if(mn):
                #siteinfo[i.replace('星期六','').replace('星期天','')]=
                #print i,'\t',mn.group('snum'),'uv：',mn.group('uv')
                uvlist[i.replace('星期六','').replace('星期天','')]=int(mn.group('uv'))
        #print uvlist
        #print 'max:',self.sort_by_value(uvlist)[-1],'min:',self.sort_by_value(uvlist)[0]
        return [self.sort_by_value(uvlist)[-1],self.sort_by_value(uvlist)[0]]
    def sort_by_value(self,d):
        return sorted(d.items(), key=lambda d:d[1])
'''
if __name__ == '__main__':
    CnzzTool=cnzz('cnzzusername','password','mygod')
    if(CnzzTool.login()):
        print "LoginOk"
        #CnzzTool.getSiteListPageCount()
        #CnzzTool.getSiteList()
        #print CnzzTool.yesterdayinfo('2918848')
        #CnzzTool.getSiteInfoByDate('2918848','2012-05-20','2012-05-24')
    else:
        print "LoginError"
'''