__author__ = 'richie'
# -*- coding: utf-8 -*-
import urllib, urllib2, cookielib, re, os, codecs, time, subprocess,sys,json
#import simplejson as json
import dateutil
class cnzz(object):
    def __init__(self, username = '', password = '',othername=''):
        self.__username = username
        self.__othername = othername
        self.__password = password
        self.__opener = ''
        self.__sitelist = []
    def login(self):
        myCookie = urllib2.HTTPCookieProcessor(cookielib.CookieJar());
        self.__opener = urllib2.build_opener(myCookie)
        post_data = {
            'username': self.__username,
            'password': self.__password
        }
        req = urllib2.Request('http://new.cnzz.com/user/login.php', urllib.urlencode(post_data))
        #print 'var _username = "'+self.__username+'";'
        loginhtml= self.__opener.open(req).read()
        #print loginhtml
        if(loginhtml.find('_username') > 1):
            #self.__opener=opener
            return True
        else:
            return False
    def getSiteListPageCount(self):
        url="http://new.cnzz.com/v1/main.php?s=site_list&sort=0&everypage=30&setpage"
        req=urllib2.Request(url)
        html= self.__opener.open(req).read().decode('gbk').encode('utf-8')
        #print html
        match=re.compile(r'第1/(?P<pagecount>\d+?)页 ').search(html)
        if match:
            #print match.group('pagecount')
            return int(match.group('pagecount'))

        #html.find('第1/3页')
        #html=self.__opener(urllib2.Request(url)).read()
        #print html
    def getSiteList(self):
        pagecount=self.getSiteListPageCount()
        print "count page :"+str(pagecount)
        for i in range(1,pagecount+1):
            url="http://new.cnzz.com/v1/main.php?s=site_list&sort=0&page="+str(i)+"&everypage=30"
            #print url
            req=urllib2.Request(url)
            html= self.__opener.open(req).read().decode('gbk').encode('utf-8')
            #print html
            match2=re.compile(r'<script>site_data\(\'(.+)\'\);</script>').findall(html)
            match3=re.compile(r'<div class="col-1">(?P<sitename>.+)').findall(html)
            if(match2 and match3):
                #print match2
                #print len(match2)
                #print len(match3)
                #print match3
                for i in range(0,len(match3)):
                    print match3[i][0:-1]+"@@@"+match2[i]
                    self.__sitelist.append(match3[i][0:-1]+"@@@"+match2[i])

            else:
                print 'getSiteList Error'
                sys.exit()
        print 'getSiteList OK'
        return self.__sitelist

    def yesterdayinfo(self,siteid=''):
        if(siteid != ''):
            url ="http://new.cnzz.com/v1/data/site_list_data.php?siteid="+siteid

            req=urllib2.Request(url)
            try:
                html= self.__opener.open(req).read().decode('gbk').encode('utf-8')
            except :
                return self.yesterdayinfo(siteid)
            #print siteid+'ok'
            #data=eval(html)[1]
            data=json.loads(html)
            #print data
            return data
            #data=json.loads("{"+html+"}")
            #print data
        else:
            print 'no siteid'
    def getSiteInfoByDate(self,siteid='',startdate='',enddate=''):
        #url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=timeflux&st=" + startdate + "&et=" + enddate
        url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=timeflux&st=" + startdate + "&et=" + enddate

        req=urllib2.Request(url)
        html= self.__opener.open(req).read().decode('gbk').encode('utf-8')
        siteinfo={}
        for i in reversed(dateutil.getDays(startdate, enddate)):
            pn = re.compile(
                r'<td>' + i + '</td>\s+<td class="num1">(?P<pv>.+)</td>\s+<td class="num1">(?P<uv>.+)</td>\s+<td class="num1">(?P<ip>.+)</td>'
                , re.I)
            mn = pn.search(html)
            #print html_src2.decode('gbk').encode('utf-8')
            #sitesinfo[key][i] = mn.group('uv')
            #print i+"uv:"+mn.group('uv')
            if(mn):
                siteinfo[i.replace('星期六','').replace('星期天','')]=[mn.group('pv'),mn.group('uv'),mn.group('ip')]
        #print siteinfo
        return siteinfo
    def getHistoryKeyByDate(self,siteid='',startdate='',enddate='',page=1):
        url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=key&st=" + startdate + "&et=" + enddate
        print url
        req=urllib2.Request(url)
        html= self.__opener.open(req).read().decode('gbk').encode('utf-8')
        match=re.compile(r'第1/(?P<pagecount>\d+?)页 ').search(html)
        #pagecount=1
        if match:
            #print match.group('pagecount')
            pagecount= int(match.group('pagecount'))
       # match2=re.compile(r'<td title=\'(.+)\'>').findall(html)
        #for i in range(0,len(match2)):
        #    print match2[i]
        pagecount=3
        keyinfos=[]
        for i in range(1,pagecount+1):
            print '正在抓取关键词列表第'+str(i)+'页'
            url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=key&st=" + startdate + "&et=" + enddate +"&page="+str(i)
            req=urllib2.Request(url)
            html= self.__opener.open(req).read().decode('gbk','ignore').encode('utf-8')
            mt=re.compile(r'<td title=\'(?P<key>.+) \'>(?P<key1>.+)</td>\s+<td class=\'all_right\'>(?P<snum>.+)</td>\s+<td class=\'all_right\'>(?P<uv>.+)</td>')
            mn=mt.findall(html)

            for i in range(0,len(mn)):
                #print mn[i][0],'\t',mn[i][2],'\t',mn[i][3]
                keyinfos.append([mn[i][0],mn[i][2],mn[i][3]])
        '''
        <td title='www.jxeea.cn '>www.jxeea.cn</td>
			<td class='all_right'>38024</td>
			<td class='all_right'>29859</td>
			<td class='all_right'>28771</td>
			<td class='all_right'>24385</td>
        '''
        print '共抓取关键词'+str(len(keyinfos))+'个'
        return keyinfos
        #print html
    def getKeyHistory(self,siteid='',startdate='',enddate='',key=''):
        #通过key查询时间段内最高，和最低的搜索量
        #http://new.cnzz.com/v1/main.php?siteid=2918556&s=history_key&st=2013-04-08&et=2013-05-07&t=30&url=%C9%EE%DB%DA%BB%E1%BC%C6%B4%D3%D2%B5%D7%CA%B8%F1%BF%BC%CA%D4%B1%A8%C3%FB%CF%B5%CD%B3
        url = "http://new.cnzz.com/v1/main.php?siteid=" + siteid + "&s=history_key&st=" + startdate + "&et=" + enddate+"&url="+urllib2.quote(key.decode('utf-8').encode('gbk'))
        #print url
        req=urllib2.Request(url)
        html= self.__opener.open(req).read().decode('gbk').encode('utf-8')

        #print html
       # match2=re.compile(r'<td title=\'(.+)\'>').findall(html)
        #for i in range(0,len(match2)):
        #    print match2[i]
        #mt=re.compile(r'<td title=\'(?P<key>.+) \'>(?P<key1>.+)</td>\s+<td class=\'all_right\'>(?P<snum>.+)</td>\s+<td class=\'all_right\'>(?P<uv>.+)</td>')
        #mn=mt.findall(html)
        datelist=reversed(dateutil.getDays2(startdate, enddate))
        uvlist={}
        for i in datelist:
            #print i
            pn = re.compile(
                r'<td> '+i+' </td>\s+<td class="num1">(?P<snum>.+)</td>\s+<td class="num1">(?P<uv>.+)</td>'
                #\s+<td class="num1">(?P<uv>.+)</td>\s+<td class="num1">(?P<ip>.+)</td>\s+<td class="num2">(?P<newuv>.+)</td>
                , re.I)
            mn = pn.search(html)
            #print html_src2.decode('gbk').encode('utf-8')
            #sitesinfo[key][i] = mn.group('uv')
            #print i+"uv:"+mn.group('uv')
            #print mn
            if(mn):
                #siteinfo[i.replace('星期六','').replace('星期天','')]=
                #print i,'\t',mn.group('snum'),'uv：',mn.group('uv')
                uvlist[i.replace('星期六','').replace('星期天','')]=int(mn.group('uv'))
        #print uvlist
        #print 'max:',self.sort_by_value(uvlist)[-1],'min:',self.sort_by_value(uvlist)[0]
        return [self.sort_by_value(uvlist)[-1],self.sort_by_value(uvlist)[0]]
    def sort_by_value(self,d):
        return sorted(d.items(), key=lambda d:d[1])
'''
if __name__ == '__main__':

    CnzzTool=cnzz('username','password','')
    if(CnzzTool.login()):
        print "LoginOk"

    else:
        print "LoginError"
'''