forthxu · August 31, 2018 04:26 · Jun 29, 2016 · Jun 28, 2016 · Jun 26, 2016
diff --git a/wechatSearchTest.py b/wechatSearchTest.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+# https://github.com/lining0806/WechatSearchProjects
+
+import sys
+import re
+import urllib, urllib2
+import requests
+import pymongo
+import datetime
+from bs4 import BeautifulSoup
+import multiprocessing as mp
+
+
+class MongoDBIO:
+    # 申明相关的属性
+    def __init__(self, host, port, name, password, database, collection):
+        self.host = host
+        self.port = port
+        self.name = name
+        self.password = password
+        self.database = database
+        self.collection = collection
+
+    # 连接数据库，db和posts为数据库和集合的游标
+    def Connection(self):
+        # connection = pymongo.Connection() # 连接本地数据库
+        connection = pymongo.Connection(host=self.host, port=self.port)
+        # db = connection.datas
+        db = connection[self.database]
+        if self.name or self.password:
+            db.authenticate(name=self.name, password=self.password) # 验证用户名密码
+        # print "Database:", db.name
+        # posts = db.cn_live_news
+        posts = db[self.collection]
+        # print "Collection:", posts.name
+        return posts
+
+# # 保存操作
+# def ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_contents):
+#     posts = MongoDBIO(save_host, save_port, save_name, save_password, save_database, save_collection).Connection()
+#
+#     for save_content in save_contents:
+#         posts.save(save_content)
+# 保存操作
+def ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_content):
+    posts = MongoDBIO(save_host, save_port, save_name, save_password, save_database, save_collection).Connection()
+    posts.save(save_content)
+
+
+def GetTitleUrl(url, data):
+    content = requests.get(url=url, params=data).content # GET请求发送
+    soup = BeautifulSoup(content)
+    tags = soup.findAll("h4")
+    titleurl = []
+    for tag in tags:
+        item = {"title":tag.text.strip(), "link":tag.find("a").get("href"), "content":""}
+        titleurl.append(item)
+    return titleurl
+
+def GetContent(url):
+    soup = BeautifulSoup(requests.get(url=url).content)
+    tag = soup.find("div", attrs={"class":"rich_media_content", "id":"js_content"}) # 提取第一个标签
+    content_list = [tag_i.text for tag_i in tag.findAll("p")]
+    content = "".join(content_list)
+    return content
+
+def ContentSave(item):
+    # 保存配置
+    save_host = "localhost"
+    save_port = 27017
+    save_name = ""
+    save_password = ""
+    save_database = "testwechat"
+    save_collection = "result"
+
+    save_content = {
+        "title":item["title"],
+        "link":item["link"],
+        "content":item["content"]
+    }
+
+    ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_content)
+
+def func(tuple):
+    querystring, type, page = tuple[0], tuple[1], tuple[2]
+    url = "http://weixin.sogou.com/weixin"
+    # get参数
+    data = {
+        "query":querystring,
+        "type":type,
+        "page":page
+    }
+
+    titleurl = GetTitleUrl(url, data)
+
+    for item in titleurl:
+        url = item["link"]
+        print "url:", url
+        content = GetContent(url)
+        item["content"] = content
+        ContentSave(item)
+
+
+if __name__ == '__main__':
+    start = datetime.datetime.now()
+
+    querystring = u"清华"
+    type = 2 # 2-文章，1-微信号
+
+    # 多进程抓取
+    p = mp.Pool()
+    p.map_async(func, [(querystring, type, page) for page in range(1, 50, 1)])
+    p.close()
+    p.join()
+
+    # # 单进程抓取
+    # for page in range(1, 50, 1):
+    #     tuple = (querystring, type, page)
+    #     func(tuple)
+
+    end = datetime.datetime.now()
+    print "last time: ", end-start
diff --git a/screenshots.js b/screenshots.js
@@ -0,0 +1,45 @@
+//[root@vps3 work]# wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2
+//[root@vps3 work]# tar jxvf phantomjs-2.1.1-linux-x86_64.tar.bz2 
+//[root@vps3 work]# vim screenshots.js
+
+    var page = require('webpage').create();
+    var args = require('system').args;
+
+    var url = args[1];
+    var filename = args[2];
+
+    page.open(url, function(status) {
+        console.log("Status: " + status);
+        if(status === "success") {
+            #执行js
+            var title = page.evaluate(function(){
+                #滚动加载惰性图片
+                window.scrollTo(0,10000);
+                #返回标题
+                return document.title;
+            });
+            #调试信息
+            console.log('Page title is ' + title);
+
+            #延迟处理，以便加载图片执行js    
+            window.setTimeout(function ()
+            {
+                #截图渲染
+                page.render(filename);
+                #退出
+                phantom.exit();
+            }, 5000);
+        }else{
+            phantom.exit();
+        }
+    });
+
+//安装微软雅黑字体
+//[root@vps3 work]#yum -y install bitmap-fonts bitmap-fonts-cjk mkfontscale fontconfig 
+//[root@vps3 work]#mkdir /usr/share/fonts/win/
+//[root@vps3 work]#wget https://nipao.googlecode.com/files/msyh.ttf -O /usr/share/fonts/win/msyh.ttf 
+//[root@vps3 work]#mkfontscale 
+//[root@vps3 work]#mkfontdir 
+//[root@vps3 work]#fc-cache
+//执行截图功能
+//[root@vps3 work]#rm -rf /home/wwwroot/default/joke.png &&  phantomjs-2.1.1-linux-x86_64/bin/phantomjs  screenshots.js http://joke.4399pk.com /home/wwwroot/default/joke.png
diff --git a/scrapting_wechat.python b/scrapting_wechat.python
@@ -0,0 +1,209 @@
+#!/usr/bin/python2.7
+# -*- coding: utf-8 -*-
+from bs4 import BeautifulSoup
+import urllib2
+import time
+import csv
+import sys,os
+import pymysql
+
+def get_cur_file_dir():
+    path = sys.path[0]
+    if os.path.isdir(path):
+        return path
+    elif os.path.isfile(path):
+        return os.path.dirname(path)
+
+
+
+def down_content(content_url,path_url):
+    xhtml=open_url(content_url)
+    if False == xhtml :
+        return False
+
+    soup = BeautifulSoup(xhtml, "html5lib")
+    titleH2 = soup.find("h2", id="activity-name")
+    if None == titleH2:
+        return False
+    title = titleH2.string.encode('utf-8')
+    string_time = soup.find("em", id="post-date").string.encode('utf-8')
+    num_time = int(time.mktime(time.strptime(string_time,'%Y-%m-%d')))
+    keywords = str(soup.find(attrs={"name":"keywords"})['content'].encode('utf8','ignore'))
+    description = str(soup.find(attrs={"name":"description"})['content'].encode('utf8','ignore'))
+    content = soup.find_all("div", class_="rich_media_content")
+
+    if len(content) < 1 :
+        print("      "+"no contet")
+        return False
+
+    html = """
+<!doctype html>
+<html>
+<head>
+<meta charset="utf-8">
+<title>"""+title+"""</title>
+<meta name="keywords" content=\""""+keywords+"""\">
+<meta name="description" content=\""""+description+"""\">
+</head>
+<body>
+    <div id="body">
+    <h1>"""+title+"""</h1>
+    <div id="string_time">"""+string_time+""" </div><div id="num_time">"""+str(num_time)+"""</div>
+    <div id="content">
+    """+str(content[0])+"""
+    </div>
+    </div>
+</body>
+<script type="text/javascript" src="js/reimg.js"></script>
+</html>
+    """
+
+    f=file(path_url,"w+")
+    f.write(html)
+    f.close()
+
+    cur.execute("INSERT INTO archive (category,category_parents,title,summary,addtime,uptime) VALUES (27,\"0,12,27,\",%s,%s,%s,%s)",(title.strip(),description.strip(),num_time,num_time))
+    #print cur.description
+    #print "ID of last record is ", int(cur.lastrowid) #最后插入行的主键ID  
+    #print "ID of inserted record is ", int(conn.insert_id()) #最新插入行的主键ID，conn.insert_id()一定要在conn.commit()之前，否则会返回0 
+    lastid = int(cur.lastrowid)
+
+    cur.execute("INSERT INTO archive_article (archive,content) VALUE (%s,%s)",(lastid, str(content[0])))
+
+    cur.connection.commit()
+
+    return True
+
+def insert_content(path_url):
+    f = open(path_url,'rb')
+    xhtml = f.read()
+    f.close()
+
+    soup = BeautifulSoup(xhtml, "html5lib")
+    titleH1 = soup.find("h1")
+    if None == titleH1:
+        return False
+    title = titleH1.string.encode('utf-8')
+    num_time = int(soup.find("div", id="num_time").string.encode('utf-8'))
+    keywords = str(soup.find(attrs={"name":"keywords"})['content'].encode('utf8','ignore'))
+    description = str(soup.find(attrs={"name":"description"})['content'].encode('utf8','ignore'))
+    content = soup.find_all("div", class_="rich_media_content")
+
+    if len(content) < 1 :
+        print("      "+"no contet")
+        return False
+
+    cur.execute("INSERT INTO archive (category,category_parents,title,summary,addtime,uptime) VALUES (27,\"0,12,27,\",%s,%s,%s,%s)",(title.strip(),description.strip(),num_time,num_time))
+    #print cur.description
+    #print "ID of last record is ", int(cur.lastrowid) #最后插入行的主键ID  
+    #print "ID of inserted record is ", int(conn.insert_id()) #最新插入行的主键ID，conn.insert_id()一定要在conn.commit()之前，否则会返回0 
+    lastid = int(cur.lastrowid)
+
+    cur.execute("INSERT INTO archive_article (archive,content) VALUE (%s,%s)",(lastid, str(content[0])))
+
+    cur.connection.commit()
+
+    return True
+
+def open_url(url):
+    req = urllib2.Request(url)
+    req.add_header('User-agent', 'Mozilla 5.10')
+    for i in range(0, 3):
+        try:
+            xhtml = urllib2.urlopen(req)
+            return xhtml
+        except urllib2.HTTPError,e:    #HTTPError必须排在URLError的前面
+            print "The server couldn't fulfill the request"
+            print "Error code:",e.code
+            if e.code!=503:
+                return False
+            time.sleep(5)
+            print("try again")
+        except urllib2.URLError,e:
+            print "Failed to reach the server"
+            print "The reason:",e.reason
+            if e.code!=503:
+                return False
+            time.sleep(5)
+            print("try again")
+
+    return Fasle
+
+def down_list(list_url):
+    xhtml=open_url(list_url)
+    if False == xhtml :
+        return False
+
+    soup = BeautifulSoup(xhtml, "html5lib")
+    title = soup.title.string.encode('utf-8')
+    li_a = soup.find_all("a", class_="question_link")
+    next_list = soup.find_all("a", text="下一页")
+
+    writer = csv.writer(file(datapath+'list.csv', 'a+b'))
+    x = 0
+    for i in range(0, len(li_a)):
+        content_id = li_a[i]['href'].encode('utf-8')[3:]
+        content_title = li_a[i].string.encode('utf-8')
+        content_url = "http://chuansong.me"+li_a[i]['href'].encode('utf-8')
+        path_url = datapath+content_id+".html"
+
+        if not os.path.exists(path_url):
+            if False == down_content(content_url,path_url) :
+                print("  "+str(x)+content_url+" down fail")
+                continue
+                return False
+
+            print("  "+str(x)+content_url+" down end")
+            writer.writerow([content_id, content_title, content_url])
+            x=x+1
+            if x%2 == 1 :
+                time.sleep(3)
+            time.sleep(1)
+        else:
+            #insert_content(path_url)
+            print("  "+content_url+" exist")
+            return False
+
+    print(list_url+" end")
+    if len(next_list) < 1 :
+        return False
+
+    print("next "+next_list[0]['href'].encode('utf-8')+"\n")
+    return True
+
+def get_list():
+    start=0
+    while True:
+        if start==0:
+            url = 'http://chuansong.me/account/xingdongpai77'
+        else:
+            url = 'http://chuansong.me/account/xingdongpai77?start='+str(start)
+
+        if False == down_list(url) or start>2000:
+            break
+
+        start+=12
+        time.sleep(1)
+
+    print("get_list end")
+
+if __name__ == "__main__":
+    datapath = get_cur_file_dir()+'/data/'
+    if not os.path.exists(datapath):
+        os.makedirs(datapath)
+
+    conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd="123456", db='mysql')
+    cur = conn.cursor()
+    cur.execute("SET NAMES utf8")
+    cur.execute("USE x")
+
+    get_list()
+
+    cur.close()
+    conn.close()
+
+    # xtime = time.strftime("%Y-%m-%d %H:%M:%S")
+    # xday = time.strftime("%Y-%m-%d")
+    # f=file(datapath+xtime+".html","w+")
+    # f.write(body)
+    # f.close()