Skip to content

Instantly share code, notes, and snippets.

@forthxu
Last active August 31, 2018 04:26
Show Gist options
  • Save forthxu/e40cdb912b402d28f553d86c4a2f14f5 to your computer and use it in GitHub Desktop.
Save forthxu/e40cdb912b402d28f553d86c4a2f14f5 to your computer and use it in GitHub Desktop.

Revisions

  1. ForthXu revised this gist Jun 29, 2016. 1 changed file with 123 additions and 0 deletions.
    123 changes: 123 additions & 0 deletions wechatSearchTest.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,123 @@
    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    # https://github.com/lining0806/WechatSearchProjects

    import sys
    import re
    import urllib, urllib2
    import requests
    import pymongo
    import datetime
    from bs4 import BeautifulSoup
    import multiprocessing as mp


    class MongoDBIO:
    # 申明相关的属性
    def __init__(self, host, port, name, password, database, collection):
    self.host = host
    self.port = port
    self.name = name
    self.password = password
    self.database = database
    self.collection = collection

    # 连接数据库,db和posts为数据库和集合的游标
    def Connection(self):
    # connection = pymongo.Connection() # 连接本地数据库
    connection = pymongo.Connection(host=self.host, port=self.port)
    # db = connection.datas
    db = connection[self.database]
    if self.name or self.password:
    db.authenticate(name=self.name, password=self.password) # 验证用户名密码
    # print "Database:", db.name
    # posts = db.cn_live_news
    posts = db[self.collection]
    # print "Collection:", posts.name
    return posts

    # # 保存操作
    # def ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_contents):
    # posts = MongoDBIO(save_host, save_port, save_name, save_password, save_database, save_collection).Connection()
    #
    # for save_content in save_contents:
    # posts.save(save_content)
    # 保存操作
    def ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_content):
    posts = MongoDBIO(save_host, save_port, save_name, save_password, save_database, save_collection).Connection()
    posts.save(save_content)


    def GetTitleUrl(url, data):
    content = requests.get(url=url, params=data).content # GET请求发送
    soup = BeautifulSoup(content)
    tags = soup.findAll("h4")
    titleurl = []
    for tag in tags:
    item = {"title":tag.text.strip(), "link":tag.find("a").get("href"), "content":""}
    titleurl.append(item)
    return titleurl

    def GetContent(url):
    soup = BeautifulSoup(requests.get(url=url).content)
    tag = soup.find("div", attrs={"class":"rich_media_content", "id":"js_content"}) # 提取第一个标签
    content_list = [tag_i.text for tag_i in tag.findAll("p")]
    content = "".join(content_list)
    return content

    def ContentSave(item):
    # 保存配置
    save_host = "localhost"
    save_port = 27017
    save_name = ""
    save_password = ""
    save_database = "testwechat"
    save_collection = "result"

    save_content = {
    "title":item["title"],
    "link":item["link"],
    "content":item["content"]
    }

    ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_content)

    def func(tuple):
    querystring, type, page = tuple[0], tuple[1], tuple[2]
    url = "http://weixin.sogou.com/weixin"
    # get参数
    data = {
    "query":querystring,
    "type":type,
    "page":page
    }

    titleurl = GetTitleUrl(url, data)

    for item in titleurl:
    url = item["link"]
    print "url:", url
    content = GetContent(url)
    item["content"] = content
    ContentSave(item)


    if __name__ == '__main__':
    start = datetime.datetime.now()

    querystring = u"清华"
    type = 2 # 2-文章,1-微信号

    # 多进程抓取
    p = mp.Pool()
    p.map_async(func, [(querystring, type, page) for page in range(1, 50, 1)])
    p.close()
    p.join()

    # # 单进程抓取
    # for page in range(1, 50, 1):
    # tuple = (querystring, type, page)
    # func(tuple)

    end = datetime.datetime.now()
    print "last time: ", end-start
  2. ForthXu revised this gist Jun 28, 2016. 1 changed file with 45 additions and 0 deletions.
    45 changes: 45 additions & 0 deletions screenshots.js
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,45 @@
    //[root@vps3 work]# wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2
    //[root@vps3 work]# tar jxvf phantomjs-2.1.1-linux-x86_64.tar.bz2
    //[root@vps3 work]# vim screenshots.js

    var page = require('webpage').create();
    var args = require('system').args;

    var url = args[1];
    var filename = args[2];

    page.open(url, function(status) {
    console.log("Status: " + status);
    if(status === "success") {
    #执行js
    var title = page.evaluate(function(){
    #滚动加载惰性图片
    window.scrollTo(0,10000);
    #返回标题
    return document.title;
    });
    #调试信息
    console.log('Page title is ' + title);

    #延迟处理,以便加载图片执行js
    window.setTimeout(function ()
    {
    #截图渲染
    page.render(filename);
    #退出
    phantom.exit();
    }, 5000);
    }else{
    phantom.exit();
    }
    });

    //安装微软雅黑字体
    //[root@vps3 work]#yum -y install bitmap-fonts bitmap-fonts-cjk mkfontscale fontconfig
    //[root@vps3 work]#mkdir /usr/share/fonts/win/
    //[root@vps3 work]#wget https://nipao.googlecode.com/files/msyh.ttf -O /usr/share/fonts/win/msyh.ttf
    //[root@vps3 work]#mkfontscale
    //[root@vps3 work]#mkfontdir
    //[root@vps3 work]#fc-cache
    //执行截图功能
    //[root@vps3 work]#rm -rf /home/wwwroot/default/joke.png && phantomjs-2.1.1-linux-x86_64/bin/phantomjs screenshots.js http://joke.4399pk.com /home/wwwroot/default/joke.png
  3. ForthXu created this gist Jun 26, 2016.
    209 changes: 209 additions & 0 deletions scrapting_wechat.python
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,209 @@
    #!/usr/bin/python2.7
    # -*- coding: utf-8 -*-
    from bs4 import BeautifulSoup
    import urllib2
    import time
    import csv
    import sys,os
    import pymysql

    def get_cur_file_dir():
    path = sys.path[0]
    if os.path.isdir(path):
    return path
    elif os.path.isfile(path):
    return os.path.dirname(path)



    def down_content(content_url,path_url):
    xhtml=open_url(content_url)
    if False == xhtml :
    return False

    soup = BeautifulSoup(xhtml, "html5lib")
    titleH2 = soup.find("h2", id="activity-name")
    if None == titleH2:
    return False
    title = titleH2.string.encode('utf-8')
    string_time = soup.find("em", id="post-date").string.encode('utf-8')
    num_time = int(time.mktime(time.strptime(string_time,'%Y-%m-%d')))
    keywords = str(soup.find(attrs={"name":"keywords"})['content'].encode('utf8','ignore'))
    description = str(soup.find(attrs={"name":"description"})['content'].encode('utf8','ignore'))
    content = soup.find_all("div", class_="rich_media_content")

    if len(content) < 1 :
    print(" "+"no contet")
    return False

    html = """
    <!doctype html>
    <html>
    <head>
    <meta charset="utf-8">
    <title>"""+title+"""</title>
    <meta name="keywords" content=\""""+keywords+"""\">
    <meta name="description" content=\""""+description+"""\">
    </head>
    <body>
    <div id="body">
    <h1>"""+title+"""</h1>
    <div id="string_time">"""+string_time+""" </div><div id="num_time">"""+str(num_time)+"""</div>
    <div id="content">
    """+str(content[0])+"""
    </div>
    </div>
    </body>
    <script type="text/javascript" src="js/reimg.js"></script>
    </html>
    """

    f=file(path_url,"w+")
    f.write(html)
    f.close()

    cur.execute("INSERT INTO archive (category,category_parents,title,summary,addtime,uptime) VALUES (27,\"0,12,27,\",%s,%s,%s,%s)",(title.strip(),description.strip(),num_time,num_time))
    #print cur.description
    #print "ID of last record is ", int(cur.lastrowid) #最后插入行的主键ID
    #print "ID of inserted record is ", int(conn.insert_id()) #最新插入行的主键ID,conn.insert_id()一定要在conn.commit()之前,否则会返回0
    lastid = int(cur.lastrowid)

    cur.execute("INSERT INTO archive_article (archive,content) VALUE (%s,%s)",(lastid, str(content[0])))

    cur.connection.commit()

    return True

    def insert_content(path_url):
    f = open(path_url,'rb')
    xhtml = f.read()
    f.close()

    soup = BeautifulSoup(xhtml, "html5lib")
    titleH1 = soup.find("h1")
    if None == titleH1:
    return False
    title = titleH1.string.encode('utf-8')
    num_time = int(soup.find("div", id="num_time").string.encode('utf-8'))
    keywords = str(soup.find(attrs={"name":"keywords"})['content'].encode('utf8','ignore'))
    description = str(soup.find(attrs={"name":"description"})['content'].encode('utf8','ignore'))
    content = soup.find_all("div", class_="rich_media_content")

    if len(content) < 1 :
    print(" "+"no contet")
    return False

    cur.execute("INSERT INTO archive (category,category_parents,title,summary,addtime,uptime) VALUES (27,\"0,12,27,\",%s,%s,%s,%s)",(title.strip(),description.strip(),num_time,num_time))
    #print cur.description
    #print "ID of last record is ", int(cur.lastrowid) #最后插入行的主键ID
    #print "ID of inserted record is ", int(conn.insert_id()) #最新插入行的主键ID,conn.insert_id()一定要在conn.commit()之前,否则会返回0
    lastid = int(cur.lastrowid)

    cur.execute("INSERT INTO archive_article (archive,content) VALUE (%s,%s)",(lastid, str(content[0])))

    cur.connection.commit()

    return True

    def open_url(url):
    req = urllib2.Request(url)
    req.add_header('User-agent', 'Mozilla 5.10')
    for i in range(0, 3):
    try:
    xhtml = urllib2.urlopen(req)
    return xhtml
    except urllib2.HTTPError,e: #HTTPError必须排在URLError的前面
    print "The server couldn't fulfill the request"
    print "Error code:",e.code
    if e.code!=503:
    return False
    time.sleep(5)
    print("try again")
    except urllib2.URLError,e:
    print "Failed to reach the server"
    print "The reason:",e.reason
    if e.code!=503:
    return False
    time.sleep(5)
    print("try again")

    return Fasle

    def down_list(list_url):
    xhtml=open_url(list_url)
    if False == xhtml :
    return False

    soup = BeautifulSoup(xhtml, "html5lib")
    title = soup.title.string.encode('utf-8')
    li_a = soup.find_all("a", class_="question_link")
    next_list = soup.find_all("a", text="下一页")

    writer = csv.writer(file(datapath+'list.csv', 'a+b'))
    x = 0
    for i in range(0, len(li_a)):
    content_id = li_a[i]['href'].encode('utf-8')[3:]
    content_title = li_a[i].string.encode('utf-8')
    content_url = "http://chuansong.me"+li_a[i]['href'].encode('utf-8')
    path_url = datapath+content_id+".html"

    if not os.path.exists(path_url):
    if False == down_content(content_url,path_url) :
    print(" "+str(x)+content_url+" down fail")
    continue
    return False

    print(" "+str(x)+content_url+" down end")
    writer.writerow([content_id, content_title, content_url])
    x=x+1
    if x%2 == 1 :
    time.sleep(3)
    time.sleep(1)
    else:
    #insert_content(path_url)
    print(" "+content_url+" exist")
    return False

    print(list_url+" end")
    if len(next_list) < 1 :
    return False

    print("next "+next_list[0]['href'].encode('utf-8')+"\n")
    return True

    def get_list():
    start=0
    while True:
    if start==0:
    url = 'http://chuansong.me/account/xingdongpai77'
    else:
    url = 'http://chuansong.me/account/xingdongpai77?start='+str(start)

    if False == down_list(url) or start>2000:
    break

    start+=12
    time.sleep(1)

    print("get_list end")

    if __name__ == "__main__":
    datapath = get_cur_file_dir()+'/data/'
    if not os.path.exists(datapath):
    os.makedirs(datapath)

    conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd="123456", db='mysql')
    cur = conn.cursor()
    cur.execute("SET NAMES utf8")
    cur.execute("USE x")

    get_list()

    cur.close()
    conn.close()

    # xtime = time.strftime("%Y-%m-%d %H:%M:%S")
    # xday = time.strftime("%Y-%m-%d")
    # f=file(datapath+xtime+".html","w+")
    # f.write(body)
    # f.close()