Last active
August 31, 2018 04:26
-
-
Save forthxu/e40cdb912b402d28f553d86c4a2f14f5 to your computer and use it in GitHub Desktop.
Revisions
-
ForthXu revised this gist
Jun 29, 2016 . 1 changed file with 123 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,123 @@ #!/usr/bin/env python # -*- coding:utf-8 -*- # https://github.com/lining0806/WechatSearchProjects import sys import re import urllib, urllib2 import requests import pymongo import datetime from bs4 import BeautifulSoup import multiprocessing as mp class MongoDBIO: # 申明相关的属性 def __init__(self, host, port, name, password, database, collection): self.host = host self.port = port self.name = name self.password = password self.database = database self.collection = collection # 连接数据库,db和posts为数据库和集合的游标 def Connection(self): # connection = pymongo.Connection() # 连接本地数据库 connection = pymongo.Connection(host=self.host, port=self.port) # db = connection.datas db = connection[self.database] if self.name or self.password: db.authenticate(name=self.name, password=self.password) # 验证用户名密码 # print "Database:", db.name # posts = db.cn_live_news posts = db[self.collection] # print "Collection:", posts.name return posts # # 保存操作 # def ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_contents): # posts = MongoDBIO(save_host, save_port, save_name, save_password, save_database, save_collection).Connection() # # for save_content in save_contents: # posts.save(save_content) # 保存操作 def ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_content): posts = MongoDBIO(save_host, save_port, save_name, save_password, save_database, save_collection).Connection() posts.save(save_content) def GetTitleUrl(url, data): content = requests.get(url=url, params=data).content # GET请求发送 soup = BeautifulSoup(content) tags = soup.findAll("h4") titleurl = [] for tag in tags: item = {"title":tag.text.strip(), "link":tag.find("a").get("href"), "content":""} titleurl.append(item) return titleurl def GetContent(url): soup = BeautifulSoup(requests.get(url=url).content) tag = soup.find("div", attrs={"class":"rich_media_content", "id":"js_content"}) # 提取第一个标签 content_list = [tag_i.text for tag_i in tag.findAll("p")] content = "".join(content_list) return content def ContentSave(item): # 保存配置 save_host = "localhost" save_port = 27017 save_name = "" save_password = "" save_database = "testwechat" save_collection = "result" save_content = { "title":item["title"], "link":item["link"], "content":item["content"] } ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_content) def func(tuple): querystring, type, page = tuple[0], tuple[1], tuple[2] url = "http://weixin.sogou.com/weixin" # get参数 data = { "query":querystring, "type":type, "page":page } titleurl = GetTitleUrl(url, data) for item in titleurl: url = item["link"] print "url:", url content = GetContent(url) item["content"] = content ContentSave(item) if __name__ == '__main__': start = datetime.datetime.now() querystring = u"清华" type = 2 # 2-文章,1-微信号 # 多进程抓取 p = mp.Pool() p.map_async(func, [(querystring, type, page) for page in range(1, 50, 1)]) p.close() p.join() # # 单进程抓取 # for page in range(1, 50, 1): # tuple = (querystring, type, page) # func(tuple) end = datetime.datetime.now() print "last time: ", end-start -
ForthXu revised this gist
Jun 28, 2016 . 1 changed file with 45 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,45 @@ //[root@vps3 work]# wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2 //[root@vps3 work]# tar jxvf phantomjs-2.1.1-linux-x86_64.tar.bz2 //[root@vps3 work]# vim screenshots.js var page = require('webpage').create(); var args = require('system').args; var url = args[1]; var filename = args[2]; page.open(url, function(status) { console.log("Status: " + status); if(status === "success") { #执行js var title = page.evaluate(function(){ #滚动加载惰性图片 window.scrollTo(0,10000); #返回标题 return document.title; }); #调试信息 console.log('Page title is ' + title); #延迟处理,以便加载图片执行js window.setTimeout(function () { #截图渲染 page.render(filename); #退出 phantom.exit(); }, 5000); }else{ phantom.exit(); } }); //安装微软雅黑字体 //[root@vps3 work]#yum -y install bitmap-fonts bitmap-fonts-cjk mkfontscale fontconfig //[root@vps3 work]#mkdir /usr/share/fonts/win/ //[root@vps3 work]#wget https://nipao.googlecode.com/files/msyh.ttf -O /usr/share/fonts/win/msyh.ttf //[root@vps3 work]#mkfontscale //[root@vps3 work]#mkfontdir //[root@vps3 work]#fc-cache //执行截图功能 //[root@vps3 work]#rm -rf /home/wwwroot/default/joke.png && phantomjs-2.1.1-linux-x86_64/bin/phantomjs screenshots.js http://joke.4399pk.com /home/wwwroot/default/joke.png -
ForthXu created this gist
Jun 26, 2016 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,209 @@ #!/usr/bin/python2.7 # -*- coding: utf-8 -*- from bs4 import BeautifulSoup import urllib2 import time import csv import sys,os import pymysql def get_cur_file_dir(): path = sys.path[0] if os.path.isdir(path): return path elif os.path.isfile(path): return os.path.dirname(path) def down_content(content_url,path_url): xhtml=open_url(content_url) if False == xhtml : return False soup = BeautifulSoup(xhtml, "html5lib") titleH2 = soup.find("h2", id="activity-name") if None == titleH2: return False title = titleH2.string.encode('utf-8') string_time = soup.find("em", id="post-date").string.encode('utf-8') num_time = int(time.mktime(time.strptime(string_time,'%Y-%m-%d'))) keywords = str(soup.find(attrs={"name":"keywords"})['content'].encode('utf8','ignore')) description = str(soup.find(attrs={"name":"description"})['content'].encode('utf8','ignore')) content = soup.find_all("div", class_="rich_media_content") if len(content) < 1 : print(" "+"no contet") return False html = """ <!doctype html> <html> <head> <meta charset="utf-8"> <title>"""+title+"""</title> <meta name="keywords" content=\""""+keywords+"""\"> <meta name="description" content=\""""+description+"""\"> </head> <body> <div id="body"> <h1>"""+title+"""</h1> <div id="string_time">"""+string_time+""" </div><div id="num_time">"""+str(num_time)+"""</div> <div id="content"> """+str(content[0])+""" </div> </div> </body> <script type="text/javascript" src="js/reimg.js"></script> </html> """ f=file(path_url,"w+") f.write(html) f.close() cur.execute("INSERT INTO archive (category,category_parents,title,summary,addtime,uptime) VALUES (27,\"0,12,27,\",%s,%s,%s,%s)",(title.strip(),description.strip(),num_time,num_time)) #print cur.description #print "ID of last record is ", int(cur.lastrowid) #最后插入行的主键ID #print "ID of inserted record is ", int(conn.insert_id()) #最新插入行的主键ID,conn.insert_id()一定要在conn.commit()之前,否则会返回0 lastid = int(cur.lastrowid) cur.execute("INSERT INTO archive_article (archive,content) VALUE (%s,%s)",(lastid, str(content[0]))) cur.connection.commit() return True def insert_content(path_url): f = open(path_url,'rb') xhtml = f.read() f.close() soup = BeautifulSoup(xhtml, "html5lib") titleH1 = soup.find("h1") if None == titleH1: return False title = titleH1.string.encode('utf-8') num_time = int(soup.find("div", id="num_time").string.encode('utf-8')) keywords = str(soup.find(attrs={"name":"keywords"})['content'].encode('utf8','ignore')) description = str(soup.find(attrs={"name":"description"})['content'].encode('utf8','ignore')) content = soup.find_all("div", class_="rich_media_content") if len(content) < 1 : print(" "+"no contet") return False cur.execute("INSERT INTO archive (category,category_parents,title,summary,addtime,uptime) VALUES (27,\"0,12,27,\",%s,%s,%s,%s)",(title.strip(),description.strip(),num_time,num_time)) #print cur.description #print "ID of last record is ", int(cur.lastrowid) #最后插入行的主键ID #print "ID of inserted record is ", int(conn.insert_id()) #最新插入行的主键ID,conn.insert_id()一定要在conn.commit()之前,否则会返回0 lastid = int(cur.lastrowid) cur.execute("INSERT INTO archive_article (archive,content) VALUE (%s,%s)",(lastid, str(content[0]))) cur.connection.commit() return True def open_url(url): req = urllib2.Request(url) req.add_header('User-agent', 'Mozilla 5.10') for i in range(0, 3): try: xhtml = urllib2.urlopen(req) return xhtml except urllib2.HTTPError,e: #HTTPError必须排在URLError的前面 print "The server couldn't fulfill the request" print "Error code:",e.code if e.code!=503: return False time.sleep(5) print("try again") except urllib2.URLError,e: print "Failed to reach the server" print "The reason:",e.reason if e.code!=503: return False time.sleep(5) print("try again") return Fasle def down_list(list_url): xhtml=open_url(list_url) if False == xhtml : return False soup = BeautifulSoup(xhtml, "html5lib") title = soup.title.string.encode('utf-8') li_a = soup.find_all("a", class_="question_link") next_list = soup.find_all("a", text="下一页") writer = csv.writer(file(datapath+'list.csv', 'a+b')) x = 0 for i in range(0, len(li_a)): content_id = li_a[i]['href'].encode('utf-8')[3:] content_title = li_a[i].string.encode('utf-8') content_url = "http://chuansong.me"+li_a[i]['href'].encode('utf-8') path_url = datapath+content_id+".html" if not os.path.exists(path_url): if False == down_content(content_url,path_url) : print(" "+str(x)+content_url+" down fail") continue return False print(" "+str(x)+content_url+" down end") writer.writerow([content_id, content_title, content_url]) x=x+1 if x%2 == 1 : time.sleep(3) time.sleep(1) else: #insert_content(path_url) print(" "+content_url+" exist") return False print(list_url+" end") if len(next_list) < 1 : return False print("next "+next_list[0]['href'].encode('utf-8')+"\n") return True def get_list(): start=0 while True: if start==0: url = 'http://chuansong.me/account/xingdongpai77' else: url = 'http://chuansong.me/account/xingdongpai77?start='+str(start) if False == down_list(url) or start>2000: break start+=12 time.sleep(1) print("get_list end") if __name__ == "__main__": datapath = get_cur_file_dir()+'/data/' if not os.path.exists(datapath): os.makedirs(datapath) conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd="123456", db='mysql') cur = conn.cursor() cur.execute("SET NAMES utf8") cur.execute("USE x") get_list() cur.close() conn.close() # xtime = time.strftime("%Y-%m-%d %H:%M:%S") # xday = time.strftime("%Y-%m-%d") # f=file(datapath+xtime+".html","w+") # f.write(body) # f.close()