#!/usr/bin/python2.7
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import urllib2
import time
import csv
import sys,os
import pymysql
def get_cur_file_dir():
path = sys.path[0]
if os.path.isdir(path):
return path
elif os.path.isfile(path):
return os.path.dirname(path)
def down_content(content_url,path_url):
xhtml=open_url(content_url)
if False == xhtml :
return False
soup = BeautifulSoup(xhtml, "html5lib")
titleH2 = soup.find("h2", id="activity-name")
if None == titleH2:
return False
title = titleH2.string.encode('utf-8')
string_time = soup.find("em", id="post-date").string.encode('utf-8')
num_time = int(time.mktime(time.strptime(string_time,'%Y-%m-%d')))
keywords = str(soup.find(attrs={"name":"keywords"})['content'].encode('utf8','ignore'))
description = str(soup.find(attrs={"name":"description"})['content'].encode('utf8','ignore'))
content = soup.find_all("div", class_="rich_media_content")
if len(content) < 1 :
print(" "+"no contet")
return False
html = """
"""+title+"""
"""+title+"""
"""+string_time+"""
"""+str(num_time)+"""
"""+str(content[0])+"""
"""
f=file(path_url,"w+")
f.write(html)
f.close()
cur.execute("INSERT INTO archive (category,category_parents,title,summary,addtime,uptime) VALUES (27,\"0,12,27,\",%s,%s,%s,%s)",(title.strip(),description.strip(),num_time,num_time))
#print cur.description
#print "ID of last record is ", int(cur.lastrowid) #最后插入行的主键ID
#print "ID of inserted record is ", int(conn.insert_id()) #最新插入行的主键ID,conn.insert_id()一定要在conn.commit()之前,否则会返回0
lastid = int(cur.lastrowid)
cur.execute("INSERT INTO archive_article (archive,content) VALUE (%s,%s)",(lastid, str(content[0])))
cur.connection.commit()
return True
def insert_content(path_url):
f = open(path_url,'rb')
xhtml = f.read()
f.close()
soup = BeautifulSoup(xhtml, "html5lib")
titleH1 = soup.find("h1")
if None == titleH1:
return False
title = titleH1.string.encode('utf-8')
num_time = int(soup.find("div", id="num_time").string.encode('utf-8'))
keywords = str(soup.find(attrs={"name":"keywords"})['content'].encode('utf8','ignore'))
description = str(soup.find(attrs={"name":"description"})['content'].encode('utf8','ignore'))
content = soup.find_all("div", class_="rich_media_content")
if len(content) < 1 :
print(" "+"no contet")
return False
cur.execute("INSERT INTO archive (category,category_parents,title,summary,addtime,uptime) VALUES (27,\"0,12,27,\",%s,%s,%s,%s)",(title.strip(),description.strip(),num_time,num_time))
#print cur.description
#print "ID of last record is ", int(cur.lastrowid) #最后插入行的主键ID
#print "ID of inserted record is ", int(conn.insert_id()) #最新插入行的主键ID,conn.insert_id()一定要在conn.commit()之前,否则会返回0
lastid = int(cur.lastrowid)
cur.execute("INSERT INTO archive_article (archive,content) VALUE (%s,%s)",(lastid, str(content[0])))
cur.connection.commit()
return True
def open_url(url):
req = urllib2.Request(url)
req.add_header('User-agent', 'Mozilla 5.10')
for i in range(0, 3):
try:
xhtml = urllib2.urlopen(req)
return xhtml
except urllib2.HTTPError,e: #HTTPError必须排在URLError的前面
print "The server couldn't fulfill the request"
print "Error code:",e.code
if e.code!=503:
return False
time.sleep(5)
print("try again")
except urllib2.URLError,e:
print "Failed to reach the server"
print "The reason:",e.reason
if e.code!=503:
return False
time.sleep(5)
print("try again")
return Fasle
def down_list(list_url):
xhtml=open_url(list_url)
if False == xhtml :
return False
soup = BeautifulSoup(xhtml, "html5lib")
title = soup.title.string.encode('utf-8')
li_a = soup.find_all("a", class_="question_link")
next_list = soup.find_all("a", text="下一页")
writer = csv.writer(file(datapath+'list.csv', 'a+b'))
x = 0
for i in range(0, len(li_a)):
content_id = li_a[i]['href'].encode('utf-8')[3:]
content_title = li_a[i].string.encode('utf-8')
content_url = "http://chuansong.me"+li_a[i]['href'].encode('utf-8')
path_url = datapath+content_id+".html"
if not os.path.exists(path_url):
if False == down_content(content_url,path_url) :
print(" "+str(x)+content_url+" down fail")
continue
return False
print(" "+str(x)+content_url+" down end")
writer.writerow([content_id, content_title, content_url])
x=x+1
if x%2 == 1 :
time.sleep(3)
time.sleep(1)
else:
#insert_content(path_url)
print(" "+content_url+" exist")
return False
print(list_url+" end")
if len(next_list) < 1 :
return False
print("next "+next_list[0]['href'].encode('utf-8')+"\n")
return True
def get_list():
start=0
while True:
if start==0:
url = 'http://chuansong.me/account/xingdongpai77'
else:
url = 'http://chuansong.me/account/xingdongpai77?start='+str(start)
if False == down_list(url) or start>2000:
break
start+=12
time.sleep(1)
print("get_list end")
if __name__ == "__main__":
datapath = get_cur_file_dir()+'/data/'
if not os.path.exists(datapath):
os.makedirs(datapath)
conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd="123456", db='mysql')
cur = conn.cursor()
cur.execute("SET NAMES utf8")
cur.execute("USE x")
get_list()
cur.close()
conn.close()
# xtime = time.strftime("%Y-%m-%d %H:%M:%S")
# xday = time.strftime("%Y-%m-%d")
# f=file(datapath+xtime+".html","w+")
# f.write(body)
# f.close()