Skip to content

Instantly share code, notes, and snippets.

@jseanj
Created September 11, 2013 07:41
Show Gist options
  • Save jseanj/6520420 to your computer and use it in GitHub Desktop.
Save jseanj/6520420 to your computer and use it in GitHub Desktop.
#-*- encoding: gb2312 -*-
import urllib2
from BeautifulSoup import BeautifulSoup
import threading
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {'User-Agent' : user_agent}
#headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6)Gecko/20091201 Firefox/3.5.6'}
def getHtml(url):
try:
#req = urllib2.Request(url, headers)
req = urllib2.Request(url)
response = urllib2.urlopen(req, None, 10)
html = response.read()
return html
except Exception, e:
print e
return None
def getRssNew(url):
print "url: " + url
html = getHtml(url)
if html:
try:
soup = BeautifulSoup(html)
except:
return None
link = soup.find('link', rel="alternate")
if link:
href = link['href']
if href and href[0] == '/':
return url + href
else:
return href
def getUrlKey(url):
pos = url.index('//')
url = url[pos+2:]
return url[:url.index('/')]
def getFriendsList(url):
print "the main url: " + url
friendsUrlList = []
key = getUrlKey(url)
html = getHtml(url)
if html:
try:
soup = BeautifulSoup(html)
except:
return None
for i in soup.findAll('li'):
a = i.a
if a and a.get('href') != None:
href = a['href']
if key not in href:
if href.startswith('http'):
print href
friendsUrlList.append(href)
return friendsUrlList
def dump(filename, rsslist):
print "begin dump"
f = file(filename, 'w')
for item in rsslist:
f.write(item)
f.write('\n')
f.close()
print "end dump"
def getUrlListFromFriends(urlList):
friendsList = []
for url in urlList:
list = getFriendsList(url)
if list:
friendsList.extend(list)
return friendsList
def getRssList(urlList):
rssList = []
for url in urlList:
rss_url = getRssNew(url)
if rss_url:
print "rss: " + rss_url
rssList.append(rss_url)
return rssList
def getUrlListNew(file):
urlList = []
fh = open(file)
for line in fh.readlines():
line=line.strip('\n')
urlList.append(line)
return urlList
class Fetch(threading.Thread):
def __init__(self, num, begin, end):
threading.Thread.__init__(self)
self._run_num = num
self.begin = begin
self.end = end
def run(self):
threadname = threading.currentThread().getName()
print threadname
for x in xrange(int(self.begin), int(self.end)):
mutex.acquire()
rssurl = getRssNew(urllist[x])
if rssurl:
print 'rss: ' + rssurl
rssset.add(rssurl)
mutex.release()
urllist = getUrlListNew('/home/jseanj/mygit/v2ex_crawl/all.txt')
rssset = set()
threads = []
num = 10
length = len(urllist)
n = int(length/num)
mutex = threading.Lock()
for x in xrange(0, num):
begin = x*n
end = begin + n
if x == num-1:
end = length
threads.append(Fetch(x, begin, end))
for t in threads:
t.start()
for t in threads:
t.join()
dump('v2ex_rss', list(rssset))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment