Created
September 11, 2013 07:41
-
-
Save jseanj/6520420 to your computer and use it in GitHub Desktop.
Revisions
-
jseanj created this gist
Sep 11, 2013 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,135 @@ #-*- encoding: gb2312 -*- import urllib2 from BeautifulSoup import BeautifulSoup import threading user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent' : user_agent} #headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6)Gecko/20091201 Firefox/3.5.6'} def getHtml(url): try: #req = urllib2.Request(url, headers) req = urllib2.Request(url) response = urllib2.urlopen(req, None, 10) html = response.read() return html except Exception, e: print e return None def getRssNew(url): print "url: " + url html = getHtml(url) if html: try: soup = BeautifulSoup(html) except: return None link = soup.find('link', rel="alternate") if link: href = link['href'] if href and href[0] == '/': return url + href else: return href def getUrlKey(url): pos = url.index('//') url = url[pos+2:] return url[:url.index('/')] def getFriendsList(url): print "the main url: " + url friendsUrlList = [] key = getUrlKey(url) html = getHtml(url) if html: try: soup = BeautifulSoup(html) except: return None for i in soup.findAll('li'): a = i.a if a and a.get('href') != None: href = a['href'] if key not in href: if href.startswith('http'): print href friendsUrlList.append(href) return friendsUrlList def dump(filename, rsslist): print "begin dump" f = file(filename, 'w') for item in rsslist: f.write(item) f.write('\n') f.close() print "end dump" def getUrlListFromFriends(urlList): friendsList = [] for url in urlList: list = getFriendsList(url) if list: friendsList.extend(list) return friendsList def getRssList(urlList): rssList = [] for url in urlList: rss_url = getRssNew(url) if rss_url: print "rss: " + rss_url rssList.append(rss_url) return rssList def getUrlListNew(file): urlList = [] fh = open(file) for line in fh.readlines(): line=line.strip('\n') urlList.append(line) return urlList class Fetch(threading.Thread): def __init__(self, num, begin, end): threading.Thread.__init__(self) self._run_num = num self.begin = begin self.end = end def run(self): threadname = threading.currentThread().getName() print threadname for x in xrange(int(self.begin), int(self.end)): mutex.acquire() rssurl = getRssNew(urllist[x]) if rssurl: print 'rss: ' + rssurl rssset.add(rssurl) mutex.release() urllist = getUrlListNew('/home/jseanj/mygit/v2ex_crawl/all.txt') rssset = set() threads = [] num = 10 length = len(urllist) n = int(length/num) mutex = threading.Lock() for x in xrange(0, num): begin = x*n end = begin + n if x == num-1: end = length threads.append(Fetch(x, begin, end)) for t in threads: t.start() for t in threads: t.join() dump('v2ex_rss', list(rssset))