Created
September 11, 2013 07:41
-
-
Save jseanj/6520420 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #-*- encoding: gb2312 -*- | |
| import urllib2 | |
| from BeautifulSoup import BeautifulSoup | |
| import threading | |
| user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' | |
| headers = {'User-Agent' : user_agent} | |
| #headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6)Gecko/20091201 Firefox/3.5.6'} | |
| def getHtml(url): | |
| try: | |
| #req = urllib2.Request(url, headers) | |
| req = urllib2.Request(url) | |
| response = urllib2.urlopen(req, None, 10) | |
| html = response.read() | |
| return html | |
| except Exception, e: | |
| print e | |
| return None | |
| def getRssNew(url): | |
| print "url: " + url | |
| html = getHtml(url) | |
| if html: | |
| try: | |
| soup = BeautifulSoup(html) | |
| except: | |
| return None | |
| link = soup.find('link', rel="alternate") | |
| if link: | |
| href = link['href'] | |
| if href and href[0] == '/': | |
| return url + href | |
| else: | |
| return href | |
| def getUrlKey(url): | |
| pos = url.index('//') | |
| url = url[pos+2:] | |
| return url[:url.index('/')] | |
| def getFriendsList(url): | |
| print "the main url: " + url | |
| friendsUrlList = [] | |
| key = getUrlKey(url) | |
| html = getHtml(url) | |
| if html: | |
| try: | |
| soup = BeautifulSoup(html) | |
| except: | |
| return None | |
| for i in soup.findAll('li'): | |
| a = i.a | |
| if a and a.get('href') != None: | |
| href = a['href'] | |
| if key not in href: | |
| if href.startswith('http'): | |
| print href | |
| friendsUrlList.append(href) | |
| return friendsUrlList | |
| def dump(filename, rsslist): | |
| print "begin dump" | |
| f = file(filename, 'w') | |
| for item in rsslist: | |
| f.write(item) | |
| f.write('\n') | |
| f.close() | |
| print "end dump" | |
| def getUrlListFromFriends(urlList): | |
| friendsList = [] | |
| for url in urlList: | |
| list = getFriendsList(url) | |
| if list: | |
| friendsList.extend(list) | |
| return friendsList | |
| def getRssList(urlList): | |
| rssList = [] | |
| for url in urlList: | |
| rss_url = getRssNew(url) | |
| if rss_url: | |
| print "rss: " + rss_url | |
| rssList.append(rss_url) | |
| return rssList | |
| def getUrlListNew(file): | |
| urlList = [] | |
| fh = open(file) | |
| for line in fh.readlines(): | |
| line=line.strip('\n') | |
| urlList.append(line) | |
| return urlList | |
| class Fetch(threading.Thread): | |
| def __init__(self, num, begin, end): | |
| threading.Thread.__init__(self) | |
| self._run_num = num | |
| self.begin = begin | |
| self.end = end | |
| def run(self): | |
| threadname = threading.currentThread().getName() | |
| print threadname | |
| for x in xrange(int(self.begin), int(self.end)): | |
| mutex.acquire() | |
| rssurl = getRssNew(urllist[x]) | |
| if rssurl: | |
| print 'rss: ' + rssurl | |
| rssset.add(rssurl) | |
| mutex.release() | |
| urllist = getUrlListNew('/home/jseanj/mygit/v2ex_crawl/all.txt') | |
| rssset = set() | |
| threads = [] | |
| num = 10 | |
| length = len(urllist) | |
| n = int(length/num) | |
| mutex = threading.Lock() | |
| for x in xrange(0, num): | |
| begin = x*n | |
| end = begin + n | |
| if x == num-1: | |
| end = length | |
| threads.append(Fetch(x, begin, end)) | |
| for t in threads: | |
| t.start() | |
| for t in threads: | |
| t.join() | |
| dump('v2ex_rss', list(rssset)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment