jseanj · September 11, 2013 07:41
diff --git a/fetchV2EXrss.py b/fetchV2EXrss.py
 #-*- encoding: gb2312 -*-
 import urllib2
 from BeautifulSoup import BeautifulSoup
 import threading

 user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
 headers = {'User-Agent' : user_agent}
 #headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6)Gecko/20091201 Firefox/3.5.6'}
 def getHtml(url):
    try:
        #req = urllib2.Request(url, headers)
        req = urllib2.Request(url)
        response = urllib2.urlopen(req, None, 10)
        html = response.read()
        return html
    except Exception, e:
        print e
        return None

 def getRssNew(url):
    print "url: " + url
    html = getHtml(url)
    if html:
        try:
            soup = BeautifulSoup(html)
        except:
            return None
        link = soup.find('link', rel="alternate")
        if link:
            href = link['href']
            if href and href[0] == '/':
                return url + href
            else:
                return href

 def getUrlKey(url):
    pos = url.index('//')
    url = url[pos+2:]
    return url[:url.index('/')]

 def getFriendsList(url):
    print "the main url: " + url
    friendsUrlList = []
    key = getUrlKey(url)
    html = getHtml(url)
    if html:
        try:
            soup = BeautifulSoup(html)
        except:
            return None
        for i in soup.findAll('li'):
            a = i.a
            if a and a.get('href') != None:
                href = a['href']
                if key not in href:
                    if href.startswith('http'):
                        print href
                        friendsUrlList.append(href)
    return friendsUrlList

 def dump(filename, rsslist):
    print "begin dump"
    f = file(filename, 'w')
    for item in rsslist:
        f.write(item)
        f.write('\n')
    f.close()
    print "end dump"

 def getUrlListFromFriends(urlList):
    friendsList = []
    for url in urlList:
        list = getFriendsList(url)
        if list:
            friendsList.extend(list)
    return friendsList

 def getRssList(urlList):
    rssList = []
    for url in urlList:
        rss_url = getRssNew(url)
        if rss_url:
            print "rss: " + rss_url
            rssList.append(rss_url)
    return rssList

 def getUrlListNew(file):
    urlList = []
    fh = open(file)
    for line in fh.readlines():
        line=line.strip('\n')
        urlList.append(line)
    return urlList

 class Fetch(threading.Thread):
    def __init__(self, num, begin, end):
        threading.Thread.__init__(self)
        self._run_num = num
        self.begin = begin
        self.end = end

    def run(self):
        threadname = threading.currentThread().getName()
        print threadname
        for x in xrange(int(self.begin), int(self.end)):
            mutex.acquire()
            rssurl = getRssNew(urllist[x])
            if rssurl:
                print 'rss: ' + rssurl
                rssset.add(rssurl)
            mutex.release()


 urllist = getUrlListNew('/home/jseanj/mygit/v2ex_crawl/all.txt')
 rssset = set()
 threads = []
 num = 10
 length = len(urllist)
 n = int(length/num)
 mutex = threading.Lock()

 for x in xrange(0, num):
    begin = x*n
    end = begin + n
    if x == num-1:
        end = length
    threads.append(Fetch(x, begin, end))

 for t in threads:
    t.start()

 for t in threads:
    t.join()
    
 dump('v2ex_rss', list(rssset))
	#-- encoding: gb2312 --
	import urllib2
	from BeautifulSoup import BeautifulSoup
	import threading

	user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
	headers = {'User-Agent' : user_agent}
	#headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6)Gecko/20091201 Firefox/3.5.6'}
	def getHtml(url):
	try:
	#req = urllib2.Request(url, headers)
	req = urllib2.Request(url)
	response = urllib2.urlopen(req, None, 10)
	html = response.read()
	return html
	except Exception, e:
	print e
	return None

	def getRssNew(url):
	print "url: " + url
	html = getHtml(url)
	if html:
	try:
	soup = BeautifulSoup(html)
	except:
	return None
	link = soup.find('link', rel="alternate")
	if link:
	href = link['href']
	if href and href[0] == '/':
	return url + href
	else:
	return href

	def getUrlKey(url):
	pos = url.index('//')
	url = url[pos+2:]
	return url[:url.index('/')]

	def getFriendsList(url):
	print "the main url: " + url
	friendsUrlList = []
	key = getUrlKey(url)
	html = getHtml(url)
	if html:
	try:
	soup = BeautifulSoup(html)
	except:
	return None
	for i in soup.findAll('li'):
	a = i.a
	if a and a.get('href') != None:
	href = a['href']
	if key not in href:
	if href.startswith('http'):
	print href
	friendsUrlList.append(href)
	return friendsUrlList

	def dump(filename, rsslist):
	print "begin dump"
	f = file(filename, 'w')
	for item in rsslist:
	f.write(item)
	f.write('\n')
	f.close()
	print "end dump"

	def getUrlListFromFriends(urlList):
	friendsList = []
	for url in urlList:
	list = getFriendsList(url)
	if list:
	friendsList.extend(list)
	return friendsList

	def getRssList(urlList):
	rssList = []
	for url in urlList:
	rss_url = getRssNew(url)
	if rss_url:
	print "rss: " + rss_url
	rssList.append(rss_url)
	return rssList

	def getUrlListNew(file):
	urlList = []
	fh = open(file)
	for line in fh.readlines():
	line=line.strip('\n')
	urlList.append(line)
	return urlList

	class Fetch(threading.Thread):
	def __init__(self, num, begin, end):
	threading.Thread.__init__(self)
	self._run_num = num
	self.begin = begin
	self.end = end

	def run(self):
	threadname = threading.currentThread().getName()
	print threadname
	for x in xrange(int(self.begin), int(self.end)):
	mutex.acquire()
	rssurl = getRssNew(urllist[x])
	if rssurl:
	print 'rss: ' + rssurl
	rssset.add(rssurl)
	mutex.release()


	urllist = getUrlListNew('/home/jseanj/mygit/v2ex_crawl/all.txt')
	rssset = set()
	threads = []
	num = 10
	length = len(urllist)
	n = int(length/num)
	mutex = threading.Lock()

	for x in xrange(0, num):
	begin = x*n
	end = begin + n
	if x == num-1:
	end = length
	threads.append(Fetch(x, begin, end))

	for t in threads:
	t.start()

	for t in threads:
	t.join()

	dump('v2ex_rss', list(rssset))