Skip to content

Instantly share code, notes, and snippets.

@jseanj
Created September 11, 2013 07:41
Show Gist options
  • Save jseanj/6520420 to your computer and use it in GitHub Desktop.
Save jseanj/6520420 to your computer and use it in GitHub Desktop.

Revisions

  1. jseanj created this gist Sep 11, 2013.
    135 changes: 135 additions & 0 deletions fetchV2EXrss.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,135 @@
    #-*- encoding: gb2312 -*-
    import urllib2
    from BeautifulSoup import BeautifulSoup
    import threading

    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = {'User-Agent' : user_agent}
    #headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6)Gecko/20091201 Firefox/3.5.6'}
    def getHtml(url):
    try:
    #req = urllib2.Request(url, headers)
    req = urllib2.Request(url)
    response = urllib2.urlopen(req, None, 10)
    html = response.read()
    return html
    except Exception, e:
    print e
    return None

    def getRssNew(url):
    print "url: " + url
    html = getHtml(url)
    if html:
    try:
    soup = BeautifulSoup(html)
    except:
    return None
    link = soup.find('link', rel="alternate")
    if link:
    href = link['href']
    if href and href[0] == '/':
    return url + href
    else:
    return href

    def getUrlKey(url):
    pos = url.index('//')
    url = url[pos+2:]
    return url[:url.index('/')]

    def getFriendsList(url):
    print "the main url: " + url
    friendsUrlList = []
    key = getUrlKey(url)
    html = getHtml(url)
    if html:
    try:
    soup = BeautifulSoup(html)
    except:
    return None
    for i in soup.findAll('li'):
    a = i.a
    if a and a.get('href') != None:
    href = a['href']
    if key not in href:
    if href.startswith('http'):
    print href
    friendsUrlList.append(href)
    return friendsUrlList

    def dump(filename, rsslist):
    print "begin dump"
    f = file(filename, 'w')
    for item in rsslist:
    f.write(item)
    f.write('\n')
    f.close()
    print "end dump"

    def getUrlListFromFriends(urlList):
    friendsList = []
    for url in urlList:
    list = getFriendsList(url)
    if list:
    friendsList.extend(list)
    return friendsList

    def getRssList(urlList):
    rssList = []
    for url in urlList:
    rss_url = getRssNew(url)
    if rss_url:
    print "rss: " + rss_url
    rssList.append(rss_url)
    return rssList

    def getUrlListNew(file):
    urlList = []
    fh = open(file)
    for line in fh.readlines():
    line=line.strip('\n')
    urlList.append(line)
    return urlList

    class Fetch(threading.Thread):
    def __init__(self, num, begin, end):
    threading.Thread.__init__(self)
    self._run_num = num
    self.begin = begin
    self.end = end

    def run(self):
    threadname = threading.currentThread().getName()
    print threadname
    for x in xrange(int(self.begin), int(self.end)):
    mutex.acquire()
    rssurl = getRssNew(urllist[x])
    if rssurl:
    print 'rss: ' + rssurl
    rssset.add(rssurl)
    mutex.release()


    urllist = getUrlListNew('/home/jseanj/mygit/v2ex_crawl/all.txt')
    rssset = set()
    threads = []
    num = 10
    length = len(urllist)
    n = int(length/num)
    mutex = threading.Lock()

    for x in xrange(0, num):
    begin = x*n
    end = begin + n
    if x == num-1:
    end = length
    threads.append(Fetch(x, begin, end))

    for t in threads:
    t.start()

    for t in threads:
    t.join()

    dump('v2ex_rss', list(rssset))