#-*- encoding: gb2312 -*-
import urllib2
from BeautifulSoup import BeautifulSoup
import threading

user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {'User-Agent' : user_agent}
#headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6)Gecko/20091201 Firefox/3.5.6'}
def getHtml(url):
    try:
        #req = urllib2.Request(url, headers)
        req = urllib2.Request(url)
        response = urllib2.urlopen(req, None, 10)
        html = response.read()
        return html
    except Exception, e:
        print e
        return None

def getRssNew(url):
    print "url: " + url
    html = getHtml(url)
    if html:
        try:
            soup = BeautifulSoup(html)
        except:
            return None
        link = soup.find('link', rel="alternate")
        if link:
            href = link['href']
            if href and href[0] == '/':
                return url + href
            else:
                return href

def getUrlKey(url):
    pos = url.index('//')
    url = url[pos+2:]
    return url[:url.index('/')]

def getFriendsList(url):
    print "the main url: " + url
    friendsUrlList = []
    key = getUrlKey(url)
    html = getHtml(url)
    if html:
        try:
            soup = BeautifulSoup(html)
        except:
            return None
        for i in soup.findAll('li'):
            a = i.a
            if a and a.get('href') != None:
                href = a['href']
                if key not in href:
                    if href.startswith('http'):
                        print href
                        friendsUrlList.append(href)
    return friendsUrlList

def dump(filename, rsslist):
    print "begin dump"
    f = file(filename, 'w')
    for item in rsslist:
        f.write(item)
        f.write('\n')
    f.close()
    print "end dump"

def getUrlListFromFriends(urlList):
    friendsList = []
    for url in urlList:
        list = getFriendsList(url)
        if list:
            friendsList.extend(list)
    return friendsList

def getRssList(urlList):
    rssList = []
    for url in urlList:
        rss_url = getRssNew(url)
        if rss_url:
            print "rss: " + rss_url
            rssList.append(rss_url)
    return rssList

def getUrlListNew(file):
    urlList = []
    fh = open(file)
    for line in fh.readlines():
        line=line.strip('\n')
        urlList.append(line)
    return urlList

class Fetch(threading.Thread):
    def __init__(self, num, begin, end):
        threading.Thread.__init__(self)
        self._run_num = num
        self.begin = begin
        self.end = end

    def run(self):
        threadname = threading.currentThread().getName()
        print threadname
        for x in xrange(int(self.begin), int(self.end)):
            mutex.acquire()
            rssurl = getRssNew(urllist[x])
            if rssurl:
                print 'rss: ' + rssurl
                rssset.add(rssurl)
            mutex.release()


urllist = getUrlListNew('/home/jseanj/mygit/v2ex_crawl/all.txt')
rssset = set()
threads = []
num = 10
length = len(urllist)
n = int(length/num)
mutex = threading.Lock()

for x in xrange(0, num):
    begin = x*n
    end = begin + n
    if x == num-1:
        end = length
    threads.append(Fetch(x, begin, end))

for t in threads:
    t.start()

for t in threads:
    t.join()
    
dump('v2ex_rss', list(rssset))