#!/usr/bin/env python import Queue import multiprocessing import urllib2 import feedparser import socket feeds = ['http://today.reuters.com/rss/topNews', 'http://today.reuters.com/rss/domesticNews', 'http://today.reuters.com/rss/worldNews', 'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml', 'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml', 'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml', 'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml', 'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml', 'http://www.nytimes.com/services/xml/rss/nyt/International.xml', 'http://news.google.com/?output=rss', 'http://feeds.salon.com/salon/news', 'http://www.foxnews.com/xmlfeed/rss/0,4313,0,00.rss', 'http://www.foxnews.com/xmlfeed/rss/0,4313,80,00.rss', 'http://www.foxnews.com/xmlfeed/rss/0,4313,81,00.rss', 'http://rss.cnn.com/rss/edition.rss', 'http://rss.cnn.com/rss/edition_world.rss', 'http://rss.cnn.com/rss/edition_us.rss'] # timeout for feed fetch (in seconds) FEED_TIMEOUT = 20 def fetch_urls(work_queue, results_queue): '''worker function - gets feed urls from queue and parses the feed''' while True: #grab feed url from queue try: feed_url = work_queue.get(block = False) except Queue.Empty: # if queue is empty this will end the thread break # download the feed feed = urllib2.urlopen(feed_url, timeout = FEED_TIMEOUT).read() except urllib2.URLError, e: continue # ignore this url # parse the feed parsed_feed = feedparser.parse(feed) for e in parsed_feed.entries: # get the links if 'link' in e: # push them into the results queue results_queue.put(link) def main(): # create and populate the work queue with all the feed urls work_queue = multiprocessing.Queue() for feed in feeds: work_queue.put(feed) # create results queue for all the links extracted from the feeds results_queue = multiprocessing.Queue() # spawn a bunch of workers for fetch pass them the work queue & results queue workers = [] for i in range(len(feeds)): worker = multiprocessing.Process(target=fetch_urls, args=(work_queue,results_queue,)) worker.start() workers.append(worker) # wait for all the workers to finish for worker in workers: worker.join() main()