Created
          December 22, 2011 11:50 
        
      - 
      
- 
        Save lrei/1510028 to your computer and use it in GitHub Desktop. 
Revisions
- 
        lrei revised this gist Dec 22, 2011 . 1 changed file with 0 additions and 3 deletions.There are no files selected for viewingThis file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -3,9 +3,6 @@ import Queue import multiprocessing import urllib2 import feedparser import socket 
- 
        lrei created this gist Dec 22, 2011 .There are no files selected for viewingThis file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,79 @@ #!/usr/bin/env python import Queue import multiprocessing import urllib2 import time import pika #import opml import feedparser import socket feeds = ['http://today.reuters.com/rss/topNews', 'http://today.reuters.com/rss/domesticNews', 'http://today.reuters.com/rss/worldNews', 'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml', 'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml', 'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml', 'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml', 'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml', 'http://www.nytimes.com/services/xml/rss/nyt/International.xml', 'http://news.google.com/?output=rss', 'http://feeds.salon.com/salon/news', 'http://www.foxnews.com/xmlfeed/rss/0,4313,0,00.rss', 'http://www.foxnews.com/xmlfeed/rss/0,4313,80,00.rss', 'http://www.foxnews.com/xmlfeed/rss/0,4313,81,00.rss', 'http://rss.cnn.com/rss/edition.rss', 'http://rss.cnn.com/rss/edition_world.rss', 'http://rss.cnn.com/rss/edition_us.rss'] # timeout for feed fetch (in seconds) FEED_TIMEOUT = 20 def fetch_urls(work_queue, results_queue): '''worker function - gets feed urls from queue and parses the feed''' while True: #grab feed url from queue try: feed_url = work_queue.get(block = False) except Queue.Empty: # if queue is empty this will end the thread break # download the feed feed = urllib2.urlopen(feed_url, timeout = FEED_TIMEOUT).read() except urllib2.URLError, e: continue # ignore this url # parse the feed parsed_feed = feedparser.parse(feed) for e in parsed_feed.entries: # get the links if 'link' in e: # push them into the results queue results_queue.put(link) def main(): # create and populate the work queue with all the feed urls work_queue = multiprocessing.Queue() for feed in feeds: work_queue.put(feed) # create results queue for all the links extracted from the feeds results_queue = multiprocessing.Queue() # spawn a bunch of workers for fetch pass them the work queue & results queue workers = [] for i in range(len(feeds)): worker = multiprocessing.Process(target=fetch_urls, args=(work_queue,results_queue,)) worker.start() workers.append(worker) # wait for all the workers to finish for worker in workers: worker.join() main()