lrei · December 22, 2011 11:50 · Dec 22, 2011 · Dec 22, 2011
diff --git a/gistfile1.py b/gistfile1.py
@@ -3,9 +3,6 @@
 import Queue
 import multiprocessing
 import urllib2
-import time
-import pika
-#import opml
 import feedparser
 import socket
 

diff --git a/gistfile1.py b/gistfile1.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+
+import Queue
+import multiprocessing
+import urllib2
+import time
+import pika
+#import opml
+import feedparser
+import socket
+
+feeds = ['http://today.reuters.com/rss/topNews',
+          'http://today.reuters.com/rss/domesticNews',
+          'http://today.reuters.com/rss/worldNews',
+          'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml',
+          'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml',
+          'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml',
+          'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml',
+          'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml',
+          'http://www.nytimes.com/services/xml/rss/nyt/International.xml',
+          'http://news.google.com/?output=rss',
+          'http://feeds.salon.com/salon/news',
+          'http://www.foxnews.com/xmlfeed/rss/0,4313,0,00.rss',
+          'http://www.foxnews.com/xmlfeed/rss/0,4313,80,00.rss',
+          'http://www.foxnews.com/xmlfeed/rss/0,4313,81,00.rss',
+          'http://rss.cnn.com/rss/edition.rss',
+          'http://rss.cnn.com/rss/edition_world.rss',
+          'http://rss.cnn.com/rss/edition_us.rss']
+
+# timeout for feed fetch (in seconds)
+FEED_TIMEOUT = 20
+
+def fetch_urls(work_queue, results_queue):
+    '''worker function - gets feed urls from queue and parses the feed'''
+    while True:
+        #grab feed url from queue
+        try:
+            feed_url = work_queue.get(block = False)
+        except Queue.Empty:
+            # if queue is empty this will end the thread
+            break
+
+        # download the feed
+        feed = urllib2.urlopen(feed_url, timeout = FEED_TIMEOUT).read()
+        except urllib2.URLError, e:
+            continue # ignore this url
+
+        # parse the feed
+        parsed_feed = feedparser.parse(feed)
+
+        for e in parsed_feed.entries:
+            # get the links
+            if 'link' in e:
+                # push them into the results queue
+                results_queue.put(link)
+
+
+def main():
+    # create and populate the work queue with all the feed urls
+    work_queue = multiprocessing.Queue()
+    for feed in feeds:
+        work_queue.put(feed)
+
+    # create results queue for all the links extracted from the feeds
+    results_queue = multiprocessing.Queue()
+
+    # spawn a bunch of workers for fetch pass them the work queue & results queue
+    workers = []
+    for i in range(len(feeds)):
+        worker = multiprocessing.Process(target=fetch_urls, args=(work_queue,results_queue,))
+        worker.start()
+        workers.append(worker)
+
+    # wait for all the workers to finish
+    for worker in workers:
+        worker.join()
+
+
+main()