Skip to content

Instantly share code, notes, and snippets.

@lrei
Created December 22, 2011 11:50
Show Gist options
  • Save lrei/1510028 to your computer and use it in GitHub Desktop.
Save lrei/1510028 to your computer and use it in GitHub Desktop.

Revisions

  1. lrei revised this gist Dec 22, 2011. 1 changed file with 0 additions and 3 deletions.
    3 changes: 0 additions & 3 deletions gistfile1.py
    Original file line number Diff line number Diff line change
    @@ -3,9 +3,6 @@
    import Queue
    import multiprocessing
    import urllib2
    import time
    import pika
    #import opml
    import feedparser
    import socket

  2. lrei created this gist Dec 22, 2011.
    79 changes: 79 additions & 0 deletions gistfile1.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,79 @@
    #!/usr/bin/env python

    import Queue
    import multiprocessing
    import urllib2
    import time
    import pika
    #import opml
    import feedparser
    import socket

    feeds = ['http://today.reuters.com/rss/topNews',
    'http://today.reuters.com/rss/domesticNews',
    'http://today.reuters.com/rss/worldNews',
    'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml',
    'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml',
    'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml',
    'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml',
    'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml',
    'http://www.nytimes.com/services/xml/rss/nyt/International.xml',
    'http://news.google.com/?output=rss',
    'http://feeds.salon.com/salon/news',
    'http://www.foxnews.com/xmlfeed/rss/0,4313,0,00.rss',
    'http://www.foxnews.com/xmlfeed/rss/0,4313,80,00.rss',
    'http://www.foxnews.com/xmlfeed/rss/0,4313,81,00.rss',
    'http://rss.cnn.com/rss/edition.rss',
    'http://rss.cnn.com/rss/edition_world.rss',
    'http://rss.cnn.com/rss/edition_us.rss']

    # timeout for feed fetch (in seconds)
    FEED_TIMEOUT = 20

    def fetch_urls(work_queue, results_queue):
    '''worker function - gets feed urls from queue and parses the feed'''
    while True:
    #grab feed url from queue
    try:
    feed_url = work_queue.get(block = False)
    except Queue.Empty:
    # if queue is empty this will end the thread
    break

    # download the feed
    feed = urllib2.urlopen(feed_url, timeout = FEED_TIMEOUT).read()
    except urllib2.URLError, e:
    continue # ignore this url

    # parse the feed
    parsed_feed = feedparser.parse(feed)

    for e in parsed_feed.entries:
    # get the links
    if 'link' in e:
    # push them into the results queue
    results_queue.put(link)


    def main():
    # create and populate the work queue with all the feed urls
    work_queue = multiprocessing.Queue()
    for feed in feeds:
    work_queue.put(feed)

    # create results queue for all the links extracted from the feeds
    results_queue = multiprocessing.Queue()

    # spawn a bunch of workers for fetch pass them the work queue & results queue
    workers = []
    for i in range(len(feeds)):
    worker = multiprocessing.Process(target=fetch_urls, args=(work_queue,results_queue,))
    worker.start()
    workers.append(worker)

    # wait for all the workers to finish
    for worker in workers:
    worker.join()


    main()