Skip to content

Instantly share code, notes, and snippets.

@patrickdrouin
Created November 14, 2022 17:08
Show Gist options
  • Select an option

  • Save patrickdrouin/4edf9d7abd83d8fe923b65292d5bc3dd to your computer and use it in GitHub Desktop.

Select an option

Save patrickdrouin/4edf9d7abd83d8fe923b65292d5bc3dd to your computer and use it in GitHub Desktop.

Revisions

  1. patrickdrouin created this gist Nov 14, 2022.
    42 changes: 42 additions & 0 deletions crawl_news.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,42 @@
    import newspaper
    from newspaper import Config
    from newspaper import Article

    USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'

    config = Config()
    config.browser_user_agent = USER_AGENT
    config.request_timeout = 10

    #base_url = 'http://www.euronews.com'
    #base_url = 'http://www.cnn.com'
    base_url = 'http://www.foxnews.com/'
    article_urls = set()
    euronews = newspaper.build(base_url, config=config, memoize_articles=False, language='en')
    for sub_article in euronews.articles:
    if sub_article.url not in article_urls:
    article_urls.add(sub_article.url)
    article = Article(sub_article.url, config=config, memoize_articles=False, language='en')
    article.download()
    article.parse()

    # The majority of the article elements are located
    # within the meta data section of the page's
    # navigational structure
    article_meta_data = article.meta_data

    published_date = {value for (key, value) in article_meta_data.items() if key == 'date.created'}
    article_published_date = " ".join(str(x) for x in published_date)

    article_title = article.title

    summary = {value for (key, value) in article_meta_data.items() if key == 'description'}
    article_summary = " ".join(str(x) for x in summary)

    keywords = ''.join({value for (key, value) in article_meta_data.items() if key == 'keywords'})
    keywords_list = sorted(keywords.lower().split(','))
    article_keywords = ', '.join(keywords_list).strip()

    # the replace is used to remove newlines
    article_text = article.text.replace('\n', '')
    print(article_text)