Skip to content

Instantly share code, notes, and snippets.

@sandgate-dev
Created April 3, 2019 05:03
Show Gist options
  • Save sandgate-dev/07ef8221be45378e931f42f923ad5b17 to your computer and use it in GitHub Desktop.
Save sandgate-dev/07ef8221be45378e931f42f923ad5b17 to your computer and use it in GitHub Desktop.

Revisions

  1. sandgate-dev created this gist Apr 3, 2019.
    25 changes: 25 additions & 0 deletions get_news_content.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,25 @@
    for source in news_sources:
    # The source is a list similar to
    # ['the-verge', 'the-wall-street-journal', 'the-washington-post', 'the-washington-times', 'time', 'usa-today', 'vice-news', 'wired']
    #
    # we should set the range to the maximum possible number based on the total results devided by page_size
    # articles = newsapi.get_everything(sources=source, from_param=str(date), sort_by='relevancy', page_size=100)
    # articles['totalResults'] // 100 + 1
    for page in range(1, 3):
    articles = newsapi.get_everything(sources=source, from_param=str(date), sort_by='relevancy', page_size=100, page=page)

    try:
    indexes = [i for i, v in enumerate(articles['articles']) if articles['articles'][i]['content'] is None]
    for index in sorted(indexes, reverse=True):
    del articles['articles'][index]

    date_str = date.strftime("%Y%m%d")
    filename = '_'.join([source, date_str])
    json_file_name = get_json_file(filename, page, json_file_path)

    with open(json_file_name, 'w+') as f:
    f.write(ujson.dumps(articles) + '\n')
    f.close()

    except Exception as e:
    print(e)