Skip to content

Instantly share code, notes, and snippets.

@sandgate-dev
Created April 3, 2019 05:03
Show Gist options
  • Save sandgate-dev/07ef8221be45378e931f42f923ad5b17 to your computer and use it in GitHub Desktop.
Save sandgate-dev/07ef8221be45378e931f42f923ad5b17 to your computer and use it in GitHub Desktop.
news content webscraping with newsapi
for source in news_sources:
# The source is a list similar to
# ['the-verge', 'the-wall-street-journal', 'the-washington-post', 'the-washington-times', 'time', 'usa-today', 'vice-news', 'wired']
#
# we should set the range to the maximum possible number based on the total results devided by page_size
# articles = newsapi.get_everything(sources=source, from_param=str(date), sort_by='relevancy', page_size=100)
# articles['totalResults'] // 100 + 1
for page in range(1, 3):
articles = newsapi.get_everything(sources=source, from_param=str(date), sort_by='relevancy', page_size=100, page=page)
try:
indexes = [i for i, v in enumerate(articles['articles']) if articles['articles'][i]['content'] is None]
for index in sorted(indexes, reverse=True):
del articles['articles'][index]
date_str = date.strftime("%Y%m%d")
filename = '_'.join([source, date_str])
json_file_name = get_json_file(filename, page, json_file_path)
with open(json_file_name, 'w+') as f:
f.write(ujson.dumps(articles) + '\n')
f.close()
except Exception as e:
print(e)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment