Created
April 3, 2019 05:03
-
-
Save sandgate-dev/07ef8221be45378e931f42f923ad5b17 to your computer and use it in GitHub Desktop.
news content webscraping with newsapi
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| for source in news_sources: | |
| # The source is a list similar to | |
| # ['the-verge', 'the-wall-street-journal', 'the-washington-post', 'the-washington-times', 'time', 'usa-today', 'vice-news', 'wired'] | |
| # | |
| # we should set the range to the maximum possible number based on the total results devided by page_size | |
| # articles = newsapi.get_everything(sources=source, from_param=str(date), sort_by='relevancy', page_size=100) | |
| # articles['totalResults'] // 100 + 1 | |
| for page in range(1, 3): | |
| articles = newsapi.get_everything(sources=source, from_param=str(date), sort_by='relevancy', page_size=100, page=page) | |
| try: | |
| indexes = [i for i, v in enumerate(articles['articles']) if articles['articles'][i]['content'] is None] | |
| for index in sorted(indexes, reverse=True): | |
| del articles['articles'][index] | |
| date_str = date.strftime("%Y%m%d") | |
| filename = '_'.join([source, date_str]) | |
| json_file_name = get_json_file(filename, page, json_file_path) | |
| with open(json_file_name, 'w+') as f: | |
| f.write(ujson.dumps(articles) + '\n') | |
| f.close() | |
| except Exception as e: | |
| print(e) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment