sandgate-dev · April 3, 2019 05:03
diff --git a/get_news_content.py b/get_news_content.py
 for source in news_sources:
    # The source is a list similar to
    # ['the-verge', 'the-wall-street-journal', 'the-washington-post', 'the-washington-times', 'time', 'usa-today', 'vice-news', 'wired']
    #
    # we should set the range to the maximum possible number based on the total results devided by page_size
    # articles = newsapi.get_everything(sources=source, from_param=str(date), sort_by='relevancy', page_size=100)
    # articles['totalResults'] // 100 + 1
    for page in range(1, 3):
        articles = newsapi.get_everything(sources=source, from_param=str(date), sort_by='relevancy', page_size=100, page=page)

        try:
            indexes = [i for i, v in enumerate(articles['articles']) if articles['articles'][i]['content'] is None]
            for index in sorted(indexes, reverse=True):
                del articles['articles'][index]

            date_str = date.strftime("%Y%m%d")
            filename = '_'.join([source, date_str])
            json_file_name = get_json_file(filename, page, json_file_path)

            with open(json_file_name, 'w+') as f:
                f.write(ujson.dumps(articles) + '\n')
                f.close()

        except Exception as e:
            print(e)
	for source in news_sources:
	# The source is a list similar to
	# ['the-verge', 'the-wall-street-journal', 'the-washington-post', 'the-washington-times', 'time', 'usa-today', 'vice-news', 'wired']
	#
	# we should set the range to the maximum possible number based on the total results devided by page_size
	# articles = newsapi.get_everything(sources=source, from_param=str(date), sort_by='relevancy', page_size=100)
	# articles['totalResults'] // 100 + 1
	for page in range(1, 3):
	articles = newsapi.get_everything(sources=source, from_param=str(date), sort_by='relevancy', page_size=100, page=page)

	try:
	indexes = [i for i, v in enumerate(articles['articles']) if articles['articles'][i]['content'] is None]
	for index in sorted(indexes, reverse=True):
	del articles['articles'][index]

	date_str = date.strftime("%Y%m%d")
	filename = '_'.join([source, date_str])
	json_file_name = get_json_file(filename, page, json_file_path)

	with open(json_file_name, 'w+') as f:
	f.write(ujson.dumps(articles) + '\n')
	f.close()

	except Exception as e:
	print(e)