sandgate-dev’s gists

sandgate-dev / flatten_list_of_lists.py

Created September 12, 2019 14:31

Example functions to flatten list of lists

	# https://stackoverflow.com/a/45323085/5983691
	# For huge nested lists, 'list(numpy.array(a).flat)' is the fastest among all functions
	#
	import functools
	import itertools
	import numpy
	import operator
	import perfplot

sandgate-dev / dd_count_categories.py

Created April 7, 2019 14:09

Count all categories

	categories = ddf.category.value_counts().compute()
	categories

sandgate-dev / set_csv_field_limit.py

Created April 7, 2019 11:35

Increase field limit to address _csv.Error: field larger than field limit (131072)

	# _csv.Error: field larger than field limit (131072)
	# https://stackoverflow.com/a/15063941/5983691
	def csv_field_limit():
	maxInt = sys.maxsize
	decrement = True

	while decrement:
	# decrease the maxInt value by factor 10
	# as long as the OverflowError occurs.
	decrement = False

sandgate-dev / create_pyLDAvis.py

Created April 7, 2019 07:43

create and display pyLDAvis

	news_content = df[df.category=='reliable'].news_content.tolist()

	tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
	stop_words = 'english',
	lowercase = True,
	token_pattern = r'\b[a-zA-Z]{3,}\b',
	max_df = 0.5,
	min_df = 10)

	tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())

sandgate-dev / get_news_content.py

Created April 3, 2019 05:03

news content webscraping with newsapi

	for source in news_sources:
	# The source is a list similar to
	# ['the-verge', 'the-wall-street-journal', 'the-washington-post', 'the-washington-times', 'time', 'usa-today', 'vice-news', 'wired']
	#
	# we should set the range to the maximum possible number based on the total results devided by page_size
	# articles = newsapi.get_everything(sources=source, from_param=str(date), sort_by='relevancy', page_size=100)
	# articles['totalResults'] // 100 + 1
	for page in range(1, 3):
	articles = newsapi.get_everything(sources=source, from_param=str(date), sort_by='relevancy', page_size=100, page=page)

sandgate-dev / search_nyt_archive.py

Last active April 3, 2019 16:08

Search NYTimes Archive

	url = 'https://api.nytimes.com/svc/archive/v1/%s/%s.json?api-key=%s'

	begin_year = 2017
	end_year = 2018

	with open('nytimes_data/archivesearch.json', 'w') as f:
	with tqdm() as progress:
	for year in range(begin_year, end_year + 1):
	for month in range(1, 13):
	try:

sandgate-dev / get_valid_url.py

Last active April 3, 2019 05:09

Getting valid url

	for url in df['url']:
	try:
	code = urlopen('http://' + url).getcode()
	except IOError:
	df = df[df.url != url]
	code = None

sandgate-dev / get_url.py

Created March 11, 2019 08:33

	import tldextract

	list = tldextract.extract('http://blog.ashiknesin.com/about')
	domain_name = list.domain + '.' + list.suffix
	# ashiknesin.com

sandgate-dev / check_url_exists.py

Created March 9, 2019 10:23

	# How can I determine if anything at the given url exists
	# https://stackoverflow.com/a/1966141/5983691
	#
	# 4xx status codes describe client errors (like "404 Not found")
	# 5xx status codes describe server errors (like "500 Internal server error")

	from urllib2 import urlopen

	code = urlopen("http://example.com/").code
	if (code / 100 >= 4):

sandgate-dev / VGG Sequential Model

Created February 9, 2019 13:04

toolshed sandgate-dev