Skip to content

Instantly share code, notes, and snippets.

View sandgate-dev's full-sized avatar
☀️

toolshed sandgate-dev

☀️
View GitHub Profile
@sandgate-dev
sandgate-dev / flatten_list_of_lists.py
Created September 12, 2019 14:31
Example functions to flatten list of lists
# https://stackoverflow.com/a/45323085/5983691
# For huge nested lists, 'list(numpy.array(a).flat)' is the fastest among all functions
#
import functools
import itertools
import numpy
import operator
import perfplot
@sandgate-dev
sandgate-dev / dd_count_categories.py
Created April 7, 2019 14:09
Count all categories
categories = ddf.category.value_counts().compute()
categories
@sandgate-dev
sandgate-dev / set_csv_field_limit.py
Created April 7, 2019 11:35
Increase field limit to address _csv.Error: field larger than field limit (131072)
# _csv.Error: field larger than field limit (131072)
# https://stackoverflow.com/a/15063941/5983691
def csv_field_limit():
maxInt = sys.maxsize
decrement = True
while decrement:
# decrease the maxInt value by factor 10
# as long as the OverflowError occurs.
decrement = False
@sandgate-dev
sandgate-dev / create_pyLDAvis.py
Created April 7, 2019 07:43
create and display pyLDAvis
news_content = df[df.category=='reliable'].news_content.tolist()
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
stop_words = 'english',
lowercase = True,
token_pattern = r'\b[a-zA-Z]{3,}\b',
max_df = 0.5,
min_df = 10)
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
@sandgate-dev
sandgate-dev / get_news_content.py
Created April 3, 2019 05:03
news content webscraping with newsapi
for source in news_sources:
# The source is a list similar to
# ['the-verge', 'the-wall-street-journal', 'the-washington-post', 'the-washington-times', 'time', 'usa-today', 'vice-news', 'wired']
#
# we should set the range to the maximum possible number based on the total results devided by page_size
# articles = newsapi.get_everything(sources=source, from_param=str(date), sort_by='relevancy', page_size=100)
# articles['totalResults'] // 100 + 1
for page in range(1, 3):
articles = newsapi.get_everything(sources=source, from_param=str(date), sort_by='relevancy', page_size=100, page=page)
@sandgate-dev
sandgate-dev / search_nyt_archive.py
Last active April 3, 2019 16:08
Search NYTimes Archive
url = 'https://api.nytimes.com/svc/archive/v1/%s/%s.json?api-key=%s'
begin_year = 2017
end_year = 2018
with open('nytimes_data/archivesearch.json', 'w') as f:
with tqdm() as progress:
for year in range(begin_year, end_year + 1):
for month in range(1, 13):
try:
@sandgate-dev
sandgate-dev / get_valid_url.py
Last active April 3, 2019 05:09
Getting valid url
for url in df['url']:
try:
code = urlopen('http://' + url).getcode()
except IOError:
df = df[df.url != url]
code = None
import tldextract
list = tldextract.extract('http://blog.ashiknesin.com/about')
domain_name = list.domain + '.' + list.suffix
# ashiknesin.com
# How can I determine if anything at the given url exists
# https://stackoverflow.com/a/1966141/5983691
#
# 4xx status codes describe client errors (like "404 Not found")
# 5xx status codes describe server errors (like "500 Internal server error")
from urllib2 import urlopen
code = urlopen("http://example.com/").code
if (code / 100 >= 4):
model = Sequential([
vgg_model,
Flatten(),
Dense(32, activation='relu'),
Dense(64, activation='relu'),
Dense(128, activation='relu'),
Dense(64, activation='relu'),
Dense(5, activation='softmax')
])