## This is a simple script to download comments from /r/utahjazz, ## look for bigrams that start with "G" then "H" (or vice versa), ## filter out likely non-names, and print them. # Imports import requests from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException from bs4 import BeautifulSoup import json import time import datetime import pandas as pd import nltk # Create a date list for the last 367 days, at 12 hour intervals base = datetime.datetime.today() date_list = [base - datetime.timedelta(days=x, hours=y) for x in range(0, 367, 1) for y in [0,12]] # For each time period, download all comments from pushshift.io all_comments = [] for ii in range(1, len(date_list)): t1 = int(date_list[ii-1].timestamp()) t2 = int(date_list[ii].timestamp()) # Create API request pushshift = 'https://api.pushshift.io/reddit/comment/search/?' pushshift += 'subreddit=utahjazz&sort=desc&sort_type=created_utc' pushshift += '&after={}&before={}&size=1000'.format(t2, t1) # Get data, add to list, then sleep for 2 seconds r = requests.get(pushshift) all_comments += json.loads(r.text)['data'] time.sleep(2) # Turn it into a DataFrame df = pd.DataFrame(all_comments) ## Filter comment text def find_gh_phrases(text): '''Pulls out all bigrams with a string of ('g___', 'h___') or ('h___', 'g____'). If any of the contained words are in the "stop_words" list, the bigram is dropped.''' tokenizer = nltk.tokenize.WordPunctTokenizer() tokens = tokenizer.tokenize(text.lower()) candidates = [(x,y) for (x,y) in nltk.bigrams(tokens) if ((x[0] == 'g') & (y[0] == 'h')) | ((x[0] == 'h') & (y[0] == 'g'))] stop_words = ['get', 'him', 'glad', 'he', 'gobert', 'hope', 'hmm', 'his', 'game', 'good', 'going', 'hurt', 'has', 'grown', 'holder', 'hit', 'honestly', 'have', 'got', 'gonna', 'how', 'great', 'hey', 'guys', 'gets', 'had', 'given', 'hear', 'go', 'hard', 'gm', 'guy', 'hispanic', 'himself', 'harden', 'hill', 'happy', 'games', 'herpes', 'give', 'grayson', 'hidden', 'hes', 'girl', 'host', 'getting', 'geen', 'gems', 'giving', 'gained', 'having', 'gh', 'hot', 'historical', 'hate', 'gave', 'hold', 'huge', 'heart', 'heard', 'gap', 'goes' ,'gotta', 'healthy', 'hood', 'hasn', 'high', 'guard', 'hams', 'goalie', 'gotten', 'gold', 'god', 'here', 'help', 'half', 'greater', 'glory', 'humanity', 'grow', 'hug', 'hurts', 'hail', 'gone', 'giannis', 'homeless', 'highlights', 'human', 'hornacek', 'gut', 'girlfriend', 'hole', 'hottest', 'gags', 'girls', 'google', 'g', 'gone', 'grail', 'generally', 'homer', 'h', 'haha', 'higher', 'gl', 'george', 'humility', 'her', 'hi', 'gail', 'hangs', 'greatly', 'gives', 'gives', 'harrassing', 'https' ] stop_filt = [(x,y) for (x,y) in candidates if (x not in stop_words) & (y not in stop_words)] if len(stop_filt) > 0: return stop_filt else: return None matches = df.body.apply(find_gh_phrases).dropna() # Write all unique names to disk all_names = [] for li in matches: for l in li: all_names.append(' '.join(s.capitalize() for s in l)) with open('gh-names.txt', 'w') as f: f.write('\n'.join(set(all_names)))