## This is a simple script to download comments from /r/utahjazz,
## look for bigrams that start with "G" then "H" (or vice versa),
## filter out likely non-names, and print them.

# Imports
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import json
import time
import datetime
import pandas as pd
import nltk


# Create a date list for the last 367 days, at 12 hour intervals
base = datetime.datetime.today()
date_list = [base - datetime.timedelta(days=x, hours=y) 
             for x in range(0, 367, 1) for y in [0,12]]

# For each time period, download all comments from pushshift.io
all_comments = []
for ii in range(1, len(date_list)):
    t1 = int(date_list[ii-1].timestamp())
    t2 = int(date_list[ii].timestamp())
    
    # Create API request
    pushshift = 'https://api.pushshift.io/reddit/comment/search/?'
    pushshift += 'subreddit=utahjazz&sort=desc&sort_type=created_utc'
    pushshift += '&after={}&before={}&size=1000'.format(t2, t1)

    # Get data, add to list, then sleep for 2 seconds
    r = requests.get(pushshift)  
    all_comments += json.loads(r.text)['data']    
    time.sleep(2)

# Turn it into a DataFrame
df = pd.DataFrame(all_comments)

## Filter comment text
def find_gh_phrases(text):
    '''Pulls out all bigrams with a string of ('g___', 'h___') or ('h___', 'g____'). 
    If any of the contained words are in the "stop_words" list, the bigram is dropped.'''
    
    tokenizer = nltk.tokenize.WordPunctTokenizer()
    tokens = tokenizer.tokenize(text.lower())
    candidates = [(x,y) for (x,y) in nltk.bigrams(tokens) 
                  if ((x[0] == 'g') & (y[0] == 'h')) 
                   | ((x[0] == 'h') & (y[0] == 'g'))]
    
    stop_words = ['get', 'him', 'glad', 'he', 'gobert', 'hope', 'hmm', 'his', 'game', 'good', 'going',
                 'hurt', 'has', 'grown', 'holder', 'hit', 'honestly', 'have', 'got', 'gonna', 'how', 
                 'great', 'hey', 'guys', 'gets', 'had', 'given', 'hear', 'go', 'hard', 'gm', 'guy', 
                 'hispanic', 'himself', 'harden', 'hill', 'happy', 'games', 'herpes', 'give', 'grayson', 
                 'hidden', 'hes', 'girl', 'host', 'getting', 'geen', 'gems', 'giving', 'gained', 'having', 'gh',
                 'hot', 'historical', 'hate', 'gave', 'hold', 'huge', 'heart', 'heard', 'gap', 'goes' ,'gotta', 'healthy',
                 'hood', 'hasn', 'high', 'guard', 'hams', 'goalie', 'gotten', 'gold', 'god', 'here', 'help',
                 'half', 'greater', 'glory', 'humanity', 'grow', 'hug', 'hurts', 'hail', 'gone', 'giannis',
                 'homeless', 'highlights', 'human', 'hornacek', 'gut', 'girlfriend', 'hole', 'hottest', 'gags',
                 'girls', 'google', 'g', 'gone', 'grail', 'generally', 'homer', 'h', 'haha', 'higher', 'gl', 'george',
                 'humility', 'her', 'hi', 'gail', 'hangs', 'greatly', 'gives', 'gives', 'harrassing', 'https' ]
    stop_filt = [(x,y) for (x,y) in candidates if (x not in stop_words) & (y not in stop_words)]
    
    if len(stop_filt) > 0:
        return stop_filt
    else:
        return None
      
matches = df.body.apply(find_gh_phrases).dropna()

# Write all unique names to disk 
all_names = []
for li in matches:
    for l in li:
        all_names.append(' '.join(s.capitalize() for s in l))
with open('gh-names.txt', 'w') as f:
    f.write('\n'.join(set(all_names)))