#                                      Houston R Users Group - Web Scraping Script
# =========================================================================================================================================

# Loading Necessary Libraries
library(rvest)
library(tidyverse)
library(reshape2)
# Other useful libraries
#library(XML) # Hides xml function from rvest
#library(RSelenium) # Helpful in dealing with dynamic webpages

# I always use Chrome (might work in other browsers, but I'm not familiar with them)
# Article PDF on basics that I used is here: http://stanford.edu/~wpmarble/webscraping_tutorial/webscraping_tutorial.pdf

# Disclaimer: Always review the website terms of service agreement


# ============================================= NBA Data from Sports Reference ========================================================

# NBA Player-Season Data
# https://www.basketball-reference.com/

url <- paste0(
  "https://www.basketball-reference.com/play-index/psl_finder.cgi?request=1&match=single&type=totals&per_minute_base=36",
  "&per_poss_base=100&season_start=1&season_end=-1&lg_id=NBA&age_min=0&age_max=99&is_playoffs=N&height_min=0&height_max=99",
  "&birth_country_is=Y&as_comp=gt&as_val=0&pos_is_g=Y&pos_is_gf=Y&pos_is_f=Y&pos_is_fg=Y&pos_is_fc=Y&pos_is_c=Y&pos_is_cf=Y&order_by=ws")

# Including XPath
nba <- url %>% 
  read_html() %>% 
  html_nodes(xpath = '//*[@id="stats"]') %>%
  html_table() %>% 
  .[[1]]

# But XPath is not actually needed here
nba <- url %>% 
  read_html() %>% 
  html_table() %>% 
  .[[1]]

# Cleaning up the scraped table
names(nba) <- nba[1,]
nba <- nba[-1,]
nba <- nba[nba$Player != "Player",]

# Checking out the data
table(nba$Player) %>% data.frame
ggplot(nba, aes(x = Age, y = WS, group = Player, color = Player)) +
  geom_line()


#============================================== Opiates - CDC Data Scrape Script =======================================================

# Data on Opiate prescriptions at the County-Year level
# https://www.cdc.gov/drugoverdose/maps/rxcounty2016.html

# Scraping a single page
url <- paste0("https://www.cdc.gov/drugoverdose/maps/rxcounty2016.html")
cdc16 <- url %>%
  read_html() %>%
  html_nodes(xpath = '//*[@id="contentArea"]/div[1]/div[4]/div/div/div/table') %>% # Try commenting out this line
  html_table()
cdc16 <- cdc16[[1]]
head(cdc16)
names(cdc16) <- make.names(names(cdc16))
#rm(cdc16)

# Scrapping 2010 to 2016 to List and Making into a DF
url <- paste0("https://www.cdc.gov/drugoverdose/maps/rxcounty20", 10:16, ".html")
dfList <- lapply(url, function(i) {
  webpage <- read_html(i)
  draft_table <- html_nodes(webpage, xpath = '//*[@id="contentArea"]/div[1]/div[4]/div/div/div/table')
  draft <- html_table(draft_table)[[1]]
})
str(dfList)
dfList[[1]] %>% head
cdc <- dfList %>% reduce(left_join, by = c("County", "State", "FIPS County Code"))
rm(dfList)

# Fixing Format and Names
names(cdc) <- make.names(names(cdc))
cdc <- melt(cdc, paste0("X20", 10:16, ".Prescribing.Rate"), id.vars = c("County", "State", "FIPS.County.Code"))
cdc$variable <- gsub("X", "", cdc$variable)
names(cdc)[4:5] <- c("Year", "Prescribing.Rate")
cdc$Year <- gsub(".Prescribing.Rate", "", cdc$Year)
cdc$Prescribing.Rate <- as.numeric(cdc$Prescribing.Rate)
# Keeping TX Only
cdc <- cdc[cdc$State %in% "TX",]

# Checking out the data
str(cdc)
table(cdc$Year)
require:gplots::plotmeans(cdc$Prescribing.Rate ~ cdc$Year)
ggplot(cdc, aes(x = Year, y = Prescribing.Rate, group = County, color = County)) +
  geom_line() +
  theme(legend.position="none")


# =============================================== Scraping State of the Union =========================================================

# Transcribed State of the Union Speeches
# https://www.presidency.ucsb.edu/documents/app-categories/spoken-addresses-and-remarks/state-the-union-addresses

# Scraping State of the Union Speech
url <- "https://www.presidency.ucsb.edu/documents/address-before-joint-session-the-congress-2"

# XPath to the speech
speech <- url %>%
  read_html() %>%
  html_nodes(xpath = '//*[@id="block-system-main"]/div/div/div[1]/div[3]') %>%
  html_text()
speech

# Another way to do it that gives a slightly different format
speechbypara <- url %>%
  read_html() %>%
  html_nodes("p") %>%
  html_text()
speechbypara

# <h1>, <h2>,.,<h6>: Largest heading, second largest heading, etc.
# <p>: Paragraph elements
# <ul>: Unordered bulleted list
# <ol>: Ordered list
# <li>: Individual List item
# <div>: Division or section
# <table>: Table