# Houston R Users Group - Web Scraping Script # ========================================================================================================================================= # Loading Necessary Libraries library(rvest) library(tidyverse) library(reshape2) # Other useful libraries #library(XML) # Hides xml function from rvest #library(RSelenium) # Helpful in dealing with dynamic webpages # I always use Chrome (might work in other browsers, but I'm not familiar with them) # Article PDF on basics that I used is here: http://stanford.edu/~wpmarble/webscraping_tutorial/webscraping_tutorial.pdf # Disclaimer: Always review the website terms of service agreement # ============================================= NBA Data from Sports Reference ======================================================== # NBA Player-Season Data # https://www.basketball-reference.com/ url <- paste0( "https://www.basketball-reference.com/play-index/psl_finder.cgi?request=1&match=single&type=totals&per_minute_base=36", "&per_poss_base=100&season_start=1&season_end=-1&lg_id=NBA&age_min=0&age_max=99&is_playoffs=N&height_min=0&height_max=99", "&birth_country_is=Y&as_comp=gt&as_val=0&pos_is_g=Y&pos_is_gf=Y&pos_is_f=Y&pos_is_fg=Y&pos_is_fc=Y&pos_is_c=Y&pos_is_cf=Y&order_by=ws") # Including XPath nba <- url %>% read_html() %>% html_nodes(xpath = '//*[@id="stats"]') %>% html_table() %>% .[[1]] # But XPath is not actually needed here nba <- url %>% read_html() %>% html_table() %>% .[[1]] # Cleaning up the scraped table names(nba) <- nba[1,] nba <- nba[-1,] nba <- nba[nba$Player != "Player",] # Checking out the data table(nba$Player) %>% data.frame ggplot(nba, aes(x = Age, y = WS, group = Player, color = Player)) + geom_line() #============================================== Opiates - CDC Data Scrape Script ======================================================= # Data on Opiate prescriptions at the County-Year level # https://www.cdc.gov/drugoverdose/maps/rxcounty2016.html # Scraping a single page url <- paste0("https://www.cdc.gov/drugoverdose/maps/rxcounty2016.html") cdc16 <- url %>% read_html() %>% html_nodes(xpath = '//*[@id="contentArea"]/div[1]/div[4]/div/div/div/table') %>% # Try commenting out this line html_table() cdc16 <- cdc16[[1]] head(cdc16) names(cdc16) <- make.names(names(cdc16)) #rm(cdc16) # Scrapping 2010 to 2016 to List and Making into a DF url <- paste0("https://www.cdc.gov/drugoverdose/maps/rxcounty20", 10:16, ".html") dfList <- lapply(url, function(i) { webpage <- read_html(i) draft_table <- html_nodes(webpage, xpath = '//*[@id="contentArea"]/div[1]/div[4]/div/div/div/table') draft <- html_table(draft_table)[[1]] }) str(dfList) dfList[[1]] %>% head cdc <- dfList %>% reduce(left_join, by = c("County", "State", "FIPS County Code")) rm(dfList) # Fixing Format and Names names(cdc) <- make.names(names(cdc)) cdc <- melt(cdc, paste0("X20", 10:16, ".Prescribing.Rate"), id.vars = c("County", "State", "FIPS.County.Code")) cdc$variable <- gsub("X", "", cdc$variable) names(cdc)[4:5] <- c("Year", "Prescribing.Rate") cdc$Year <- gsub(".Prescribing.Rate", "", cdc$Year) cdc$Prescribing.Rate <- as.numeric(cdc$Prescribing.Rate) # Keeping TX Only cdc <- cdc[cdc$State %in% "TX",] # Checking out the data str(cdc) table(cdc$Year) require:gplots::plotmeans(cdc$Prescribing.Rate ~ cdc$Year) ggplot(cdc, aes(x = Year, y = Prescribing.Rate, group = County, color = County)) + geom_line() + theme(legend.position="none") # =============================================== Scraping State of the Union ========================================================= # Transcribed State of the Union Speeches # https://www.presidency.ucsb.edu/documents/app-categories/spoken-addresses-and-remarks/state-the-union-addresses # Scraping State of the Union Speech url <- "https://www.presidency.ucsb.edu/documents/address-before-joint-session-the-congress-2" # XPath to the speech speech <- url %>% read_html() %>% html_nodes(xpath = '//*[@id="block-system-main"]/div/div/div[1]/div[3]') %>% html_text() speech # Another way to do it that gives a slightly different format speechbypara <- url %>% read_html() %>% html_nodes("p") %>% html_text() speechbypara #
: Paragraph elements #