Skip to content

Instantly share code, notes, and snippets.

@jkaupp
Last active February 14, 2019 15:24
Show Gist options
  • Save jkaupp/93487d2f1bbe496c09b4f9f1e670061a to your computer and use it in GitHub Desktop.
Save jkaupp/93487d2f1bbe496c09b4f9f1e670061a to your computer and use it in GitHub Desktop.

Revisions

  1. jkaupp revised this gist Feb 14, 2019. 1 changed file with 3 additions and 2 deletions.
    5 changes: 3 additions & 2 deletions scraping_nfl.R
    Original file line number Diff line number Diff line change
    @@ -13,8 +13,9 @@ scrape_nfl_table <- function(url, page) {
    html_table() %>%
    flatten_df() %>%
    mutate_at(c(1, 5:8, 10:15, 17:19), as.numeric) %>%
    mutate_at(9, as.character) %>%
    clean_names()
    mutate_at(9, as.character) %>%
    clean_names() %>%
    mutate_at("yds", parse_number)

    }

  2. jkaupp revised this gist Feb 14, 2019. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion scraping_nfl.R
    Original file line number Diff line number Diff line change
    @@ -2,6 +2,7 @@ library(tidyverse)
    library(lubridate)
    library(glue)
    library(rvest)
    libray(janitor)

    scrape_nfl_table <- function(url, page) {

    @@ -13,7 +14,6 @@ scrape_nfl_table <- function(url, page) {
    flatten_df() %>%
    mutate_at(c(1, 5:8, 10:15, 17:19), as.numeric) %>%
    mutate_at(9, as.character) %>%
    mutate(statistic = statisticCategory) %>%
    clean_names()

    }
  3. jkaupp created this gist Feb 9, 2019.
    51 changes: 51 additions & 0 deletions scraping_nfl.R
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,51 @@
    library(tidyverse)
    library(lubridate)
    library(glue)
    library(rvest)

    scrape_nfl_table <- function(url, page) {


    glue("{url}&d-447263-p={page}") %>%
    read_html() %>%
    html_nodes("#result") %>%
    html_table() %>%
    flatten_df() %>%
    mutate_at(c(1, 5:8, 10:15, 17:19), as.numeric) %>%
    mutate_at(9, as.character) %>%
    mutate(statistic = statisticCategory) %>%
    clean_names()

    }

    pull_nfl_statistics <- function(archive, conference, statisticCategory, season, seasonType) {

    Sys.sleep(5)

    if (season != 2018) {
    archive <- "true"
    } else {
    archive <- "false"
    }

    url <- glue("http://www.nfl.com/stats/categorystats?archive={archive}&conference={conference}&statisticCategory={statisticCategory}&season={season}&seasonType={seasonType}&experience=&tabSeq=0&qualified=true&Submit=Go")

    pages <- read_html(url) %>%
    html_nodes("#main-content > div.c > div.grid > div.col.span-12 > form > span:nth-child(4)") %>%
    html_text() %>%
    str_extract_all("\\d", simplify = TRUE)

    map_dfr(pages, ~scrape_nfl_table(url, .x))

    }



    scaffold <- tibble(archive = "true",
    conference = "null",
    statisticCategory = "PASSING",
    season = 2018,
    seasonType = "REG")

    output <- pmap(scaffold, pull_nfl_statistics)