Skip to content

Instantly share code, notes, and snippets.

@cwickham
Created July 30, 2018 13:17
Show Gist options
  • Save cwickham/d66f8fc1b59a84284ce90adbfcea9b83 to your computer and use it in GitHub Desktop.
Save cwickham/d66f8fc1b59a84284ce90adbfcea9b83 to your computer and use it in GitHub Desktop.

Revisions

  1. cwickham created this gist Jul 30, 2018.
    76 changes: 76 additions & 0 deletions hurricanes.R
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,76 @@
    library(tidyverse)
    library(xml2)

    url <- "http://www.aoml.noaa.gov/hrd/hurdat/hurdat2-nepac.html"

    # Column names ------------------------------------------------------------

    # From: http://www.aoml.noaa.gov/hrd/hurdat/newhurdat-format.pdf
    # for the data lines
    wind_vars <- cross_df(list(
    y = c("NE", "SE", "SW", "NW"),
    x = c("34", "50", "64"))) %>%
    glue::glue_data("winds_{x}_{y}")

    col_names <- c("date", "time",
    "record_id", "status", "lat", "lon",
    "max_wind", "min_pressure", wind_vars, "empty"
    )

    # Import ------------------------------------------------------------------

    hurricanes <- read_html(url) %>%
    xml_find_first(".//pre") %>%
    xml_text() %>%
    read_csv(col_names = col_names,
    col_types = cols(record_id = col_character()))

    # Warnings correspond to header lines
    problems(hurricanes)

    # Pull apart header and data rows -----------------------------------------

    # Find headers based on first two characters in first column (date)
    # specifiying basin
    hurricanes <- hurricanes %>%
    mutate(
    header = str_detect(date, "[A-Z]{2}"),
    id = cumsum(header)
    )

    # Now for each hurricane add header info as columns to data
    hurricanes_tidy <- hurricanes %>%
    group_by(id) %>%
    mutate(
    cyclone_id = first(date),
    name = first(time),
    n_records = first(record_id)
    ) %>%
    slice(-1) %>%
    ungroup()

    # and fix up a few data types
    hurricanes_clean <-
    hurricanes_tidy %>%
    mutate(
    datetime = lubridate::ymd_hm(paste(date, time, sep = "T")),
    date = lubridate::date(datetime),
    time = hms::as.hms(datetime),
    lat = parse_number(lat),
    lon = parse_number(lon)
    )

    # some quick checks -------------------------------------------------------

    # number of records matches that reported
    hurricanes_clean %>%
    group_by(cyclone_id) %>%
    summarise(n = n(),
    n_records = first(n_records),
    match = n == n_records) %>%
    summarise(all(match))

    # Quick messy plot
    hurricanes_clean %>%
    ggplot(aes(lon, lat)) +
    geom_path(aes(group = cyclone_id))