Skip to content

Instantly share code, notes, and snippets.

@cwickham
Created July 30, 2018 13:17
Show Gist options
  • Save cwickham/d66f8fc1b59a84284ce90adbfcea9b83 to your computer and use it in GitHub Desktop.
Save cwickham/d66f8fc1b59a84284ce90adbfcea9b83 to your computer and use it in GitHub Desktop.
Import messy hurricane data
library(tidyverse)
library(xml2)
url <- "http://www.aoml.noaa.gov/hrd/hurdat/hurdat2-nepac.html"
# Column names ------------------------------------------------------------
# From: http://www.aoml.noaa.gov/hrd/hurdat/newhurdat-format.pdf
# for the data lines
wind_vars <- cross_df(list(
y = c("NE", "SE", "SW", "NW"),
x = c("34", "50", "64"))) %>%
glue::glue_data("winds_{x}_{y}")
col_names <- c("date", "time",
"record_id", "status", "lat", "lon",
"max_wind", "min_pressure", wind_vars, "empty"
)
# Import ------------------------------------------------------------------
hurricanes <- read_html(url) %>%
xml_find_first(".//pre") %>%
xml_text() %>%
read_csv(col_names = col_names,
col_types = cols(record_id = col_character()))
# Warnings correspond to header lines
problems(hurricanes)
# Pull apart header and data rows -----------------------------------------
# Find headers based on first two characters in first column (date)
# specifiying basin
hurricanes <- hurricanes %>%
mutate(
header = str_detect(date, "[A-Z]{2}"),
id = cumsum(header)
)
# Now for each hurricane add header info as columns to data
hurricanes_tidy <- hurricanes %>%
group_by(id) %>%
mutate(
cyclone_id = first(date),
name = first(time),
n_records = first(record_id)
) %>%
slice(-1) %>%
ungroup()
# and fix up a few data types
hurricanes_clean <-
hurricanes_tidy %>%
mutate(
datetime = lubridate::ymd_hm(paste(date, time, sep = "T")),
date = lubridate::date(datetime),
time = hms::as.hms(datetime),
lat = parse_number(lat),
lon = parse_number(lon)
)
# some quick checks -------------------------------------------------------
# number of records matches that reported
hurricanes_clean %>%
group_by(cyclone_id) %>%
summarise(n = n(),
n_records = first(n_records),
match = n == n_records) %>%
summarise(all(match))
# Quick messy plot
hurricanes_clean %>%
ggplot(aes(lon, lat)) +
geom_path(aes(group = cyclone_id))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment