cwickham · July 30, 2018 13:17 · Jul 30, 2018
diff --git a/hurricanes.R b/hurricanes.R
@@ -0,0 +1,76 @@
+library(tidyverse)
+library(xml2)
+
+url <- "http://www.aoml.noaa.gov/hrd/hurdat/hurdat2-nepac.html"
+
+# Column names ------------------------------------------------------------
+
+# From: http://www.aoml.noaa.gov/hrd/hurdat/newhurdat-format.pdf
+# for the data lines
+wind_vars <- cross_df(list(
+  y = c("NE", "SE", "SW", "NW"), 
+  x = c("34", "50", "64"))) %>% 
+  glue::glue_data("winds_{x}_{y}")
+
+col_names <- c("date", "time",
+  "record_id", "status", "lat", "lon",
+  "max_wind", "min_pressure", wind_vars, "empty"
+  )
+
+# Import ------------------------------------------------------------------
+
+hurricanes <- read_html(url) %>%
+  xml_find_first(".//pre") %>% 
+  xml_text() %>% 
+  read_csv(col_names = col_names,
+    col_types = cols(record_id = col_character())) 
+
+# Warnings correspond to header lines
+problems(hurricanes)
+
+# Pull apart header and data rows -----------------------------------------
+
+# Find headers based on first two characters in first column (date)
+# specifiying basin
+hurricanes <- hurricanes %>%
+  mutate(
+    header = str_detect(date, "[A-Z]{2}"),
+    id = cumsum(header)
+  ) 
+
+# Now for each hurricane add header info as columns to data  
+hurricanes_tidy <- hurricanes %>% 
+  group_by(id) %>% 
+  mutate(
+    cyclone_id = first(date), 
+    name = first(time), 
+    n_records = first(record_id)
+    ) %>% 
+  slice(-1) %>% 
+  ungroup()
+
+# and fix up a few data types
+hurricanes_clean <- 
+  hurricanes_tidy %>% 
+  mutate(
+    datetime = lubridate::ymd_hm(paste(date, time, sep = "T")),
+    date = lubridate::date(datetime),
+    time = hms::as.hms(datetime),
+    lat = parse_number(lat),
+    lon = parse_number(lon)
+  )
+
+# some quick checks -------------------------------------------------------
+
+# number of records matches that reported
+hurricanes_clean %>% 
+  group_by(cyclone_id) %>% 
+  summarise(n = n(),
+    n_records = first(n_records),
+    match = n == n_records) %>% 
+  summarise(all(match))
+
+# Quick messy plot
+hurricanes_clean %>% 
+  ggplot(aes(lon, lat)) +
+  geom_path(aes(group = cyclone_id))