cwickham · July 30, 2018 13:17
diff --git a/hurricanes.R b/hurricanes.R
 library(tidyverse)
 library(xml2)

 url <- "http://www.aoml.noaa.gov/hrd/hurdat/hurdat2-nepac.html"

 # Column names ------------------------------------------------------------

 # From: http://www.aoml.noaa.gov/hrd/hurdat/newhurdat-format.pdf
 # for the data lines
 wind_vars <- cross_df(list(
  y = c("NE", "SE", "SW", "NW"), 
  x = c("34", "50", "64"))) %>% 
  glue::glue_data("winds_{x}_{y}")

 col_names <- c("date", "time",
  "record_id", "status", "lat", "lon",
  "max_wind", "min_pressure", wind_vars, "empty"
  )

 # Import ------------------------------------------------------------------

 hurricanes <- read_html(url) %>%
  xml_find_first(".//pre") %>% 
  xml_text() %>% 
  read_csv(col_names = col_names,
    col_types = cols(record_id = col_character())) 

 # Warnings correspond to header lines
 problems(hurricanes)

 # Pull apart header and data rows -----------------------------------------

 # Find headers based on first two characters in first column (date)
 # specifiying basin
 hurricanes <- hurricanes %>%
  mutate(
    header = str_detect(date, "[A-Z]{2}"),
    id = cumsum(header)
  ) 

 # Now for each hurricane add header info as columns to data  
 hurricanes_tidy <- hurricanes %>% 
  group_by(id) %>% 
  mutate(
    cyclone_id = first(date), 
    name = first(time), 
    n_records = first(record_id)
    ) %>% 
  slice(-1) %>% 
  ungroup()

 # and fix up a few data types
 hurricanes_clean <- 
  hurricanes_tidy %>% 
  mutate(
    datetime = lubridate::ymd_hm(paste(date, time, sep = "T")),
    date = lubridate::date(datetime),
    time = hms::as.hms(datetime),
    lat = parse_number(lat),
    lon = parse_number(lon)
  )

 # some quick checks -------------------------------------------------------

 # number of records matches that reported
 hurricanes_clean %>% 
  group_by(cyclone_id) %>% 
  summarise(n = n(),
    n_records = first(n_records),
    match = n == n_records) %>% 
  summarise(all(match))

 # Quick messy plot
 hurricanes_clean %>% 
  ggplot(aes(lon, lat)) +
  geom_path(aes(group = cyclone_id))
	library(tidyverse)
	library(xml2)

	url <- "http://www.aoml.noaa.gov/hrd/hurdat/hurdat2-nepac.html"

	# Column names ------------------------------------------------------------

	# From: http://www.aoml.noaa.gov/hrd/hurdat/newhurdat-format.pdf
	# for the data lines
	wind_vars <- cross_df(list(
	y = c("NE", "SE", "SW", "NW"),
	x = c("34", "50", "64"))) %>%
	glue::glue_data("winds_{x}_{y}")

	col_names <- c("date", "time",
	"record_id", "status", "lat", "lon",
	"max_wind", "min_pressure", wind_vars, "empty"
	)

	# Import ------------------------------------------------------------------

	hurricanes <- read_html(url) %>%
	xml_find_first(".//pre") %>%
	xml_text() %>%
	read_csv(col_names = col_names,
	col_types = cols(record_id = col_character()))

	# Warnings correspond to header lines
	problems(hurricanes)

	# Pull apart header and data rows -----------------------------------------

	# Find headers based on first two characters in first column (date)
	# specifiying basin
	hurricanes <- hurricanes %>%
	mutate(
	header = str_detect(date, "[A-Z]{2}"),
	id = cumsum(header)
	)

	# Now for each hurricane add header info as columns to data
	hurricanes_tidy <- hurricanes %>%
	group_by(id) %>%
	mutate(
	cyclone_id = first(date),
	name = first(time),
	n_records = first(record_id)
	) %>%
	slice(-1) %>%
	ungroup()

	# and fix up a few data types
	hurricanes_clean <-
	hurricanes_tidy %>%
	mutate(
	datetime = lubridate::ymd_hm(paste(date, time, sep = "T")),
	date = lubridate::date(datetime),
	time = hms::as.hms(datetime),
	lat = parse_number(lat),
	lon = parse_number(lon)
	)

	# some quick checks -------------------------------------------------------

	# number of records matches that reported
	hurricanes_clean %>%
	group_by(cyclone_id) %>%
	summarise(n = n(),
	n_records = first(n_records),
	match = n == n_records) %>%
	summarise(all(match))

	# Quick messy plot
	hurricanes_clean %>%
	ggplot(aes(lon, lat)) +
	geom_path(aes(group = cyclone_id))