Skip to content

Instantly share code, notes, and snippets.

@cattleguard
Last active November 17, 2017 23:30
Show Gist options
  • Save cattleguard/2274938cd22f05c364f7eaddfec83f7c to your computer and use it in GitHub Desktop.
Save cattleguard/2274938cd22f05c364f7eaddfec83f7c to your computer and use it in GitHub Desktop.
library(gsubfn)
library(magrittr)
big.texan.recordbook <- read.csv("/bigtexan.csv", stringsAsFactors = FALSE)
# Fixing dates that arent't.
# > grep("[0-9]{1,2}/[0-9]{1,2}/[0-9]{1,2}", big.texan.recordbook$date, value = TRUE, invert = TRUE)
# [1] "*1965" "*1966" "*1968" "*1972" "*1972" "1/??/98" "8/??/2004"
# [8] "4/?/10" "101 4/5" " " "7/210/15" " "
# Removing the really messed up obs and the blanks, replacing with NA
big.texan.recordbook$date <- gsub("^\\s$", NA, big.texan.recordbook$date)
# We're setting the star years to January 1. So, know that January gets 5+ obs by default
big.texan.recordbook$date <- gsub("\\*", "01/01/", big.texan.recordbook$date)
# Now we're setting the ?? and ? unknown to the first of the month
big.texan.recordbook$date <- gsub("\\?{1,2}", "01", big.texan.recordbook$date)
# Try to take a whack at the triple digit day
big.texan.recordbook$date <- gsub("7/210/15", "07/10/15", big.texan.recordbook$date)
big.texan.recordbook$date <- gsub("101 4/5", NA, big.texan.recordbook$date)
# Now let's try to fix up the time...
big.texan.recordbook$time <- gsub("Under 1\\s?H[R|r][.]?", "1900-01-01 00:59:00", big.texan.recordbook$time)
big.texan.recordbook$time <- gsub("^(\\d{1,2})\\s+[m|M]ins[.]?$", "1900-01-01 00:\\1:00", big.texan.recordbook$time)
big.texan.recordbook$time <- gsub("^(\\d{1,2})[m|M]$", "1900-01-01 00:\\1:00", big.texan.recordbook$time)
big.texan.recordbook$time <- gsub("^(\\d{1,2})m\\s(\\d{1,2})s$", "1900-01-01 00:\\1:\\2", big.texan.recordbook$time)
big.texan.recordbook$time <- gsub("^(\\d)\\s?[H|h]R?$", "1900-01-01 \\1:00:00", big.texan.recordbook$time)
big.texan.recordbook$time <- gsub("^(\\d{1,2}):(\\d{1,2})$", "1900-01-01 00:\\1:\\2", big.texan.recordbook$time)
big.texan.recordbook$time <- gsub("^(\\d{1,2})m\\s(\\d{1,2})s\\s(\\d{1,3})m[s]?$", "1900-01-01 00:\\1:\\2.\\3", big.texan.recordbook$time)
big.texan.recordbook$time <- gsub("^(\\d{1,2})$", "1900-01-01 00:\\1:00", big.texan.recordbook$time)
big.texan.recordbook$time <- gsub("^53.10 mind$", "1900-01-01 00:53:06", big.texan.recordbook$time)
big.texan.recordbook$time <- gsub("^(\\d{2}):(\\d{2}):(\\d{2})$", "1900-01-01 00:\\1:\\2.\\3", big.texan.recordbook$time)
big.texan.recordbook$time <- gsub("^(\\d{1,2}):(\\d{1,2})\\smins$", "1900-01-01 00:\\1:\\2", big.texan.recordbook$time)
big.texan.recordbook$time <- gsub("^(\\d{1,2})m\\s(\\d{1,2})s\\s(\\d{3})", "1900-01-01 00:\\1:\\2.\\3", big.texan.recordbook$time)
big.texan.recordbook$time <- gsub("^N/A$", NA, big.texan.recordbook$time)
big.texan.recordbook$time <- gsub("^\\s$", NA, big.texan.recordbook$time)
big.texan.recordbook$time <- gsub("1900-01-01 00:60:00", "1900-01-01 01:00:00", big.texan.recordbook$time)
#Special case where they put Mr. Smith's weight in with his time
big.texan.recordbook$time[big.texan.recordbook$name == "Matthew Smith"] <- "1900-01-01 00:59:35.400"
big.texan.recordbook$posixtime <- as.POSIXct(strptime(big.texan.recordbook$time, "%Y-%m-%d %H:%M:%OS"))
# In case we want to do a "This Day in Big Texan History" sort of thing
big.texan.recordbook$date <- as.Date(big.texan.recordbook$date, format = "%m/%d/%y")
# I suppose we should move on to fixing up weights
big.texan.recordbook$weight[big.texan.recordbook$name == "Matthew Smith"] <- "80 KG"
big.texan.recordbook$weight <- gsub("^[\\s|-|N/A|—|-]", NA, big.texan.recordbook$weight)
big.texan.recordbook$weight <- gsub("^\\s$", NA, big.texan.recordbook$weight)
big.texan.recordbook$weight <- gsub("^\\?{1,2}$", NA, big.texan.recordbook$weight)
big.texan.recordbook$weight <- gsub("^a lot$", NA, big.texan.recordbook$weight)
big.texan.recordbook$weight <- gsub("^6'$", NA, big.texan.recordbook$weight)
big.texan.recordbook$weight <- gsub("^14st 7lbs 40$", NA, big.texan.recordbook$weight)
big.texan.recordbook$weight <- gsub("^Big$", NA, big.texan.recordbook$weight)
# kg to lbs conversion 1:2.204623
# if there is a better way to do this, let me know. Or an equivalent using %>% ?
big.texan.recordbook$weight[!is.na(big.texan.recordbook$weight)] <- big.texan.recordbook$weight[!is.na(big.texan.recordbook$weight)] %>% gsubfn("^(\\d{1,3})\\s?[k|K][g|G]?", ~ as.character(as.numeric(x)*2.204623), .)
# 400lbs+ just gets converted to 401
big.texan.recordbook$weight <- gsub("^400\\+$", "401", big.texan.recordbook$weight)
# fix the +4.5 group
big.texan.recordbook$weight[!is.na(big.texan.recordbook$weight)] <- big.texan.recordbook$weight[!is.na(big.texan.recordbook$weight)] %>% gsubfn("^(\\d{1,3})\\+4\\.5", ~ as.character(as.numeric(x)+4.5), .)
# Eric Chow
big.texan.recordbook$weight[big.texan.recordbook$name == "Eric Chow"] <- "330.69345"
big.texan.recordbook$age[big.texan.recordbook$name == "Eric Chow"] <- "41"
big.texan.recordbook$age <- gsub("^[N/A|\\?]", NA, big.texan.recordbook$age)
big.texan.recordbook$age <- gsub("^\\s$", NA, big.texan.recordbook$age)
# Again, who knows what to do with 40+?
big.texan.recordbook$age <- gsub("^40\\+$", "41", big.texan.recordbook$age)
big.texan.recordbook$age <- gsub("^21\\+$", "22", big.texan.recordbook$age)
# What?
big.texan.recordbook$age <- gsub("^very$", NA, big.texan.recordbook$age)
big.texan.recordbook$age <- gsub("^young$", NA, big.texan.recordbook$age)
# Renee Collazo
big.texan.recordbook$weight[big.texan.recordbook$name == "Renee Collazo"] <- "280"
big.texan.recordbook$age[big.texan.recordbook$name == "Renee Collazo"] <- "36"
# Bringing it home!
big.texan.recordbook$time <- NULL # We have posixtime now
big.texan.recordbook$weight <- as.numeric(big.texan.recordbook$weight)
big.texan.recordbook$age <- as.numeric(big.texan.recordbook$age)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment