ClaytonJY · October 31, 2019 02:59 · Oct 31, 2019 · Oct 31, 2019 · Oct 30, 2019 · Oct 30, 2019
diff --git a/group-and-summarize.md b/group-and-summarize.md
@@ -21,16 +21,16 @@ input_tbl
 #> # A tibble: 27 x 7
 #>    MarketDay  HourEnding FuelType Month Region consumption withdrawal
 #>    <date>          <int> <chr>    <dbl> <chr>        <dbl>      <dbl>
-#>  1 2016-01-01          1 Coal         1 North         4.47       22.0
-#>  2 2016-01-02          1 Coal         1 North        -2.44       64.8
-#>  3 2016-01-03          1 Coal         1 North        26.9        62.4
-#>  4 2016-01-01          2 Coal         1 North        47.6       104. 
-#>  5 2016-01-02          2 Coal         1 North        50.4        86.8
-#>  6 2016-01-03          2 Coal         1 North        71.7        79.2
-#>  7 2016-01-01          3 Coal         1 North        77.6       138. 
-#>  8 2016-01-02          3 Coal         1 North        65.4       126. 
-#>  9 2016-01-03          3 Coal         1 North        73.6       202. 
-#> 10 2016-01-01          1 Gas          1 North       101.        204. 
+#>  1 2016-01-01          1 Coal         1 North       -11.3        23.8
+#>  2 2016-01-02          1 Coal         1 North         9.01       16.9
+#>  3 2016-01-03          1 Coal         1 North        42.1        47.2
+#>  4 2016-01-01          2 Coal         1 North        33.0        94.1
+#>  5 2016-01-02          2 Coal         1 North        43.0        88.3
+#>  6 2016-01-03          2 Coal         1 North        55.4       137. 
+#>  7 2016-01-01          3 Coal         1 North        68.6       165. 
+#>  8 2016-01-02          3 Coal         1 North        82.0       153. 
+#>  9 2016-01-03          3 Coal         1 North        77.2       158. 
+#> 10 2016-01-01          1 Gas          1 North        84.7       171. 
 #> # … with 17 more rows
 
 # we want total consumption & withdrawal per MarketDay & HourEnding.
@@ -47,15 +47,15 @@ summary_tbl
 #> # A tibble: 9 x 4
 #>   MarketDay  HourEnding hourly_total_consumption hourly_total_withdrawal
 #>   <date>          <int>                    <dbl>                   <dbl>
-#> 1 2016-01-01          1                     292.                    584.
-#> 2 2016-01-01          2                     397.                    826.
-#> 3 2016-01-01          3                     489.                    971.
-#> 4 2016-01-02          1                     306.                    661.
-#> 5 2016-01-02          2                     418.                    877.
-#> 6 2016-01-02          3                     486.                    987.
-#> 7 2016-01-03          1                     387.                    705.
-#> 8 2016-01-03          2                     455.                    867.
-#> 9 2016-01-03          3                     515.                   1146.
+#> 1 2016-01-01          1                     261.                    549.
+#> 2 2016-01-01          2                     388.                    753.
+#> 3 2016-01-01          3                     484.                    941.
+#> 4 2016-01-02          1                     319.                    680.
+#> 5 2016-01-02          2                     398.                    877.
+#> 6 2016-01-02          3                     507.                    987.
+#> 7 2016-01-03          1                     362.                    696.
+#> 8 2016-01-03          2                     464.                    947.
+#> 9 2016-01-03          3                     537.                   1075.
 
 # now we compute deltas by subtracting a lagged value from our value
 # this would be trickier to implement and reason about if our hourly_total_* columns
@@ -73,15 +73,15 @@ summary_tbl
 #> # A tibble: 9 x 6
 #>   MarketDay  HourEnding hourly_total_co… hourly_total_wi… delta_hourly_co…
 #>   <date>          <int>            <dbl>            <dbl>            <dbl>
-#> 1 2016-01-01          1             292.             584.             NA  
-#> 2 2016-01-01          2             397.             826.            105. 
-#> 3 2016-01-01          3             489.             971.             92.0
-#> 4 2016-01-02          1             306.             661.           -183. 
-#> 5 2016-01-02          2             418.             877.            112. 
-#> 6 2016-01-02          3             486.             987.             67.6
-#> 7 2016-01-03          1             387.             705.            -98.3
-#> 8 2016-01-03          2             455.             867.             67.1
-#> 9 2016-01-03          3             515.            1146.             60.4
+#> 1 2016-01-01          1             261.             549.             NA  
+#> 2 2016-01-01          2             388.             753.            127. 
+#> 3 2016-01-01          3             484.             941.             96.4
+#> 4 2016-01-02          1             319.             680.           -165. 
+#> 5 2016-01-02          2             398.             877.             78.5
+#> 6 2016-01-02          3             507.             987.            109. 
+#> 7 2016-01-03          1             362.             696.           -144. 
+#> 8 2016-01-03          2             464.             947.            102. 
+#> 9 2016-01-03          3             537.            1075.             73.2
 #> # … with 1 more variable: delta_hourly_withdrawal <dbl>
 
 # now we can join this back in to our original table
@@ -93,37 +93,37 @@ tbl
 #> # A tibble: 27 x 11
 #>    MarketDay  HourEnding FuelType Month Region consumption withdrawal
 #>    <date>          <int> <chr>    <dbl> <chr>        <dbl>      <dbl>
-#>  1 2016-01-01          1 Coal         1 North         4.47       22.0
-#>  2 2016-01-01          1 Gas          1 North       101.        204. 
-#>  3 2016-01-01          1 Hydro        1 North       187.        358. 
-#>  4 2016-01-01          2 Coal         1 North        47.6       104. 
-#>  5 2016-01-01          2 Gas          1 North       126.        263. 
-#>  6 2016-01-01          2 Hydro        1 North       224.        460. 
-#>  7 2016-01-01          3 Coal         1 North        77.6       138. 
-#>  8 2016-01-01          3 Gas          1 North       169.        326. 
-#>  9 2016-01-01          3 Hydro        1 North       242.        508. 
-#> 10 2016-01-02          1 Coal         1 North        -2.44       64.8
+#>  1 2016-01-01          1 Coal         1 North       -11.3        23.8
+#>  2 2016-01-01          1 Gas          1 North        84.7       171. 
+#>  3 2016-01-01          1 Hydro        1 North       188.        354. 
+#>  4 2016-01-01          2 Coal         1 North        33.0        94.1
+#>  5 2016-01-01          2 Gas          1 North       122.        249. 
+#>  6 2016-01-01          2 Hydro        1 North       233.        410. 
+#>  7 2016-01-01          3 Coal         1 North        68.6       165. 
+#>  8 2016-01-01          3 Gas          1 North       161.        296. 
+#>  9 2016-01-01          3 Hydro        1 North       255.        481. 
+#> 10 2016-01-02          1 Coal         1 North         9.01       16.9
 #> # … with 17 more rows, and 4 more variables:
 #> #   hourly_total_consumption <dbl>, hourly_total_withdrawal <dbl>,
 #> #   delta_hourly_consumption <dbl>, delta_hourly_withdrawal <dbl>
 
 # too many columns to show in this format, so we'll focus on comsumption-only
 tbl %>%
-  select(-contains("withdrawal"))
-#> # A tibble: 27 x 8
-#>    MarketDay  HourEnding FuelType Month Region consumption hourly_total_co…
-#>    <date>          <int> <chr>    <dbl> <chr>        <dbl>            <dbl>
-#>  1 2016-01-01          1 Coal         1 North         4.47             292.
-#>  2 2016-01-01          1 Gas          1 North       101.               292.
-#>  3 2016-01-01          1 Hydro        1 North       187.               292.
-#>  4 2016-01-01          2 Coal         1 North        47.6              397.
-#>  5 2016-01-01          2 Gas          1 North       126.               397.
-#>  6 2016-01-01          2 Hydro        1 North       224.               397.
-#>  7 2016-01-01          3 Coal         1 North        77.6              489.
-#>  8 2016-01-01          3 Gas          1 North       169.               489.
-#>  9 2016-01-01          3 Hydro        1 North       242.               489.
-#> 10 2016-01-02          1 Coal         1 North        -2.44             306.
-#> # … with 17 more rows, and 1 more variable: delta_hourly_consumption <dbl>
+  select(-Month, -Region, -FuelType, -contains("withdrawal"))
+#> # A tibble: 27 x 5
+#>    MarketDay  HourEnding consumption hourly_total_consu… delta_hourly_cons…
+#>    <date>          <int>       <dbl>               <dbl>              <dbl>
+#>  1 2016-01-01          1      -11.3                 261.               NA  
+#>  2 2016-01-01          1       84.7                 261.               NA  
+#>  3 2016-01-01          1      188.                  261.               NA  
+#>  4 2016-01-01          2       33.0                 388.              127. 
+#>  5 2016-01-01          2      122.                  388.              127. 
+#>  6 2016-01-01          2      233.                  388.              127. 
+#>  7 2016-01-01          3       68.6                 484.               96.4
+#>  8 2016-01-01          3      161.                  484.               96.4
+#>  9 2016-01-01          3      255.                  484.               96.4
+#> 10 2016-01-02          1        9.01                319.             -165. 
+#> # … with 17 more rows
 ```
 
 <sup>Created on 2019-10-30 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>
diff --git a/group-and-summarize.md b/group-and-summarize.md
@@ -13,87 +13,117 @@ input_tbl <- lst(
   mutate(
     MarketDay = as.Date(MarketDay, origin = "1970-01-01"),
     # skipping GenMISObyFuel, consumption_factor, withdrawal_factor
-    consumption = rnorm(nrow(.)),  # random draws from normal dist; doesn't look like your data, but doesn't matter
-    withdrawal = rnorm(nrow(.))
+    consumption = 10 * (1:n() + rnorm(n())),  # changed this up to make later steps more clear
+    withdrawal = 20 * (1:n() + rnorm(n()))
   )
 
 input_tbl
 #> # A tibble: 27 x 7
 #>    MarketDay  HourEnding FuelType Month Region consumption withdrawal
 #>    <date>          <int> <chr>    <dbl> <chr>        <dbl>      <dbl>
-#>  1 2016-01-01          1 Coal         1 North      -0.746       1.70 
-#>  2 2016-01-02          1 Coal         1 North      -0.670      -1.04 
-#>  3 2016-01-03          1 Coal         1 North      -1.07        0.634
-#>  4 2016-01-01          2 Coal         1 North      -0.375       0.534
-#>  5 2016-01-02          2 Coal         1 North      -0.0111      0.613
-#>  6 2016-01-03          2 Coal         1 North       0.537      -1.64 
-#>  7 2016-01-01          3 Coal         1 North      -1.19        0.645
-#>  8 2016-01-02          3 Coal         1 North      -0.456       0.631
-#>  9 2016-01-03          3 Coal         1 North      -0.866      -0.488
-#> 10 2016-01-01          1 Gas          1 North       0.131      -0.913
+#>  1 2016-01-01          1 Coal         1 North         4.47       22.0
+#>  2 2016-01-02          1 Coal         1 North        -2.44       64.8
+#>  3 2016-01-03          1 Coal         1 North        26.9        62.4
+#>  4 2016-01-01          2 Coal         1 North        47.6       104. 
+#>  5 2016-01-02          2 Coal         1 North        50.4        86.8
+#>  6 2016-01-03          2 Coal         1 North        71.7        79.2
+#>  7 2016-01-01          3 Coal         1 North        77.6       138. 
+#>  8 2016-01-02          3 Coal         1 North        65.4       126. 
+#>  9 2016-01-03          3 Coal         1 North        73.6       202. 
+#> 10 2016-01-01          1 Gas          1 North       101.        204. 
 #> # … with 17 more rows
 
-# we want sum consumption & withdrawal by MarketDay & HourEnding
-# will produce 1 row per MarketDay-HourEnding combo
-# used 3 values for each above, so 9 rows expected
-input_tbl %>%
+# we want total consumption & withdrawal per MarketDay & HourEnding.
+# we'll do this as a separate tibble because it'll simplify the next step
+summary_tbl <- input_tbl %>%
   group_by(MarketDay, HourEnding) %>%
   summarize(
     hourly_total_consumption = sum(consumption),
     hourly_total_withdrawal = sum(withdrawal)
   ) %>%
-  ungroup()  # important so we aren't still grouped during later work
+  ungroup()
+
+summary_tbl
 #> # A tibble: 9 x 4
 #>   MarketDay  HourEnding hourly_total_consumption hourly_total_withdrawal
 #>   <date>          <int>                    <dbl>                   <dbl>
-#> 1 2016-01-01          1                   -0.270                   1.88 
-#> 2 2016-01-01          2                   -0.840                   2.15 
-#> 3 2016-01-01          3                   -2.79                    2.35 
-#> 4 2016-01-02          1                    0.343                  -0.866
-#> 5 2016-01-02          2                   -0.552                   0.288
-#> 6 2016-01-02          3                    0.793                   0.960
-#> 7 2016-01-03          1                    0.201                  -0.292
-#> 8 2016-01-03          2                   -0.753                  -1.77 
-#> 9 2016-01-03          3                   -0.567                  -0.401
+#> 1 2016-01-01          1                     292.                    584.
+#> 2 2016-01-01          2                     397.                    826.
+#> 3 2016-01-01          3                     489.                    971.
+#> 4 2016-01-02          1                     306.                    661.
+#> 5 2016-01-02          2                     418.                    877.
+#> 6 2016-01-02          3                     486.                    987.
+#> 7 2016-01-03          1                     387.                    705.
+#> 8 2016-01-03          2                     455.                    867.
+#> 9 2016-01-03          3                     515.                   1146.
 
-# notice everything but the group cols and summarize cols is _gone_
-# that's expected! the colums I didn't add to input_tbl would also go away,
-# hence why not including them didnt matter here
+# now we compute deltas by subtracting a lagged value from our value
+# this would be trickier to implement and reason about if our hourly_total_* columns
+# were already in our bigger input_tbl
+summary_tbl <- summary_tbl %>%
+  arrange(MarketDay, HourEnding) %>%  # to be extra sure our order is chronological
+  mutate(
+    delta_hourly_consumption = hourly_total_consumption - lag(hourly_total_consumption),
+    delta_hourly_withdrawal = hourly_total_withdrawal - lag(hourly_total_withdrawal)
+  )
 
-#### BONUS ####
+# notice the NA's at the top here!
+# there's nothing to subtract; the lag starts with NA, and x - NA = NA
+summary_tbl
+#> # A tibble: 9 x 6
+#>   MarketDay  HourEnding hourly_total_co… hourly_total_wi… delta_hourly_co…
+#>   <date>          <int>            <dbl>            <dbl>            <dbl>
+#> 1 2016-01-01          1             292.             584.             NA  
+#> 2 2016-01-01          2             397.             826.            105. 
+#> 3 2016-01-01          3             489.             971.             92.0
+#> 4 2016-01-02          1             306.             661.           -183. 
+#> 5 2016-01-02          2             418.             877.            112. 
+#> 6 2016-01-02          3             486.             987.             67.6
+#> 7 2016-01-03          1             387.             705.            -98.3
+#> 8 2016-01-03          2             455.             867.             67.1
+#> 9 2016-01-03          3             515.            1146.             60.4
+#> # … with 1 more variable: delta_hourly_withdrawal <dbl>
 
-# remember count()?
-input_tbl %>% count(MarketDay, HourEnding)
-#> # A tibble: 9 x 3
-#>   MarketDay  HourEnding     n
-#>   <date>          <int> <int>
-#> 1 2016-01-01          1     3
-#> 2 2016-01-01          2     3
-#> 3 2016-01-01          3     3
-#> 4 2016-01-02          1     3
-#> 5 2016-01-02          2     3
-#> 6 2016-01-02          3     3
-#> 7 2016-01-03          1     3
-#> 8 2016-01-03          2     3
-#> 9 2016-01-03          3     3
+# now we can join this back in to our original table
+tbl <- input_tbl %>%
+  left_join(summary_tbl, by = c("MarketDay", "HourEnding")) %>%
+  arrange(MarketDay, HourEnding)
 
-# thats really shorthand for a group_by and summarize!
-input_tbl %>%
-  group_by(MarketDay, HourEnding) %>%
-  summarize(n = n()) %>%  # n() is a special function for use in certain dplyr functions; numer of rows in group
-  ungroup()
-#> # A tibble: 9 x 3
-#>   MarketDay  HourEnding     n
-#>   <date>          <int> <int>
-#> 1 2016-01-01          1     3
-#> 2 2016-01-01          2     3
-#> 3 2016-01-01          3     3
-#> 4 2016-01-02          1     3
-#> 5 2016-01-02          2     3
-#> 6 2016-01-02          3     3
-#> 7 2016-01-03          1     3
-#> 8 2016-01-03          2     3
-#> 9 2016-01-03          3     3
+tbl
+#> # A tibble: 27 x 11
+#>    MarketDay  HourEnding FuelType Month Region consumption withdrawal
+#>    <date>          <int> <chr>    <dbl> <chr>        <dbl>      <dbl>
+#>  1 2016-01-01          1 Coal         1 North         4.47       22.0
+#>  2 2016-01-01          1 Gas          1 North       101.        204. 
+#>  3 2016-01-01          1 Hydro        1 North       187.        358. 
+#>  4 2016-01-01          2 Coal         1 North        47.6       104. 
+#>  5 2016-01-01          2 Gas          1 North       126.        263. 
+#>  6 2016-01-01          2 Hydro        1 North       224.        460. 
+#>  7 2016-01-01          3 Coal         1 North        77.6       138. 
+#>  8 2016-01-01          3 Gas          1 North       169.        326. 
+#>  9 2016-01-01          3 Hydro        1 North       242.        508. 
+#> 10 2016-01-02          1 Coal         1 North        -2.44       64.8
+#> # … with 17 more rows, and 4 more variables:
+#> #   hourly_total_consumption <dbl>, hourly_total_withdrawal <dbl>,
+#> #   delta_hourly_consumption <dbl>, delta_hourly_withdrawal <dbl>
+
+# too many columns to show in this format, so we'll focus on comsumption-only
+tbl %>%
+  select(-contains("withdrawal"))
+#> # A tibble: 27 x 8
+#>    MarketDay  HourEnding FuelType Month Region consumption hourly_total_co…
+#>    <date>          <int> <chr>    <dbl> <chr>        <dbl>            <dbl>
+#>  1 2016-01-01          1 Coal         1 North         4.47             292.
+#>  2 2016-01-01          1 Gas          1 North       101.               292.
+#>  3 2016-01-01          1 Hydro        1 North       187.               292.
+#>  4 2016-01-01          2 Coal         1 North        47.6              397.
+#>  5 2016-01-01          2 Gas          1 North       126.               397.
+#>  6 2016-01-01          2 Hydro        1 North       224.               397.
+#>  7 2016-01-01          3 Coal         1 North        77.6              489.
+#>  8 2016-01-01          3 Gas          1 North       169.               489.
+#>  9 2016-01-01          3 Hydro        1 North       242.               489.
+#> 10 2016-01-02          1 Coal         1 North        -2.44             306.
+#> # … with 17 more rows, and 1 more variable: delta_hourly_consumption <dbl>
 ```
 
-<sup>Created on 2019-10-30 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>
+<sup>Created on 2019-10-30 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>
diff --git a/group-and-summarize.md b/group-and-summarize.md
@@ -0,0 +1,99 @@
+``` r
+library(tidyverse)
+
+# fake some data
+input_tbl <- lst(
+  MarketDay = as.Date(c("2016-01-01", "2016-01-02", "2016-01-03")),
+  HourEnding = 1:3,
+  FuelType = c("Coal", "Gas", "Hydro"),
+  Month = 1,  # because only Jan dates
+  Region = "North"
+) %>%
+  cross_df() %>%
+  mutate(
+    MarketDay = as.Date(MarketDay, origin = "1970-01-01"),
+    # skipping GenMISObyFuel, consumption_factor, withdrawal_factor
+    consumption = rnorm(nrow(.)),  # random draws from normal dist; doesn't look like your data, but doesn't matter
+    withdrawal = rnorm(nrow(.))
+  )
+
+input_tbl
+#> # A tibble: 27 x 7
+#>    MarketDay  HourEnding FuelType Month Region consumption withdrawal
+#>    <date>          <int> <chr>    <dbl> <chr>        <dbl>      <dbl>
+#>  1 2016-01-01          1 Coal         1 North      -0.746       1.70 
+#>  2 2016-01-02          1 Coal         1 North      -0.670      -1.04 
+#>  3 2016-01-03          1 Coal         1 North      -1.07        0.634
+#>  4 2016-01-01          2 Coal         1 North      -0.375       0.534
+#>  5 2016-01-02          2 Coal         1 North      -0.0111      0.613
+#>  6 2016-01-03          2 Coal         1 North       0.537      -1.64 
+#>  7 2016-01-01          3 Coal         1 North      -1.19        0.645
+#>  8 2016-01-02          3 Coal         1 North      -0.456       0.631
+#>  9 2016-01-03          3 Coal         1 North      -0.866      -0.488
+#> 10 2016-01-01          1 Gas          1 North       0.131      -0.913
+#> # … with 17 more rows
+
+# we want sum consumption & withdrawal by MarketDay & HourEnding
+# will produce 1 row per MarketDay-HourEnding combo
+# used 3 values for each above, so 9 rows expected
+input_tbl %>%
+  group_by(MarketDay, HourEnding) %>%
+  summarize(
+    hourly_total_consumption = sum(consumption),
+    hourly_total_withdrawal = sum(withdrawal)
+  ) %>%
+  ungroup()  # important so we aren't still grouped during later work
+#> # A tibble: 9 x 4
+#>   MarketDay  HourEnding hourly_total_consumption hourly_total_withdrawal
+#>   <date>          <int>                    <dbl>                   <dbl>
+#> 1 2016-01-01          1                   -0.270                   1.88 
+#> 2 2016-01-01          2                   -0.840                   2.15 
+#> 3 2016-01-01          3                   -2.79                    2.35 
+#> 4 2016-01-02          1                    0.343                  -0.866
+#> 5 2016-01-02          2                   -0.552                   0.288
+#> 6 2016-01-02          3                    0.793                   0.960
+#> 7 2016-01-03          1                    0.201                  -0.292
+#> 8 2016-01-03          2                   -0.753                  -1.77 
+#> 9 2016-01-03          3                   -0.567                  -0.401
+
+# notice everything but the group cols and summarize cols is _gone_
+# that's expected! the colums I didn't add to input_tbl would also go away,
+# hence why not including them didnt matter here
+
+#### BONUS ####
+
+# remember count()?
+input_tbl %>% count(MarketDay, HourEnding)
+#> # A tibble: 9 x 3
+#>   MarketDay  HourEnding     n
+#>   <date>          <int> <int>
+#> 1 2016-01-01          1     3
+#> 2 2016-01-01          2     3
+#> 3 2016-01-01          3     3
+#> 4 2016-01-02          1     3
+#> 5 2016-01-02          2     3
+#> 6 2016-01-02          3     3
+#> 7 2016-01-03          1     3
+#> 8 2016-01-03          2     3
+#> 9 2016-01-03          3     3
+
+# thats really shorthand for a group_by and summarize!
+input_tbl %>%
+  group_by(MarketDay, HourEnding) %>%
+  summarize(n = n()) %>%  # n() is a special function for use in certain dplyr functions; numer of rows in group
+  ungroup()
+#> # A tibble: 9 x 3
+#>   MarketDay  HourEnding     n
+#>   <date>          <int> <int>
+#> 1 2016-01-01          1     3
+#> 2 2016-01-01          2     3
+#> 3 2016-01-01          3     3
+#> 4 2016-01-02          1     3
+#> 5 2016-01-02          2     3
+#> 6 2016-01-02          3     3
+#> 7 2016-01-03          1     3
+#> 8 2016-01-03          2     3
+#> 9 2016-01-03          3     3
+```
+
+<sup>Created on 2019-10-30 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>
diff --git a/impact-factor-example.R → impact-factor-example.md b/impact-factor-example.R → impact-factor-example.md
diff --git a/impact-factor-example.R b/impact-factor-example.R
@@ -1,3 +1,4 @@
+``` r
 library(tidyverse)  # all the good stuff
 library(lubridate)
 #> 
@@ -27,16 +28,16 @@ df2016north
 #> # A tibble: 20,886 x 6
 #>    MarketDay  HourEnding FuelType Month Region GenMISObyFuel
 #>    <date>          <dbl> <chr>    <dbl> <chr>          <int>
-#>  1 2019-01-01          1 Coal         1 North             15
-#>  2 2019-01-02          1 Coal         1 North              4
-#>  3 2019-01-03          1 Coal         1 North             19
-#>  4 2019-01-04          1 Coal         1 North              6
-#>  5 2019-01-05          1 Coal         1 North              2
+#>  1 2019-01-01          1 Coal         1 North              7
+#>  2 2019-01-02          1 Coal         1 North              7
+#>  3 2019-01-03          1 Coal         1 North             18
+#>  4 2019-01-04          1 Coal         1 North              8
+#>  5 2019-01-05          1 Coal         1 North             11
 #>  6 2019-01-06          1 Coal         1 North             20
-#>  7 2019-01-07          1 Coal         1 North             14
-#>  8 2019-01-08          1 Coal         1 North             16
-#>  9 2019-01-09          1 Coal         1 North             13
-#> 10 2019-01-10          1 Coal         1 North             15
+#>  7 2019-01-07          1 Coal         1 North             18
+#>  8 2019-01-08          1 Coal         1 North              2
+#>  9 2019-01-09          1 Coal         1 North              5
+#> 10 2019-01-10          1 Coal         1 North              7
 #> # … with 20,876 more rows
 
 # to map (FuelType, Month) to impact_factor, we'd like a table with each as a column
@@ -55,12 +56,12 @@ impact_df
 #> # A tibble: 6 x 3
 #>   FuelType Month impact_factor
 #>   <chr>    <dbl>         <int>
-#> 1 Coal         1           100
-#> 2 Gas          1           142
-#> 3 Other        1           148
-#> 4 Coal         2           129
-#> 5 Gas          2            97
-#> 6 Other        2            43
+#> 1 Coal         1            88
+#> 2 Gas          1            22
+#> 3 Other        1           113
+#> 4 Coal         2           140
+#> 5 Gas          2           130
+#> 6 Other        2            67
 
 # now we can join to get impact_factor into our df2016north tbl
 # left join means we'll keep all the columns from the left one, df2016north,
@@ -72,15 +73,44 @@ df2016north
 #> # A tibble: 20,886 x 7
 #>    MarketDay  HourEnding FuelType Month Region GenMISObyFuel impact_factor
 #>    <date>          <dbl> <chr>    <dbl> <chr>          <int>         <int>
-#>  1 2019-01-01          1 Coal         1 North             15           100
-#>  2 2019-01-02          1 Coal         1 North              4           100
-#>  3 2019-01-03          1 Coal         1 North             19           100
-#>  4 2019-01-04          1 Coal         1 North              6           100
-#>  5 2019-01-05          1 Coal         1 North              2           100
-#>  6 2019-01-06          1 Coal         1 North             20           100
-#>  7 2019-01-07          1 Coal         1 North             14           100
-#>  8 2019-01-08          1 Coal         1 North             16           100
-#>  9 2019-01-09          1 Coal         1 North             13           100
-#> 10 2019-01-10          1 Coal         1 North             15           100
+#>  1 2019-01-01          1 Coal         1 North              7            88
+#>  2 2019-01-02          1 Coal         1 North              7            88
+#>  3 2019-01-03          1 Coal         1 North             18            88
+#>  4 2019-01-04          1 Coal         1 North              8            88
+#>  5 2019-01-05          1 Coal         1 North             11            88
+#>  6 2019-01-06          1 Coal         1 North             20            88
+#>  7 2019-01-07          1 Coal         1 North             18            88
+#>  8 2019-01-08          1 Coal         1 North              2            88
+#>  9 2019-01-09          1 Coal         1 North              5            88
+#> 10 2019-01-10          1 Coal         1 North              7            88
 #> # … with 20,876 more rows
-Created on 2019-10-30 by the reprex package (v0.3.0)
+
+# that only showed one FuelType-Month combo
+# we can randomly sample to better understand what happened
+sample_n(df2016north, 20)
+#> # A tibble: 20 x 7
+#>    MarketDay  HourEnding FuelType Month Region GenMISObyFuel impact_factor
+#>    <date>          <dbl> <chr>    <dbl> <chr>          <int>         <int>
+#>  1 2019-02-01          2 Coal         2 North              1           140
+#>  2 2019-01-18          2 Coal         2 North              6           140
+#>  3 2019-01-31          1 Gas          2 North              3           130
+#>  4 2019-02-11          2 Other        2 North              4            67
+#>  5 2019-01-07          1 Coal         2 North             13           140
+#>  6 2019-01-18          1 Other        2 North             12            67
+#>  7 2019-01-20          1 Other        2 North             17            67
+#>  8 2019-01-10          1 Other        1 North              1           113
+#>  9 2019-01-30          2 Other        2 North             14            67
+#> 10 2019-01-23          2 Other        1 North              2           113
+#> 11 2019-01-09          1 Gas          1 North             11            22
+#> 12 2019-01-04          2 Other        1 North             13           113
+#> 13 2019-01-02          2 Coal         1 North              4            88
+#> 14 2019-02-24          2 Gas          1 North              9            22
+#> 15 2019-01-18          1 Other        1 North              5           113
+#> 16 2019-02-20          2 Coal         1 North             14            88
+#> 17 2019-02-04          1 Other        2 North             15            67
+#> 18 2019-02-26          2 Coal         1 North              1            88
+#> 19 2019-02-09          1 Other        2 North             19            67
+#> 20 2019-01-21          2 Coal         1 North             19            88
+```
+
+<sup>Created on 2019-10-30 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>
diff --git a/impact-factor-example.R b/impact-factor-example.R
@@ -1,4 +1,3 @@
-``` r
 library(tidyverse)  # all the good stuff
 library(lubridate)
 #> 
@@ -28,16 +27,16 @@ df2016north
 #> # A tibble: 20,886 x 6
 #>    MarketDay  HourEnding FuelType Month Region GenMISObyFuel
 #>    <date>          <dbl> <chr>    <dbl> <chr>          <int>
-#>  1 2019-01-01          1 Coal         1 North             10
-#>  2 2019-01-02          1 Coal         1 North              1
-#>  3 2019-01-03          1 Coal         1 North             13
-#>  4 2019-01-04          1 Coal         1 North             16
-#>  5 2019-01-05          1 Coal         1 North             11
-#>  6 2019-01-06          1 Coal         1 North             17
-#>  7 2019-01-07          1 Coal         1 North             20
-#>  8 2019-01-08          1 Coal         1 North              5
-#>  9 2019-01-09          1 Coal         1 North              4
-#> 10 2019-01-10          1 Coal         1 North             20
+#>  1 2019-01-01          1 Coal         1 North             15
+#>  2 2019-01-02          1 Coal         1 North              4
+#>  3 2019-01-03          1 Coal         1 North             19
+#>  4 2019-01-04          1 Coal         1 North              6
+#>  5 2019-01-05          1 Coal         1 North              2
+#>  6 2019-01-06          1 Coal         1 North             20
+#>  7 2019-01-07          1 Coal         1 North             14
+#>  8 2019-01-08          1 Coal         1 North             16
+#>  9 2019-01-09          1 Coal         1 North             13
+#> 10 2019-01-10          1 Coal         1 North             15
 #> # … with 20,876 more rows
 
 # to map (FuelType, Month) to impact_factor, we'd like a table with each as a column
@@ -56,35 +55,32 @@ impact_df
 #> # A tibble: 6 x 3
 #>   FuelType Month impact_factor
 #>   <chr>    <dbl>         <int>
-#> 1 Coal         1            17
-#> 2 Gas          1            18
-#> 3 Other        1            24
-#> 4 Coal         2           148
-#> 5 Gas          2            28
-#> 6 Other        2           107
+#> 1 Coal         1           100
+#> 2 Gas          1           142
+#> 3 Other        1           148
+#> 4 Coal         2           129
+#> 5 Gas          2            97
+#> 6 Other        2            43
 
 # now we can join to get impact_factor into our df2016north tbl
 # left join means we'll keep all the columns from the left one, df2016north,
 # even the rows with no match in impact_df (all rows have matched in this example)
-df2016north <- left_join(df2016north, impact_df, by = c("Fuel Type", "Month"))
-#> `by` can't contain join column `Fuel Type` which is missing from LHS
+df2016north <- left_join(df2016north, impact_df, by = c("FuelType", "Month"))
 
 # peek
 df2016north
-#> # A tibble: 20,886 x 6
-#>    MarketDay  HourEnding FuelType Month Region GenMISObyFuel
-#>    <date>          <dbl> <chr>    <dbl> <chr>          <int>
-#>  1 2019-01-01          1 Coal         1 North             10
-#>  2 2019-01-02          1 Coal         1 North              1
-#>  3 2019-01-03          1 Coal         1 North             13
-#>  4 2019-01-04          1 Coal         1 North             16
-#>  5 2019-01-05          1 Coal         1 North             11
-#>  6 2019-01-06          1 Coal         1 North             17
-#>  7 2019-01-07          1 Coal         1 North             20
-#>  8 2019-01-08          1 Coal         1 North              5
-#>  9 2019-01-09          1 Coal         1 North              4
-#> 10 2019-01-10          1 Coal         1 North             20
+#> # A tibble: 20,886 x 7
+#>    MarketDay  HourEnding FuelType Month Region GenMISObyFuel impact_factor
+#>    <date>          <dbl> <chr>    <dbl> <chr>          <int>         <int>
+#>  1 2019-01-01          1 Coal         1 North             15           100
+#>  2 2019-01-02          1 Coal         1 North              4           100
+#>  3 2019-01-03          1 Coal         1 North             19           100
+#>  4 2019-01-04          1 Coal         1 North              6           100
+#>  5 2019-01-05          1 Coal         1 North              2           100
+#>  6 2019-01-06          1 Coal         1 North             20           100
+#>  7 2019-01-07          1 Coal         1 North             14           100
+#>  8 2019-01-08          1 Coal         1 North             16           100
+#>  9 2019-01-09          1 Coal         1 North             13           100
+#> 10 2019-01-10          1 Coal         1 North             15           100
 #> # … with 20,876 more rows
-```
-
-<sup>Created on 2019-10-30 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>
+Created on 2019-10-30 by the reprex package (v0.3.0)
diff --git a/impact-factor-example.R b/impact-factor-example.R
@@ -1,3 +1,4 @@
+``` r
 library(tidyverse)  # all the good stuff
 library(lubridate)
 #> 
@@ -16,24 +17,27 @@ df2016north <- lst(
   Region = "North"
 ) %>%
   cross_df() %>%
-  mutate(GenMISObyFuel = sample.int(20:7000, nrow(.), replace = TRUE))
+  mutate(
+    MarketDay = as_date(MarketDay),  # this one got messed up by cross_df
+    GenMISObyFuel = sample.int(20:7000, nrow(.), replace = TRUE)
+  )
 
 # let's take a peek
 # should be one row for every possible combo of the stuff in the lst above
 df2016north
 #> # A tibble: 20,886 x 6
-#>    MarketDay HourEnding FuelType Month Region GenMISObyFuel
-#>        <dbl>      <dbl> <chr>    <dbl> <chr>          <int>
-#>  1     17897          1 Coal         1 North              3
-#>  2     17898          1 Coal         1 North             18
-#>  3     17899          1 Coal         1 North              2
-#>  4     17900          1 Coal         1 North             18
-#>  5     17901          1 Coal         1 North             20
-#>  6     17902          1 Coal         1 North             18
-#>  7     17903          1 Coal         1 North              5
-#>  8     17904          1 Coal         1 North             14
-#>  9     17905          1 Coal         1 North             19
-#> 10     17906          1 Coal         1 North              2
+#>    MarketDay  HourEnding FuelType Month Region GenMISObyFuel
+#>    <date>          <dbl> <chr>    <dbl> <chr>          <int>
+#>  1 2019-01-01          1 Coal         1 North             10
+#>  2 2019-01-02          1 Coal         1 North              1
+#>  3 2019-01-03          1 Coal         1 North             13
+#>  4 2019-01-04          1 Coal         1 North             16
+#>  5 2019-01-05          1 Coal         1 North             11
+#>  6 2019-01-06          1 Coal         1 North             17
+#>  7 2019-01-07          1 Coal         1 North             20
+#>  8 2019-01-08          1 Coal         1 North              5
+#>  9 2019-01-09          1 Coal         1 North              4
+#> 10 2019-01-10          1 Coal         1 North             20
 #> # … with 20,876 more rows
 
 # to map (FuelType, Month) to impact_factor, we'd like a table with each as a column
@@ -52,33 +56,35 @@ impact_df
 #> # A tibble: 6 x 3
 #>   FuelType Month impact_factor
 #>   <chr>    <dbl>         <int>
-#> 1 Coal         1            22
-#> 2 Gas          1           118
-#> 3 Other        1           133
-#> 4 Coal         2           130
-#> 5 Gas          2           114
-#> 6 Other        2            90
+#> 1 Coal         1            17
+#> 2 Gas          1            18
+#> 3 Other        1            24
+#> 4 Coal         2           148
+#> 5 Gas          2            28
+#> 6 Other        2           107
 
 # now we can join to get impact_factor into our df2016north tbl
 # left join means we'll keep all the columns from the left one, df2016north,
 # even the rows with no match in impact_df (all rows have matched in this example)
-df2016north <- left_join(df2016north, impact_df)
-#> Joining, by = c("FuelType", "Month")
+df2016north <- left_join(df2016north, impact_df, by = c("Fuel Type", "Month"))
+#> `by` can't contain join column `Fuel Type` which is missing from LHS
 
 # peek
 df2016north
-#> # A tibble: 20,886 x 7
-#>    MarketDay HourEnding FuelType Month Region GenMISObyFuel impact_factor
-#>        <dbl>      <dbl> <chr>    <dbl> <chr>          <int>         <int>
-#>  1     17897          1 Coal         1 North              3            22
-#>  2     17898          1 Coal         1 North             18            22
-#>  3     17899          1 Coal         1 North              2            22
-#>  4     17900          1 Coal         1 North             18            22
-#>  5     17901          1 Coal         1 North             20            22
-#>  6     17902          1 Coal         1 North             18            22
-#>  7     17903          1 Coal         1 North              5            22
-#>  8     17904          1 Coal         1 North             14            22
-#>  9     17905          1 Coal         1 North             19            22
-#> 10     17906          1 Coal         1 North              2            22
+#> # A tibble: 20,886 x 6
+#>    MarketDay  HourEnding FuelType Month Region GenMISObyFuel
+#>    <date>          <dbl> <chr>    <dbl> <chr>          <int>
+#>  1 2019-01-01          1 Coal         1 North             10
+#>  2 2019-01-02          1 Coal         1 North              1
+#>  3 2019-01-03          1 Coal         1 North             13
+#>  4 2019-01-04          1 Coal         1 North             16
+#>  5 2019-01-05          1 Coal         1 North             11
+#>  6 2019-01-06          1 Coal         1 North             17
+#>  7 2019-01-07          1 Coal         1 North             20
+#>  8 2019-01-08          1 Coal         1 North              5
+#>  9 2019-01-09          1 Coal         1 North              4
+#> 10 2019-01-10          1 Coal         1 North             20
 #> # … with 20,876 more rows
-Created on 2019-10-30 by the reprex package (v0.3.0)
+```
+
+<sup>Created on 2019-10-30 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>
diff --git a/impact-factor-example.R b/impact-factor-example.R
@@ -0,0 +1,84 @@
+library(tidyverse)  # all the good stuff
+library(lubridate)
+#> 
+#> Attaching package: 'lubridate'
+#> The following object is masked from 'package:base':
+#> 
+#>     date
+
+# I want to build something kinda like your actual data, but simpler
+# fewer levels of important cols, only a few extra columns, but same general idea
+df2016north <- lst(
+  MarketDay = seq.Date(from = as_date("2019-01-01"), to = as_date("2019-02-28"), by = "day"),
+  HourEnding = c(1, 2),
+  FuelType = c("Coal", "Gas", "Other"),
+  Month = month(MarketDay),
+  Region = "North"
+) %>%
+  cross_df() %>%
+  mutate(GenMISObyFuel = sample.int(20:7000, nrow(.), replace = TRUE))
+
+# let's take a peek
+# should be one row for every possible combo of the stuff in the lst above
+df2016north
+#> # A tibble: 20,886 x 6
+#>    MarketDay HourEnding FuelType Month Region GenMISObyFuel
+#>        <dbl>      <dbl> <chr>    <dbl> <chr>          <int>
+#>  1     17897          1 Coal         1 North              3
+#>  2     17898          1 Coal         1 North             18
+#>  3     17899          1 Coal         1 North              2
+#>  4     17900          1 Coal         1 North             18
+#>  5     17901          1 Coal         1 North             20
+#>  6     17902          1 Coal         1 North             18
+#>  7     17903          1 Coal         1 North              5
+#>  8     17904          1 Coal         1 North             14
+#>  9     17905          1 Coal         1 North             19
+#> 10     17906          1 Coal         1 North              2
+#> # … with 20,876 more rows
+
+# to map (FuelType, Month) to impact_factor, we'd like a table with each as a column
+# it might look like this, after we've read it in and formatted it appropriately
+impact_df <- lst(
+  FuelType = unique(df2016north$FuelType),
+  Month = unique(df2016north$Month)
+) %>%
+  cross_df() %>%
+  mutate(
+    impact_factor = sample.int(150:800, nrow(.))
+  )
+
+# take a peek
+impact_df
+#> # A tibble: 6 x 3
+#>   FuelType Month impact_factor
+#>   <chr>    <dbl>         <int>
+#> 1 Coal         1            22
+#> 2 Gas          1           118
+#> 3 Other        1           133
+#> 4 Coal         2           130
+#> 5 Gas          2           114
+#> 6 Other        2            90
+
+# now we can join to get impact_factor into our df2016north tbl
+# left join means we'll keep all the columns from the left one, df2016north,
+# even the rows with no match in impact_df (all rows have matched in this example)
+df2016north <- left_join(df2016north, impact_df)
+#> Joining, by = c("FuelType", "Month")
+
+# peek
+df2016north
+#> # A tibble: 20,886 x 7
+#>    MarketDay HourEnding FuelType Month Region GenMISObyFuel impact_factor
+#>        <dbl>      <dbl> <chr>    <dbl> <chr>          <int>         <int>
+#>  1     17897          1 Coal         1 North              3            22
+#>  2     17898          1 Coal         1 North             18            22
+#>  3     17899          1 Coal         1 North              2            22
+#>  4     17900          1 Coal         1 North             18            22
+#>  5     17901          1 Coal         1 North             20            22
+#>  6     17902          1 Coal         1 North             18            22
+#>  7     17903          1 Coal         1 North              5            22
+#>  8     17904          1 Coal         1 North             14            22
+#>  9     17905          1 Coal         1 North             19            22
+#> 10     17906          1 Coal         1 North              2            22
+#> # … with 20,876 more rows
+Created on 2019-10-30 by the reprex package (v0.3.0)