Skip to content

Instantly share code, notes, and snippets.

@ClaytonJY
Last active October 31, 2019 02:59
Show Gist options
  • Save ClaytonJY/ac6853d55cb61d9a2f6885e0679bf66f to your computer and use it in GitHub Desktop.
Save ClaytonJY/ac6853d55cb61d9a2f6885e0679bf66f to your computer and use it in GitHub Desktop.

Revisions

  1. ClaytonJY revised this gist Oct 31, 2019. 1 changed file with 53 additions and 53 deletions.
    106 changes: 53 additions & 53 deletions group-and-summarize.md
    Original file line number Diff line number Diff line change
    @@ -21,16 +21,16 @@ input_tbl
    #> # A tibble: 27 x 7
    #> MarketDay HourEnding FuelType Month Region consumption withdrawal
    #> <date> <int> <chr> <dbl> <chr> <dbl> <dbl>
    #> 1 2016-01-01 1 Coal 1 North 4.47 22.0
    #> 2 2016-01-02 1 Coal 1 North -2.44 64.8
    #> 3 2016-01-03 1 Coal 1 North 26.9 62.4
    #> 4 2016-01-01 2 Coal 1 North 47.6 104.
    #> 5 2016-01-02 2 Coal 1 North 50.4 86.8
    #> 6 2016-01-03 2 Coal 1 North 71.7 79.2
    #> 7 2016-01-01 3 Coal 1 North 77.6 138.
    #> 8 2016-01-02 3 Coal 1 North 65.4 126.
    #> 9 2016-01-03 3 Coal 1 North 73.6 202.
    #> 10 2016-01-01 1 Gas 1 North 101. 204.
    #> 1 2016-01-01 1 Coal 1 North -11.3 23.8
    #> 2 2016-01-02 1 Coal 1 North 9.01 16.9
    #> 3 2016-01-03 1 Coal 1 North 42.1 47.2
    #> 4 2016-01-01 2 Coal 1 North 33.0 94.1
    #> 5 2016-01-02 2 Coal 1 North 43.0 88.3
    #> 6 2016-01-03 2 Coal 1 North 55.4 137.
    #> 7 2016-01-01 3 Coal 1 North 68.6 165.
    #> 8 2016-01-02 3 Coal 1 North 82.0 153.
    #> 9 2016-01-03 3 Coal 1 North 77.2 158.
    #> 10 2016-01-01 1 Gas 1 North 84.7 171.
    #> # … with 17 more rows

    # we want total consumption & withdrawal per MarketDay & HourEnding.
    @@ -47,15 +47,15 @@ summary_tbl
    #> # A tibble: 9 x 4
    #> MarketDay HourEnding hourly_total_consumption hourly_total_withdrawal
    #> <date> <int> <dbl> <dbl>
    #> 1 2016-01-01 1 292. 584.
    #> 2 2016-01-01 2 397. 826.
    #> 3 2016-01-01 3 489. 971.
    #> 4 2016-01-02 1 306. 661.
    #> 5 2016-01-02 2 418. 877.
    #> 6 2016-01-02 3 486. 987.
    #> 7 2016-01-03 1 387. 705.
    #> 8 2016-01-03 2 455. 867.
    #> 9 2016-01-03 3 515. 1146.
    #> 1 2016-01-01 1 261. 549.
    #> 2 2016-01-01 2 388. 753.
    #> 3 2016-01-01 3 484. 941.
    #> 4 2016-01-02 1 319. 680.
    #> 5 2016-01-02 2 398. 877.
    #> 6 2016-01-02 3 507. 987.
    #> 7 2016-01-03 1 362. 696.
    #> 8 2016-01-03 2 464. 947.
    #> 9 2016-01-03 3 537. 1075.

    # now we compute deltas by subtracting a lagged value from our value
    # this would be trickier to implement and reason about if our hourly_total_* columns
    @@ -73,15 +73,15 @@ summary_tbl
    #> # A tibble: 9 x 6
    #> MarketDay HourEnding hourly_total_co… hourly_total_wi… delta_hourly_co…
    #> <date> <int> <dbl> <dbl> <dbl>
    #> 1 2016-01-01 1 292. 584. NA
    #> 2 2016-01-01 2 397. 826. 105.
    #> 3 2016-01-01 3 489. 971. 92.0
    #> 4 2016-01-02 1 306. 661. -183.
    #> 5 2016-01-02 2 418. 877. 112.
    #> 6 2016-01-02 3 486. 987. 67.6
    #> 7 2016-01-03 1 387. 705. -98.3
    #> 8 2016-01-03 2 455. 867. 67.1
    #> 9 2016-01-03 3 515. 1146. 60.4
    #> 1 2016-01-01 1 261. 549. NA
    #> 2 2016-01-01 2 388. 753. 127.
    #> 3 2016-01-01 3 484. 941. 96.4
    #> 4 2016-01-02 1 319. 680. -165.
    #> 5 2016-01-02 2 398. 877. 78.5
    #> 6 2016-01-02 3 507. 987. 109.
    #> 7 2016-01-03 1 362. 696. -144.
    #> 8 2016-01-03 2 464. 947. 102.
    #> 9 2016-01-03 3 537. 1075. 73.2
    #> # … with 1 more variable: delta_hourly_withdrawal <dbl>

    # now we can join this back in to our original table
    @@ -93,37 +93,37 @@ tbl
    #> # A tibble: 27 x 11
    #> MarketDay HourEnding FuelType Month Region consumption withdrawal
    #> <date> <int> <chr> <dbl> <chr> <dbl> <dbl>
    #> 1 2016-01-01 1 Coal 1 North 4.47 22.0
    #> 2 2016-01-01 1 Gas 1 North 101. 204.
    #> 3 2016-01-01 1 Hydro 1 North 187. 358.
    #> 4 2016-01-01 2 Coal 1 North 47.6 104.
    #> 5 2016-01-01 2 Gas 1 North 126. 263.
    #> 6 2016-01-01 2 Hydro 1 North 224. 460.
    #> 7 2016-01-01 3 Coal 1 North 77.6 138.
    #> 8 2016-01-01 3 Gas 1 North 169. 326.
    #> 9 2016-01-01 3 Hydro 1 North 242. 508.
    #> 10 2016-01-02 1 Coal 1 North -2.44 64.8
    #> 1 2016-01-01 1 Coal 1 North -11.3 23.8
    #> 2 2016-01-01 1 Gas 1 North 84.7 171.
    #> 3 2016-01-01 1 Hydro 1 North 188. 354.
    #> 4 2016-01-01 2 Coal 1 North 33.0 94.1
    #> 5 2016-01-01 2 Gas 1 North 122. 249.
    #> 6 2016-01-01 2 Hydro 1 North 233. 410.
    #> 7 2016-01-01 3 Coal 1 North 68.6 165.
    #> 8 2016-01-01 3 Gas 1 North 161. 296.
    #> 9 2016-01-01 3 Hydro 1 North 255. 481.
    #> 10 2016-01-02 1 Coal 1 North 9.01 16.9
    #> # … with 17 more rows, and 4 more variables:
    #> # hourly_total_consumption <dbl>, hourly_total_withdrawal <dbl>,
    #> # delta_hourly_consumption <dbl>, delta_hourly_withdrawal <dbl>

    # too many columns to show in this format, so we'll focus on comsumption-only
    tbl %>%
    select(-contains("withdrawal"))
    #> # A tibble: 27 x 8
    #> MarketDay HourEnding FuelType Month Region consumption hourly_total_co
    #> <date> <int> <chr> <dbl> <chr> <dbl> <dbl>
    #> 1 2016-01-01 1 Coal 1 North 4.47 292.
    #> 2 2016-01-01 1 Gas 1 North 101. 292.
    #> 3 2016-01-01 1 Hydro 1 North 187. 292.
    #> 4 2016-01-01 2 Coal 1 North 47.6 397.
    #> 5 2016-01-01 2 Gas 1 North 126. 397.
    #> 6 2016-01-01 2 Hydro 1 North 224. 397.
    #> 7 2016-01-01 3 Coal 1 North 77.6 489.
    #> 8 2016-01-01 3 Gas 1 North 169. 489.
    #> 9 2016-01-01 3 Hydro 1 North 242. 489.
    #> 10 2016-01-02 1 Coal 1 North -2.44 306.
    #> # … with 17 more rows, and 1 more variable: delta_hourly_consumption <dbl>
    select(-Month, -Region, -FuelType, -contains("withdrawal"))
    #> # A tibble: 27 x 5
    #> MarketDay HourEnding consumption hourly_total_consu… delta_hourly_cons
    #> <date> <int> <dbl> <dbl> <dbl>
    #> 1 2016-01-01 1 -11.3 261. NA
    #> 2 2016-01-01 1 84.7 261. NA
    #> 3 2016-01-01 1 188. 261. NA
    #> 4 2016-01-01 2 33.0 388. 127.
    #> 5 2016-01-01 2 122. 388. 127.
    #> 6 2016-01-01 2 233. 388. 127.
    #> 7 2016-01-01 3 68.6 484. 96.4
    #> 8 2016-01-01 3 161. 484. 96.4
    #> 9 2016-01-01 3 255. 484. 96.4
    #> 10 2016-01-02 1 9.01 319. -165.
    #> # … with 17 more rows
    ```

    <sup>Created on 2019-10-30 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>
  2. ClaytonJY revised this gist Oct 31, 2019. 1 changed file with 92 additions and 62 deletions.
    154 changes: 92 additions & 62 deletions group-and-summarize.md
    Original file line number Diff line number Diff line change
    @@ -13,87 +13,117 @@ input_tbl <- lst(
    mutate(
    MarketDay = as.Date(MarketDay, origin = "1970-01-01"),
    # skipping GenMISObyFuel, consumption_factor, withdrawal_factor
    consumption = rnorm(nrow(.)), # random draws from normal dist; doesn't look like your data, but doesn't matter
    withdrawal = rnorm(nrow(.))
    consumption = 10 * (1:n() + rnorm(n())), # changed this up to make later steps more clear
    withdrawal = 20 * (1:n() + rnorm(n()))
    )

    input_tbl
    #> # A tibble: 27 x 7
    #> MarketDay HourEnding FuelType Month Region consumption withdrawal
    #> <date> <int> <chr> <dbl> <chr> <dbl> <dbl>
    #> 1 2016-01-01 1 Coal 1 North -0.746 1.70
    #> 2 2016-01-02 1 Coal 1 North -0.670 -1.04
    #> 3 2016-01-03 1 Coal 1 North -1.07 0.634
    #> 4 2016-01-01 2 Coal 1 North -0.375 0.534
    #> 5 2016-01-02 2 Coal 1 North -0.0111 0.613
    #> 6 2016-01-03 2 Coal 1 North 0.537 -1.64
    #> 7 2016-01-01 3 Coal 1 North -1.19 0.645
    #> 8 2016-01-02 3 Coal 1 North -0.456 0.631
    #> 9 2016-01-03 3 Coal 1 North -0.866 -0.488
    #> 10 2016-01-01 1 Gas 1 North 0.131 -0.913
    #> 1 2016-01-01 1 Coal 1 North 4.47 22.0
    #> 2 2016-01-02 1 Coal 1 North -2.44 64.8
    #> 3 2016-01-03 1 Coal 1 North 26.9 62.4
    #> 4 2016-01-01 2 Coal 1 North 47.6 104.
    #> 5 2016-01-02 2 Coal 1 North 50.4 86.8
    #> 6 2016-01-03 2 Coal 1 North 71.7 79.2
    #> 7 2016-01-01 3 Coal 1 North 77.6 138.
    #> 8 2016-01-02 3 Coal 1 North 65.4 126.
    #> 9 2016-01-03 3 Coal 1 North 73.6 202.
    #> 10 2016-01-01 1 Gas 1 North 101. 204.
    #> # … with 17 more rows

    # we want sum consumption & withdrawal by MarketDay & HourEnding
    # will produce 1 row per MarketDay-HourEnding combo
    # used 3 values for each above, so 9 rows expected
    input_tbl %>%
    # we want total consumption & withdrawal per MarketDay & HourEnding.
    # we'll do this as a separate tibble because it'll simplify the next step
    summary_tbl <- input_tbl %>%
    group_by(MarketDay, HourEnding) %>%
    summarize(
    hourly_total_consumption = sum(consumption),
    hourly_total_withdrawal = sum(withdrawal)
    ) %>%
    ungroup() # important so we aren't still grouped during later work
    ungroup()

    summary_tbl
    #> # A tibble: 9 x 4
    #> MarketDay HourEnding hourly_total_consumption hourly_total_withdrawal
    #> <date> <int> <dbl> <dbl>
    #> 1 2016-01-01 1 -0.270 1.88
    #> 2 2016-01-01 2 -0.840 2.15
    #> 3 2016-01-01 3 -2.79 2.35
    #> 4 2016-01-02 1 0.343 -0.866
    #> 5 2016-01-02 2 -0.552 0.288
    #> 6 2016-01-02 3 0.793 0.960
    #> 7 2016-01-03 1 0.201 -0.292
    #> 8 2016-01-03 2 -0.753 -1.77
    #> 9 2016-01-03 3 -0.567 -0.401
    #> 1 2016-01-01 1 292. 584.
    #> 2 2016-01-01 2 397. 826.
    #> 3 2016-01-01 3 489. 971.
    #> 4 2016-01-02 1 306. 661.
    #> 5 2016-01-02 2 418. 877.
    #> 6 2016-01-02 3 486. 987.
    #> 7 2016-01-03 1 387. 705.
    #> 8 2016-01-03 2 455. 867.
    #> 9 2016-01-03 3 515. 1146.

    # notice everything but the group cols and summarize cols is _gone_
    # that's expected! the colums I didn't add to input_tbl would also go away,
    # hence why not including them didnt matter here
    # now we compute deltas by subtracting a lagged value from our value
    # this would be trickier to implement and reason about if our hourly_total_* columns
    # were already in our bigger input_tbl
    summary_tbl <- summary_tbl %>%
    arrange(MarketDay, HourEnding) %>% # to be extra sure our order is chronological
    mutate(
    delta_hourly_consumption = hourly_total_consumption - lag(hourly_total_consumption),
    delta_hourly_withdrawal = hourly_total_withdrawal - lag(hourly_total_withdrawal)
    )

    #### BONUS ####
    # notice the NA's at the top here!
    # there's nothing to subtract; the lag starts with NA, and x - NA = NA
    summary_tbl
    #> # A tibble: 9 x 6
    #> MarketDay HourEnding hourly_total_co… hourly_total_wi… delta_hourly_co…
    #> <date> <int> <dbl> <dbl> <dbl>
    #> 1 2016-01-01 1 292. 584. NA
    #> 2 2016-01-01 2 397. 826. 105.
    #> 3 2016-01-01 3 489. 971. 92.0
    #> 4 2016-01-02 1 306. 661. -183.
    #> 5 2016-01-02 2 418. 877. 112.
    #> 6 2016-01-02 3 486. 987. 67.6
    #> 7 2016-01-03 1 387. 705. -98.3
    #> 8 2016-01-03 2 455. 867. 67.1
    #> 9 2016-01-03 3 515. 1146. 60.4
    #> # … with 1 more variable: delta_hourly_withdrawal <dbl>

    # remember count()?
    input_tbl %>% count(MarketDay, HourEnding)
    #> # A tibble: 9 x 3
    #> MarketDay HourEnding n
    #> <date> <int> <int>
    #> 1 2016-01-01 1 3
    #> 2 2016-01-01 2 3
    #> 3 2016-01-01 3 3
    #> 4 2016-01-02 1 3
    #> 5 2016-01-02 2 3
    #> 6 2016-01-02 3 3
    #> 7 2016-01-03 1 3
    #> 8 2016-01-03 2 3
    #> 9 2016-01-03 3 3
    # now we can join this back in to our original table
    tbl <- input_tbl %>%
    left_join(summary_tbl, by = c("MarketDay", "HourEnding")) %>%
    arrange(MarketDay, HourEnding)

    # thats really shorthand for a group_by and summarize!
    input_tbl %>%
    group_by(MarketDay, HourEnding) %>%
    summarize(n = n()) %>% # n() is a special function for use in certain dplyr functions; numer of rows in group
    ungroup()
    #> # A tibble: 9 x 3
    #> MarketDay HourEnding n
    #> <date> <int> <int>
    #> 1 2016-01-01 1 3
    #> 2 2016-01-01 2 3
    #> 3 2016-01-01 3 3
    #> 4 2016-01-02 1 3
    #> 5 2016-01-02 2 3
    #> 6 2016-01-02 3 3
    #> 7 2016-01-03 1 3
    #> 8 2016-01-03 2 3
    #> 9 2016-01-03 3 3
    tbl
    #> # A tibble: 27 x 11
    #> MarketDay HourEnding FuelType Month Region consumption withdrawal
    #> <date> <int> <chr> <dbl> <chr> <dbl> <dbl>
    #> 1 2016-01-01 1 Coal 1 North 4.47 22.0
    #> 2 2016-01-01 1 Gas 1 North 101. 204.
    #> 3 2016-01-01 1 Hydro 1 North 187. 358.
    #> 4 2016-01-01 2 Coal 1 North 47.6 104.
    #> 5 2016-01-01 2 Gas 1 North 126. 263.
    #> 6 2016-01-01 2 Hydro 1 North 224. 460.
    #> 7 2016-01-01 3 Coal 1 North 77.6 138.
    #> 8 2016-01-01 3 Gas 1 North 169. 326.
    #> 9 2016-01-01 3 Hydro 1 North 242. 508.
    #> 10 2016-01-02 1 Coal 1 North -2.44 64.8
    #> # … with 17 more rows, and 4 more variables:
    #> # hourly_total_consumption <dbl>, hourly_total_withdrawal <dbl>,
    #> # delta_hourly_consumption <dbl>, delta_hourly_withdrawal <dbl>

    # too many columns to show in this format, so we'll focus on comsumption-only
    tbl %>%
    select(-contains("withdrawal"))
    #> # A tibble: 27 x 8
    #> MarketDay HourEnding FuelType Month Region consumption hourly_total_co…
    #> <date> <int> <chr> <dbl> <chr> <dbl> <dbl>
    #> 1 2016-01-01 1 Coal 1 North 4.47 292.
    #> 2 2016-01-01 1 Gas 1 North 101. 292.
    #> 3 2016-01-01 1 Hydro 1 North 187. 292.
    #> 4 2016-01-01 2 Coal 1 North 47.6 397.
    #> 5 2016-01-01 2 Gas 1 North 126. 397.
    #> 6 2016-01-01 2 Hydro 1 North 224. 397.
    #> 7 2016-01-01 3 Coal 1 North 77.6 489.
    #> 8 2016-01-01 3 Gas 1 North 169. 489.
    #> 9 2016-01-01 3 Hydro 1 North 242. 489.
    #> 10 2016-01-02 1 Coal 1 North -2.44 306.
    #> # … with 17 more rows, and 1 more variable: delta_hourly_consumption <dbl>
    ```

    <sup>Created on 2019-10-30 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>
    <sup>Created on 2019-10-30 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>
  3. ClaytonJY revised this gist Oct 30, 2019. 1 changed file with 99 additions and 0 deletions.
    99 changes: 99 additions & 0 deletions group-and-summarize.md
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,99 @@
    ``` r
    library(tidyverse)

    # fake some data
    input_tbl <- lst(
    MarketDay = as.Date(c("2016-01-01", "2016-01-02", "2016-01-03")),
    HourEnding = 1:3,
    FuelType = c("Coal", "Gas", "Hydro"),
    Month = 1, # because only Jan dates
    Region = "North"
    ) %>%
    cross_df() %>%
    mutate(
    MarketDay = as.Date(MarketDay, origin = "1970-01-01"),
    # skipping GenMISObyFuel, consumption_factor, withdrawal_factor
    consumption = rnorm(nrow(.)), # random draws from normal dist; doesn't look like your data, but doesn't matter
    withdrawal = rnorm(nrow(.))
    )

    input_tbl
    #> # A tibble: 27 x 7
    #> MarketDay HourEnding FuelType Month Region consumption withdrawal
    #> <date> <int> <chr> <dbl> <chr> <dbl> <dbl>
    #> 1 2016-01-01 1 Coal 1 North -0.746 1.70
    #> 2 2016-01-02 1 Coal 1 North -0.670 -1.04
    #> 3 2016-01-03 1 Coal 1 North -1.07 0.634
    #> 4 2016-01-01 2 Coal 1 North -0.375 0.534
    #> 5 2016-01-02 2 Coal 1 North -0.0111 0.613
    #> 6 2016-01-03 2 Coal 1 North 0.537 -1.64
    #> 7 2016-01-01 3 Coal 1 North -1.19 0.645
    #> 8 2016-01-02 3 Coal 1 North -0.456 0.631
    #> 9 2016-01-03 3 Coal 1 North -0.866 -0.488
    #> 10 2016-01-01 1 Gas 1 North 0.131 -0.913
    #> # … with 17 more rows

    # we want sum consumption & withdrawal by MarketDay & HourEnding
    # will produce 1 row per MarketDay-HourEnding combo
    # used 3 values for each above, so 9 rows expected
    input_tbl %>%
    group_by(MarketDay, HourEnding) %>%
    summarize(
    hourly_total_consumption = sum(consumption),
    hourly_total_withdrawal = sum(withdrawal)
    ) %>%
    ungroup() # important so we aren't still grouped during later work
    #> # A tibble: 9 x 4
    #> MarketDay HourEnding hourly_total_consumption hourly_total_withdrawal
    #> <date> <int> <dbl> <dbl>
    #> 1 2016-01-01 1 -0.270 1.88
    #> 2 2016-01-01 2 -0.840 2.15
    #> 3 2016-01-01 3 -2.79 2.35
    #> 4 2016-01-02 1 0.343 -0.866
    #> 5 2016-01-02 2 -0.552 0.288
    #> 6 2016-01-02 3 0.793 0.960
    #> 7 2016-01-03 1 0.201 -0.292
    #> 8 2016-01-03 2 -0.753 -1.77
    #> 9 2016-01-03 3 -0.567 -0.401

    # notice everything but the group cols and summarize cols is _gone_
    # that's expected! the colums I didn't add to input_tbl would also go away,
    # hence why not including them didnt matter here

    #### BONUS ####

    # remember count()?
    input_tbl %>% count(MarketDay, HourEnding)
    #> # A tibble: 9 x 3
    #> MarketDay HourEnding n
    #> <date> <int> <int>
    #> 1 2016-01-01 1 3
    #> 2 2016-01-01 2 3
    #> 3 2016-01-01 3 3
    #> 4 2016-01-02 1 3
    #> 5 2016-01-02 2 3
    #> 6 2016-01-02 3 3
    #> 7 2016-01-03 1 3
    #> 8 2016-01-03 2 3
    #> 9 2016-01-03 3 3

    # thats really shorthand for a group_by and summarize!
    input_tbl %>%
    group_by(MarketDay, HourEnding) %>%
    summarize(n = n()) %>% # n() is a special function for use in certain dplyr functions; numer of rows in group
    ungroup()
    #> # A tibble: 9 x 3
    #> MarketDay HourEnding n
    #> <date> <int> <int>
    #> 1 2016-01-01 1 3
    #> 2 2016-01-01 2 3
    #> 3 2016-01-01 3 3
    #> 4 2016-01-02 1 3
    #> 5 2016-01-02 2 3
    #> 6 2016-01-02 3 3
    #> 7 2016-01-03 1 3
    #> 8 2016-01-03 2 3
    #> 9 2016-01-03 3 3
    ```

    <sup>Created on 2019-10-30 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>
  4. ClaytonJY renamed this gist Oct 30, 2019. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  5. ClaytonJY revised this gist Oct 30, 2019. 1 changed file with 56 additions and 26 deletions.
    82 changes: 56 additions & 26 deletions impact-factor-example.R
    Original file line number Diff line number Diff line change
    @@ -1,3 +1,4 @@
    ``` r
    library(tidyverse) # all the good stuff
    library(lubridate)
    #>
    @@ -27,16 +28,16 @@ df2016north
    #> # A tibble: 20,886 x 6
    #> MarketDay HourEnding FuelType Month Region GenMISObyFuel
    #> <date> <dbl> <chr> <dbl> <chr> <int>
    #> 1 2019-01-01 1 Coal 1 North 15
    #> 2 2019-01-02 1 Coal 1 North 4
    #> 3 2019-01-03 1 Coal 1 North 19
    #> 4 2019-01-04 1 Coal 1 North 6
    #> 5 2019-01-05 1 Coal 1 North 2
    #> 1 2019-01-01 1 Coal 1 North 7
    #> 2 2019-01-02 1 Coal 1 North 7
    #> 3 2019-01-03 1 Coal 1 North 18
    #> 4 2019-01-04 1 Coal 1 North 8
    #> 5 2019-01-05 1 Coal 1 North 11
    #> 6 2019-01-06 1 Coal 1 North 20
    #> 7 2019-01-07 1 Coal 1 North 14
    #> 8 2019-01-08 1 Coal 1 North 16
    #> 9 2019-01-09 1 Coal 1 North 13
    #> 10 2019-01-10 1 Coal 1 North 15
    #> 7 2019-01-07 1 Coal 1 North 18
    #> 8 2019-01-08 1 Coal 1 North 2
    #> 9 2019-01-09 1 Coal 1 North 5
    #> 10 2019-01-10 1 Coal 1 North 7
    #> # … with 20,876 more rows

    # to map (FuelType, Month) to impact_factor, we'd like a table with each as a column
    @@ -55,12 +56,12 @@ impact_df
    #> # A tibble: 6 x 3
    #> FuelType Month impact_factor
    #> <chr> <dbl> <int>
    #> 1 Coal 1 100
    #> 2 Gas 1 142
    #> 3 Other 1 148
    #> 4 Coal 2 129
    #> 5 Gas 2 97
    #> 6 Other 2 43
    #> 1 Coal 1 88
    #> 2 Gas 1 22
    #> 3 Other 1 113
    #> 4 Coal 2 140
    #> 5 Gas 2 130
    #> 6 Other 2 67

    # now we can join to get impact_factor into our df2016north tbl
    # left join means we'll keep all the columns from the left one, df2016north,
    @@ -72,15 +73,44 @@ df2016north
    #> # A tibble: 20,886 x 7
    #> MarketDay HourEnding FuelType Month Region GenMISObyFuel impact_factor
    #> <date> <dbl> <chr> <dbl> <chr> <int> <int>
    #> 1 2019-01-01 1 Coal 1 North 15 100
    #> 2 2019-01-02 1 Coal 1 North 4 100
    #> 3 2019-01-03 1 Coal 1 North 19 100
    #> 4 2019-01-04 1 Coal 1 North 6 100
    #> 5 2019-01-05 1 Coal 1 North 2 100
    #> 6 2019-01-06 1 Coal 1 North 20 100
    #> 7 2019-01-07 1 Coal 1 North 14 100
    #> 8 2019-01-08 1 Coal 1 North 16 100
    #> 9 2019-01-09 1 Coal 1 North 13 100
    #> 10 2019-01-10 1 Coal 1 North 15 100
    #> 1 2019-01-01 1 Coal 1 North 7 88
    #> 2 2019-01-02 1 Coal 1 North 7 88
    #> 3 2019-01-03 1 Coal 1 North 18 88
    #> 4 2019-01-04 1 Coal 1 North 8 88
    #> 5 2019-01-05 1 Coal 1 North 11 88
    #> 6 2019-01-06 1 Coal 1 North 20 88
    #> 7 2019-01-07 1 Coal 1 North 18 88
    #> 8 2019-01-08 1 Coal 1 North 2 88
    #> 9 2019-01-09 1 Coal 1 North 5 88
    #> 10 2019-01-10 1 Coal 1 North 7 88
    #> # … with 20,876 more rows
    Created on 2019-10-30 by the reprex package (v0.3.0)

    # that only showed one FuelType-Month combo
    # we can randomly sample to better understand what happened
    sample_n(df2016north, 20)
    #> # A tibble: 20 x 7
    #> MarketDay HourEnding FuelType Month Region GenMISObyFuel impact_factor
    #> <date> <dbl> <chr> <dbl> <chr> <int> <int>
    #> 1 2019-02-01 2 Coal 2 North 1 140
    #> 2 2019-01-18 2 Coal 2 North 6 140
    #> 3 2019-01-31 1 Gas 2 North 3 130
    #> 4 2019-02-11 2 Other 2 North 4 67
    #> 5 2019-01-07 1 Coal 2 North 13 140
    #> 6 2019-01-18 1 Other 2 North 12 67
    #> 7 2019-01-20 1 Other 2 North 17 67
    #> 8 2019-01-10 1 Other 1 North 1 113
    #> 9 2019-01-30 2 Other 2 North 14 67
    #> 10 2019-01-23 2 Other 1 North 2 113
    #> 11 2019-01-09 1 Gas 1 North 11 22
    #> 12 2019-01-04 2 Other 1 North 13 113
    #> 13 2019-01-02 2 Coal 1 North 4 88
    #> 14 2019-02-24 2 Gas 1 North 9 22
    #> 15 2019-01-18 1 Other 1 North 5 113
    #> 16 2019-02-20 2 Coal 1 North 14 88
    #> 17 2019-02-04 1 Other 2 North 15 67
    #> 18 2019-02-26 2 Coal 1 North 1 88
    #> 19 2019-02-09 1 Other 2 North 19 67
    #> 20 2019-01-21 2 Coal 1 North 19 88
    ```

    <sup>Created on 2019-10-30 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>
  6. ClaytonJY revised this gist Oct 30, 2019. 1 changed file with 31 additions and 35 deletions.
    66 changes: 31 additions & 35 deletions impact-factor-example.R
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,3 @@
    ``` r
    library(tidyverse) # all the good stuff
    library(lubridate)
    #>
    @@ -28,16 +27,16 @@ df2016north
    #> # A tibble: 20,886 x 6
    #> MarketDay HourEnding FuelType Month Region GenMISObyFuel
    #> <date> <dbl> <chr> <dbl> <chr> <int>
    #> 1 2019-01-01 1 Coal 1 North 10
    #> 2 2019-01-02 1 Coal 1 North 1
    #> 3 2019-01-03 1 Coal 1 North 13
    #> 4 2019-01-04 1 Coal 1 North 16
    #> 5 2019-01-05 1 Coal 1 North 11
    #> 6 2019-01-06 1 Coal 1 North 17
    #> 7 2019-01-07 1 Coal 1 North 20
    #> 8 2019-01-08 1 Coal 1 North 5
    #> 9 2019-01-09 1 Coal 1 North 4
    #> 10 2019-01-10 1 Coal 1 North 20
    #> 1 2019-01-01 1 Coal 1 North 15
    #> 2 2019-01-02 1 Coal 1 North 4
    #> 3 2019-01-03 1 Coal 1 North 19
    #> 4 2019-01-04 1 Coal 1 North 6
    #> 5 2019-01-05 1 Coal 1 North 2
    #> 6 2019-01-06 1 Coal 1 North 20
    #> 7 2019-01-07 1 Coal 1 North 14
    #> 8 2019-01-08 1 Coal 1 North 16
    #> 9 2019-01-09 1 Coal 1 North 13
    #> 10 2019-01-10 1 Coal 1 North 15
    #> # … with 20,876 more rows

    # to map (FuelType, Month) to impact_factor, we'd like a table with each as a column
    @@ -56,35 +55,32 @@ impact_df
    #> # A tibble: 6 x 3
    #> FuelType Month impact_factor
    #> <chr> <dbl> <int>
    #> 1 Coal 1 17
    #> 2 Gas 1 18
    #> 3 Other 1 24
    #> 4 Coal 2 148
    #> 5 Gas 2 28
    #> 6 Other 2 107
    #> 1 Coal 1 100
    #> 2 Gas 1 142
    #> 3 Other 1 148
    #> 4 Coal 2 129
    #> 5 Gas 2 97
    #> 6 Other 2 43

    # now we can join to get impact_factor into our df2016north tbl
    # left join means we'll keep all the columns from the left one, df2016north,
    # even the rows with no match in impact_df (all rows have matched in this example)
    df2016north <- left_join(df2016north, impact_df, by = c("Fuel Type", "Month"))
    #> `by` can't contain join column `Fuel Type` which is missing from LHS
    df2016north <- left_join(df2016north, impact_df, by = c("FuelType", "Month"))

    # peek
    df2016north
    #> # A tibble: 20,886 x 6
    #> MarketDay HourEnding FuelType Month Region GenMISObyFuel
    #> <date> <dbl> <chr> <dbl> <chr> <int>
    #> 1 2019-01-01 1 Coal 1 North 10
    #> 2 2019-01-02 1 Coal 1 North 1
    #> 3 2019-01-03 1 Coal 1 North 13
    #> 4 2019-01-04 1 Coal 1 North 16
    #> 5 2019-01-05 1 Coal 1 North 11
    #> 6 2019-01-06 1 Coal 1 North 17
    #> 7 2019-01-07 1 Coal 1 North 20
    #> 8 2019-01-08 1 Coal 1 North 5
    #> 9 2019-01-09 1 Coal 1 North 4
    #> 10 2019-01-10 1 Coal 1 North 20
    #> # A tibble: 20,886 x 7
    #> MarketDay HourEnding FuelType Month Region GenMISObyFuel impact_factor
    #> <date> <dbl> <chr> <dbl> <chr> <int> <int>
    #> 1 2019-01-01 1 Coal 1 North 15 100
    #> 2 2019-01-02 1 Coal 1 North 4 100
    #> 3 2019-01-03 1 Coal 1 North 19 100
    #> 4 2019-01-04 1 Coal 1 North 6 100
    #> 5 2019-01-05 1 Coal 1 North 2 100
    #> 6 2019-01-06 1 Coal 1 North 20 100
    #> 7 2019-01-07 1 Coal 1 North 14 100
    #> 8 2019-01-08 1 Coal 1 North 16 100
    #> 9 2019-01-09 1 Coal 1 North 13 100
    #> 10 2019-01-10 1 Coal 1 North 15 100
    #> # … with 20,876 more rows
    ```

    <sup>Created on 2019-10-30 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>
    Created on 2019-10-30 by the reprex package (v0.3.0)
  7. ClaytonJY revised this gist Oct 30, 2019. No changes.
  8. ClaytonJY revised this gist Oct 30, 2019. 1 changed file with 41 additions and 35 deletions.
    76 changes: 41 additions & 35 deletions impact-factor-example.R
    Original file line number Diff line number Diff line change
    @@ -1,3 +1,4 @@
    ``` r
    library(tidyverse) # all the good stuff
    library(lubridate)
    #>
    @@ -16,24 +17,27 @@ df2016north <- lst(
    Region = "North"
    ) %>%
    cross_df() %>%
    mutate(GenMISObyFuel = sample.int(20:7000, nrow(.), replace = TRUE))
    mutate(
    MarketDay = as_date(MarketDay), # this one got messed up by cross_df
    GenMISObyFuel = sample.int(20:7000, nrow(.), replace = TRUE)
    )

    # let's take a peek
    # should be one row for every possible combo of the stuff in the lst above
    df2016north
    #> # A tibble: 20,886 x 6
    #> MarketDay HourEnding FuelType Month Region GenMISObyFuel
    #> <dbl> <dbl> <chr> <dbl> <chr> <int>
    #> 1 17897 1 Coal 1 North 3
    #> 2 17898 1 Coal 1 North 18
    #> 3 17899 1 Coal 1 North 2
    #> 4 17900 1 Coal 1 North 18
    #> 5 17901 1 Coal 1 North 20
    #> 6 17902 1 Coal 1 North 18
    #> 7 17903 1 Coal 1 North 5
    #> 8 17904 1 Coal 1 North 14
    #> 9 17905 1 Coal 1 North 19
    #> 10 17906 1 Coal 1 North 2
    #> MarketDay HourEnding FuelType Month Region GenMISObyFuel
    #> <date> <dbl> <chr> <dbl> <chr> <int>
    #> 1 2019-01-01 1 Coal 1 North 10
    #> 2 2019-01-02 1 Coal 1 North 1
    #> 3 2019-01-03 1 Coal 1 North 13
    #> 4 2019-01-04 1 Coal 1 North 16
    #> 5 2019-01-05 1 Coal 1 North 11
    #> 6 2019-01-06 1 Coal 1 North 17
    #> 7 2019-01-07 1 Coal 1 North 20
    #> 8 2019-01-08 1 Coal 1 North 5
    #> 9 2019-01-09 1 Coal 1 North 4
    #> 10 2019-01-10 1 Coal 1 North 20
    #> # … with 20,876 more rows

    # to map (FuelType, Month) to impact_factor, we'd like a table with each as a column
    @@ -52,33 +56,35 @@ impact_df
    #> # A tibble: 6 x 3
    #> FuelType Month impact_factor
    #> <chr> <dbl> <int>
    #> 1 Coal 1 22
    #> 2 Gas 1 118
    #> 3 Other 1 133
    #> 4 Coal 2 130
    #> 5 Gas 2 114
    #> 6 Other 2 90
    #> 1 Coal 1 17
    #> 2 Gas 1 18
    #> 3 Other 1 24
    #> 4 Coal 2 148
    #> 5 Gas 2 28
    #> 6 Other 2 107

    # now we can join to get impact_factor into our df2016north tbl
    # left join means we'll keep all the columns from the left one, df2016north,
    # even the rows with no match in impact_df (all rows have matched in this example)
    df2016north <- left_join(df2016north, impact_df)
    #> Joining, by = c("FuelType", "Month")
    df2016north <- left_join(df2016north, impact_df, by = c("Fuel Type", "Month"))
    #> `by` can't contain join column `Fuel Type` which is missing from LHS

    # peek
    df2016north
    #> # A tibble: 20,886 x 7
    #> MarketDay HourEnding FuelType Month Region GenMISObyFuel impact_factor
    #> <dbl> <dbl> <chr> <dbl> <chr> <int> <int>
    #> 1 17897 1 Coal 1 North 3 22
    #> 2 17898 1 Coal 1 North 18 22
    #> 3 17899 1 Coal 1 North 2 22
    #> 4 17900 1 Coal 1 North 18 22
    #> 5 17901 1 Coal 1 North 20 22
    #> 6 17902 1 Coal 1 North 18 22
    #> 7 17903 1 Coal 1 North 5 22
    #> 8 17904 1 Coal 1 North 14 22
    #> 9 17905 1 Coal 1 North 19 22
    #> 10 17906 1 Coal 1 North 2 22
    #> # A tibble: 20,886 x 6
    #> MarketDay HourEnding FuelType Month Region GenMISObyFuel
    #> <date> <dbl> <chr> <dbl> <chr> <int>
    #> 1 2019-01-01 1 Coal 1 North 10
    #> 2 2019-01-02 1 Coal 1 North 1
    #> 3 2019-01-03 1 Coal 1 North 13
    #> 4 2019-01-04 1 Coal 1 North 16
    #> 5 2019-01-05 1 Coal 1 North 11
    #> 6 2019-01-06 1 Coal 1 North 17
    #> 7 2019-01-07 1 Coal 1 North 20
    #> 8 2019-01-08 1 Coal 1 North 5
    #> 9 2019-01-09 1 Coal 1 North 4
    #> 10 2019-01-10 1 Coal 1 North 20
    #> # … with 20,876 more rows
    Created on 2019-10-30 by the reprex package (v0.3.0)
    ```

    <sup>Created on 2019-10-30 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>
  9. ClaytonJY created this gist Oct 30, 2019.
    84 changes: 84 additions & 0 deletions impact-factor-example.R
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,84 @@
    library(tidyverse) # all the good stuff
    library(lubridate)
    #>
    #> Attaching package: 'lubridate'
    #> The following object is masked from 'package:base':
    #>
    #> date

    # I want to build something kinda like your actual data, but simpler
    # fewer levels of important cols, only a few extra columns, but same general idea
    df2016north <- lst(
    MarketDay = seq.Date(from = as_date("2019-01-01"), to = as_date("2019-02-28"), by = "day"),
    HourEnding = c(1, 2),
    FuelType = c("Coal", "Gas", "Other"),
    Month = month(MarketDay),
    Region = "North"
    ) %>%
    cross_df() %>%
    mutate(GenMISObyFuel = sample.int(20:7000, nrow(.), replace = TRUE))

    # let's take a peek
    # should be one row for every possible combo of the stuff in the lst above
    df2016north
    #> # A tibble: 20,886 x 6
    #> MarketDay HourEnding FuelType Month Region GenMISObyFuel
    #> <dbl> <dbl> <chr> <dbl> <chr> <int>
    #> 1 17897 1 Coal 1 North 3
    #> 2 17898 1 Coal 1 North 18
    #> 3 17899 1 Coal 1 North 2
    #> 4 17900 1 Coal 1 North 18
    #> 5 17901 1 Coal 1 North 20
    #> 6 17902 1 Coal 1 North 18
    #> 7 17903 1 Coal 1 North 5
    #> 8 17904 1 Coal 1 North 14
    #> 9 17905 1 Coal 1 North 19
    #> 10 17906 1 Coal 1 North 2
    #> # … with 20,876 more rows

    # to map (FuelType, Month) to impact_factor, we'd like a table with each as a column
    # it might look like this, after we've read it in and formatted it appropriately
    impact_df <- lst(
    FuelType = unique(df2016north$FuelType),
    Month = unique(df2016north$Month)
    ) %>%
    cross_df() %>%
    mutate(
    impact_factor = sample.int(150:800, nrow(.))
    )

    # take a peek
    impact_df
    #> # A tibble: 6 x 3
    #> FuelType Month impact_factor
    #> <chr> <dbl> <int>
    #> 1 Coal 1 22
    #> 2 Gas 1 118
    #> 3 Other 1 133
    #> 4 Coal 2 130
    #> 5 Gas 2 114
    #> 6 Other 2 90

    # now we can join to get impact_factor into our df2016north tbl
    # left join means we'll keep all the columns from the left one, df2016north,
    # even the rows with no match in impact_df (all rows have matched in this example)
    df2016north <- left_join(df2016north, impact_df)
    #> Joining, by = c("FuelType", "Month")

    # peek
    df2016north
    #> # A tibble: 20,886 x 7
    #> MarketDay HourEnding FuelType Month Region GenMISObyFuel impact_factor
    #> <dbl> <dbl> <chr> <dbl> <chr> <int> <int>
    #> 1 17897 1 Coal 1 North 3 22
    #> 2 17898 1 Coal 1 North 18 22
    #> 3 17899 1 Coal 1 North 2 22
    #> 4 17900 1 Coal 1 North 18 22
    #> 5 17901 1 Coal 1 North 20 22
    #> 6 17902 1 Coal 1 North 18 22
    #> 7 17903 1 Coal 1 North 5 22
    #> 8 17904 1 Coal 1 North 14 22
    #> 9 17905 1 Coal 1 North 19 22
    #> 10 17906 1 Coal 1 North 2 22
    #> # … with 20,876 more rows
    Created on 2019-10-30 by the reprex package (v0.3.0)