## This document illustrates that type 1 sum of squares lead to increased alpha
## error rates when a predictive covariate is included in the regression model. 


# Estimate p-value for treatment (null) effect via linear regression, 
# including a covariate that is predictive of the outcome
#
# param N: sample size, default 100
# param beta_covariate: regression weight for a covariate on an outcome, default 0.5
# param coding: how is the treatment coded, defaults to 0/1
# param AOV: sum of squares method; if `FALSE`, the p-value is just estimated via `summary()` 
#    (= default); if 1, the p-value is estimated via `anova()`; if 2 or 3, the p-value
#    is estimated using type 2 or type 3 sum of squares using `car::Anova()`
# return: the p-value associated with the "treatment"
#
# Details:
# Data is generated via a regression model, predicting the outcome from a covariate
# `outcome = beta_covariate * rnorm(N) + error` 
# [`+ treatment * 0`, i.e. a null effect of treatment, is implicit]
#
covariate_regression <- function(
  N = 100, 
  beta_covariate = 0.5, 
  coding = c(0, 1), 
  AOV = FALSE
) {
  # Simulate covariate and outcome data
  data <- data.frame(covariate = rnorm(N))
  error <- rnorm(N)
  data$outcome <- beta_covariate * data$covariate + error
  # Insert treatment variable that has no effect
  data$treatment <- sample(rep_len(coding, N))
  # do regression to test for treatment effect
  model <- lm(outcome ~ treatment * covariate, data = data)
  get_p_value(model, "treatment", AOV)
}

get_p_value <- function(model, effect, AOV = 0) {
  stopifnot(AOV %in% 0:3)
  if (AOV == 0) {
    tab <- summary(model)$coefficients
    # extract p-value
    return(tab[rownames(tab) == effect, "Pr(>|t|)"])
  } else if (AOV == 1) {
    tab <- anova(model)
    return(tab[rownames(tab) == effect, "Pr(>F)"])
  }
  tab <- car::Anova(model, type = AOV)
  tab[rownames(tab) == effect, "Pr(>F)"]
}


# Standard regression, using summary to print p-value:
mean(replicate(10000, covariate_regression()) <= .05)
#> 0.0513

# Use `anova()` to print p-value:
mean(replicate(10000, covariate_regression(AOV = 1)) <= .05)
#> 0.0828

# Coding scheme does not seem to matter: 
mean(replicate(10000, covariate_regression(AOV = 1, coding = -1:1)) <= .05)
#> 0.0793

# No problem if covariate is not predictive of outcome:
mean(replicate(10000, covariate_regression(AOV = 1, beta_covariate = 0)) <= .05)
#> 0.05

# No problem for type 2 or type 3 sum of squares
mean(replicate(10000, covariate_regression(AOV = 2)) <= .05)
#> 0.0484

mean(replicate(10000, covariate_regression(AOV = 3)) <= .05)
#> 0.0531

### Problem disappears when the covariate comes first in the regression model!
# lm(outcome ~ covariate * treatment, data = data)