## This document illustrates that type 1 sum of squares lead to increased alpha ## error rates when a predictive covariate is included in the regression model. # Estimate p-value for treatment (null) effect via linear regression, # including a covariate that is predictive of the outcome # # param N: sample size, default 100 # param beta_covariate: regression weight for a covariate on an outcome, default 0.5 # param coding: how is the treatment coded, defaults to 0/1 # param AOV: sum of squares method; if `FALSE`, the p-value is just estimated via `summary()` # (= default); if 1, the p-value is estimated via `anova()`; if 2 or 3, the p-value # is estimated using type 2 or type 3 sum of squares using `car::Anova()` # return: the p-value associated with the "treatment" # # Details: # Data is generated via a regression model, predicting the outcome from a covariate # `outcome = beta_covariate * rnorm(N) + error` # [`+ treatment * 0`, i.e. a null effect of treatment, is implicit] # covariate_regression <- function( N = 100, beta_covariate = 0.5, coding = c(0, 1), AOV = FALSE ) { # Simulate covariate and outcome data data <- data.frame(covariate = rnorm(N)) error <- rnorm(N) data$outcome <- beta_covariate * data$covariate + error # Insert treatment variable that has no effect data$treatment <- sample(rep_len(coding, N)) # do regression to test for treatment effect model <- lm(outcome ~ treatment * covariate, data = data) get_p_value(model, "treatment", AOV) } get_p_value <- function(model, effect, AOV = 0) { stopifnot(AOV %in% 0:3) if (AOV == 0) { tab <- summary(model)$coefficients # extract p-value return(tab[rownames(tab) == effect, "Pr(>|t|)"]) } else if (AOV == 1) { tab <- anova(model) return(tab[rownames(tab) == effect, "Pr(>F)"]) } tab <- car::Anova(model, type = AOV) tab[rownames(tab) == effect, "Pr(>F)"] } # Standard regression, using summary to print p-value: mean(replicate(10000, covariate_regression()) <= .05) #> 0.0513 # Use `anova()` to print p-value: mean(replicate(10000, covariate_regression(AOV = 1)) <= .05) #> 0.0828 # Coding scheme does not seem to matter: mean(replicate(10000, covariate_regression(AOV = 1, coding = -1:1)) <= .05) #> 0.0793 # No problem if covariate is not predictive of outcome: mean(replicate(10000, covariate_regression(AOV = 1, beta_covariate = 0)) <= .05) #> 0.05 # No problem for type 2 or type 3 sum of squares mean(replicate(10000, covariate_regression(AOV = 2)) <= .05) #> 0.0484 mean(replicate(10000, covariate_regression(AOV = 3)) <= .05) #> 0.0531 ### Problem disappears when the covariate comes first in the regression model! # lm(outcome ~ covariate * treatment, data = data)