# Packages for this script library(tidyverse) library(lavaan) library(dagitty) library(glue) library(rstanarm) ## SIMULATION ## # here is a true data-generating process in lavaan (SEM) syntax dgp <- " A ~ 0.4*D B ~ 0.2*D + 0.8*A + -0.1*C C ~ -0.3*D + -0.3*A + 0.2*E D ~ 1 E ~ 1 F ~ 0.9*B + -0.4*C + -0.3*E " df <- simulateData(dgp, sample.nobs = 20000) dag <- lavaanToGraph(dgp) plot(dag, show.coefficients = TRUE) ## ESTIMATION ## # how can we find the adjustment set? Use DAG rules! adjustmentSets(dag, exposure = "A", outcome = "B", effect = "direct") coef(lm(B ~ A + C + D, data = df))[2] # a function to compute any of these paths with OLS ols_estimate_path <- function(df, dag, source = "A", target = "B") { adjustment_set <- adjustmentSets(dag, exposure = source, outcome = target, effect = "direct")[[1]] adj_set_string <- paste(adjustment_set, collapse = ', ') print(glue("Identifying {source} -> {target} using adjustment set {adj_set_string}")) adj_set_frm <- paste(adjustment_set, collapse = ' + ') frm <- glue("{target} ~ {source} + {adj_set_frm}") unname(coef(lm(frm, df))[2]) } ols_estimate_path(df, dag, "E", "F") ## PREDICTION ## # I will re-estimate relevant models in a Bayesian way to easily do posterior prediction / forward simulation mod_c <- stan_lm(formula = C ~ A + D, data = df, prior = NULL, algorithm = "fullrank") mod_b <- stan_lm(formula = B ~ A + C + D, data = df, prior = NULL, algorithm = "fullrank") mod_f <- stan_lm(formula = F ~ C + B + E, data = df, prior = NULL, algorithm = "fullrank") # now, let's simulate an intervention on A and find its effect on F # Adding 1.3 to A df_int <- df df_int$A <- df_int$A + 1.3 # what would C be? df_int$C <- posterior_predict(mod_c, df_int, draws = 1)[1,] tibble(x = c(df$C, df_int$C), type = rep(c("original", "intervention"), each = nrow(df))) |> ggplot(aes(x = x, fill = type)) + geom_density() + labs(title = "Distribution of C") # what would B be? df_int$B <- posterior_predict(mod_b, df_int, draws = 1)[1,] tibble(x = c(df$B, df_int$B), type = rep(c("original", "intervention"), each = nrow(df))) |> ggplot(aes(x = x, fill = type)) + geom_density() + labs(title = "Distribution of B") # finally, what would F be? df_int$F <- posterior_predict(mod_f, df_int, draws = 1)[1,] tibble(x = c(df$F, df_int$F), type = rep(c("original", "intervention"), each = nrow(df))) |> ggplot(aes(x = x, fill = type)) + geom_density() + labs(title = "Distribution of F") # the average total causal effect mean(df_int$F - df$F)