# Packages for this script
library(tidyverse)
library(lavaan)
library(dagitty)
library(glue)
library(rstanarm)


## SIMULATION ##
# here is a true data-generating process in lavaan (SEM) syntax
dgp <- "
  A ~ 0.4*D
  B ~ 0.2*D + 0.8*A + -0.1*C
  C ~ -0.3*D + -0.3*A + 0.2*E
  D ~ 1
  E ~ 1
  F ~ 0.9*B + -0.4*C + -0.3*E
"

df <- simulateData(dgp, sample.nobs = 20000)

dag <- lavaanToGraph(dgp)
plot(dag, show.coefficients = TRUE)


## ESTIMATION ##
# how can we find the adjustment set? Use DAG rules!
adjustmentSets(dag, exposure = "A", outcome = "B", effect = "direct")
coef(lm(B ~ A + C + D, data = df))[2]

# a function to compute any of these paths with OLS
ols_estimate_path <- function(df, dag, source = "A", target = "B") {
  adjustment_set <- adjustmentSets(dag, exposure = source, outcome = target, effect = "direct")[[1]]
  adj_set_string <- paste(adjustment_set, collapse = ', ')
  print(glue("Identifying {source} -> {target} using adjustment set {adj_set_string}"))
  adj_set_frm <- paste(adjustment_set, collapse = ' + ')
  frm <- glue("{target} ~ {source} + {adj_set_frm}")
  unname(coef(lm(frm, df))[2])
}

ols_estimate_path(df, dag, "E", "F")


## PREDICTION ##
# I will re-estimate relevant models in a Bayesian way to easily do posterior prediction / forward simulation
mod_c <- stan_lm(formula = C ~ A + D,     data = df, prior = NULL, algorithm = "fullrank")
mod_b <- stan_lm(formula = B ~ A + C + D, data = df, prior = NULL, algorithm = "fullrank")
mod_f <- stan_lm(formula = F ~ C + B + E, data = df, prior = NULL, algorithm = "fullrank")

# now, let's simulate an intervention on A and find its effect on F
# Adding 1.3 to A
df_int <- df
df_int$A <- df_int$A + 1.3

# what would C be?
df_int$C <- posterior_predict(mod_c, df_int, draws = 1)[1,]

tibble(x = c(df$C, df_int$C), type = rep(c("original", "intervention"), each = nrow(df))) |> 
  ggplot(aes(x = x, fill = type)) +
  geom_density() +
  labs(title = "Distribution of C")


# what would B be?
df_int$B <- posterior_predict(mod_b, df_int, draws = 1)[1,]

tibble(x = c(df$B, df_int$B), type = rep(c("original", "intervention"), each = nrow(df))) |> 
  ggplot(aes(x = x, fill = type)) +
  geom_density() +
  labs(title = "Distribution of B")

# finally, what would F be?
df_int$F <- posterior_predict(mod_f, df_int, draws = 1)[1,]

tibble(x = c(df$F, df_int$F), type = rep(c("original", "intervention"), each = nrow(df))) |> 
  ggplot(aes(x = x, fill = type)) +
  geom_density() +
  labs(title = "Distribution of F")

# the average total causal effect
mean(df_int$F - df$F)