tomhopper · October 30, 2017 14:54 · Oct 30, 2017 · Oct 30, 2017
diff --git a/central_limit_theorem.R b/central_limit_theorem.R
@@ -9,7 +9,6 @@ library(ggplot2)
 ## Data used ####
 # right-triangle distribution (peak at 0; minimum at 1)
 data_vec <- rbeta(10000, shape1 = 1, shape2 = 2);
-data_vec <- rnorm(10000)
 
 ## Function ####
 #' @title Multiple comparisons of a given data vector to the normal distribution

diff --git a/central_limit_theorem.R b/central_limit_theorem.R
@@ -0,0 +1,61 @@
+## Demonstration of central limit theorem
+## Based on code in an anonymous comment to the blog post at \url{https://sas-and-r.blogspot.com/2012/01/example-919-demonstrating-central-limit.html}
+
+## Libraries ####
+library(nortest)
+library(dplyr)
+library(ggplot2)
+
+## Data used ####
+# right-triangle distribution (peak at 0; minimum at 1)
+data_vec <- rbeta(10000, shape1 = 1, shape2 = 2);
+data_vec <- rnorm(10000)
+
+## Function ####
+#' @title Multiple comparisons of a given data vector to the normal distribution
+#' @param data_vec Data to sample for calculating means
+#' @param sample_size Sample size to use for calculating means
+#' @return A number representing the proportion of p-values < 0.05
+#' @description Given a data vector and a sample size, computes 200 means 
+#'   based on \code{sample_size}, then compares the means to the normal distribution
+#'  on the hypothesis H0: data_vec is normal using \code{sf.test()} from 
+#'  \code{library(nortest)}. Finally, computes and returns the fraction of p-values
+#'  that reject the null hypothesis. Given a non-normal distribution, the Central
+#'  Limit Theorem tells us that this proportion
+#'  should decrease as \code{sample_size} increases.
+clt <- function(data_vec, sample_size) {
+
+  means<-array(0, 200)
+  pvals <- array(0, 200)
+
+  for(i in 1:200) {
+
+    for(j in i:200) { means[j] <- mean(sample(data, sample_size)) }
+    pvals[i] <- sf.test(means)$p.val
+  }
+  length(pvals[pvals<0.05])/200
+}
+
+## Run \code{clt()} multiple times for a range of sample sizes. ####
+sample_size <- c(rep(2, times = 100), 
+                 rep(10, times = 100), 
+                 rep(100, times = 100),
+                 rep(1000, times = 100))
+
+prop_pvalue <- data_frame(size = sample_size,
+                          pvalue = rep(NA, length(sample_size)))
+for(x in 1:length(sample_size)) {
+  prop_pvalue[x, 2] <- clt(data = data_vec, sample_size = sample_size[x])
+}
+
+## Graph results ####
+prop_pvalue %>% 
+  mutate(size = as.factor(size)) %>% 
+  ggplot(aes(x = size, y = pvalue)) +
+  geom_boxplot() +
+  coord_flip() +
+  labs(title = "Central Limit Theorem",
+       x = expression(paste("Sample size, ", italic("n"))),
+       y = "Proportion of non-normal distributions") +
+  theme_minimal()
+