Created
October 30, 2017 14:54
-
-
Save tomhopper/a4b8bce5182cc742d6b09ea1b66fca7a to your computer and use it in GitHub Desktop.
Revisions
-
tomhopper revised this gist
Oct 30, 2017 . 1 changed file with 0 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -9,7 +9,6 @@ library(ggplot2) ## Data used #### # right-triangle distribution (peak at 0; minimum at 1) data_vec <- rbeta(10000, shape1 = 1, shape2 = 2); ## Function #### #' @title Multiple comparisons of a given data vector to the normal distribution -
tomhopper created this gist
Oct 30, 2017 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,61 @@ ## Demonstration of central limit theorem ## Based on code in an anonymous comment to the blog post at \url{https://sas-and-r.blogspot.com/2012/01/example-919-demonstrating-central-limit.html} ## Libraries #### library(nortest) library(dplyr) library(ggplot2) ## Data used #### # right-triangle distribution (peak at 0; minimum at 1) data_vec <- rbeta(10000, shape1 = 1, shape2 = 2); data_vec <- rnorm(10000) ## Function #### #' @title Multiple comparisons of a given data vector to the normal distribution #' @param data_vec Data to sample for calculating means #' @param sample_size Sample size to use for calculating means #' @return A number representing the proportion of p-values < 0.05 #' @description Given a data vector and a sample size, computes 200 means #' based on \code{sample_size}, then compares the means to the normal distribution #' on the hypothesis H0: data_vec is normal using \code{sf.test()} from #' \code{library(nortest)}. Finally, computes and returns the fraction of p-values #' that reject the null hypothesis. Given a non-normal distribution, the Central #' Limit Theorem tells us that this proportion #' should decrease as \code{sample_size} increases. clt <- function(data_vec, sample_size) { means<-array(0, 200) pvals <- array(0, 200) for(i in 1:200) { for(j in i:200) { means[j] <- mean(sample(data, sample_size)) } pvals[i] <- sf.test(means)$p.val } length(pvals[pvals<0.05])/200 } ## Run \code{clt()} multiple times for a range of sample sizes. #### sample_size <- c(rep(2, times = 100), rep(10, times = 100), rep(100, times = 100), rep(1000, times = 100)) prop_pvalue <- data_frame(size = sample_size, pvalue = rep(NA, length(sample_size))) for(x in 1:length(sample_size)) { prop_pvalue[x, 2] <- clt(data = data_vec, sample_size = sample_size[x]) } ## Graph results #### prop_pvalue %>% mutate(size = as.factor(size)) %>% ggplot(aes(x = size, y = pvalue)) + geom_boxplot() + coord_flip() + labs(title = "Central Limit Theorem", x = expression(paste("Sample size, ", italic("n"))), y = "Proportion of non-normal distributions") + theme_minimal()