dpasch01 · June 15, 2017 10:31 · Jun 15, 2017
diff --git a/LDA Parameter Tuning with R b/LDA Parameter Tuning with R
@@ -0,0 +1,60 @@
+#!/usr/bin/env Rscript
+args = commandArgs(trailingOnly=TRUE)
+
+# USAGE: $ Rscript --vanilla ldatuning.r input.txt 10 200
+
+# https://cran.r-project.org/web/packages/ldatuning/vignettes/topics.html
+library("ldatuning")
+library(tm)
+library(SnowballC)
+
+# The script accepts 3 parameters, the first one is the document containing
+# the text for the tuning to be applied on, the other two are the numbers of
+# the minimum and maximum number of topics to be checked.
+if (length(args)!=3) {
+  stop("Please provide the required parameters for LDA Tuning.", call.=FALSE)
+}
+
+if (!file.exists(args[1])) {
+	stop("The file provided does not exist.", call.=FALSE)
+}
+
+lines = readLines(args[1])
+from_topics = as.integer(args[2])
+to_topics = as.integer(args[3])
+
+# Following is the processing of the text file e.g converting to vector space,
+# removing punctuation, numbers etc, and converting the data into DTM - Document
+# Term Matrix object
+doc.vec <- VectorSource(lines)
+doc.corpus <- Corpus(doc.vec)
+summary(doc.corpus)
+
+doc.corpus <- tm_map(doc.corpus, tolower)
+doc.corpus <- tm_map(doc.corpus, removePunctuation)
+doc.corpus <- tm_map(doc.corpus, removeNumbers)
+doc.corpus <- tm_map(doc.corpus, removeWords, stopwords("english"))
+
+doc.corpus <- tm_map(doc.corpus, stemDocument)
+doc.corpus <- tm_map(doc.corpus, stripWhitespace)
+
+DTM <- DocumentTermMatrix(doc.corpus)
+
+# Remove any empty entries in the vector space so as for the LDATuning to work.
+rowTotals <- apply(DTM , 1, sum) 
+DTM <- DTM[rowTotals> 0, ]    
+
+# Execute the LDATuning with the metrics of Griffiths2004, CaoJuan2009, Arun2010
+# and Deveaud2014 on 2 cores with Gibbs sampling, and plot the results.
+
+result <- FindTopicsNumber(
+  DTM,
+  topics = seq(from = from_topics, to = to_topics, by = 1),
+  metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
+  method = "Gibbs",
+  control = list(seed = 77),
+  mc.cores = 2L,
+  verbose = TRUE
+)
+
+FindTopicsNumber_plot(result)
No results found