Skip to content

Instantly share code, notes, and snippets.

@dpasch01
Created June 15, 2017 10:31
Show Gist options
  • Select an option

  • Save dpasch01/e4c07b0fdacb7b986fbc337da01f2309 to your computer and use it in GitHub Desktop.

Select an option

Save dpasch01/e4c07b0fdacb7b986fbc337da01f2309 to your computer and use it in GitHub Desktop.
#!/usr/bin/env Rscript
args = commandArgs(trailingOnly=TRUE)
# USAGE: $ Rscript --vanilla ldatuning.r input.txt 10 200
# https://cran.r-project.org/web/packages/ldatuning/vignettes/topics.html
library("ldatuning")
library(tm)
library(SnowballC)
# The script accepts 3 parameters, the first one is the document containing
# the text for the tuning to be applied on, the other two are the numbers of
# the minimum and maximum number of topics to be checked.
if (length(args)!=3) {
stop("Please provide the required parameters for LDA Tuning.", call.=FALSE)
}
if (!file.exists(args[1])) {
stop("The file provided does not exist.", call.=FALSE)
}
lines = readLines(args[1])
from_topics = as.integer(args[2])
to_topics = as.integer(args[3])
# Following is the processing of the text file e.g converting to vector space,
# removing punctuation, numbers etc, and converting the data into DTM - Document
# Term Matrix object
doc.vec <- VectorSource(lines)
doc.corpus <- Corpus(doc.vec)
summary(doc.corpus)
doc.corpus <- tm_map(doc.corpus, tolower)
doc.corpus <- tm_map(doc.corpus, removePunctuation)
doc.corpus <- tm_map(doc.corpus, removeNumbers)
doc.corpus <- tm_map(doc.corpus, removeWords, stopwords("english"))
doc.corpus <- tm_map(doc.corpus, stemDocument)
doc.corpus <- tm_map(doc.corpus, stripWhitespace)
DTM <- DocumentTermMatrix(doc.corpus)
# Remove any empty entries in the vector space so as for the LDATuning to work.
rowTotals <- apply(DTM , 1, sum)
DTM <- DTM[rowTotals> 0, ]
# Execute the LDATuning with the metrics of Griffiths2004, CaoJuan2009, Arun2010
# and Deveaud2014 on 2 cores with Gibbs sampling, and plot the results.
result <- FindTopicsNumber(
DTM,
topics = seq(from = from_topics, to = to_topics, by = 1),
metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
method = "Gibbs",
control = list(seed = 77),
mc.cores = 2L,
verbose = TRUE
)
FindTopicsNumber_plot(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment