Created
June 15, 2017 10:31
-
-
Save dpasch01/e4c07b0fdacb7b986fbc337da01f2309 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env Rscript | |
| args = commandArgs(trailingOnly=TRUE) | |
| # USAGE: $ Rscript --vanilla ldatuning.r input.txt 10 200 | |
| # https://cran.r-project.org/web/packages/ldatuning/vignettes/topics.html | |
| library("ldatuning") | |
| library(tm) | |
| library(SnowballC) | |
| # The script accepts 3 parameters, the first one is the document containing | |
| # the text for the tuning to be applied on, the other two are the numbers of | |
| # the minimum and maximum number of topics to be checked. | |
| if (length(args)!=3) { | |
| stop("Please provide the required parameters for LDA Tuning.", call.=FALSE) | |
| } | |
| if (!file.exists(args[1])) { | |
| stop("The file provided does not exist.", call.=FALSE) | |
| } | |
| lines = readLines(args[1]) | |
| from_topics = as.integer(args[2]) | |
| to_topics = as.integer(args[3]) | |
| # Following is the processing of the text file e.g converting to vector space, | |
| # removing punctuation, numbers etc, and converting the data into DTM - Document | |
| # Term Matrix object | |
| doc.vec <- VectorSource(lines) | |
| doc.corpus <- Corpus(doc.vec) | |
| summary(doc.corpus) | |
| doc.corpus <- tm_map(doc.corpus, tolower) | |
| doc.corpus <- tm_map(doc.corpus, removePunctuation) | |
| doc.corpus <- tm_map(doc.corpus, removeNumbers) | |
| doc.corpus <- tm_map(doc.corpus, removeWords, stopwords("english")) | |
| doc.corpus <- tm_map(doc.corpus, stemDocument) | |
| doc.corpus <- tm_map(doc.corpus, stripWhitespace) | |
| DTM <- DocumentTermMatrix(doc.corpus) | |
| # Remove any empty entries in the vector space so as for the LDATuning to work. | |
| rowTotals <- apply(DTM , 1, sum) | |
| DTM <- DTM[rowTotals> 0, ] | |
| # Execute the LDATuning with the metrics of Griffiths2004, CaoJuan2009, Arun2010 | |
| # and Deveaud2014 on 2 cores with Gibbs sampling, and plot the results. | |
| result <- FindTopicsNumber( | |
| DTM, | |
| topics = seq(from = from_topics, to = to_topics, by = 1), | |
| metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"), | |
| method = "Gibbs", | |
| control = list(seed = 77), | |
| mc.cores = 2L, | |
| verbose = TRUE | |
| ) | |
| FindTopicsNumber_plot(result) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment