Skip to content

Instantly share code, notes, and snippets.

@dpasch01
Created June 15, 2017 10:31
Show Gist options
  • Select an option

  • Save dpasch01/e4c07b0fdacb7b986fbc337da01f2309 to your computer and use it in GitHub Desktop.

Select an option

Save dpasch01/e4c07b0fdacb7b986fbc337da01f2309 to your computer and use it in GitHub Desktop.

Revisions

  1. dpasch01 created this gist Jun 15, 2017.
    60 changes: 60 additions & 0 deletions LDA Parameter Tuning with R
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,60 @@
    #!/usr/bin/env Rscript
    args = commandArgs(trailingOnly=TRUE)

    # USAGE: $ Rscript --vanilla ldatuning.r input.txt 10 200

    # https://cran.r-project.org/web/packages/ldatuning/vignettes/topics.html
    library("ldatuning")
    library(tm)
    library(SnowballC)

    # The script accepts 3 parameters, the first one is the document containing
    # the text for the tuning to be applied on, the other two are the numbers of
    # the minimum and maximum number of topics to be checked.
    if (length(args)!=3) {
    stop("Please provide the required parameters for LDA Tuning.", call.=FALSE)
    }

    if (!file.exists(args[1])) {
    stop("The file provided does not exist.", call.=FALSE)
    }

    lines = readLines(args[1])
    from_topics = as.integer(args[2])
    to_topics = as.integer(args[3])

    # Following is the processing of the text file e.g converting to vector space,
    # removing punctuation, numbers etc, and converting the data into DTM - Document
    # Term Matrix object
    doc.vec <- VectorSource(lines)
    doc.corpus <- Corpus(doc.vec)
    summary(doc.corpus)

    doc.corpus <- tm_map(doc.corpus, tolower)
    doc.corpus <- tm_map(doc.corpus, removePunctuation)
    doc.corpus <- tm_map(doc.corpus, removeNumbers)
    doc.corpus <- tm_map(doc.corpus, removeWords, stopwords("english"))

    doc.corpus <- tm_map(doc.corpus, stemDocument)
    doc.corpus <- tm_map(doc.corpus, stripWhitespace)

    DTM <- DocumentTermMatrix(doc.corpus)

    # Remove any empty entries in the vector space so as for the LDATuning to work.
    rowTotals <- apply(DTM , 1, sum)
    DTM <- DTM[rowTotals> 0, ]

    # Execute the LDATuning with the metrics of Griffiths2004, CaoJuan2009, Arun2010
    # and Deveaud2014 on 2 cores with Gibbs sampling, and plot the results.

    result <- FindTopicsNumber(
    DTM,
    topics = seq(from = from_topics, to = to_topics, by = 1),
    metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
    method = "Gibbs",
    control = list(seed = 77),
    mc.cores = 2L,
    verbose = TRUE
    )

    FindTopicsNumber_plot(result)