from time import time import logging import numpy as np from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import CountVectorizer from sklearn.decomposition import LatentDirichletAllocation from gensim.matutils import Sparse2Corpus #from gensim.models.ldamodel import LdaModel from gensim.models.ldamulticore import LdaMulticore logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) def load_dataset(): train = fetch_20newsgroups(subset='train', random_state=1, remove=('headers', 'footers', 'quotes')).data test = fetch_20newsgroups(subset='test', random_state=1, remove=('headers', 'footers', 'quotes')).data return train, test def main(): # test mode can be 'batch' or 'online' test_mode = 'batch' #test_mode = 'online' # params n_features = 2000 n_topics = 10 alpha = 1. / n_topics eta = 1. / n_topics n_jobs = 3 # for batch update setting max_iterations = 5 # for online udpate setting kappa = 0.5 # decay in gensim tau0 = 1. # offest in gensim batch_size = 2000 # chunk size in gensim train_data, test_data = load_dataset() #sklearn format vectorizer = CountVectorizer(max_df=0.8, max_features=n_features, min_df=3, stop_words='english') train_X = vectorizer.fit_transform(train_data) test_X = vectorizer.transform(test_data) # convert sparse matrix to gensim corpus id2words = dict() for k, v in vectorizer.vocabulary_.iteritems(): id2words[v] = k train_corpus = Sparse2Corpus(train_X, documents_columns=False) test_corpus = Sparse2Corpus(test_X, documents_columns=False) # sklearn lda_sklearn = LatentDirichletAllocation(n_topics=n_topics, alpha=alpha, eta=eta, batch_size=batch_size, kappa=kappa, tau=tau0, n_jobs=n_jobs, n_docs=1e4, normalize_doc=False, random_state=0, verbose=1) print('run test in %s mode' % test_mode) t0 = time() if test_mode == 'batch': #for batch mode lda_sklearn.fit(train_X, max_iters=max_iterations) else: # for online mode lda_sklearn.partial_fit(train_X) print("sklearn fit in %0.3fs." % (time() - t0)) # transform train_gamma = lda_sklearn.transform(train_X) #bound = lda_sklearn._approx_bound(train_X, train_gamma, False) train_preplexity = lda_sklearn.preplexity(train_X, train_gamma) test_gamma = lda_sklearn.transform(test_X) test_preplexity = lda_sklearn.preplexity(test_X, test_gamma) print('sklearn preplexity: train=%.3f, test=%.3f' % (train_preplexity, test_preplexity)) # gensim id2words = dict() for k, v in vectorizer.vocabulary_.iteritems(): id2words[v] = k train_corpus = Sparse2Corpus(train_X, documents_columns=False) test_corpus = Sparse2Corpus(test_X, documents_columns=False) t0 = time() if test_mode == 'batch': # for batch mode lda_gensim = LdaMulticore(train_corpus, id2word=id2words, batch=True, eval_every=1, workers=n_jobs, num_topics=n_topics, passes=max_iterations) else: # for online mode lda_gensim = LdaMulticore(train_corpus, id2word=id2words, batch=False, eval_every=20, decay=0.5, offset=1.0, workers=n_jobs, num_topics=n_topics, passes=1) print("gensim done in %0.3fs." % (time() - t0)) #lda_gensim.print_topics() train_log_prep_gensim = lda_gensim.log_perplexity(train_corpus) test_log_prep_gensim = lda_gensim.log_perplexity(test_corpus) train_preplexity_gensim = np.exp(-1. * train_log_prep_gensim) test_preplexity_gensim = np.exp(-1. * test_log_prep_gensim) print('gensim preplexity: train=%.3f, test=%.3f' % (train_preplexity_gensim, test_preplexity_gensim)) if __name__ == '__main__': main()