Last active
August 29, 2015 14:13
-
-
Save chyikwei/34b97d4d443a0cc38a2f to your computer and use it in GitHub Desktop.
Revisions
-
chyikwei revised this gist
Jan 14, 2015 . 1 changed file with 74 additions and 31 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,51 +1,86 @@ from time import time import logging import numpy as np from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import CountVectorizer from sklearn.decomposition import LatentDirichletAllocation from gensim.matutils import Sparse2Corpus #from gensim.models.ldamodel import LdaModel from gensim.models.ldamulticore import LdaMulticore logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) def load_dataset(): train = fetch_20newsgroups(subset='train', random_state=1, remove=('headers', 'footers', 'quotes')).data test = fetch_20newsgroups(subset='test', random_state=1, remove=('headers', 'footers', 'quotes')).data return train, test def main(): # test mode can be 'batch' or 'online' test_mode = 'batch' #test_mode = 'online' # params n_features = 2000 n_topics = 10 alpha = 1. / n_topics eta = 1. / n_topics n_jobs = 3 # for batch update setting max_iterations = 5 # for online udpate setting kappa = 0.5 # decay in gensim tau0 = 1. # offest in gensim batch_size = 2000 # chunk size in gensim train_data, test_data = load_dataset() #sklearn format vectorizer = CountVectorizer(max_df=0.8, max_features=n_features, min_df=3, stop_words='english') train_X = vectorizer.fit_transform(train_data) test_X = vectorizer.transform(test_data) # convert sparse matrix to gensim corpus id2words = dict() for k, v in vectorizer.vocabulary_.iteritems(): id2words[v] = k train_corpus = Sparse2Corpus(train_X, documents_columns=False) test_corpus = Sparse2Corpus(test_X, documents_columns=False) # sklearn lda_sklearn = LatentDirichletAllocation(n_topics=n_topics, alpha=alpha, eta=eta, batch_size=batch_size, kappa=kappa, tau=tau0, n_jobs=n_jobs, n_docs=1e4, normalize_doc=False, random_state=0, verbose=1) print('run test in %s mode' % test_mode) t0 = time() if test_mode == 'batch': #for batch mode lda_sklearn.fit(train_X, max_iters=max_iterations) else: # for online mode lda_sklearn.partial_fit(train_X) print("sklearn fit in %0.3fs." % (time() - t0)) # transform train_gamma = lda_sklearn.transform(train_X) #bound = lda_sklearn._approx_bound(train_X, train_gamma, False) train_preplexity = lda_sklearn.preplexity(train_X, train_gamma) test_gamma = lda_sklearn.transform(test_X) test_preplexity = lda_sklearn.preplexity(test_X, test_gamma) print('sklearn preplexity: train=%.3f, test=%.3f' % (train_preplexity, test_preplexity)) # gensim id2words = dict() @@ -55,20 +90,28 @@ def laod_dataset(): test_corpus = Sparse2Corpus(test_X, documents_columns=False) t0 = time() if test_mode == 'batch': # for batch mode lda_gensim = LdaMulticore(train_corpus, id2word=id2words, batch=True, eval_every=1, workers=n_jobs, num_topics=n_topics, passes=max_iterations) else: # for online mode lda_gensim = LdaMulticore(train_corpus, id2word=id2words, batch=False, eval_every=20, decay=0.5, offset=1.0, workers=n_jobs, num_topics=n_topics, passes=1) print("gensim done in %0.3fs." % (time() - t0)) #lda_gensim.print_topics() train_log_prep_gensim = lda_gensim.log_perplexity(train_corpus) test_log_prep_gensim = lda_gensim.log_perplexity(test_corpus) train_preplexity_gensim = np.exp(-1. * train_log_prep_gensim) test_preplexity_gensim = np.exp(-1. * test_log_prep_gensim) print('gensim preplexity: train=%.3f, test=%.3f' % (train_preplexity_gensim, test_preplexity_gensim)) if __name__ == '__main__': main() -
chyikwei renamed this gist
Jan 12, 2015 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
chyikwei created this gist
Jan 12, 2015 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,74 @@ # gensim test from time import time from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import CountVectorizer from sklearn.externals.six.moves import xrange from sklearn.decomposition import LatentDirichletAllocation from gensim.matutils import Sparse2Corpus from gensim.models.ldamodel import LdaModel def laod_dataset(): # params n_features = 2000 n_topics = 10 alpha = 1. / n_topics eta = 1. / n_topics train_data = fetch_20newsgroups(subset='train', random_state=1, remove=('headers', 'footers', 'quotes')).data test_data = fetch_20newsgroups(subset='train', random_state=1, remove=('headers', 'footers', 'quotes')).data vectorizer = CountVectorizer(max_df=0.8, max_features=n_features, min_df=3, stop_words='english') train_X = vectorizer.fit_transform(train_data) test_X = vectorizer.transform(test_data) # scikit learn lda_sklearn = LatentDirichletAllocation(n_topics=n_topics, alpha=alpha, eta=eta, kappa=0.7, tau=512., n_jobs=1, n_docs=1e4, normalize_doc=False, random_state=0, verbose=0) # fit t0 = time() lda_sklearn.fit(train_X, max_iters=10) print("done in %0.3fs." % (time() - t0)) # transform train_gamma = lda_sklearn.transform(train_X) bound = lda_sklearn._approx_bound(train_X, train_gamma, False) train_preplexity = lda_sklearn.preplexity(train_X, train_gamma) test_gamma = lda_sklearn.transform(test_X) test_preplexity = lda_sklearn.preplexity(test_X, test_gamma) print train_preplexity, test_preplexity # gensim id2words = dict() for k, v in vectorizer.vocabulary_.iteritems(): id2words[v] = k train_corpus = Sparse2Corpus(train_X, documents_columns=False) test_corpus = Sparse2Corpus(test_X, documents_columns=False) t0 = time() lda_gensim = LdaModel(train_corpus, id2word=id2words, update_every=None, num_topics=10, passes=10) print("done in %0.3fs." % (time() - t0)) lda_gensim.print_topics() train_log_prep_gensim = lda_gensim.log_perplexity(train_corpus) test_log_prep_gensim = lda_gensim.log_perplexity(test_corpus) print train_log_prep_gensim, test_log_prep_gensim def sklearn_test(): pass def gensim_test(): corpus = gensim.matutils.Sparse2Corpus(scipy_sparse_matrix)