Skip to content

Instantly share code, notes, and snippets.

@chyikwei
Last active August 29, 2015 14:13
Show Gist options
  • Save chyikwei/34b97d4d443a0cc38a2f to your computer and use it in GitHub Desktop.
Save chyikwei/34b97d4d443a0cc38a2f to your computer and use it in GitHub Desktop.

Revisions

  1. chyikwei revised this gist Jan 14, 2015. 1 changed file with 74 additions and 31 deletions.
    105 changes: 74 additions & 31 deletions gistfile1.py
    Original file line number Diff line number Diff line change
    @@ -1,51 +1,86 @@
    # gensim test
    from time import time
    import logging
    import numpy as np
    from sklearn.datasets import fetch_20newsgroups
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.externals.six.moves import xrange
    from sklearn.decomposition import LatentDirichletAllocation


    from gensim.matutils import Sparse2Corpus
    from gensim.models.ldamodel import LdaModel
    #from gensim.models.ldamodel import LdaModel
    from gensim.models.ldamulticore import LdaMulticore

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


    def load_dataset():
    train = fetch_20newsgroups(subset='train', random_state=1,
    remove=('headers', 'footers', 'quotes')).data
    test = fetch_20newsgroups(subset='test', random_state=1,
    remove=('headers', 'footers', 'quotes')).data
    return train, test


    def main():
    # test mode can be 'batch' or 'online'
    test_mode = 'batch'
    #test_mode = 'online'

    def laod_dataset():
    # params
    n_features = 2000
    n_topics = 10
    alpha = 1. / n_topics
    eta = 1. / n_topics
    n_jobs = 3
    # for batch update setting
    max_iterations = 5
    # for online udpate setting
    kappa = 0.5 # decay in gensim
    tau0 = 1. # offest in gensim
    batch_size = 2000 # chunk size in gensim


    train_data = fetch_20newsgroups(subset='train', random_state=1,
    remove=('headers', 'footers', 'quotes')).data
    test_data = fetch_20newsgroups(subset='train', random_state=1,
    remove=('headers', 'footers', 'quotes')).data
    train_data, test_data = load_dataset()

    #sklearn format
    vectorizer = CountVectorizer(max_df=0.8, max_features=n_features,
    min_df=3, stop_words='english')

    train_X = vectorizer.fit_transform(train_data)
    test_X = vectorizer.transform(test_data)

    # scikit learn
    # convert sparse matrix to gensim corpus
    id2words = dict()
    for k, v in vectorizer.vocabulary_.iteritems():
    id2words[v] = k
    train_corpus = Sparse2Corpus(train_X, documents_columns=False)
    test_corpus = Sparse2Corpus(test_X, documents_columns=False)

    # sklearn
    lda_sklearn = LatentDirichletAllocation(n_topics=n_topics, alpha=alpha, eta=eta,
    kappa=0.7, tau=512., n_jobs=1, n_docs=1e4,
    normalize_doc=False, random_state=0, verbose=0)
    batch_size=batch_size, kappa=kappa, tau=tau0,
    n_jobs=n_jobs, n_docs=1e4,
    normalize_doc=False, random_state=0, verbose=1)

    # fit
    print('run test in %s mode' % test_mode)

    t0 = time()
    lda_sklearn.fit(train_X, max_iters=10)
    print("done in %0.3fs." % (time() - t0))
    if test_mode == 'batch':
    #for batch mode
    lda_sklearn.fit(train_X, max_iters=max_iterations)
    else:
    # for online mode
    lda_sklearn.partial_fit(train_X)

    print("sklearn fit in %0.3fs." % (time() - t0))

    # transform
    train_gamma = lda_sklearn.transform(train_X)
    bound = lda_sklearn._approx_bound(train_X, train_gamma, False)
    #bound = lda_sklearn._approx_bound(train_X, train_gamma, False)
    train_preplexity = lda_sklearn.preplexity(train_X, train_gamma)

    test_gamma = lda_sklearn.transform(test_X)
    test_preplexity = lda_sklearn.preplexity(test_X, test_gamma)

    print train_preplexity, test_preplexity
    print('sklearn preplexity: train=%.3f, test=%.3f' % (train_preplexity, test_preplexity))

    # gensim
    id2words = dict()
    @@ -55,20 +90,28 @@ def laod_dataset():
    test_corpus = Sparse2Corpus(test_X, documents_columns=False)

    t0 = time()
    lda_gensim = LdaModel(train_corpus, id2word=id2words,
    update_every=None, num_topics=10, passes=10)
    print("done in %0.3fs." % (time() - t0))

    lda_gensim.print_topics()
    if test_mode == 'batch':
    # for batch mode
    lda_gensim = LdaMulticore(train_corpus, id2word=id2words,
    batch=True, eval_every=1,
    workers=n_jobs, num_topics=n_topics, passes=max_iterations)
    else:
    # for online mode
    lda_gensim = LdaMulticore(train_corpus, id2word=id2words,
    batch=False, eval_every=20,
    decay=0.5, offset=1.0,
    workers=n_jobs, num_topics=n_topics,
    passes=1)

    print("gensim done in %0.3fs." % (time() - t0))

    #lda_gensim.print_topics()
    train_log_prep_gensim = lda_gensim.log_perplexity(train_corpus)
    test_log_prep_gensim = lda_gensim.log_perplexity(test_corpus)

    print train_log_prep_gensim, test_log_prep_gensim


    def sklearn_test():
    pass
    train_preplexity_gensim = np.exp(-1. * train_log_prep_gensim)
    test_preplexity_gensim = np.exp(-1. * test_log_prep_gensim)
    print('gensim preplexity: train=%.3f, test=%.3f' % (train_preplexity_gensim, test_preplexity_gensim))


    def gensim_test():
    corpus = gensim.matutils.Sparse2Corpus(scipy_sparse_matrix)
    if __name__ == '__main__':
    main()
  2. chyikwei renamed this gist Jan 12, 2015. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  3. chyikwei created this gist Jan 12, 2015.
    74 changes: 74 additions & 0 deletions gistfile1.txt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,74 @@
    # gensim test
    from time import time
    from sklearn.datasets import fetch_20newsgroups
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.externals.six.moves import xrange
    from sklearn.decomposition import LatentDirichletAllocation


    from gensim.matutils import Sparse2Corpus
    from gensim.models.ldamodel import LdaModel

    def laod_dataset():
    # params
    n_features = 2000
    n_topics = 10
    alpha = 1. / n_topics
    eta = 1. / n_topics

    train_data = fetch_20newsgroups(subset='train', random_state=1,
    remove=('headers', 'footers', 'quotes')).data
    test_data = fetch_20newsgroups(subset='train', random_state=1,
    remove=('headers', 'footers', 'quotes')).data

    vectorizer = CountVectorizer(max_df=0.8, max_features=n_features,
    min_df=3, stop_words='english')

    train_X = vectorizer.fit_transform(train_data)
    test_X = vectorizer.transform(test_data)

    # scikit learn
    lda_sklearn = LatentDirichletAllocation(n_topics=n_topics, alpha=alpha, eta=eta,
    kappa=0.7, tau=512., n_jobs=1, n_docs=1e4,
    normalize_doc=False, random_state=0, verbose=0)

    # fit
    t0 = time()
    lda_sklearn.fit(train_X, max_iters=10)
    print("done in %0.3fs." % (time() - t0))

    # transform
    train_gamma = lda_sklearn.transform(train_X)
    bound = lda_sklearn._approx_bound(train_X, train_gamma, False)
    train_preplexity = lda_sklearn.preplexity(train_X, train_gamma)

    test_gamma = lda_sklearn.transform(test_X)
    test_preplexity = lda_sklearn.preplexity(test_X, test_gamma)

    print train_preplexity, test_preplexity

    # gensim
    id2words = dict()
    for k, v in vectorizer.vocabulary_.iteritems():
    id2words[v] = k
    train_corpus = Sparse2Corpus(train_X, documents_columns=False)
    test_corpus = Sparse2Corpus(test_X, documents_columns=False)

    t0 = time()
    lda_gensim = LdaModel(train_corpus, id2word=id2words,
    update_every=None, num_topics=10, passes=10)
    print("done in %0.3fs." % (time() - t0))

    lda_gensim.print_topics()
    train_log_prep_gensim = lda_gensim.log_perplexity(train_corpus)
    test_log_prep_gensim = lda_gensim.log_perplexity(test_corpus)

    print train_log_prep_gensim, test_log_prep_gensim


    def sklearn_test():
    pass


    def gensim_test():
    corpus = gensim.matutils.Sparse2Corpus(scipy_sparse_matrix)