Skip to content

Instantly share code, notes, and snippets.

@maxbellec
Last active June 22, 2022 14:36
Show Gist options
  • Save maxbellec/85d90d3d7f2f96589f1517e5c4567dc3 to your computer and use it in GitHub Desktop.
Save maxbellec/85d90d3d7f2f96589f1517e5c4567dc3 to your computer and use it in GitHub Desktop.

Revisions

  1. maxbellec revised this gist Jan 25, 2017. 1 changed file with 12 additions and 4 deletions.
    16 changes: 12 additions & 4 deletions word2vec_tf_idf_from_wikipeida.py
    Original file line number Diff line number Diff line change
    @@ -9,10 +9,18 @@
    logging.root.setLevel(level=logging.INFO)

    wiki = WikiCorpus('data/enwiki-20170101-pages-articles-multistream.xml.bz2', lemmatize=False)

    tfidf = TfidfModel(wiki)

    sentences = list(wiki.get_texts())
    # save for persistence
    wiki.save('wiki.corpus)
    tfidf.save('wiki.tfidf.model')

    # word2vec
    class MySentences(object):
    def __iter__(self):
    for text in wiki.get_texts():
    yield [word.decode() for word in text]
    sentences = MySentences()
    params = {'size': 300, 'window': 10, 'min_count': 40,
    'workers': max(1, multiprocessing.cpu_count() - 1), 'sample': 1e-3,}
    word2vec = Word2Vec(sentences, **params)
    word2vec = Word2Vec(sentences, **params)
    word2vec.save('wiki.word2vec.model')
  2. maxbellec revised this gist Jan 23, 2017. 1 changed file with 2 additions and 0 deletions.
    2 changes: 2 additions & 0 deletions word2vec_tf_idf_from_wikipeida.py
    Original file line number Diff line number Diff line change
    @@ -2,6 +2,8 @@
    from gensim.corpora.wikicorpus import WikiCorpus
    from gensim.models.word2vec import Word2Vec
    from gensim.models import TfidfModel

    # logging is important to get the state of the functions
    import logging
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
  3. maxbellec revised this gist Jan 23, 2017. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion word2vec_tf_idf_from_wikipeida.py
    Original file line number Diff line number Diff line change
    @@ -8,7 +8,7 @@

    wiki = WikiCorpus('data/enwiki-20170101-pages-articles-multistream.xml.bz2', lemmatize=False)

    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf = TfidfModel(wiki)

    sentences = list(wiki.get_texts())
    params = {'size': 300, 'window': 10, 'min_count': 40,
  4. maxbellec created this gist Jan 22, 2017.
    16 changes: 16 additions & 0 deletions word2vec_tf_idf_from_wikipeida.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,16 @@
    import multiprocessing
    from gensim.corpora.wikicorpus import WikiCorpus
    from gensim.models.word2vec import Word2Vec
    from gensim.models import TfidfModel
    import logging
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)

    wiki = WikiCorpus('data/enwiki-20170101-pages-articles-multistream.xml.bz2', lemmatize=False)

    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)

    sentences = list(wiki.get_texts())
    params = {'size': 300, 'window': 10, 'min_count': 40,
    'workers': max(1, multiprocessing.cpu_count() - 1), 'sample': 1e-3,}
    word2vec = Word2Vec(sentences, **params)