Last active
June 22, 2022 14:36
-
-
Save maxbellec/85d90d3d7f2f96589f1517e5c4567dc3 to your computer and use it in GitHub Desktop.
Revisions
-
maxbellec revised this gist
Jan 25, 2017 . 1 changed file with 12 additions and 4 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -9,10 +9,18 @@ logging.root.setLevel(level=logging.INFO) wiki = WikiCorpus('data/enwiki-20170101-pages-articles-multistream.xml.bz2', lemmatize=False) tfidf = TfidfModel(wiki) # save for persistence wiki.save('wiki.corpus) tfidf.save('wiki.tfidf.model') # word2vec class MySentences(object): def __iter__(self): for text in wiki.get_texts(): yield [word.decode() for word in text] sentences = MySentences() params = {'size': 300, 'window': 10, 'min_count': 40, 'workers': max(1, multiprocessing.cpu_count() - 1), 'sample': 1e-3,} word2vec = Word2Vec(sentences, **params) word2vec.save('wiki.word2vec.model') -
maxbellec revised this gist
Jan 23, 2017 . 1 changed file with 2 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -2,6 +2,8 @@ from gensim.corpora.wikicorpus import WikiCorpus from gensim.models.word2vec import Word2Vec from gensim.models import TfidfModel # logging is important to get the state of the functions import logging logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) -
maxbellec revised this gist
Jan 23, 2017 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -8,7 +8,7 @@ wiki = WikiCorpus('data/enwiki-20170101-pages-articles-multistream.xml.bz2', lemmatize=False) tfidf = TfidfModel(wiki) sentences = list(wiki.get_texts()) params = {'size': 300, 'window': 10, 'min_count': 40, -
maxbellec created this gist
Jan 22, 2017 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,16 @@ import multiprocessing from gensim.corpora.wikicorpus import WikiCorpus from gensim.models.word2vec import Word2Vec from gensim.models import TfidfModel import logging logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) wiki = WikiCorpus('data/enwiki-20170101-pages-articles-multistream.xml.bz2', lemmatize=False) tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) sentences = list(wiki.get_texts()) params = {'size': 300, 'window': 10, 'min_count': 40, 'workers': max(1, multiprocessing.cpu_count() - 1), 'sample': 1e-3,} word2vec = Word2Vec(sentences, **params)