Last active
October 30, 2020 05:21
-
-
Save lucinda-lim/b3c7ddc5c055021c01e13aaf31845d3d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from nltk.corpus import stopwords | |
| import spacy | |
| import gensim | |
| import gensim.corpora as corpora | |
| from gensim.utils import simple_preprocess | |
| from gensim.models import CoherenceModel | |
| from pprint import pprint | |
| import pyLDAvis | |
| import pyLDAvis.gensim | |
| import matplotlib.pyplot as plt | |
| %matplotlib inline | |
| import spacy ##python3 -m spacy download en | |
| ## --------------------------------------------------------------------------------------------------------- | |
| def sent_to_words(sentences): | |
| for sentence in sentences: | |
| yield(gensim.utils.simple_preprocess(str(sentence),deacc=True)) | |
| def remove_stopwords(texts): | |
| return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts] | |
| def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV','PROPN']): | |
| """https://spacy.io/api/annotation""" | |
| texts_out = [] | |
| for sent in texts: | |
| doc = nlp(" ".join(sent)) | |
| texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) | |
| return texts_out | |
| ## --------------------------------------------------------------------------------------------------------- | |
| keywords_list=df.Document_Keywords.tolist() | |
| data_words = list(sent_to_words(keywords_list)) | |
| stop_words = stopwords.words('english') | |
| data_words = remove_stopwords(data_words) | |
| nlp = spacy.load('en', disable=['parser', 'ner']) | |
| data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV','PROPN']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment