Skip to content

Instantly share code, notes, and snippets.

@lucinda-lim
Last active October 30, 2020 05:21
Show Gist options
  • Select an option

  • Save lucinda-lim/b3c7ddc5c055021c01e13aaf31845d3d to your computer and use it in GitHub Desktop.

Select an option

Save lucinda-lim/b3c7ddc5c055021c01e13aaf31845d3d to your computer and use it in GitHub Desktop.
from nltk.corpus import stopwords
import spacy
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from pprint import pprint
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline
import spacy ##python3 -m spacy download en
## ---------------------------------------------------------------------------------------------------------
def sent_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence),deacc=True))
def remove_stopwords(texts):
return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV','PROPN']):
"""https://spacy.io/api/annotation"""
texts_out = []
for sent in texts:
doc = nlp(" ".join(sent))
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
return texts_out
## ---------------------------------------------------------------------------------------------------------
keywords_list=df.Document_Keywords.tolist()
data_words = list(sent_to_words(keywords_list))
stop_words = stopwords.words('english')
data_words = remove_stopwords(data_words)
nlp = spacy.load('en', disable=['parser', 'ner'])
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV','PROPN'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment