lucinda-lim · October 30, 2020 05:21
diff --git a/text_preprocessing.py b/text_preprocessing.py
 from nltk.corpus import stopwords
 import spacy
 import gensim
 import gensim.corpora as corpora
 from gensim.utils import simple_preprocess
 from gensim.models import CoherenceModel
 from pprint import pprint
 import pyLDAvis
 import pyLDAvis.gensim  
 import matplotlib.pyplot as plt
 %matplotlib inline
 import spacy ##python3 -m spacy download en
 ## ---------------------------------------------------------------------------------------------------------
 def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence),deacc=True))
        
 def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    
 def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV','PROPN']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out
 ## ---------------------------------------------------------------------------------------------------------
 keywords_list=df.Document_Keywords.tolist()
 data_words = list(sent_to_words(keywords_list))
 stop_words = stopwords.words('english')
 data_words = remove_stopwords(data_words)
 nlp = spacy.load('en', disable=['parser', 'ner'])
 data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV','PROPN'])
	from nltk.corpus import stopwords
	import spacy
	import gensim
	import gensim.corpora as corpora
	from gensim.utils import simple_preprocess
	from gensim.models import CoherenceModel
	from pprint import pprint
	import pyLDAvis
	import pyLDAvis.gensim
	import matplotlib.pyplot as plt
	%matplotlib inline
	import spacy ##python3 -m spacy download en
	## ---------------------------------------------------------------------------------------------------------
	def sent_to_words(sentences):
	for sentence in sentences:
	yield(gensim.utils.simple_preprocess(str(sentence),deacc=True))

	def remove_stopwords(texts):
	return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

	def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV','PROPN']):
	"""https://spacy.io/api/annotation"""
	texts_out = []
	for sent in texts:
	doc = nlp(" ".join(sent))
	texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
	return texts_out
	## ---------------------------------------------------------------------------------------------------------
	keywords_list=df.Document_Keywords.tolist()
	data_words = list(sent_to_words(keywords_list))
	stop_words = stopwords.words('english')
	data_words = remove_stopwords(data_words)
	nlp = spacy.load('en', disable=['parser', 'ner'])
	data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV','PROPN'])
No results found