# import IPython from IPython.display import display # lunar1.py header import pandas as pd import glob # lunar2.py header import MeCab tagger = MeCab.Tagger("-Ochasen") import mojimoji import os import urllib # lunar3.py header from wordcloud import WordCloud import matplotlib import matplotlib.pyplot as plt import numpy as np #lunar4.py header from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer # lunar5.py header from sklearn.model_selection import GridSearchCV from sklearn.decomposition import LatentDirichletAllocation as LDA #lunar1.py text_paths = glob.glob('data/text/**/*.txt') texts = [] for text_path in text_paths: text = open(text_path, 'r').read() text = text.split('\n') title = text[2] text = ' '.join(text[3:]) texts.append(text) news_ss = pd.Series(texts) # display(news_ss.head()) # lunar2.py def load_jp_stopwords(path="data/jp_stop_words.txt"): url = 'http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt' if os.path.exists(path): print('File already exists.') else: print('Downloading...') urllib.request.urlretrieve(url, path) return pd.read_csv(path, header=None)[0].tolist() def preprocess_jp(series): stop_words = load_jp_stopwords() def tokenizer_func(text): tokens = [] node = tagger.parseToNode(str(text)) while node: features = node.feature.split(',') surface = features[6] if (surface == '*') or (len(surface) < 2) or (surface in stop_words): node = node.next continue noun_flag = (features[0] == '名詞') proper_noun_flag = (features[0] == '名詞') & (features[1] == '固有名詞') verb_flag = (features[0] == '動詞') & (features[1] == '自立') adjective_flag = (features[0] == '形容詞') & (features[1] == '自立') if proper_noun_flag: tokens.append(surface) elif noun_flag: tokens.append(surface) elif verb_flag: tokens.append(surface) elif adjective_flag: tokens.append(surface) node = node.next return " ".join(tokens) series = series.map(tokenizer_func) #---------------Normalization-----------# series = series.map(lambda x: x.lower()) series = series.map(mojimoji.zen_to_han) return series processed_news_ss = preprocess_jp(news_ss) # display(processed_news_ss.head()) # lunar3.py font_path="/Library/Fonts/ipaexg.ttf" font_property = matplotlib.font_manager.FontProperties(fname=font_path, size=24) # font_property = matplotlib.font_manager.FontProperties(size=24) def show_wordcloud(series): long_string = ','.join(list(series.values)) # Create a WordCloud object wordcloud = WordCloud(font_path=font_path, background_color="white", max_words=1000, contour_width=3, contour_color='steelblue') # wordcloud = WordCloud( background_color="white", max_words=1000, contour_width=3, contour_color='steelblue') # Generate a word cloud wordcloud.generate(long_string) # Visualize the word cloud plt.imshow(wordcloud) plt.show() # show_wordcloud(processed_news_ss) # lunar4.py count_vectorizer = CountVectorizer() count_data = count_vectorizer.fit_transform(processed_news_ss) tfidf_vectorizer = TfidfTransformer() tfidf_data = tfidf_vectorizer.fit_transform(count_data) print(tfidf_data.toarray()) print(tfidf_data.shape) # lunar5.py from sklearn.model_selection import GridSearchCV from sklearn.decomposition import LatentDirichletAllocation as LDA def gridsearch_best_model(tfidf_data, plot_enabled=True): # Define Search Param n_topics = [4,5,6,7,8,9] search_params = {'n_components': n_topics} # Init the Model lda = LDA(max_iter=25, # Max learning iterations learning_method='batch', random_state=0, # Random state n_jobs = -1, # Use all available CPUs) ) # Init Grid Search Class model = GridSearchCV(lda, param_grid=search_params) # Do the Grid Search model.fit(tfidf_data) # Best Model best_lda_model = model.best_estimator_ # Model Parameters print("Best Model's Params: ", model.best_params_) # Log Likelihood Score print("Best Log Likelihood Score: ", model.best_score_) # Perplexity print("Model Perplexity: ", best_lda_model.perplexity(tfidf_data)) # Get Log Likelyhoods from Grid Search Output log_likelyhoods_score = [round(score) for score in model.cv_results_["mean_test_score"]] if plot_enabled: # Show graph plt.figure(figsize=(12, 8)) plt.plot(n_topics, log_likelyhoods_score) plt.title("Choosing Optimal LDA Model") plt.xlabel("Numer of Topics") plt.ylabel("Log Likelyhood Scores") plt.show() return best_lda_model