# import IPython
from IPython.display import display

# lunar1.py header
import pandas as pd
import glob

# lunar2.py header
import MeCab
tagger = MeCab.Tagger("-Ochasen")
import mojimoji
import os
import urllib

# lunar3.py header
from wordcloud import WordCloud
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

#lunar4.py header
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# lunar5.py header
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import LatentDirichletAllocation as LDA

#lunar1.py
text_paths = glob.glob('data/text/**/*.txt')
texts = []
for text_path in text_paths:
    text = open(text_path, 'r').read()
    text = text.split('\n')
    title = text[2]
    text = ' '.join(text[3:])
    texts.append(text)
    
news_ss = pd.Series(texts)
# display(news_ss.head()) 

# lunar2.py
def load_jp_stopwords(path="data/jp_stop_words.txt"):
    url = 'http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt'
    if os.path.exists(path):
        print('File already exists.')
    else:
        print('Downloading...')
        urllib.request.urlretrieve(url, path)
    return pd.read_csv(path, header=None)[0].tolist()

def preprocess_jp(series):
    stop_words = load_jp_stopwords()
    def tokenizer_func(text):
        tokens = []
        node = tagger.parseToNode(str(text))
        while node:
            features = node.feature.split(',')
            surface = features[6]
            if (surface == '*') or (len(surface) < 2) or (surface in stop_words):
                node = node.next
                continue
            noun_flag = (features[0] == '名詞')
            proper_noun_flag = (features[0] == '名詞') & (features[1] == '固有名詞')
            verb_flag = (features[0] == '動詞') & (features[1] == '自立')
            adjective_flag = (features[0] == '形容詞') & (features[1] == '自立')
            if proper_noun_flag:
                tokens.append(surface)
            elif noun_flag:
                tokens.append(surface)
            elif verb_flag:
                tokens.append(surface)
            elif adjective_flag:
                tokens.append(surface)
            node = node.next
        return " ".join(tokens)

    series = series.map(tokenizer_func)
    
    #---------------Normalization-----------#
    series = series.map(lambda x: x.lower())
    series = series.map(mojimoji.zen_to_han)

    return series
    
processed_news_ss = preprocess_jp(news_ss)
# display(processed_news_ss.head()) 

# lunar3.py
font_path="/Library/Fonts/ipaexg.ttf"
font_property = matplotlib.font_manager.FontProperties(fname=font_path, size=24)
# font_property = matplotlib.font_manager.FontProperties(size=24)

def show_wordcloud(series):
    long_string = ','.join(list(series.values))
    
    # Create a WordCloud object
    wordcloud = WordCloud(font_path=font_path, background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')
    # wordcloud = WordCloud( background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')
    
    # Generate a word cloud
    wordcloud.generate(long_string)
    
    # Visualize the word cloud
    plt.imshow(wordcloud)
    plt.show()

# show_wordcloud(processed_news_ss) 

# lunar4.py
count_vectorizer = CountVectorizer()
count_data = count_vectorizer.fit_transform(processed_news_ss)
tfidf_vectorizer = TfidfTransformer()
tfidf_data = tfidf_vectorizer.fit_transform(count_data)
print(tfidf_data.toarray()) 
print(tfidf_data.shape)

# lunar5.py
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import LatentDirichletAllocation as LDA

def gridsearch_best_model(tfidf_data, plot_enabled=True):
    # Define Search Param
    n_topics = [4,5,6,7,8,9]
    search_params = {'n_components': n_topics}
    
    # Init the Model
    lda = LDA(max_iter=25,               # Max learning iterations
              learning_method='batch',   
              random_state=0,            # Random state
              n_jobs = -1,               # Use all available CPUs)
              )
    
    # Init Grid Search Class
    model = GridSearchCV(lda, param_grid=search_params)
    
    # Do the Grid Search
    model.fit(tfidf_data)
    
    # Best Model
    best_lda_model = model.best_estimator_
    
    # Model Parameters
    print("Best Model's Params: ", model.best_params_)
    
    # Log Likelihood Score
    print("Best Log Likelihood Score: ", model.best_score_)
    
    # Perplexity
    print("Model Perplexity: ", best_lda_model.perplexity(tfidf_data))
    
    # Get Log Likelyhoods from Grid Search Output
    log_likelyhoods_score = [round(score) for score in model.cv_results_["mean_test_score"]]
    
    if plot_enabled:
        # Show graph
        plt.figure(figsize=(12, 8))
        plt.plot(n_topics, log_likelyhoods_score)
        plt.title("Choosing Optimal LDA Model")
        plt.xlabel("Numer of Topics")
        plt.ylabel("Log Likelyhood Scores")
        plt.show()

    return best_lda_model