# some convenience functions here, nothing new
'''
# usage:
from easypipe import easy_pipeline
from easypipe import print_metrics
data_folder = "data-hold/20news"
p = easy_pipeline()
print_metrics(p, data_folder)
'''


import sys
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.pipeline import Pipeline
import numpy


## make a Multinomial-NB and CountVectorized pipeline

def easy_pipeline(vect = 'tfidf', min_df = 0.01, max_df = 0.95, stop_words = None, decode_error = 'ignore'):
  if vect == 'tfidf':
    V = TfidfVectorizer
  else:
    V = CountVectorizer

  pipeline = Pipeline([
      ('vect', V(min_df=min_df, max_df=max_df, stop_words = stop_words, decode_error = decode_error)),
      ('clf', MultinomialNB()),
  ])
  return pipeline

## Print the precision/recall/F1 numbers per label, and also
##  the top-10 most informative features per label
def print_metrics(pipeline, data_folder, test_size = 0.25):
  dataset = load_files(data_folder, shuffle = False)
  docs_train, docs_test, y_train, y_test = train_test_split(
    dataset.data, dataset.target, test_size = test_size, random_state=None)
  pipeline.fit(docs_train, y_train)
  y_predicted = pipeline.predict(docs_test)
  # print report
  print(metrics.classification_report(y_test, y_predicted, target_names = dataset.target_names))
  ## print out top 10 words
  clf = pipeline.steps[1][1]
  vect = pipeline.steps[0][1]
  for i, class_label in enumerate(dataset.target_names):
              topt = numpy.argsort(clf.coef_[i])[-10:]
              print("%s:    %s" % (class_label,
                    ", ".join(vect.get_feature_names()[j] for j in topt)))