# some convenience functions here, nothing new
# usage:
#
#  data_folder = "stuff/"
#  dataset = load_files(data_folder, shuffle = False)

import sys
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.pipeline import Pipeline
import numpy


## make a Multinomial-NB and CountVectorized pipeline

def easy_pipeline(min_df = 0.01, max_df = 0.95, stop_words = None):
  pipeline = Pipeline([
      ('vect', CountVectorizer(min_df=min_df, max_df=max_df, stop_words = stop_words)),
      ('clf', MultinomialNB()),
  ])
  return pipeline

## Print the precision/recall/F1 numbers per label, and also
##  the top-10 most informative features per label
def print_metrics(pipeline, dataset, test_size = 0.25):
  docs_train, docs_test, y_train, y_test = train_test_split(
    dataset.data, dataset.target, test_size = test_size, random_state=None)
  pipeline.fit(docs_train, y_train)
  y_predicted = pipeline.predict(docs_test)
  # print report
  print(metrics.classification_report(y_test, y_predicted, target_names = dataset.target_names))
  ## print out top 10 words
  clf = pipeline.steps[1][1]
  vect = pipeline.steps[0][1]
  for i, class_label in enumerate(dataset.target_names):
              topt = numpy.argsort(clf.coef_[i])[-10:]
              print("%s:    %s" % (class_label,
                    ", ".join(vect.get_feature_names()[j] for j in topt)))