# some convenience functions here, nothing new # usage: # # data_folder = "stuff/" # dataset = load_files(data_folder, shuffle = False) import sys from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer from sklearn.datasets import load_files from sklearn.cross_validation import train_test_split from sklearn import metrics from sklearn.pipeline import Pipeline import numpy ## make a Multinomial-NB and CountVectorized pipeline def easy_pipeline(min_df = 0.01, max_df = 0.95, stop_words = None): pipeline = Pipeline([ ('vect', CountVectorizer(min_df=min_df, max_df=max_df, stop_words = stop_words)), ('clf', MultinomialNB()), ]) return pipeline ## Print the precision/recall/F1 numbers per label, and also ## the top-10 most informative features per label def print_metrics(pipeline, dataset, test_size = 0.25): docs_train, docs_test, y_train, y_test = train_test_split( dataset.data, dataset.target, test_size = test_size, random_state=None) pipeline.fit(docs_train, y_train) y_predicted = pipeline.predict(docs_test) # print report print(metrics.classification_report(y_test, y_predicted, target_names = dataset.target_names)) ## print out top 10 words clf = pipeline.steps[1][1] vect = pipeline.steps[0][1] for i, class_label in enumerate(dataset.target_names): topt = numpy.argsort(clf.coef_[i])[-10:] print("%s: %s" % (class_label, ", ".join(vect.get_feature_names()[j] for j in topt)))