# some convenience functions here, nothing new ''' # usage: from easypipe import easy_pipeline from easypipe import print_metrics data_folder = "data-hold/20news" p = easy_pipeline() print_metrics(p, data_folder) ''' import sys from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.datasets import load_files from sklearn.cross_validation import train_test_split from sklearn import metrics from sklearn.pipeline import Pipeline import numpy ## make a Multinomial-NB and CountVectorized pipeline def easy_pipeline(vect = 'tfidf', min_df = 0.01, max_df = 0.95, stop_words = None, decode_error = 'ignore'): if vect == 'tfidf': V = TfidfVectorizer else: V = CountVectorizer pipeline = Pipeline([ ('vect', V(min_df=min_df, max_df=max_df, stop_words = stop_words, decode_error = decode_error)), ('clf', MultinomialNB()), ]) return pipeline ## Print the precision/recall/F1 numbers per label, and also ## the top-10 most informative features per label def print_metrics(pipeline, data_folder, test_size = 0.25): dataset = load_files(data_folder, shuffle = False) docs_train, docs_test, y_train, y_test = train_test_split( dataset.data, dataset.target, test_size = test_size, random_state=None) pipeline.fit(docs_train, y_train) y_predicted = pipeline.predict(docs_test) # print report print(metrics.classification_report(y_test, y_predicted, target_names = dataset.target_names)) ## print out top 10 words clf = pipeline.steps[1][1] vect = pipeline.steps[0][1] for i, class_label in enumerate(dataset.target_names): topt = numpy.argsort(clf.coef_[i])[-10:] print("%s: %s" % (class_label, ", ".join(vect.get_feature_names()[j] for j in topt)))