# Using just CountVectorizer (threshold of 0.05 to 0.75) and Multinomial-Naive Bayes import sys from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer from sklearn.datasets import load_files from sklearn.cross_validation import train_test_split from sklearn import metrics from sklearn.pipeline import Pipeline import numpy data_folder = "./data-hold/cleaned/" dataset = load_files(data_folder, shuffle = False) docs_train, docs_test, y_train, y_test = train_test_split( dataset.data, dataset.target, test_size=0.25, random_state=None) pipeline = Pipeline([ ('vect', CountVectorizer(min_df=0.05, max_df=0.75)), ('clf', MultinomialNB()), ]) pipeline.fit(docs_train, y_train) y_predicted = pipeline.predict(docs_test) print(metrics.classification_report(y_test, y_predicted, target_names = dataset.target_names)) ################ Performance report # precision recall f1-score support # charles-m-blow 0.87 0.71 0.78 82 # david-brooks 0.93 0.75 0.83 187 # frank-bruni 0.69 0.83 0.76 60 # gail-collins 0.84 0.89 0.87 169 # joe-nocera 0.80 0.83 0.81 81 # maureen-dowd 0.76 0.88 0.81 128 # nicholas-kristof 0.84 0.83 0.83 123 # paul-krugman 0.90 0.91 0.90 154 # roger-cohen 0.80 0.86 0.83 115 # ross-douthat 0.77 0.71 0.74 48 # thomas-l-friedman 0.87 0.87 0.87 116 # avg / total 0.84 0.83 0.83 1263 ## print out top 10 most informative features clf = pipeline.steps[1][1] vect = pipeline.steps[0][1] for i, class_label in enumerate(dataset.target_names): topt = numpy.argsort(clf.coef_[i])[-10:] print("%s: %s" % (class_label, ", ".join(vect.get_feature_names()[j] for j in topt))) # charles-m-blow: republicans, said, some, our, most, those, were, obama, president, percent # david-brooks: new, government, now, them, over, some, these, do, were, obama # frank-bruni: last, many, just, re, them, had, him, said, her, she # gail-collins: get, state, mr, do, her, were, she, said, new, had # joe-nocera: do, them, years, other, said, she, new, were, its, had # maureen-dowd: hillary, even, were, him, had, president, said, her, obama, she # nicholas-kristof: my, also, some, because, our, year, said, had, her, she # paul-krugman: government, were, health, much, obama, economic, economy, now, even, mr # roger-cohen: had, states, united, american, now, israel, world, iran, obama, its # ross-douthat: well, because, new, many, even, just, party, our, its, obama # thomas-l-friedman: president, america, them, how, its, now, just, do, world, our