The use of TF-IDF and LinearSVC is copied verbatim from the scikit-learn text analysis tutorial on about 5,000 columns gathered across 11 NYT columnists, for example, Maureen Dowd columns as listed on /column/maureen-dowd.
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
data_folder = "./data-hold/cleaned/"
sh_dataset = load_files(data_folder, shuffle = True)
sh_docs_train, sh_docs_test, sh_y_train, sh_y_test = train_test_split(
sh_dataset.data, sh_dataset.target, test_size=0.25, random_state=None)
sh_pipeline = Pipeline([
('vect', TfidfVectorizer(min_df=3, max_df=0.95)),
('clf', LinearSVC(C=1000)),
])
sh_pipeline.fit(sh_docs_train, sh_y_train)
sh_y_predicted = sh_pipeline.predict(sh_docs_test)
# print the results
print(metrics.classification_report(sh_y_test, sh_y_predicted, target_names = sh_dataset.target_names))Initial results:
precision recall f1-score support
charles-m-blow 0.99 0.94 0.96 81
david-brooks 0.98 0.98 0.98 169
frank-bruni 1.00 0.98 0.99 64
gail-collins 0.99 0.98 0.98 167
joe-nocera 0.95 0.95 0.95 76
maureen-dowd 0.95 0.98 0.96 125
nicholas-kristof 0.93 0.96 0.95 134
paul-krugman 0.98 0.99 0.98 157
roger-cohen 0.99 0.99 0.99 115
ross-douthat 1.00 0.94 0.97 49
thomas-l-friedman 0.98 0.98 0.98 126
avg / total 0.97 0.97 0.97 1263
import numpy as np
clf = pipeline.steps[1][1]
vect = pipeline.steps[0][1]
feature_names = vect.get_feature_names()
class_labels = dataset.target_names
for i, class_label in enumerate(class_labels):
topt = np.argsort(clf.coef_[i])[-20:]
print("%s: %s" % (class_label,
" ".join(feature_names[j] for j in topt)))Results:
charles-m-blow: zimmerman sequester week pew thankful gallup trayvon wednesday those pointed officer president continued nearly report furthermore poll must released according
david-brooks: moral series each these few speech then self cooper he culture lewinsky percent will past kerry people sort they are
frank-bruni: ones less monday there just he zelizer whose wasn evangelical isn colorado its many or last re them gay which
gail-collins: idea since perhaps giuliani all been guy ginsburg actually totally quiz who definitely was presidential going nobody pretty everybody really
joe-nocera: luke course money caro executive thus which article though indeed gun athletes retirement detainees joe football its company instance had
maureen-dowd: noting rice mushy put up poppy wrote old who christmas adding replied cheney tuesday hillary white even president said washington
nicholas-kristof: jesus isn notes my girls often united sudan then moldova one mr sometimes year found partly also yet may likewise
paul-krugman: thing which investors mainly aren isn answer even bad large claim administration example financial declared insurance fact what however mr
roger-cohen: french from century where obama course holbrooke minister perhaps land cannot words adderall before must states me has united london
ross-douthat: christian promise though post internet last critics liberals liberalism rather sweeping religious might instance instead kind well daniels liberal era
thomas-l-friedman: therefore will simon how watson putin just sandel arab more their anymore need regime israel our energy america added today