The use of TF-IDF and LinearSVC is copied verbatim from the scikit-learn text analysis tutorial on about 5,000 columns gathered across 11 NYT columnists (for example, from /column/maureen-dowd
data_folder = "./data-hold/cleaned/"
sh_dataset = load_files(data_folder, shuffle = True)
sh_docs_train, sh_docs_test, sh_y_train, sh_y_test = train_test_split(
sh_dataset.data, sh_dataset.target, test_size=0.25, random_state=None)
sh_pipeline = Pipeline([
('vect', TfidfVectorizer(min_df=3, max_df=0.95)),
('clf', LinearSVC(C=1000)),
])
sh_pipeline.fit(sh_docs_train, sh_y_train)
sh_y_predicted = sh_pipeline.predict(sh_docs_test)
# print the results
print(metrics.classification_report(sh_y_test, sh_y_predicted, target_names = sh_dataset.target_names))Initial results:
precision recall f1-score support
charles-m-blow 0.99 0.94 0.96 81
david-brooks 0.98 0.98 0.98 169
frank-bruni 1.00 0.98 0.99 64
gail-collins 0.99 0.98 0.98 167
joe-nocera 0.95 0.95 0.95 76
maureen-dowd 0.95 0.98 0.96 125
nicholas-kristof 0.93 0.96 0.95 134
paul-krugman 0.98 0.99 0.98 157
roger-cohen 0.99 0.99 0.99 115
ross-douthat 1.00 0.94 0.97 49
thomas-l-friedman 0.98 0.98 0.98 126
avg / total 0.97 0.97 0.97 1263