""" A plain linear SVM for text classification """ import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import SVC, LinearSVC from sklearn import metrics from sklearn.cross_validation import StratifiedKFold from sklearn.datasets import fetch_20newsgroups if __name__ == "__main__": categories = [ 'alt.atheism', 'talk.religion.misc', ] data_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42) X = data_train.data y = np.array(data_train.target) kf = StratifiedKFold(y, k=5, indices=True) vectorizer = TfidfVectorizer() clf = SVC(kernel="linear") #clf = LinearSVC() for train_index, test_index in kf: text_train = [] for i in train_index: text_train.append(X[i]) text_test = [] for i in test_index: text_test.append(X[i]) y_train, y_test = y[train_index], y[test_index] X_train = vectorizer.fit_transform(text_train) clf.fit(X_train, y_train) X_test = vectorizer.transform(text_test) y_predict = clf.predict(X_test) print metrics.confusion_matrix(y_test, y_predict) print metrics.classification_report(y_test, y_predict)