pprett · May 24, 2011 12:00 · May 24, 2011 · May 24, 2011
diff --git a/linearsvc_vs_svc.py b/linearsvc_vs_svc.py
@@ -14,6 +14,7 @@
 from scikits.learn.grid_search import GridSearchCV
 from scikits.learn.metrics.metrics import f1_score
 from scikits.learn.cross_val import StratifiedKFold
+from scikits.learn.preprocessing import Scaler
 
 # Initialize default C and gamma values
 C_start, C_end, C_step = -3, 4, 2
@@ -33,10 +34,10 @@
     train, test = iter(StratifiedKFold(Y, 2, indices=True)).next()
 
     # standardize data - try to comment this out to see the effect!
-    mean, std = X[train].mean(axis=0), X[train].std(axis=0)
-    std[std == 0.0] = 1.0
-    X[train] = (X[train] - mean) / std
-    X[test] = (X[test] - mean) / std
+    scaler = Scaler()
+    scaler.fit(X[train])
+    X[train] = scaler.transform(X[train], copy=False)
+    X[test] = scaler.transform(X[test], copy=False)
 
     # make X sparse
     X = sparse.csr_matrix(X)

diff --git a/linearsvc_vs_svc.py b/linearsvc_vs_svc.py
@@ -0,0 +1,79 @@
+"""High difference in classifier accuracies with LinearSVC and SVC.
+
+Get data.npz from [1].
+
+[1] https://docs.google.com/leaf?id=0B1BhwRZOwyxRZTcxZDA1OWMtZjZkMy00YjgxLWI3ZTMtZjJkNGIyODAyOTQy&hl=en_US
+"""
+print __doc__
+
+import numpy as np
+from functools import partial
+
+from scipy import sparse
+from scikits.learn import svm
+from scikits.learn.grid_search import GridSearchCV
+from scikits.learn.metrics.metrics import f1_score
+from scikits.learn.cross_val import StratifiedKFold
+
+# Initialize default C and gamma values
+C_start, C_end, C_step = -3, 4, 2
+
+
+if __name__ == "__main__":
+    cross_fold = 10
+
+    A = np.load("data.npz")
+
+    Y = A["arr_1"]
+    X = A["arr_0"]
+    print "X.shape=", X.shape
+    print "Y.shape=", Y.shape
+
+    folds = StratifiedKFold(Y, cross_fold, indices=True)
+    train, test = iter(StratifiedKFold(Y, 2, indices=True)).next()
+
+    # standardize data - try to comment this out to see the effect!
+    mean, std = X[train].mean(axis=0), X[train].std(axis=0)
+    std[std == 0.0] = 1.0
+    X[train] = (X[train] - mean) / std
+    X[test] = (X[test] - mean) / std
+
+    # make X sparse
+    X = sparse.csr_matrix(X)
+
+    # Generate grid search values for C, gamma
+    C_val = 2. ** np.arange(C_start, C_end + C_step, C_step)
+    tol_val = [0.1, 0.01, 0.001, 0.0001]
+    params = {'C': C_val, 'tol': tol_val}
+
+    for clf_class in [svm.sparse.LinearSVC, partial(svm.sparse.SVC,
+                                                    kernel="linear")]:
+        grid_clf = clf_class()
+        print "_" * 80
+        print grid_clf
+        print
+
+        grid_search = GridSearchCV(grid_clf, params, score_func=f1_score)
+        grid_search.fit(X[train], Y[train],
+                        cv=StratifiedKFold(Y[train],
+                                           10, indices=True))
+        y_true, y_pred = Y[test], grid_search.predict(X[test])
+
+        print "Classification report for the best estimator: "
+        print grid_search.best_estimator
+
+        print "Tuned for  with optimal f1-score: %0.3f" % f1_score(y_true,
+                                                                   y_pred)
+
+        print "Best score: %0.3f" % grid_search.best_score
+
+        best_parameters = grid_search.best_estimator._get_params()
+        print "Best C: %0.3f " % best_parameters['C']
+        print "Best tolerance: %0.16f " % best_parameters['tol']
+
+        clf = clf_class(C=best_parameters['C'], tol=best_parameters['tol'])
+        print clf
+        clf.fit(X[train], Y[train])
+        y_pred = clf.predict(X[test])
+        print "Accuracy:\t%.4f" % (y_true == y_pred).mean()
+        print "F-Score:\t%.4f" % f1_score(y_true, y_pred)