Created
          May 24, 2011 12:00 
        
      - 
      
 - 
        
Save pprett/988586 to your computer and use it in GitHub Desktop.  
Revisions
- 
        
pprett revised this gist
May 24, 2011 . 1 changed file with 5 additions and 4 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -14,6 +14,7 @@ from scikits.learn.grid_search import GridSearchCV from scikits.learn.metrics.metrics import f1_score from scikits.learn.cross_val import StratifiedKFold from scikits.learn.preprocessing import Scaler # Initialize default C and gamma values C_start, C_end, C_step = -3, 4, 2 @@ -33,10 +34,10 @@ train, test = iter(StratifiedKFold(Y, 2, indices=True)).next() # standardize data - try to comment this out to see the effect! scaler = Scaler() scaler.fit(X[train]) X[train] = scaler.transform(X[train], copy=False) X[test] = scaler.transform(X[test], copy=False) # make X sparse X = sparse.csr_matrix(X)  - 
        
pprett created this gist
May 24, 2011 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,79 @@ """High difference in classifier accuracies with LinearSVC and SVC. Get data.npz from [1]. [1] https://docs.google.com/leaf?id=0B1BhwRZOwyxRZTcxZDA1OWMtZjZkMy00YjgxLWI3ZTMtZjJkNGIyODAyOTQy&hl=en_US """ print __doc__ import numpy as np from functools import partial from scipy import sparse from scikits.learn import svm from scikits.learn.grid_search import GridSearchCV from scikits.learn.metrics.metrics import f1_score from scikits.learn.cross_val import StratifiedKFold # Initialize default C and gamma values C_start, C_end, C_step = -3, 4, 2 if __name__ == "__main__": cross_fold = 10 A = np.load("data.npz") Y = A["arr_1"] X = A["arr_0"] print "X.shape=", X.shape print "Y.shape=", Y.shape folds = StratifiedKFold(Y, cross_fold, indices=True) train, test = iter(StratifiedKFold(Y, 2, indices=True)).next() # standardize data - try to comment this out to see the effect! mean, std = X[train].mean(axis=0), X[train].std(axis=0) std[std == 0.0] = 1.0 X[train] = (X[train] - mean) / std X[test] = (X[test] - mean) / std # make X sparse X = sparse.csr_matrix(X) # Generate grid search values for C, gamma C_val = 2. ** np.arange(C_start, C_end + C_step, C_step) tol_val = [0.1, 0.01, 0.001, 0.0001] params = {'C': C_val, 'tol': tol_val} for clf_class in [svm.sparse.LinearSVC, partial(svm.sparse.SVC, kernel="linear")]: grid_clf = clf_class() print "_" * 80 print grid_clf print grid_search = GridSearchCV(grid_clf, params, score_func=f1_score) grid_search.fit(X[train], Y[train], cv=StratifiedKFold(Y[train], 10, indices=True)) y_true, y_pred = Y[test], grid_search.predict(X[test]) print "Classification report for the best estimator: " print grid_search.best_estimator print "Tuned for with optimal f1-score: %0.3f" % f1_score(y_true, y_pred) print "Best score: %0.3f" % grid_search.best_score best_parameters = grid_search.best_estimator._get_params() print "Best C: %0.3f " % best_parameters['C'] print "Best tolerance: %0.16f " % best_parameters['tol'] clf = clf_class(C=best_parameters['C'], tol=best_parameters['tol']) print clf clf.fit(X[train], Y[train]) y_pred = clf.predict(X[test]) print "Accuracy:\t%.4f" % (y_true == y_pred).mean() print "F-Score:\t%.4f" % f1_score(y_true, y_pred)