import numpy as np import pylab as pl import pandas as pd from sklearn import svm from sklearn import linear_model from sklearn import tree from sklearn.metrics import confusion_matrix x_min, x_max = 0, 15 y_min, y_max = 0, 10 step = .1 # to plot the boundary, we're going to create a matrix of every possible point # then label each point as a wolf or cow using our classifier xx, yy = np.meshgrid(np.arange(x_min, x_max, step), np.arange(y_min, y_max, step)) df = pd.DataFrame(data={'x': xx.ravel(), 'y': yy.ravel()}) df['color_gauge'] = (df.x-7.5)**2 + (df.y-5)**2 df['color'] = df.color_gauge.apply(lambda x: "red" if x <= 15 else "green") df['color_as_int'] = df.color.apply(lambda x: 0 if x=="red" else 1) print "Points on flag:" print df.groupby('color').size() print figure = 1 # plot a figure for the entire dataset for color in df.color.unique(): idx = df.color==color pl.subplot(2, 2, figure) pl.scatter(df[idx].x, df[idx].y, color=color) pl.title('Actual') train_idx = df.x < 10 train = df[train_idx] test = df[-train_idx] print "Training Set Size: %d" % len(train) print "Test Set Size: %d" % len(test) # train using the x and y position coordiantes cols = ["x", "y"] clfs = { "SVM": svm.SVC(degree=0.5), "Logistic" : linear_model.LogisticRegression(), "Decision Tree": tree.DecisionTreeClassifier() } # racehorse different classifiers and plot the results for clf_name, clf in clfs.iteritems(): figure += 1 # train the classifier clf.fit(train[cols], train.color_as_int) # get the predicted values from the test set test['predicted_color_as_int'] = clf.predict(test[cols]) test['pred_color'] = test.predicted_color_as_int.apply(lambda x: "red" if x==0 else "green") # create a new subplot on the plot pl.subplot(2, 2, figure) # plot each predicted color for color in test.pred_color.unique(): # plot only rows where pred_color is equal to color idx = test.pred_color==color pl.scatter(test[idx].x, test[idx].y, color=color) # plot the training set as well for color in train.color.unique(): idx = train.color==color pl.scatter(train[idx].x, train[idx].y, color=color) # add a dotted line to show the boundary between the training and test set # (everything to the right of the line is in the test set) #this plots a vertical line train_line_y = np.linspace(y_min, y_max) #evenly spaced array from 0 to 10 train_line_x = np.repeat(10, len(train_line_y)) #repeat 10 (threshold for traininset) n times # add a black, dotted line to the subplot pl.plot(train_line_x, train_line_y, 'k--', color="black") pl.title(clf_name) print "Confusion Matrix for %s:" % clf_name print confusion_matrix(test.color, test.pred_color) pl.show()