Mashimo · April 29, 2018 22:39 · May 15, 2017 · May 15, 2017 · May 15, 2017 · May 15, 2017
diff --git a/Decision Tree.md → Decision Tree b/Decision Tree.md → Decision Tree
diff --git a/Decision Tree.md b/Decision Tree.md
@@ -0,0 +1,3 @@
+Decision trees are a supervised, probabilistic, machine learning classifier that are often used as decision support tools. Like any other classifier, they are capable of predicting the label of a sample, and the way they do this is by examining the probabilistic outcomes of your samples' features.   
+Decision trees are one of the oldest and most used machine learning algorithms, perhaps even pre-dating machine learning. They're very popular and have been around for decades. Following through with sequential cause-and-effect decisions comes very naturally.  
+Decision trees are a good tool to use when you want backing evidence to support a decision.
diff --git a/mushroomsTree.py b/mushroomsTree.py
@@ -0,0 +1,87 @@
+"""
+Use decision trees to peruse The Mushroom Data Set, drawn from the Audobon 
+Society Field Guide to North American Mushrooms (1981). The data set details 
+mushrooms described in terms of many physical characteristics, such as cap size 
+and stalk length, along with a classification of poisonous or edible.
+
+As a standard disclaimer, if you eat a random mushroom you find, you are doing 
+so at your own risk.
+"""
+import pandas as pd
+
+
+#dataset is here:
+#    https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.names
+
+
+# 
+# : Load up the mushroom dataset into dataframe 'X'
+# Header information is on the dataset's website at the UCI ML Repo
+#
+colNames=['label', 'cap-shape','cap-surface','cap-color','bruises','odor',
+          'gill-attachment','gill-spacing','gill-size','gill-color','stalk-shape',
+          'stalk-root','stalk-surface-above-ring','stalk-surface-below-ring',
+          'stalk-color-above-ring','stalk-color-below-ring','veil-type',
+          'veil-color','ring-number','ring-type','spore-print-color','population',
+          'habitat']
+X = pd.read_csv("Datasets/agaricus-lepiota.data", header=None, na_values='?',
+                names=colNames) 
+
+
+# 
+# : Go ahead and drop any row with a nan
+#
+X.dropna(axis=0, inplace=True) 
+print (X.shape)
+
+
+#
+# : Copy the labels out of the dset into variable 'y' then Remove
+# them from X. Encode the labels poisonous / edible
+
+y = X[X.columns[0]].copy()
+X.drop(X.columns[0], axis=1,inplace=True)
+
+y = y.map({'p':0, 'e':1})
+
+#
+# : Encode the entire dataset using dummies
+#
+X = pd.get_dummies(X)
+
+
+# 
+# : Split data into test / train sets
+#
+from sklearn.model_selection import train_test_split
+
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
+                                                    random_state=7)
+
+
+#
+# : Create a DT classifier. No need to set any parameters
+#
+from sklearn import tree
+
+model = tree.DecisionTreeClassifier()
+
+#
+# : train the classifier on the training data / labels:
+#
+model.fit(X_train, y_train)
+
+# : score the classifier on the testing data / labels:
+score = model.score(X_test, y_test)
+
+print ("High-Dimensionality Score: ", round((score*100), 3))
+
+# RESULT:
+# top two features you should consider when deciding if a mushroom is eadible or not:
+# Odor, and Gill Size
+#
+# output a .DOT file
+# .DOT files can be rendered to .PNGs, if you've already `brew install graphviz`.
+# If not, `brew install graphviz`. If you can't, use: http://webgraphviz.com/.
+
+tree.export_graphviz(model.tree_, out_file='tree.dot', feature_names=X.columns)
diff --git a/tree.py b/tree.py
@@ -0,0 +1,197 @@
+"""
+Revisite UCI's wheat-seeds dataset with decision trees, to benchmark how long 
+it takes to train and predict with decision trees relative to the speed of 
+KNeighbors and SVC, as well as compare the decision boundary plots produced by it.
+"""
+
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+
+import pandas as pd
+import numpy as np 
+import time
+
+
+# 
+# INFO: Parameters.
+# You can adjust them 
+
+iterations = 100   
+
+#
+# INFO: You can set this to false if you want to
+# draw the full square matrix
+FAST_DRAW = True
+
+
+
+def drawPlots(model, X_train, X_test, y_train, y_test, wintitle='Figure 1'):
+  # INFO: A convenience function to break any higher-dimensional space down
+  # And view cross sections of it.
+
+  mpl.style.use('ggplot') # Look Pretty
+
+  padding = 3
+  resolution = 0.5
+  max_2d_score = 0
+  score = 0
+
+
+  y_colors = ['#ff0000', '#00ff00', '#0000ff']
+  my_cmap = mpl.colors.ListedColormap(['#ffaaaa', '#aaffaa', '#aaaaff'])
+  colors = [y_colors[i] for i in y_train]
+  num_columns = len(X_train.columns)
+
+  fig = plt.figure()
+  fig.canvas.set_window_title(wintitle)
+
+  cnt = 0
+  for col in range(num_columns):
+    for row in range(num_columns):
+      # Easy out
+      if FAST_DRAW and col > row:
+        cnt += 1
+        continue
+
+      ax = plt.subplot(num_columns, num_columns, cnt + 1)
+      plt.xticks(())
+      plt.yticks(())
+
+          # Intersection:
+      if col == row:
+        plt.text(0.5, 0.5, X_train.columns[row], verticalalignment='center', 
+                 horizontalalignment='center', fontsize=12)
+        cnt += 1
+        continue
+
+
+          # Only select two features to display, then train the model
+      X_train_bag = X_train.ix[:, [row,col]]
+      X_test_bag = X_test.ix[:, [row,col]]
+      model.fit(X_train_bag, y_train)
+
+          # Create a mesh to plot in
+      x_min, x_max = X_train_bag.ix[:, 0].min() - padding, X_train_bag.ix[:, 0].max() + padding
+      y_min, y_max = X_train_bag.ix[:, 1].min() - padding, X_train_bag.ix[:, 1].max() + padding
+      xx, yy = np.meshgrid(np.arange(x_min, x_max, resolution),
+                           np.arange(y_min, y_max, resolution))
+
+          # Plot Boundaries
+      plt.xlim(xx.min(), xx.max())
+      plt.ylim(yy.min(), yy.max())
+
+          # Prepare the contour
+      Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
+      Z = Z.reshape(xx.shape)
+      plt.contourf(xx, yy, Z, cmap=my_cmap, alpha=0.8)
+      plt.scatter(X_train_bag.ix[:, 0], X_train_bag.ix[:, 1], c=colors, alpha=0.5)
+
+
+      score = round(model.score(X_test_bag, y_test) * 100, 3)
+      plt.text(0.5, 0, "Score: {0}".format(score), transform = ax.transAxes, 
+               horizontalalignment='center', fontsize=8)
+      max_2d_score = score if score > max_2d_score else max_2d_score
+
+      cnt += 1
+
+  print ("Max 2D Score: ", max_2d_score)
+  fig.set_tight_layout(True)
+
+
+def benchmark(model, X_train, X_test, y_train, y_test, wintitle='Figure 1'):
+  print ('\n\n' + wintitle + ' Results')
+
+  # the only purpose of doing many iterations is to get a more accurate 
+  # count of the time it took for each classifier
+  s = time.time()
+  for i in range(iterations):
+    #
+    # : train the classifier on the training data / labels:
+    #
+    model.fit(X_train, y_train) 
+
+
+  print ("{0} Iterations Training Time: ".format(iterations), time.time() - s)
+
+
+  scoreBch = 0
+
+  s = time.time()
+  for i in range(iterations):
+    #
+    # : score the classifier on the testing data / labels:
+    #
+    scoreBch = model.score(X_test, y_test)
+
+
+  print ("{0} Iterations Scoring Time: ".format(iterations), time.time() - s)
+  print ("High-Dimensionality Score: ", round((scoreBch*100), 3))
+
+
+
+# 
+# : Load up the wheat dataset into dataframe 'X'
+#
+df = pd.read_csv("Datasets/wheat.data", index_col='id')
+
+
+# INFO: An easy way to show which rows have nans in them
+print (df[pd.isnull(df).any(axis=1)])
+
+
+# 
+# : Go ahead and drop any row with a nan
+#
+df.dropna(axis=0, inplace=True)
+
+
+# 
+# INFO: # In the future, you might try setting the nan values to the
+# mean value of that column, the mean should only be calculated for
+# the specific class rather than across all classes, now that you
+# have the labels
+
+
+
+#
+# : Copy the labels out of the dset into variable 'y' then Remove
+# them from X. Encode the labels -- canadian:0, kama:1, and rosa:2
+#
+labels = df.wheat_type.copy() # copy “y” values out
+df.drop(['wheat_type'], axis=1, inplace=True) # drop output column 
+
+labels = labels.map({'canadian':0, 'kama':1, 'rosa':2})
+
+
+# 
+# : Split data into test / train sets
+#
+from sklearn.model_selection import train_test_split
+
+
+X_train, X_test, y_train, y_test = train_test_split(df, labels, test_size=0.3, 
+                                                    random_state=7)
+
+
+#
+# : Create a decision tree classifier 
+#
+from sklearn import tree
+"""
+Reminder. Decision tree classifier - default values:
+
+DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=9,
+            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
+            min_samples_split=2, min_weight_fraction_leaf=0.0,
+            presort=False, random_state=None, splitter='best')
+"""    
+model = tree.DecisionTreeClassifier(max_depth=6, random_state=2)
+model.fit(X_train, y_train)
+
+
+
+benchmark(model, X_train, X_test, y_train, y_test, 'Tree')
+drawPlots(model, X_train, X_test, y_train, y_test, 'Tree')
+
+plt.show()
+