"""
Revisite UCI's wheat-seeds dataset with decision trees, to benchmark how long 
it takes to train and predict with decision trees relative to the speed of 
KNeighbors and SVC, as well as compare the decision boundary plots produced by it.
"""

import matplotlib as mpl
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np 
import time


# 
# INFO: Parameters.
# You can adjust them 

iterations = 100   

#
# INFO: You can set this to false if you want to
# draw the full square matrix
FAST_DRAW = True


def drawPlots(model, X_train, X_test, y_train, y_test, wintitle='Figure 1'):
  # INFO: A convenience function to break any higher-dimensional space down
  # And view cross sections of it.

  mpl.style.use('ggplot') # Look Pretty

  padding = 3
  resolution = 0.5
  max_2d_score = 0
  score = 0


  y_colors = ['#ff0000', '#00ff00', '#0000ff']
  my_cmap = mpl.colors.ListedColormap(['#ffaaaa', '#aaffaa', '#aaaaff'])
  colors = [y_colors[i] for i in y_train]
  num_columns = len(X_train.columns)

  fig = plt.figure()
  fig.canvas.set_window_title(wintitle)
  
  cnt = 0
  for col in range(num_columns):
    for row in range(num_columns):
      # Easy out
      if FAST_DRAW and col > row:
        cnt += 1
        continue

      ax = plt.subplot(num_columns, num_columns, cnt + 1)
      plt.xticks(())
      plt.yticks(())

          # Intersection:
      if col == row:
        plt.text(0.5, 0.5, X_train.columns[row], verticalalignment='center', 
                 horizontalalignment='center', fontsize=12)
        cnt += 1
        continue


          # Only select two features to display, then train the model
      X_train_bag = X_train.ix[:, [row,col]]
      X_test_bag = X_test.ix[:, [row,col]]
      model.fit(X_train_bag, y_train)

          # Create a mesh to plot in
      x_min, x_max = X_train_bag.ix[:, 0].min() - padding, X_train_bag.ix[:, 0].max() + padding
      y_min, y_max = X_train_bag.ix[:, 1].min() - padding, X_train_bag.ix[:, 1].max() + padding
      xx, yy = np.meshgrid(np.arange(x_min, x_max, resolution),
                           np.arange(y_min, y_max, resolution))

          # Plot Boundaries
      plt.xlim(xx.min(), xx.max())
      plt.ylim(yy.min(), yy.max())

          # Prepare the contour
      Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
      Z = Z.reshape(xx.shape)
      plt.contourf(xx, yy, Z, cmap=my_cmap, alpha=0.8)
      plt.scatter(X_train_bag.ix[:, 0], X_train_bag.ix[:, 1], c=colors, alpha=0.5)


      score = round(model.score(X_test_bag, y_test) * 100, 3)
      plt.text(0.5, 0, "Score: {0}".format(score), transform = ax.transAxes, 
               horizontalalignment='center', fontsize=8)
      max_2d_score = score if score > max_2d_score else max_2d_score

      cnt += 1

  print ("Max 2D Score: ", max_2d_score)
  fig.set_tight_layout(True)


def benchmark(model, X_train, X_test, y_train, y_test, wintitle='Figure 1'):
  print ('\n\n' + wintitle + ' Results')
  
  # the only purpose of doing many iterations is to get a more accurate 
  # count of the time it took for each classifier
  s = time.time()
  for i in range(iterations):
    #
    # : train the classifier on the training data / labels:
    #
    model.fit(X_train, y_train) 

    
  print ("{0} Iterations Training Time: ".format(iterations), time.time() - s)


  scoreBch = 0

  s = time.time()
  for i in range(iterations):
    #
    # : score the classifier on the testing data / labels:
    #
    scoreBch = model.score(X_test, y_test)

    
  print ("{0} Iterations Scoring Time: ".format(iterations), time.time() - s)
  print ("High-Dimensionality Score: ", round((scoreBch*100), 3))


# 
# : Load up the wheat dataset into dataframe 'X'
#
df = pd.read_csv("Datasets/wheat.data", index_col='id')


# INFO: An easy way to show which rows have nans in them
print (df[pd.isnull(df).any(axis=1)])


# 
# : Go ahead and drop any row with a nan
#
df.dropna(axis=0, inplace=True)


# 
# INFO: # In the future, you might try setting the nan values to the
# mean value of that column, the mean should only be calculated for
# the specific class rather than across all classes, now that you
# have the labels


#
# : Copy the labels out of the dset into variable 'y' then Remove
# them from X. Encode the labels -- canadian:0, kama:1, and rosa:2
#
labels = df.wheat_type.copy() # copy “y” values out
df.drop(['wheat_type'], axis=1, inplace=True) # drop output column 

labels = labels.map({'canadian':0, 'kama':1, 'rosa':2})


# 
# : Split data into test / train sets
#
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(df, labels, test_size=0.3, 
                                                    random_state=7)


#
# : Create a decision tree classifier 
#
from sklearn import tree
"""
Reminder. Decision tree classifier - default values:

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=9,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
"""    
model = tree.DecisionTreeClassifier(max_depth=6, random_state=2)
model.fit(X_train, y_train)


benchmark(model, X_train, X_test, y_train, y_test, 'Tree')
drawPlots(model, X_train, X_test, y_train, y_test, 'Tree')

plt.show()