Mashimo · April 29, 2018 22:37 · Jan 19, 2018 · May 10, 2017 · May 10, 2017 · May 10, 2017
diff --git a/Regression → Regression b/Regression → Regression
diff --git a/sound.py b/sound.py
@@ -24,7 +24,7 @@
 # we mean the actually "features" of the audio file.
 #
 #
-# The goal of this lab is to use multi-target, linear regression to generate
+# The goal of this gist is to use multi-target, linear regression to generate
 # by extrapolation, the missing portion of the test audio file.
 #
 # Each one audio_sample features will be the output of an equation,
@@ -39,7 +39,7 @@
 
 
 #
-# : Play with this. This is how much of the audio file will
+# Play with this. This is how much of the audio file will
 # be provided, in percent. The remaining percent of the file will
 # be generated via linear extrapolation.
 Provided_Portion = 0.25
@@ -52,7 +52,7 @@
 
 
 #
-# : Create a regular ol' Python List called 'zero'
+# Create a regular ol' Python List called 'zero'
 # Loop through the dataset and load up all 50 of the 0_jackson*.wav files
 # For each audio file, simply append the audio data (not the sample_rate,
 # just the data!) to the Python list 'zero':
@@ -68,7 +68,7 @@
 
 
 # 
-# : Just for a second, convert zero into a DataFrame and set the dtype to 
+# convert zero into a DataFrame and set the dtype to 
 # np.int16, since the input audio files are 16
 # bits per sample. This is important otherwise the produced audio samples 
 # will be encoded as 64 bits per sample and will be too short.
@@ -91,7 +91,7 @@
 
 zero = zeroDF.values # this is a list
 #
-# : It's important to know how (many audio_samples samples) long the
+# It's important to know how (many audio_samples samples) long the
 # data is now. 'zero' is currently shaped [n_samples, n_audio_samples]
 #
 
@@ -100,7 +100,7 @@
 
 
 #
-# : Create the linear regression model 
+# Create the linear regression model 
 #
 model = linear_model.LinearRegression()
 
@@ -122,7 +122,7 @@
 
 
 # 
-# : Print out the shape of train, and the shape of test
+# Print out the shape of train, and the shape of test
 # train will be shaped: [n_samples, n_audio_samples], where
 # n_audio_samples are the 'features' of the audio file
 # test will be shaped [n_audio_features], since it is a single
@@ -147,12 +147,12 @@
 # half of, so that you can compare it to the 'patched' clip once 
 # you've generated it. 
 # this assume the sample rate is always the same for all samples
-wavfile.write('Original Test Clip.wav', sample_rate, test)
+wavfile.write('OriginalTestClip.wav', sample_rate, test)
 
 
 
 #
-# : Prepare the TEST data by creating a slice called X_test. It
+# Prepare the TEST data by creating a slice called X_test. It
 # should have Provided_Portion * n_audio_samples audio sample features,
 # taken from the test audio file, currently stored in the variable
 # 'test'. 
@@ -162,7 +162,7 @@
 
 
 #
-# : If the first Provided_Portion * n_audio_samples features were
+# If the first Provided_Portion * n_audio_samples features were
 # stored in X_test, then we need to also grab the *remaining* audio
 # features and store it in y_test. With the remaining features stored
 # in there, we will be able to R^2 "score" how well our algorithm did
@@ -172,42 +172,31 @@
 
 
 # 
-# : Duplicate the same process for X_train, y_train. The only
-# differences being: 1) we will be getting the audio data from
-# 'train' instead of from 'test', 2) Remember the shape of train that
-# you printed out earlier? You want to do this slicing but for ALL
-# samples (observations). For each observation, you want to slice
-# the first Provided_Portion * n_audio_samples audio features into
-# X_train, and the remaining go into y_test. All of this should be
-# accomplishable using regular indexing in two lines of code.
+# Duplicate the same process for X_train, y_train. 
 #
 X_train = train[:, 0:test_samples] # first ones
 y_train = train[:, test_samples:]
 
 # 
-# : SciKit-Learn gets mad if you don't supply your training
+# SciKit-Learn gets mad if you don't supply your training
 # data in the form of a 2D arrays: [n_samples, n_features].
 #
 # So if you only have one SAMPLE, such as is our case with X_test, 
 # and y_test, then by calling .reshape(1, -1), you can turn
 # [n_features] into [1, n_features].
 #
-# On the other hand, if you only have one FEATURE, which currently
-# doesn't apply, you can call .reshape(-1, 1) on your data to turn
-# [n_samples] into [n_samples, 1]:
 #
 X_test = X_test.reshape(1,-1)
 y_test = y_test.reshape(1,-1)
 
 #
-# : Fit the model using the training data and label:
+# Fit the model using the training data and label:
 #
 model.fit(X_train, y_train)
 
 
 # 
-# : Use the model to predict the 'label' of X_test. Store the
-# resulting prediction 
+# Use the model to predict the 'label' of X_test. 
 #
 y_test_prediction = model.predict(X_test)
 
@@ -219,7 +208,7 @@
 
 
 # 
-# : Score how well the prediction would do for some good laughs,
+# Score how well the prediction would do for some good laughs,
 # by passing in the test data and test label (y_test).
 #
 score = model.score(X_test, y_test)

diff --git a/sound.py b/sound.py
@@ -0,0 +1,235 @@
+"""
+Use linear regression to recover or 'fill out' a completely deleted portion of an audio file!
+This will be using The FSDD, Free-Spoken-Digits-Dataset, an audio dataset put together by Zohar Jackson:
+cleaned up audio (no dead-space, roughly same length, same bitrate, same samples-per-second rate, etc) samples ready for machine learning.
+"""
+import numpy as np
+import pandas as pd
+import os
+
+from sklearn import linear_model
+
+import scipy.io.wavfile as wavfile
+
+
+#
+# INFO:
+# Samples = Observations. Each audio file  is a single sample
+#           in the dataset.
+#
+# Audio Samples = https://en.wikipedia.org/wiki/Sampling_(signal_processing)
+# Each .wav file is actually just a bunch of numeric samples, "sampled"
+# from the analog signal. Sampling is a type of discretization. When we
+# mention 'samples', we mean observations. When we mention 'audio samples',
+# we mean the actually "features" of the audio file.
+#
+#
+# The goal of this lab is to use multi-target, linear regression to generate
+# by extrapolation, the missing portion of the test audio file.
+#
+# Each one audio_sample features will be the output of an equation,
+# which is a function of the provided portion of the audio_samples:
+#
+#    missing_samples = f(provided_samples)
+#
+# You can experiment with how much of the audio you want to chop off
+# and have the computer generate using the Provided_Portion parameter.
+
+
+
+
+#
+# : Play with this. This is how much of the audio file will
+# be provided, in percent. The remaining percent of the file will
+# be generated via linear extrapolation.
+Provided_Portion = 0.25
+
+
+
+# INFO: You have to download the dataset (audio files) from the website:
+# https://github.com/Jakobovski/free-spoken-digit-dataset
+
+
+
+#
+# : Create a regular ol' Python List called 'zero'
+# Loop through the dataset and load up all 50 of the 0_jackson*.wav files
+# For each audio file, simply append the audio data (not the sample_rate,
+# just the data!) to the Python list 'zero':
+#
+zero = []
+directory = "Datasets/free-spoken-digit-dataset-master/recordings/"
+for fname in os.listdir(directory):
+    if fname.startswith("0_jackson"):
+        fullname = os.path.join(directory, fname)
+        sample_rate, data = wavfile.read(fullname)
+        zero.append( data )
+
+
+
+# 
+# : Just for a second, convert zero into a DataFrame and set the dtype to 
+# np.int16, since the input audio files are 16
+# bits per sample. This is important otherwise the produced audio samples 
+# will be encoded as 64 bits per sample and will be too short.
+
+zeroDF = pd.DataFrame(zero, dtype=np.int16)
+#
+# Since these audio clips are unfortunately not length-normalized,
+# we're going to have to just hard chop them to all be the same length.
+# Since Pandas would have inserted NANs at any spot to make zero a 
+# perfectly rectangular [n_observed_samples, n_audio_samples] array,
+# do a dropna on the Y axis here. Then, convert one back into an
+# NDArray using .values
+#
+
+if zeroDF.isnull().values.any() == True:
+  print("Preprocessing data: dropping all NaN")
+  zeroDF.dropna(axis=1, inplace=True)
+else:
+  print("Preprocessing data: No NaN found!")
+
+zero = zeroDF.values # this is a list
+#
+# : It's important to know how (many audio_samples samples) long the
+# data is now. 'zero' is currently shaped [n_samples, n_audio_samples]
+#
+
+n_audio_samples = zero.shape[1]
+
+
+
+#
+# : Create the linear regression model 
+#
+model = linear_model.LinearRegression()
+
+
+
+#
+# INFO: There are 50 takes of each clip. You want to pull out just one
+# of them, randomly, and that one will NOT be used in the training of
+# the model. In other words, the one file we'll be testing / scoring
+# on will be an unseen sample, independent to the rest of the
+# training set:
+from sklearn.utils.validation import check_random_state
+
+rng   = check_random_state(7) 
+random_idx = rng.randint(zero.shape[0])
+
+test  = zero[random_idx] # the test sample
+train = np.delete(zero, [random_idx], axis=0)
+
+
+# 
+# : Print out the shape of train, and the shape of test
+# train will be shaped: [n_samples, n_audio_samples], where
+# n_audio_samples are the 'features' of the audio file
+# test will be shaped [n_audio_features], since it is a single
+# sample (audio file, e.g. observation).
+#
+print(train.shape)
+print(test.shape)
+
+
+#
+# INFO: The test data will have two parts, X_test and y_test. X_test is
+# going to be the first portion of the test audio file, which we will
+# be providing the computer as input. y_test, the "label" if you will,
+# is going to be the remaining portion of the audio file. Like such, 
+# the computer will use linear regression to derive the missing
+# portion of the sound file based off of the training data its received!
+
+
+
+#
+# Save the original 'test' clip, the one you're about to delete
+# half of, so that you can compare it to the 'patched' clip once 
+# you've generated it. 
+# this assume the sample rate is always the same for all samples
+wavfile.write('Original Test Clip.wav', sample_rate, test)
+
+
+
+#
+# : Prepare the TEST data by creating a slice called X_test. It
+# should have Provided_Portion * n_audio_samples audio sample features,
+# taken from the test audio file, currently stored in the variable
+# 'test'. 
+#
+test_samples = int(Provided_Portion * n_audio_samples)
+X_test = test[0:test_samples] # first ones
+
+
+#
+# : If the first Provided_Portion * n_audio_samples features were
+# stored in X_test, then we need to also grab the *remaining* audio
+# features and store it in y_test. With the remaining features stored
+# in there, we will be able to R^2 "score" how well our algorithm did
+# in completing the sound file.
+#
+y_test = test[test_samples:] # remaining ones
+
+
+# 
+# : Duplicate the same process for X_train, y_train. The only
+# differences being: 1) we will be getting the audio data from
+# 'train' instead of from 'test', 2) Remember the shape of train that
+# you printed out earlier? You want to do this slicing but for ALL
+# samples (observations). For each observation, you want to slice
+# the first Provided_Portion * n_audio_samples audio features into
+# X_train, and the remaining go into y_test. All of this should be
+# accomplishable using regular indexing in two lines of code.
+#
+X_train = train[:, 0:test_samples] # first ones
+y_train = train[:, test_samples:]
+
+# 
+# : SciKit-Learn gets mad if you don't supply your training
+# data in the form of a 2D arrays: [n_samples, n_features].
+#
+# So if you only have one SAMPLE, such as is our case with X_test, 
+# and y_test, then by calling .reshape(1, -1), you can turn
+# [n_features] into [1, n_features].
+#
+# On the other hand, if you only have one FEATURE, which currently
+# doesn't apply, you can call .reshape(-1, 1) on your data to turn
+# [n_samples] into [n_samples, 1]:
+#
+X_test = X_test.reshape(1,-1)
+y_test = y_test.reshape(1,-1)
+
+#
+# : Fit the model using the training data and label:
+#
+model.fit(X_train, y_train)
+
+
+# 
+# : Use the model to predict the 'label' of X_test. Store the
+# resulting prediction 
+#
+y_test_prediction = model.predict(X_test)
+
+
+# INFO: SciKit-Learn will use float64 to generate the predictions
+# so let's take those values back to int16:
+y_test_prediction = y_test_prediction.astype(dtype=np.int16)
+
+
+
+# 
+# : Score how well the prediction would do for some good laughs,
+# by passing in the test data and test label (y_test).
+#
+score = model.score(X_test, y_test)
+print ("Extrapolation R^2 Score: ", score)
+
+
+#
+# First, take the first Provided_Portion portion of the test clip, the
+# part you fed into your linear regression model. Then, stitch that
+# together with the abomination the predictor model generated for you,
+# and then save the completed audio clip:
+completed_clip = np.hstack((X_test, y_test_prediction))
+wavfile.write('Extrapolated Clip.wav', sample_rate, completed_clip[0])
diff --git a/predictLifeExpectancy.py b/predictLifeExpectancy.py
@@ -1,3 +1,7 @@
+"""
+Using linear regression, extrapolate how long people will live in the future. 
+The dataset "Life expectancy at birth, at age 65, and at age 75, by sex, race, and origin" is provided courtesy of the Center for Disease Control and Prevention's National Center for Health Statistics; page: http://www.cdc.gov/nchs/data_access/ftp_data.htm
+"""
 import pandas as pd
 import matplotlib
 import matplotlib.pyplot as plt

diff --git a/Regression → Regression b/Regression → Regression
diff --git a/college-acceptance.py b/college-acceptance.py
@@ -1,3 +1,11 @@
+"""
+There are four relationships we are interested in modeling:
+
+1.The amount charged for room and board, expressed as a function of the number of: accepted students
+2.The number of enrolled students per college, expressed as a function of the number of: accepted students
+3.The number of failed undergraduate students per college, expressed as a function of: the number of accepted students
+4.The amount charged for room and board coupled with the number of enrolled students, expressed as a function of: the number of accepted students.
+"""
 import pandas as pd
 import numpy as np
 import matplotlib

diff --git a/college-acceptance.py b/college-acceptance.py
@@ -0,0 +1,233 @@
+import pandas as pd
+import numpy as np
+import matplotlib
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D
+
+from sklearn import linear_model
+from sklearn.model_selection import train_test_split
+
+
+matplotlib.style.use('ggplot') # Look Pretty
+
+
+def drawLine(model, X_test, y_test, title, R2):
+  # This convenience method will take care of plotting the
+  # test observations, comparing them to the regression line,
+  # and displaying the R2 coefficient
+  fig = plt.figure()
+  ax = fig.add_subplot(111)
+  ax.scatter(X_test, y_test, c='g', marker='o')
+  ax.plot(X_test, model.predict(X_test), color='orange', linewidth=1, alpha=0.7)
+
+  title += " R2: " + str(R2)
+  ax.set_title(title)
+  print (title)
+  print ("Intercept(s): ", model.intercept_)
+
+  plt.show()
+
+def drawPlane(model, X_test, y_test, title, R2):
+  # This convenience method will take care of plotting the
+  # test observations, comparing them to the regression plane,
+  # and displaying the R2 coefficient
+  fig = plt.figure()
+  ax = Axes3D(fig)
+  ax.set_zlabel('prediction')
+
+  # You might have passed in a DataFrame, a Series (slice),
+  # an NDArray, or a Python List... so let's keep it simple:
+  X_test = np.array(X_test)
+  col1 = X_test[:,0]
+  col2 = X_test[:,1]
+
+  # Set up a Grid. We could have predicted on the actual
+  # col1, col2 values directly; but that would have generated
+  # a mesh with WAY too fine a grid, which would have detracted
+  # from the visualization
+  x_min, x_max = col1.min(), col1.max()
+  y_min, y_max = col2.min(), col2.max()
+  x = np.arange(x_min, x_max, (x_max-x_min) / 10)
+  y = np.arange(y_min, y_max, (y_max-y_min) / 10)
+  x, y = np.meshgrid(x, y)
+
+  # Predict based on possible input values that span the domain
+  # of the x and y inputs:
+  z = model.predict(  np.c_[x.ravel(), y.ravel()]  )
+  z = z.reshape(x.shape)
+
+  ax.scatter(col1, col2, y_test, c='g', marker='o')
+  ax.plot_wireframe(x, y, z, color='orange', alpha=0.7)
+
+  title += " R2: " + str(R2)
+  ax.set_title(title)
+  print (title)
+  print ("Intercept(s): ", model.intercept_)
+
+  plt.show()
+
+
+
+#
+# INFO: Let's get started!
+
+
+# the first column is both unique (the name of each)
+# college, as well as unlabeled. This is a HINT that it must be the
+# index column. If you do not indicate to Pandas that you already
+# have an index column, it'll create one for you, which would be
+# undesirable since you already have one.
+#
+# load up the College dataset into a variable
+# called X:
+#
+X = pd.read_csv("Datasets/college.csv", index_col=0) 
+
+
+#
+# The .map() method is like .apply(), but instead of taking in a
+# lambda / function, you simply provide a mapping of keys:values.
+
+X.Private = X.Private.map({'Yes':1, 'No':0})
+
+
+#
+# : Create the linear regression model 
+#
+model = linear_model.LinearRegression()
+
+#
+# INFO: The first relationship we're interested in is the 
+# number of accepted students, as a function of the amount
+# charged for room and board.
+
+#
+# : Using indexing, create two slices (series). One will just
+# store the room and board column, the other will store the accepted
+# students column. Then use train_test_split to cut the data up
+# into X_train, X_test, y_train, y_test, with a test_size of 30% and
+# a random_state of 7.
+#
+X_rb  = X['Room.Board'] # series
+y = X['Accept']
+
+X_train, X_test, y_train, y_test = train_test_split(X_rb, y, 
+                                                    test_size=0.3, random_state=7)
+
+#
+# : Fit and score the model appropriately. 
+#
+# fit(), score() and predict() expect 2d arrays
+model.fit(X_train.reshape(-1,1), y_train)
+X_test = X_test.reshape(-1,1)
+score = model.score(X_test, y_test)
+
+drawLine(model, X_test, y_test, "Accept(Room&Board)", score)
+
+
+
+
+# 
+# : Duplicate the process above; this time, model the number of
+# accepted students, as a function of the number of enrolled students
+# per college.
+#
+X_en  = X['Enroll'] # series
+X_train, X_test, y_train, y_test = train_test_split(X_en, y, 
+                                                    test_size=0.3, random_state=7)
+
+model.fit(X_train.reshape(-1,1), y_train)
+X_test = X_test.reshape(-1,1)
+score = model.score(X_test, y_test)
+
+drawLine(model, X_test, y_test, "Accept(Enroll)", score)
+
+
+
+# 
+# : Duplicate the process above; this time, model the number of
+# accepted students, as as function of the numbr of failed undergraduate
+# students per college.
+#
+X_fu  = X['F.Undergrad'] # series
+X_train, X_test, y_train, y_test = train_test_split(X_fu, y, 
+                                                    test_size=0.3, random_state=7)
+
+model.fit(X_train.reshape(-1,1), y_train)
+X_test = X_test.reshape(-1,1)
+score = model.score(X_test, y_test)
+
+drawLine(model, X_test, y_test, "Accept(F.Undergrad)", score)
+
+
+#
+# : Duplicate the process above (almost). This time is going to be
+# a bit more complicated. Instead of modeling one feature as a function
+# of another,  will attempt to do multivariate linear regression to
+# model one feature as a function of TWO other features.
+#
+# Model the amount charged for room and board AND the number of enrolled
+# students, as a function of the number of accepted students. To do
+# this, instead of creating a regular slice for a single-feature input,
+# simply create a slice that contains both columns you wish to use as
+# inputs. Your training labels will remain a single slice.
+#
+X_rb_en = X[['Room.Board', 'Enroll']] # data frame
+X_train, X_test, y_train, y_test = train_test_split(X_rb_en, y, 
+                                                    test_size=0.3, random_state=7)
+
+model.fit(X_train, y_train)
+score = model.score(X_test, y_test)
+
+drawPlane(model, X_test, y_test, "Accept(Room&Board,Enroll)", score)
+
+
+
+# INFO + HINT On Fitting, Scoring, and Predicting:
+# When you use .fit(), .score(), and .predict() on
+# your model, SciKit-Learn expects your training data to be in
+# spreadsheet (2D Array-Like) form. This means you can't simply
+# pass in a 1D Array (slice) and get away with it.
+#
+# To properly prep your data, you have to pass in a 2D Numpy Array,
+# or a dataframe. But what happens if you really only want to pass
+# in a single feature?
+#
+# If you slice your dataframe using df[['ColumnName']] syntax, the
+# result that comes back is actually a *dataframe*. Go ahead and do
+# a type() on it to check it out. Since it's already a dataframe,
+# you're good -- no further changes needed.
+#
+# But if you slice your dataframe using the df.ColumnName syntax,
+# OR if you call df['ColumnName'], the result that comes back is
+# actually a series (1D Array)! This will cause SKLearn to bug out.
+# So if you are slicing using either of those two techniques, before
+# sending your training or testing data to .fit / .score, do a
+# my_column = my_column.reshape(-1,1). This will convert your 1D
+# array of [n_samples], to a 2D array shaped like [n_samples, 1].
+# A single feature, with many samples.
+#
+# If you did something like my_column = [my_column], that would produce
+# an array in the shape of [1, n_samples], which is incorrect because
+# SKLearn expects your data to be arranged as [n_samples, n_features].
+# Keep in mind, all of the above only relates to your "X" or input
+# data, and does not apply to your "y" or labels.
+
+
+
+#
+# Extra
+# ========================
+#
+# 
+# What happens if you apply scaling to your data before doing 
+# linear regression? Would it alter the quality of your results?
+# Do the scalers that work on a per-feature basis, such as MinMaxScaler
+# behave differently that those that work on a multi-feature basis, such
+# as normalize? And moreover, once your features have been scaled, you
+# won't be able to use the resulting regression directly... unless you're
+# able to .inverse_transform() the scaling. Do all of the SciKit-Learn
+# scalers support that?
+#
+
+
diff --git a/Regression b/Regression
@@ -0,0 +1 @@
+Examples of regression models for prediction
diff --git a/predictLifeExpectancy.py b/predictLifeExpectancy.py
@@ -0,0 +1,140 @@
+import pandas as pd
+import matplotlib
+import matplotlib.pyplot as plt
+
+from sklearn import linear_model
+
+matplotlib.style.use('ggplot') # Look Pretty
+
+
+def drawLine(model, X_test, y_test, title):
+  # This convenience method will take care of plotting the
+  # test observations, comparing them to the regression line,
+  # and displaying the R2 coefficient
+  fig = plt.figure()
+  ax = fig.add_subplot(111)
+  ax.scatter(X_test, y_test, c='g', marker='o')
+  ax.plot(X_test, model.predict(X_test), color='orange', linewidth=1, alpha=0.7)
+
+  print ("Est 2014 " + title + " Life Expectancy: ", model.predict([[2014]])[0])
+  print ("Est 2030 " + title + " Life Expectancy: ", model.predict([[2030]])[0])
+  print ("Est 2045 " + title + " Life Expectancy: ", model.predict([[2045]])[0])
+
+  score = model.score(X_test, y_test)
+  title += " R2: " + str(score)
+  ax.set_title(title)
+
+
+  plt.show()
+
+
+#
+# : Load up the data  into a variable called 'X'.
+#
+X = pd.read_csv("Datasets/life_expectancy.csv", sep='\t')
+
+#
+# : Create the linear regression model 
+#
+model = linear_model.LinearRegression()
+
+#
+# : Slice out the data manually (e.g. not using train_test_split.
+# Set X_train to be year values
+# LESS than 1986, and y_train to be corresponding WhiteMale age values.
+#
+# INFO You might also want to read the note about slicing on the bottom
+# of this document.
+#
+X_train = X.loc[X['Year']<1986, ['Year']] # data frame
+y_trainWM = X[X['Year'] < 1986].WhiteMale # series
+
+
+#
+# : Train the model then pass it into drawLine with the training
+# set and labels. drawLine will output
+# to the console a 2014 extrapolation / approximation for what it
+# believes the WhiteMale's life expectancy in the U.S. will be...
+# given the pre-1986 data you trained it with. It'll also produce a
+# 2030 and 2045 extrapolation.
+#
+model.fit(X_train, y_trainWM)
+
+drawLine(model, X_train, y_trainWM, "WhiteMale")
+
+#
+# : Print the actual 2014 WhiteMale life expectancy 
+
+print("Actual 2014 WhiteMale Life Expectancy = ", 
+      X[X.Year == 2014].WhiteMale.iloc[0])
+
+
+# 
+# : Repeat the process, but instead of for WhiteMale, this time
+# select BlackFemale. Create a slice for BlackFemales, fit the
+# model, and then call drawLine. Lastly, print out the actual 2014
+# BlackFemale life expectancy
+#
+y_trainBF = X[X['Year'] < 1986].BlackFemale # series
+model.fit(X_train, y_trainBF)
+
+drawLine(model, X_train, y_trainBF, "BlackFemale")
+
+print("Actual 2014 BlackFemale Life Expectancy = ", 
+      X[X.Year == 2014].BlackFemale.iloc[0])
+
+
+#
+# : Lastly, print out a correlation matrix for the entire
+# dataset, and display a visualization of the correlation
+# matrix
+#
+print("Correlation matrix: ")
+print(X.corr())
+
+fig, ax = plt.subplots()
+im = ax.imshow(X.corr(), cmap=plt.cm.Blues, interpolation='nearest')
+plt.colorbar(im)
+tick_marks = [i for i in range(len(X.columns))]
+plt.xticks(tick_marks, X.columns, rotation='vertical')
+plt.yticks(tick_marks, X.columns)
+
+ax.set_title("Correlation matrix")
+
+plt.show()
+
+
+
+
+#
+# INFO On Fitting, Scoring, and Predicting:
+#
+# When you use .fit(), .score(), and .predict() on
+# your model, SciKit-Learn expects your training data to be in
+# spreadsheet (2D Array-Like) form. This means you can't simply
+# pass in a 1D Array (slice) and get away with it.
+#
+# To properly prep your data, you have to pass in a 2D Numpy Array,
+# or a dataframe. But what happens if you really only want to pass
+# in a single feature?
+#
+# If you slice your dataframe using df[['ColumnName']] syntax, the
+# result that comes back is actually a *dataframe*. Go ahead and do
+# a type() on it to check it out. Since it's already a dataframe,
+# you're good -- no further changes needed.
+#
+# But if you slice your dataframe using the df.ColumnName syntax,
+# OR if you call df['ColumnName'], the result that comes back is
+# actually a series (1D Array)! This will cause SKLearn to bug out.
+# So if you are slicing using either of those two techniques, before
+# sending your training or testing data to .fit / .score, do a
+# my_column = my_column.reshape(-1,1). This will convert your 1D
+# array of [n_samples], to a 2D array shaped like [n_samples, 1].
+# A single feature, with many samples.
+#
+# If you did something like my_column = [my_column], that would produce
+# an array in the shape of [1, n_samples], which is incorrect because
+# SKLearn expects your data to be arranged as [n_samples, n_features].
+# Keep in mind, all of the above only relates to your "X" or input
+# data, and does not apply to your "y" or labels.
+