Mashimo · April 29, 2018 22:37
diff --git a/Regression b/Regression
 Examples of regression models for prediction
diff --git a/college-acceptance.py b/college-acceptance.py
 """
 There are four relationships we are interested in modeling:

 1.The amount charged for room and board, expressed as a function of the number of: accepted students
 2.The number of enrolled students per college, expressed as a function of the number of: accepted students
 3.The number of failed undergraduate students per college, expressed as a function of: the number of accepted students
 4.The amount charged for room and board coupled with the number of enrolled students, expressed as a function of: the number of accepted students.
 """
 import pandas as pd
 import numpy as np
 import matplotlib
 import matplotlib.pyplot as plt
 from mpl_toolkits.mplot3d import Axes3D

 from sklearn import linear_model
 from sklearn.model_selection import train_test_split


 matplotlib.style.use('ggplot') # Look Pretty


 def drawLine(model, X_test, y_test, title, R2):
  # This convenience method will take care of plotting the
  # test observations, comparing them to the regression line,
  # and displaying the R2 coefficient
  fig = plt.figure()
  ax = fig.add_subplot(111)
  ax.scatter(X_test, y_test, c='g', marker='o')
  ax.plot(X_test, model.predict(X_test), color='orange', linewidth=1, alpha=0.7)

  title += " R2: " + str(R2)
  ax.set_title(title)
  print (title)
  print ("Intercept(s): ", model.intercept_)

  plt.show()

 def drawPlane(model, X_test, y_test, title, R2):
  # This convenience method will take care of plotting the
  # test observations, comparing them to the regression plane,
  # and displaying the R2 coefficient
  fig = plt.figure()
  ax = Axes3D(fig)
  ax.set_zlabel('prediction')

  # You might have passed in a DataFrame, a Series (slice),
  # an NDArray, or a Python List... so let's keep it simple:
  X_test = np.array(X_test)
  col1 = X_test[:,0]
  col2 = X_test[:,1]

  # Set up a Grid. We could have predicted on the actual
  # col1, col2 values directly; but that would have generated
  # a mesh with WAY too fine a grid, which would have detracted
  # from the visualization
  x_min, x_max = col1.min(), col1.max()
  y_min, y_max = col2.min(), col2.max()
  x = np.arange(x_min, x_max, (x_max-x_min) / 10)
  y = np.arange(y_min, y_max, (y_max-y_min) / 10)
  x, y = np.meshgrid(x, y)

  # Predict based on possible input values that span the domain
  # of the x and y inputs:
  z = model.predict(  np.c_[x.ravel(), y.ravel()]  )
  z = z.reshape(x.shape)

  ax.scatter(col1, col2, y_test, c='g', marker='o')
  ax.plot_wireframe(x, y, z, color='orange', alpha=0.7)
  
  title += " R2: " + str(R2)
  ax.set_title(title)
  print (title)
  print ("Intercept(s): ", model.intercept_)
  
  plt.show()
  


 #
 # INFO: Let's get started!


 # the first column is both unique (the name of each)
 # college, as well as unlabeled. This is a HINT that it must be the
 # index column. If you do not indicate to Pandas that you already
 # have an index column, it'll create one for you, which would be
 # undesirable since you already have one.
 #
 # load up the College dataset into a variable
 # called X:
 #
 X = pd.read_csv("Datasets/college.csv", index_col=0) 


 #
 # The .map() method is like .apply(), but instead of taking in a
 # lambda / function, you simply provide a mapping of keys:values.

 X.Private = X.Private.map({'Yes':1, 'No':0})


 #
 # : Create the linear regression model 
 #
 model = linear_model.LinearRegression()

 #
 # INFO: The first relationship we're interested in is the 
 # number of accepted students, as a function of the amount
 # charged for room and board.

 #
 # : Using indexing, create two slices (series). One will just
 # store the room and board column, the other will store the accepted
 # students column. Then use train_test_split to cut the data up
 # into X_train, X_test, y_train, y_test, with a test_size of 30% and
 # a random_state of 7.
 #
 X_rb  = X['Room.Board'] # series
 y = X['Accept']

 X_train, X_test, y_train, y_test = train_test_split(X_rb, y, 
                                                    test_size=0.3, random_state=7)

 #
 # : Fit and score the model appropriately. 
 #
 # fit(), score() and predict() expect 2d arrays
 model.fit(X_train.reshape(-1,1), y_train)
 X_test = X_test.reshape(-1,1)
 score = model.score(X_test, y_test)

 drawLine(model, X_test, y_test, "Accept(Room&Board)", score)




 # 
 # : Duplicate the process above; this time, model the number of
 # accepted students, as a function of the number of enrolled students
 # per college.
 #
 X_en  = X['Enroll'] # series
 X_train, X_test, y_train, y_test = train_test_split(X_en, y, 
                                                    test_size=0.3, random_state=7)

 model.fit(X_train.reshape(-1,1), y_train)
 X_test = X_test.reshape(-1,1)
 score = model.score(X_test, y_test)

 drawLine(model, X_test, y_test, "Accept(Enroll)", score)



 # 
 # : Duplicate the process above; this time, model the number of
 # accepted students, as as function of the numbr of failed undergraduate
 # students per college.
 #
 X_fu  = X['F.Undergrad'] # series
 X_train, X_test, y_train, y_test = train_test_split(X_fu, y, 
                                                    test_size=0.3, random_state=7)

 model.fit(X_train.reshape(-1,1), y_train)
 X_test = X_test.reshape(-1,1)
 score = model.score(X_test, y_test)

 drawLine(model, X_test, y_test, "Accept(F.Undergrad)", score)


 #
 # : Duplicate the process above (almost). This time is going to be
 # a bit more complicated. Instead of modeling one feature as a function
 # of another,  will attempt to do multivariate linear regression to
 # model one feature as a function of TWO other features.
 #
 # Model the amount charged for room and board AND the number of enrolled
 # students, as a function of the number of accepted students. To do
 # this, instead of creating a regular slice for a single-feature input,
 # simply create a slice that contains both columns you wish to use as
 # inputs. Your training labels will remain a single slice.
 #
 X_rb_en = X[['Room.Board', 'Enroll']] # data frame
 X_train, X_test, y_train, y_test = train_test_split(X_rb_en, y, 
                                                    test_size=0.3, random_state=7)

 model.fit(X_train, y_train)
 score = model.score(X_test, y_test)

 drawPlane(model, X_test, y_test, "Accept(Room&Board,Enroll)", score)



 # INFO + HINT On Fitting, Scoring, and Predicting:
 # When you use .fit(), .score(), and .predict() on
 # your model, SciKit-Learn expects your training data to be in
 # spreadsheet (2D Array-Like) form. This means you can't simply
 # pass in a 1D Array (slice) and get away with it.
 #
 # To properly prep your data, you have to pass in a 2D Numpy Array,
 # or a dataframe. But what happens if you really only want to pass
 # in a single feature?
 #
 # If you slice your dataframe using df[['ColumnName']] syntax, the
 # result that comes back is actually a *dataframe*. Go ahead and do
 # a type() on it to check it out. Since it's already a dataframe,
 # you're good -- no further changes needed.
 #
 # But if you slice your dataframe using the df.ColumnName syntax,
 # OR if you call df['ColumnName'], the result that comes back is
 # actually a series (1D Array)! This will cause SKLearn to bug out.
 # So if you are slicing using either of those two techniques, before
 # sending your training or testing data to .fit / .score, do a
 # my_column = my_column.reshape(-1,1). This will convert your 1D
 # array of [n_samples], to a 2D array shaped like [n_samples, 1].
 # A single feature, with many samples.
 #
 # If you did something like my_column = [my_column], that would produce
 # an array in the shape of [1, n_samples], which is incorrect because
 # SKLearn expects your data to be arranged as [n_samples, n_features].
 # Keep in mind, all of the above only relates to your "X" or input
 # data, and does not apply to your "y" or labels.



 #
 # Extra
 # ========================
 #
 # 
 # What happens if you apply scaling to your data before doing 
 # linear regression? Would it alter the quality of your results?
 # Do the scalers that work on a per-feature basis, such as MinMaxScaler
 # behave differently that those that work on a multi-feature basis, such
 # as normalize? And moreover, once your features have been scaled, you
 # won't be able to use the resulting regression directly... unless you're
 # able to .inverse_transform() the scaling. Do all of the SciKit-Learn
 # scalers support that?
 #


diff --git a/predictLifeExpectancy.py b/predictLifeExpectancy.py
 import pandas as pd
 import matplotlib
 import matplotlib.pyplot as plt

 from sklearn import linear_model

 matplotlib.style.use('ggplot') # Look Pretty


 def drawLine(model, X_test, y_test, title):
  # This convenience method will take care of plotting the
  # test observations, comparing them to the regression line,
  # and displaying the R2 coefficient
  fig = plt.figure()
  ax = fig.add_subplot(111)
  ax.scatter(X_test, y_test, c='g', marker='o')
  ax.plot(X_test, model.predict(X_test), color='orange', linewidth=1, alpha=0.7)

  print ("Est 2014 " + title + " Life Expectancy: ", model.predict([[2014]])[0])
  print ("Est 2030 " + title + " Life Expectancy: ", model.predict([[2030]])[0])
  print ("Est 2045 " + title + " Life Expectancy: ", model.predict([[2045]])[0])

  score = model.score(X_test, y_test)
  title += " R2: " + str(score)
  ax.set_title(title)


  plt.show()


 #
 # : Load up the data  into a variable called 'X'.
 #
 X = pd.read_csv("Datasets/life_expectancy.csv", sep='\t')

 #
 # : Create the linear regression model 
 #
 model = linear_model.LinearRegression()

 #
 # : Slice out the data manually (e.g. not using train_test_split.
 # Set X_train to be year values
 # LESS than 1986, and y_train to be corresponding WhiteMale age values.
 #
 # INFO You might also want to read the note about slicing on the bottom
 # of this document.
 #
 X_train = X.loc[X['Year']<1986, ['Year']] # data frame
 y_trainWM = X[X['Year'] < 1986].WhiteMale # series


 #
 # : Train the model then pass it into drawLine with the training
 # set and labels. drawLine will output
 # to the console a 2014 extrapolation / approximation for what it
 # believes the WhiteMale's life expectancy in the U.S. will be...
 # given the pre-1986 data you trained it with. It'll also produce a
 # 2030 and 2045 extrapolation.
 #
 model.fit(X_train, y_trainWM)

 drawLine(model, X_train, y_trainWM, "WhiteMale")

 #
 # : Print the actual 2014 WhiteMale life expectancy 

 print("Actual 2014 WhiteMale Life Expectancy = ", 
      X[X.Year == 2014].WhiteMale.iloc[0])


 # 
 # : Repeat the process, but instead of for WhiteMale, this time
 # select BlackFemale. Create a slice for BlackFemales, fit the
 # model, and then call drawLine. Lastly, print out the actual 2014
 # BlackFemale life expectancy
 #
 y_trainBF = X[X['Year'] < 1986].BlackFemale # series
 model.fit(X_train, y_trainBF)

 drawLine(model, X_train, y_trainBF, "BlackFemale")

 print("Actual 2014 BlackFemale Life Expectancy = ", 
      X[X.Year == 2014].BlackFemale.iloc[0])


 #
 # : Lastly, print out a correlation matrix for the entire
 # dataset, and display a visualization of the correlation
 # matrix
 #
 print("Correlation matrix: ")
 print(X.corr())

 fig, ax = plt.subplots()
 im = ax.imshow(X.corr(), cmap=plt.cm.Blues, interpolation='nearest')
 plt.colorbar(im)
 tick_marks = [i for i in range(len(X.columns))]
 plt.xticks(tick_marks, X.columns, rotation='vertical')
 plt.yticks(tick_marks, X.columns)

 ax.set_title("Correlation matrix")

 plt.show()




 #
 # INFO On Fitting, Scoring, and Predicting:
 #
 # When you use .fit(), .score(), and .predict() on
 # your model, SciKit-Learn expects your training data to be in
 # spreadsheet (2D Array-Like) form. This means you can't simply
 # pass in a 1D Array (slice) and get away with it.
 #
 # To properly prep your data, you have to pass in a 2D Numpy Array,
 # or a dataframe. But what happens if you really only want to pass
 # in a single feature?
 #
 # If you slice your dataframe using df[['ColumnName']] syntax, the
 # result that comes back is actually a *dataframe*. Go ahead and do
 # a type() on it to check it out. Since it's already a dataframe,
 # you're good -- no further changes needed.
 #
 # But if you slice your dataframe using the df.ColumnName syntax,
 # OR if you call df['ColumnName'], the result that comes back is
 # actually a series (1D Array)! This will cause SKLearn to bug out.
 # So if you are slicing using either of those two techniques, before
 # sending your training or testing data to .fit / .score, do a
 # my_column = my_column.reshape(-1,1). This will convert your 1D
 # array of [n_samples], to a 2D array shaped like [n_samples, 1].
 # A single feature, with many samples.
 #
 # If you did something like my_column = [my_column], that would produce
 # an array in the shape of [1, n_samples], which is incorrect because
 # SKLearn expects your data to be arranged as [n_samples, n_features].
 # Keep in mind, all of the above only relates to your "X" or input
 # data, and does not apply to your "y" or labels.
	"""
	There are four relationships we are interested in modeling:

	1.The amount charged for room and board, expressed as a function of the number of: accepted students
	2.The number of enrolled students per college, expressed as a function of the number of: accepted students
	3.The number of failed undergraduate students per college, expressed as a function of: the number of accepted students
	4.The amount charged for room and board coupled with the number of enrolled students, expressed as a function of: the number of accepted students.
	"""
	import pandas as pd
	import numpy as np
	import matplotlib
	import matplotlib.pyplot as plt
	from mpl_toolkits.mplot3d import Axes3D

	from sklearn import linear_model
	from sklearn.model_selection import train_test_split


	matplotlib.style.use('ggplot') # Look Pretty


	def drawLine(model, X_test, y_test, title, R2):
	# This convenience method will take care of plotting the
	# test observations, comparing them to the regression line,
	# and displaying the R2 coefficient
	fig = plt.figure()
	ax = fig.add_subplot(111)
	ax.scatter(X_test, y_test, c='g', marker='o')
	ax.plot(X_test, model.predict(X_test), color='orange', linewidth=1, alpha=0.7)

	title += " R2: " + str(R2)
	ax.set_title(title)
	print (title)
	print ("Intercept(s): ", model.intercept_)

	plt.show()

	def drawPlane(model, X_test, y_test, title, R2):
	# This convenience method will take care of plotting the
	# test observations, comparing them to the regression plane,
	# and displaying the R2 coefficient
	fig = plt.figure()
	ax = Axes3D(fig)
	ax.set_zlabel('prediction')

	# You might have passed in a DataFrame, a Series (slice),
	# an NDArray, or a Python List... so let's keep it simple:
	X_test = np.array(X_test)
	col1 = X_test[:,0]
	col2 = X_test[:,1]

	# Set up a Grid. We could have predicted on the actual
	# col1, col2 values directly; but that would have generated
	# a mesh with WAY too fine a grid, which would have detracted
	# from the visualization
	x_min, x_max = col1.min(), col1.max()
	y_min, y_max = col2.min(), col2.max()
	x = np.arange(x_min, x_max, (x_max-x_min) / 10)
	y = np.arange(y_min, y_max, (y_max-y_min) / 10)
	x, y = np.meshgrid(x, y)

	# Predict based on possible input values that span the domain
	# of the x and y inputs:
	z = model.predict( np.c_[x.ravel(), y.ravel()] )
	z = z.reshape(x.shape)

	ax.scatter(col1, col2, y_test, c='g', marker='o')
	ax.plot_wireframe(x, y, z, color='orange', alpha=0.7)

	title += " R2: " + str(R2)
	ax.set_title(title)
	print (title)
	print ("Intercept(s): ", model.intercept_)

	plt.show()



	#
	# INFO: Let's get started!


	# the first column is both unique (the name of each)
	# college, as well as unlabeled. This is a HINT that it must be the
	# index column. If you do not indicate to Pandas that you already
	# have an index column, it'll create one for you, which would be
	# undesirable since you already have one.
	#
	# load up the College dataset into a variable
	# called X:
	#
	X = pd.read_csv("Datasets/college.csv", index_col=0)


	#
	# The .map() method is like .apply(), but instead of taking in a
	# lambda / function, you simply provide a mapping of keys:values.

	X.Private = X.Private.map({'Yes':1, 'No':0})


	#
	# : Create the linear regression model
	#
	model = linear_model.LinearRegression()

	#
	# INFO: The first relationship we're interested in is the
	# number of accepted students, as a function of the amount
	# charged for room and board.

	#
	# : Using indexing, create two slices (series). One will just
	# store the room and board column, the other will store the accepted
	# students column. Then use train_test_split to cut the data up
	# into X_train, X_test, y_train, y_test, with a test_size of 30% and
	# a random_state of 7.
	#
	X_rb = X['Room.Board'] # series
	y = X['Accept']

	X_train, X_test, y_train, y_test = train_test_split(X_rb, y,
	test_size=0.3, random_state=7)

	#
	# : Fit and score the model appropriately.
	#
	# fit(), score() and predict() expect 2d arrays
	model.fit(X_train.reshape(-1,1), y_train)
	X_test = X_test.reshape(-1,1)
	score = model.score(X_test, y_test)

	drawLine(model, X_test, y_test, "Accept(Room&Board)", score)




	#
	# : Duplicate the process above; this time, model the number of
	# accepted students, as a function of the number of enrolled students
	# per college.
	#
	X_en = X['Enroll'] # series
	X_train, X_test, y_train, y_test = train_test_split(X_en, y,
	test_size=0.3, random_state=7)

	model.fit(X_train.reshape(-1,1), y_train)
	X_test = X_test.reshape(-1,1)
	score = model.score(X_test, y_test)

	drawLine(model, X_test, y_test, "Accept(Enroll)", score)



	#
	# : Duplicate the process above; this time, model the number of
	# accepted students, as as function of the numbr of failed undergraduate
	# students per college.
	#
	X_fu = X['F.Undergrad'] # series
	X_train, X_test, y_train, y_test = train_test_split(X_fu, y,
	test_size=0.3, random_state=7)

	model.fit(X_train.reshape(-1,1), y_train)
	X_test = X_test.reshape(-1,1)
	score = model.score(X_test, y_test)

	drawLine(model, X_test, y_test, "Accept(F.Undergrad)", score)


	#
	# : Duplicate the process above (almost). This time is going to be
	# a bit more complicated. Instead of modeling one feature as a function
	# of another, will attempt to do multivariate linear regression to
	# model one feature as a function of TWO other features.
	#
	# Model the amount charged for room and board AND the number of enrolled
	# students, as a function of the number of accepted students. To do
	# this, instead of creating a regular slice for a single-feature input,
	# simply create a slice that contains both columns you wish to use as
	# inputs. Your training labels will remain a single slice.
	#
	X_rb_en = X[['Room.Board', 'Enroll']] # data frame
	X_train, X_test, y_train, y_test = train_test_split(X_rb_en, y,
	test_size=0.3, random_state=7)

	model.fit(X_train, y_train)
	score = model.score(X_test, y_test)

	drawPlane(model, X_test, y_test, "Accept(Room&Board,Enroll)", score)



	# INFO + HINT On Fitting, Scoring, and Predicting:
	# When you use .fit(), .score(), and .predict() on
	# your model, SciKit-Learn expects your training data to be in
	# spreadsheet (2D Array-Like) form. This means you can't simply
	# pass in a 1D Array (slice) and get away with it.
	#
	# To properly prep your data, you have to pass in a 2D Numpy Array,
	# or a dataframe. But what happens if you really only want to pass
	# in a single feature?
	#
	# If you slice your dataframe using df[['ColumnName']] syntax, the
	# result that comes back is actually a dataframe. Go ahead and do
	# a type() on it to check it out. Since it's already a dataframe,
	# you're good -- no further changes needed.
	#
	# But if you slice your dataframe using the df.ColumnName syntax,
	# OR if you call df['ColumnName'], the result that comes back is
	# actually a series (1D Array)! This will cause SKLearn to bug out.
	# So if you are slicing using either of those two techniques, before
	# sending your training or testing data to .fit / .score, do a
	# my_column = my_column.reshape(-1,1). This will convert your 1D
	# array of [n_samples], to a 2D array shaped like [n_samples, 1].
	# A single feature, with many samples.
	#
	# If you did something like my_column = [my_column], that would produce
	# an array in the shape of [1, n_samples], which is incorrect because
	# SKLearn expects your data to be arranged as [n_samples, n_features].
	# Keep in mind, all of the above only relates to your "X" or input
	# data, and does not apply to your "y" or labels.



	#
	# Extra
	# ========================
	#
	#
	# What happens if you apply scaling to your data before doing
	# linear regression? Would it alter the quality of your results?
	# Do the scalers that work on a per-feature basis, such as MinMaxScaler
	# behave differently that those that work on a multi-feature basis, such
	# as normalize? And moreover, once your features have been scaled, you
	# won't be able to use the resulting regression directly... unless you're
	# able to .inverse_transform() the scaling. Do all of the SciKit-Learn
	# scalers support that?
	#
No results found