""" There are four relationships we are interested in modeling: 1.The amount charged for room and board, expressed as a function of the number of: accepted students 2.The number of enrolled students per college, expressed as a function of the number of: accepted students 3.The number of failed undergraduate students per college, expressed as a function of: the number of accepted students 4.The amount charged for room and board coupled with the number of enrolled students, expressed as a function of: the number of accepted students. """ import pandas as pd import numpy as np import matplotlib import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from sklearn import linear_model from sklearn.model_selection import train_test_split matplotlib.style.use('ggplot') # Look Pretty def drawLine(model, X_test, y_test, title, R2): # This convenience method will take care of plotting the # test observations, comparing them to the regression line, # and displaying the R2 coefficient fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(X_test, y_test, c='g', marker='o') ax.plot(X_test, model.predict(X_test), color='orange', linewidth=1, alpha=0.7) title += " R2: " + str(R2) ax.set_title(title) print (title) print ("Intercept(s): ", model.intercept_) plt.show() def drawPlane(model, X_test, y_test, title, R2): # This convenience method will take care of plotting the # test observations, comparing them to the regression plane, # and displaying the R2 coefficient fig = plt.figure() ax = Axes3D(fig) ax.set_zlabel('prediction') # You might have passed in a DataFrame, a Series (slice), # an NDArray, or a Python List... so let's keep it simple: X_test = np.array(X_test) col1 = X_test[:,0] col2 = X_test[:,1] # Set up a Grid. We could have predicted on the actual # col1, col2 values directly; but that would have generated # a mesh with WAY too fine a grid, which would have detracted # from the visualization x_min, x_max = col1.min(), col1.max() y_min, y_max = col2.min(), col2.max() x = np.arange(x_min, x_max, (x_max-x_min) / 10) y = np.arange(y_min, y_max, (y_max-y_min) / 10) x, y = np.meshgrid(x, y) # Predict based on possible input values that span the domain # of the x and y inputs: z = model.predict( np.c_[x.ravel(), y.ravel()] ) z = z.reshape(x.shape) ax.scatter(col1, col2, y_test, c='g', marker='o') ax.plot_wireframe(x, y, z, color='orange', alpha=0.7) title += " R2: " + str(R2) ax.set_title(title) print (title) print ("Intercept(s): ", model.intercept_) plt.show() # # INFO: Let's get started! # the first column is both unique (the name of each) # college, as well as unlabeled. This is a HINT that it must be the # index column. If you do not indicate to Pandas that you already # have an index column, it'll create one for you, which would be # undesirable since you already have one. # # load up the College dataset into a variable # called X: # X = pd.read_csv("Datasets/college.csv", index_col=0) # # The .map() method is like .apply(), but instead of taking in a # lambda / function, you simply provide a mapping of keys:values. X.Private = X.Private.map({'Yes':1, 'No':0}) # # : Create the linear regression model # model = linear_model.LinearRegression() # # INFO: The first relationship we're interested in is the # number of accepted students, as a function of the amount # charged for room and board. # # : Using indexing, create two slices (series). One will just # store the room and board column, the other will store the accepted # students column. Then use train_test_split to cut the data up # into X_train, X_test, y_train, y_test, with a test_size of 30% and # a random_state of 7. # X_rb = X['Room.Board'] # series y = X['Accept'] X_train, X_test, y_train, y_test = train_test_split(X_rb, y, test_size=0.3, random_state=7) # # : Fit and score the model appropriately. # # fit(), score() and predict() expect 2d arrays model.fit(X_train.reshape(-1,1), y_train) X_test = X_test.reshape(-1,1) score = model.score(X_test, y_test) drawLine(model, X_test, y_test, "Accept(Room&Board)", score) # # : Duplicate the process above; this time, model the number of # accepted students, as a function of the number of enrolled students # per college. # X_en = X['Enroll'] # series X_train, X_test, y_train, y_test = train_test_split(X_en, y, test_size=0.3, random_state=7) model.fit(X_train.reshape(-1,1), y_train) X_test = X_test.reshape(-1,1) score = model.score(X_test, y_test) drawLine(model, X_test, y_test, "Accept(Enroll)", score) # # : Duplicate the process above; this time, model the number of # accepted students, as as function of the numbr of failed undergraduate # students per college. # X_fu = X['F.Undergrad'] # series X_train, X_test, y_train, y_test = train_test_split(X_fu, y, test_size=0.3, random_state=7) model.fit(X_train.reshape(-1,1), y_train) X_test = X_test.reshape(-1,1) score = model.score(X_test, y_test) drawLine(model, X_test, y_test, "Accept(F.Undergrad)", score) # # : Duplicate the process above (almost). This time is going to be # a bit more complicated. Instead of modeling one feature as a function # of another, will attempt to do multivariate linear regression to # model one feature as a function of TWO other features. # # Model the amount charged for room and board AND the number of enrolled # students, as a function of the number of accepted students. To do # this, instead of creating a regular slice for a single-feature input, # simply create a slice that contains both columns you wish to use as # inputs. Your training labels will remain a single slice. # X_rb_en = X[['Room.Board', 'Enroll']] # data frame X_train, X_test, y_train, y_test = train_test_split(X_rb_en, y, test_size=0.3, random_state=7) model.fit(X_train, y_train) score = model.score(X_test, y_test) drawPlane(model, X_test, y_test, "Accept(Room&Board,Enroll)", score) # INFO + HINT On Fitting, Scoring, and Predicting: # When you use .fit(), .score(), and .predict() on # your model, SciKit-Learn expects your training data to be in # spreadsheet (2D Array-Like) form. This means you can't simply # pass in a 1D Array (slice) and get away with it. # # To properly prep your data, you have to pass in a 2D Numpy Array, # or a dataframe. But what happens if you really only want to pass # in a single feature? # # If you slice your dataframe using df[['ColumnName']] syntax, the # result that comes back is actually a *dataframe*. Go ahead and do # a type() on it to check it out. Since it's already a dataframe, # you're good -- no further changes needed. # # But if you slice your dataframe using the df.ColumnName syntax, # OR if you call df['ColumnName'], the result that comes back is # actually a series (1D Array)! This will cause SKLearn to bug out. # So if you are slicing using either of those two techniques, before # sending your training or testing data to .fit / .score, do a # my_column = my_column.reshape(-1,1). This will convert your 1D # array of [n_samples], to a 2D array shaped like [n_samples, 1]. # A single feature, with many samples. # # If you did something like my_column = [my_column], that would produce # an array in the shape of [1, n_samples], which is incorrect because # SKLearn expects your data to be arranged as [n_samples, n_features]. # Keep in mind, all of the above only relates to your "X" or input # data, and does not apply to your "y" or labels. # # Extra # ======================== # # # What happens if you apply scaling to your data before doing # linear regression? Would it alter the quality of your results? # Do the scalers that work on a per-feature basis, such as MinMaxScaler # behave differently that those that work on a multi-feature basis, such # as normalize? And moreover, once your features have been scaled, you # won't be able to use the resulting regression directly... unless you're # able to .inverse_transform() the scaling. Do all of the SciKit-Learn # scalers support that? #