Last active
April 29, 2018 22:37
-
-
Save Mashimo/df3ba243b8d8982bc824ac0663ed3e23 to your computer and use it in GitHub Desktop.
Regression
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Examples of regression models for prediction |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| There are four relationships we are interested in modeling: | |
| 1.The amount charged for room and board, expressed as a function of the number of: accepted students | |
| 2.The number of enrolled students per college, expressed as a function of the number of: accepted students | |
| 3.The number of failed undergraduate students per college, expressed as a function of: the number of accepted students | |
| 4.The amount charged for room and board coupled with the number of enrolled students, expressed as a function of: the number of accepted students. | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib | |
| import matplotlib.pyplot as plt | |
| from mpl_toolkits.mplot3d import Axes3D | |
| from sklearn import linear_model | |
| from sklearn.model_selection import train_test_split | |
| matplotlib.style.use('ggplot') # Look Pretty | |
| def drawLine(model, X_test, y_test, title, R2): | |
| # This convenience method will take care of plotting the | |
| # test observations, comparing them to the regression line, | |
| # and displaying the R2 coefficient | |
| fig = plt.figure() | |
| ax = fig.add_subplot(111) | |
| ax.scatter(X_test, y_test, c='g', marker='o') | |
| ax.plot(X_test, model.predict(X_test), color='orange', linewidth=1, alpha=0.7) | |
| title += " R2: " + str(R2) | |
| ax.set_title(title) | |
| print (title) | |
| print ("Intercept(s): ", model.intercept_) | |
| plt.show() | |
| def drawPlane(model, X_test, y_test, title, R2): | |
| # This convenience method will take care of plotting the | |
| # test observations, comparing them to the regression plane, | |
| # and displaying the R2 coefficient | |
| fig = plt.figure() | |
| ax = Axes3D(fig) | |
| ax.set_zlabel('prediction') | |
| # You might have passed in a DataFrame, a Series (slice), | |
| # an NDArray, or a Python List... so let's keep it simple: | |
| X_test = np.array(X_test) | |
| col1 = X_test[:,0] | |
| col2 = X_test[:,1] | |
| # Set up a Grid. We could have predicted on the actual | |
| # col1, col2 values directly; but that would have generated | |
| # a mesh with WAY too fine a grid, which would have detracted | |
| # from the visualization | |
| x_min, x_max = col1.min(), col1.max() | |
| y_min, y_max = col2.min(), col2.max() | |
| x = np.arange(x_min, x_max, (x_max-x_min) / 10) | |
| y = np.arange(y_min, y_max, (y_max-y_min) / 10) | |
| x, y = np.meshgrid(x, y) | |
| # Predict based on possible input values that span the domain | |
| # of the x and y inputs: | |
| z = model.predict( np.c_[x.ravel(), y.ravel()] ) | |
| z = z.reshape(x.shape) | |
| ax.scatter(col1, col2, y_test, c='g', marker='o') | |
| ax.plot_wireframe(x, y, z, color='orange', alpha=0.7) | |
| title += " R2: " + str(R2) | |
| ax.set_title(title) | |
| print (title) | |
| print ("Intercept(s): ", model.intercept_) | |
| plt.show() | |
| # | |
| # INFO: Let's get started! | |
| # the first column is both unique (the name of each) | |
| # college, as well as unlabeled. This is a HINT that it must be the | |
| # index column. If you do not indicate to Pandas that you already | |
| # have an index column, it'll create one for you, which would be | |
| # undesirable since you already have one. | |
| # | |
| # load up the College dataset into a variable | |
| # called X: | |
| # | |
| X = pd.read_csv("Datasets/college.csv", index_col=0) | |
| # | |
| # The .map() method is like .apply(), but instead of taking in a | |
| # lambda / function, you simply provide a mapping of keys:values. | |
| X.Private = X.Private.map({'Yes':1, 'No':0}) | |
| # | |
| # : Create the linear regression model | |
| # | |
| model = linear_model.LinearRegression() | |
| # | |
| # INFO: The first relationship we're interested in is the | |
| # number of accepted students, as a function of the amount | |
| # charged for room and board. | |
| # | |
| # : Using indexing, create two slices (series). One will just | |
| # store the room and board column, the other will store the accepted | |
| # students column. Then use train_test_split to cut the data up | |
| # into X_train, X_test, y_train, y_test, with a test_size of 30% and | |
| # a random_state of 7. | |
| # | |
| X_rb = X['Room.Board'] # series | |
| y = X['Accept'] | |
| X_train, X_test, y_train, y_test = train_test_split(X_rb, y, | |
| test_size=0.3, random_state=7) | |
| # | |
| # : Fit and score the model appropriately. | |
| # | |
| # fit(), score() and predict() expect 2d arrays | |
| model.fit(X_train.reshape(-1,1), y_train) | |
| X_test = X_test.reshape(-1,1) | |
| score = model.score(X_test, y_test) | |
| drawLine(model, X_test, y_test, "Accept(Room&Board)", score) | |
| # | |
| # : Duplicate the process above; this time, model the number of | |
| # accepted students, as a function of the number of enrolled students | |
| # per college. | |
| # | |
| X_en = X['Enroll'] # series | |
| X_train, X_test, y_train, y_test = train_test_split(X_en, y, | |
| test_size=0.3, random_state=7) | |
| model.fit(X_train.reshape(-1,1), y_train) | |
| X_test = X_test.reshape(-1,1) | |
| score = model.score(X_test, y_test) | |
| drawLine(model, X_test, y_test, "Accept(Enroll)", score) | |
| # | |
| # : Duplicate the process above; this time, model the number of | |
| # accepted students, as as function of the numbr of failed undergraduate | |
| # students per college. | |
| # | |
| X_fu = X['F.Undergrad'] # series | |
| X_train, X_test, y_train, y_test = train_test_split(X_fu, y, | |
| test_size=0.3, random_state=7) | |
| model.fit(X_train.reshape(-1,1), y_train) | |
| X_test = X_test.reshape(-1,1) | |
| score = model.score(X_test, y_test) | |
| drawLine(model, X_test, y_test, "Accept(F.Undergrad)", score) | |
| # | |
| # : Duplicate the process above (almost). This time is going to be | |
| # a bit more complicated. Instead of modeling one feature as a function | |
| # of another, will attempt to do multivariate linear regression to | |
| # model one feature as a function of TWO other features. | |
| # | |
| # Model the amount charged for room and board AND the number of enrolled | |
| # students, as a function of the number of accepted students. To do | |
| # this, instead of creating a regular slice for a single-feature input, | |
| # simply create a slice that contains both columns you wish to use as | |
| # inputs. Your training labels will remain a single slice. | |
| # | |
| X_rb_en = X[['Room.Board', 'Enroll']] # data frame | |
| X_train, X_test, y_train, y_test = train_test_split(X_rb_en, y, | |
| test_size=0.3, random_state=7) | |
| model.fit(X_train, y_train) | |
| score = model.score(X_test, y_test) | |
| drawPlane(model, X_test, y_test, "Accept(Room&Board,Enroll)", score) | |
| # INFO + HINT On Fitting, Scoring, and Predicting: | |
| # When you use .fit(), .score(), and .predict() on | |
| # your model, SciKit-Learn expects your training data to be in | |
| # spreadsheet (2D Array-Like) form. This means you can't simply | |
| # pass in a 1D Array (slice) and get away with it. | |
| # | |
| # To properly prep your data, you have to pass in a 2D Numpy Array, | |
| # or a dataframe. But what happens if you really only want to pass | |
| # in a single feature? | |
| # | |
| # If you slice your dataframe using df[['ColumnName']] syntax, the | |
| # result that comes back is actually a *dataframe*. Go ahead and do | |
| # a type() on it to check it out. Since it's already a dataframe, | |
| # you're good -- no further changes needed. | |
| # | |
| # But if you slice your dataframe using the df.ColumnName syntax, | |
| # OR if you call df['ColumnName'], the result that comes back is | |
| # actually a series (1D Array)! This will cause SKLearn to bug out. | |
| # So if you are slicing using either of those two techniques, before | |
| # sending your training or testing data to .fit / .score, do a | |
| # my_column = my_column.reshape(-1,1). This will convert your 1D | |
| # array of [n_samples], to a 2D array shaped like [n_samples, 1]. | |
| # A single feature, with many samples. | |
| # | |
| # If you did something like my_column = [my_column], that would produce | |
| # an array in the shape of [1, n_samples], which is incorrect because | |
| # SKLearn expects your data to be arranged as [n_samples, n_features]. | |
| # Keep in mind, all of the above only relates to your "X" or input | |
| # data, and does not apply to your "y" or labels. | |
| # | |
| # Extra | |
| # ======================== | |
| # | |
| # | |
| # What happens if you apply scaling to your data before doing | |
| # linear regression? Would it alter the quality of your results? | |
| # Do the scalers that work on a per-feature basis, such as MinMaxScaler | |
| # behave differently that those that work on a multi-feature basis, such | |
| # as normalize? And moreover, once your features have been scaled, you | |
| # won't be able to use the resulting regression directly... unless you're | |
| # able to .inverse_transform() the scaling. Do all of the SciKit-Learn | |
| # scalers support that? | |
| # | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import matplotlib | |
| import matplotlib.pyplot as plt | |
| from sklearn import linear_model | |
| matplotlib.style.use('ggplot') # Look Pretty | |
| def drawLine(model, X_test, y_test, title): | |
| # This convenience method will take care of plotting the | |
| # test observations, comparing them to the regression line, | |
| # and displaying the R2 coefficient | |
| fig = plt.figure() | |
| ax = fig.add_subplot(111) | |
| ax.scatter(X_test, y_test, c='g', marker='o') | |
| ax.plot(X_test, model.predict(X_test), color='orange', linewidth=1, alpha=0.7) | |
| print ("Est 2014 " + title + " Life Expectancy: ", model.predict([[2014]])[0]) | |
| print ("Est 2030 " + title + " Life Expectancy: ", model.predict([[2030]])[0]) | |
| print ("Est 2045 " + title + " Life Expectancy: ", model.predict([[2045]])[0]) | |
| score = model.score(X_test, y_test) | |
| title += " R2: " + str(score) | |
| ax.set_title(title) | |
| plt.show() | |
| # | |
| # : Load up the data into a variable called 'X'. | |
| # | |
| X = pd.read_csv("Datasets/life_expectancy.csv", sep='\t') | |
| # | |
| # : Create the linear regression model | |
| # | |
| model = linear_model.LinearRegression() | |
| # | |
| # : Slice out the data manually (e.g. not using train_test_split. | |
| # Set X_train to be year values | |
| # LESS than 1986, and y_train to be corresponding WhiteMale age values. | |
| # | |
| # INFO You might also want to read the note about slicing on the bottom | |
| # of this document. | |
| # | |
| X_train = X.loc[X['Year']<1986, ['Year']] # data frame | |
| y_trainWM = X[X['Year'] < 1986].WhiteMale # series | |
| # | |
| # : Train the model then pass it into drawLine with the training | |
| # set and labels. drawLine will output | |
| # to the console a 2014 extrapolation / approximation for what it | |
| # believes the WhiteMale's life expectancy in the U.S. will be... | |
| # given the pre-1986 data you trained it with. It'll also produce a | |
| # 2030 and 2045 extrapolation. | |
| # | |
| model.fit(X_train, y_trainWM) | |
| drawLine(model, X_train, y_trainWM, "WhiteMale") | |
| # | |
| # : Print the actual 2014 WhiteMale life expectancy | |
| print("Actual 2014 WhiteMale Life Expectancy = ", | |
| X[X.Year == 2014].WhiteMale.iloc[0]) | |
| # | |
| # : Repeat the process, but instead of for WhiteMale, this time | |
| # select BlackFemale. Create a slice for BlackFemales, fit the | |
| # model, and then call drawLine. Lastly, print out the actual 2014 | |
| # BlackFemale life expectancy | |
| # | |
| y_trainBF = X[X['Year'] < 1986].BlackFemale # series | |
| model.fit(X_train, y_trainBF) | |
| drawLine(model, X_train, y_trainBF, "BlackFemale") | |
| print("Actual 2014 BlackFemale Life Expectancy = ", | |
| X[X.Year == 2014].BlackFemale.iloc[0]) | |
| # | |
| # : Lastly, print out a correlation matrix for the entire | |
| # dataset, and display a visualization of the correlation | |
| # matrix | |
| # | |
| print("Correlation matrix: ") | |
| print(X.corr()) | |
| fig, ax = plt.subplots() | |
| im = ax.imshow(X.corr(), cmap=plt.cm.Blues, interpolation='nearest') | |
| plt.colorbar(im) | |
| tick_marks = [i for i in range(len(X.columns))] | |
| plt.xticks(tick_marks, X.columns, rotation='vertical') | |
| plt.yticks(tick_marks, X.columns) | |
| ax.set_title("Correlation matrix") | |
| plt.show() | |
| # | |
| # INFO On Fitting, Scoring, and Predicting: | |
| # | |
| # When you use .fit(), .score(), and .predict() on | |
| # your model, SciKit-Learn expects your training data to be in | |
| # spreadsheet (2D Array-Like) form. This means you can't simply | |
| # pass in a 1D Array (slice) and get away with it. | |
| # | |
| # To properly prep your data, you have to pass in a 2D Numpy Array, | |
| # or a dataframe. But what happens if you really only want to pass | |
| # in a single feature? | |
| # | |
| # If you slice your dataframe using df[['ColumnName']] syntax, the | |
| # result that comes back is actually a *dataframe*. Go ahead and do | |
| # a type() on it to check it out. Since it's already a dataframe, | |
| # you're good -- no further changes needed. | |
| # | |
| # But if you slice your dataframe using the df.ColumnName syntax, | |
| # OR if you call df['ColumnName'], the result that comes back is | |
| # actually a series (1D Array)! This will cause SKLearn to bug out. | |
| # So if you are slicing using either of those two techniques, before | |
| # sending your training or testing data to .fit / .score, do a | |
| # my_column = my_column.reshape(-1,1). This will convert your 1D | |
| # array of [n_samples], to a 2D array shaped like [n_samples, 1]. | |
| # A single feature, with many samples. | |
| # | |
| # If you did something like my_column = [my_column], that would produce | |
| # an array in the shape of [1, n_samples], which is incorrect because | |
| # SKLearn expects your data to be arranged as [n_samples, n_features]. | |
| # Keep in mind, all of the above only relates to your "X" or input | |
| # data, and does not apply to your "y" or labels. | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment