-
-
Save fstfwd/131ebf6183273f5f09eac14bbcc6053c to your computer and use it in GitHub Desktop.
(Credit Prediction ML)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Use a general function that returns multiple values | |
| def var_summary(x): | |
| return pd.Series([x.count(), x.isnull().sum(), x.sum(), x.mean(), x.median(), x.std(), x.var(), x.min(), x.dropna().quantile(0.01), x.dropna().quantile(0.05),x.dropna().quantile(0.10),x.dropna().quantile(0.25),x.dropna().quantile(0.50),x.dropna().quantile(0.75), x.dropna().quantile(0.90),x.dropna().quantile(0.95), x.dropna().quantile(0.99),x.max()], | |
| index=['N', 'NMISS', 'SUM', 'MEAN','MEDIAN', 'STD', 'VAR', 'MIN', 'P1' , 'P5' ,'P10' ,'P25' ,'P50' ,'P75' ,'P90' ,'P95' ,'P99' ,'MAX']) | |
| num_summary=custdata_df_num.apply(lambda x: var_summary(x)).T | |
| num_summary |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Seaborn scatter plot with regression line | |
| sns.lmplot(x='income', y='totalspend', data=custdata_df) | |
| # Creating Categorical List= Contains categorical variables... | |
| for x in ['region','townsize','gender','agecat','edcat','birthmonth','jobcat','union','employ','empcat','retire', | |
| 'inccat','default','jobsat','marital','spousedcat','homeown','hometype','address','addresscat','cars','carown', | |
| 'cartype','carcatvalue','carbought','carbuy','commute','commutecat','commutecar', | |
| 'commutemotorcycle','commutecarpool','commutebus','commuterail','commutepublic','commutebike','commutewalk', | |
| 'commutenonmotor','telecommute','reason','polview','polparty','polcontrib','vote','card','cardtype','cardbenefit', | |
| 'cardfee','cardtenure','cardtenurecat','card2','card2type','card2benefit','card2fee','card2tenure','card2tenurecat', | |
| 'active','bfast','churn','tollfree','equip','callcard','wireless','multline','voice','pager','internet','callid', | |
| 'callwait','forward','confer','ebill','owntv','ownvcr','owndvd','owncd','ownpda','ownpc','ownipod','owngame','ownfax', | |
| 'news','response_01','response_02','response_03']: | |
| custdata_df[x]=custdata_df[x].astype('object') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #Drop cardspent(first card spent amount) and card2spent(Second card spent amount) because its not adding any value | |
| #Also we are droping "custid","birthmonth" bacuse cust id is unique and birth month is not adding any value | |
| custdata_df.drop(["cardspent","card2spent","custid"],axis=1, inplace=True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # import relevant modules | |
| import sys | |
| import pandas as pd | |
| import numpy as np | |
| import seaborn as sns | |
| import matplotlib.pyplot as plt | |
| import statsmodels.formula.api as sn | |
| import scipy.stats as stats | |
| from matplotlib.backends.backend_pdf import PdfPages | |
| from sklearn.model_selection import train_test_split | |
| from sklearn import metrics | |
| from sklearn.linear_model import LogisticRegression | |
| from statsmodels.stats.outliers_influence import variance_inflation_factor | |
| from patsy import dmatrices | |
| %matplotlib inline | |
| # Ignore warnings | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| # Settings | |
| pd.set_option('display.max_columns', None) | |
| np.set_printoptions(threshold=sys.maxsize) | |
| np.set_printoptions(precision=3) | |
| sns.set(style="darkgrid") | |
| plt.rcParams['axes.labelsize'] = 14 | |
| plt.rcParams['xtick.labelsize'] = 12 | |
| plt.rcParams['ytick.labelsize'] = 12 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| custdata_df = pd.read_excel("Data Set.xlsx", sheet_name="customer_dbase") | |
| custdata_df.sample(5) | |
| # Find column information in the dataframe. | |
| custdata_df.columns | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Find numerical variable in Data frame. | |
| # This will return a list | |
| numeric_var_names = [key for key in dict(custdata_df.dtypes) if dict(custdata_df.dtypes)[key] in ['float64', 'int64', 'float32', 'int32']] | |
| # Find Categorical variable in Data frame | |
| cat_var_names = [key for key in dict(custdata_df.dtypes) if dict(custdata_df.dtypes)[key] in ['object']] | |
| #Print the data frame | |
| print( numeric_var_names) | |
| print(cat_var_names) | |
| #Convert a list in to dataframe | |
| #Information about numericvariable in custdata_df | |
| custdata_df_num=custdata_df[numeric_var_names] | |
| custdata_df_num.head(5) | |
| #Convert a list in to dataframe | |
| #Information about categorical variable in custdata_df | |
| custdata_df_cat=custdata_df[cat_var_names] | |
| custdata_df_cat.head(5) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #To create Y we need to sumup cardspent(first card spent amount) and card2spent(Second card spent amount) | |
| custdata_df['totalspend'] = custdata_df['cardspent'] + custdata_df['card2spent'] | |
| custdata_df.head() | |
| # Now Run pandas profiling to see the data audit reports | |
| import pandas_profiling | |
| pandas_profiling.ProfileReport(custdata_df) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment