fstfwd · June 15, 2023 14:59
diff --git a/DataAudit b/DataAudit
 # Use a general function that returns multiple values
 def var_summary(x):
    return pd.Series([x.count(), x.isnull().sum(), x.sum(), x.mean(), x.median(),  x.std(), x.var(), x.min(), x.dropna().quantile(0.01), x.dropna().quantile(0.05),x.dropna().quantile(0.10),x.dropna().quantile(0.25),x.dropna().quantile(0.50),x.dropna().quantile(0.75), x.dropna().quantile(0.90),x.dropna().quantile(0.95), x.dropna().quantile(0.99),x.max()], 
                  index=['N', 'NMISS', 'SUM', 'MEAN','MEDIAN', 'STD', 'VAR', 'MIN', 'P1' , 'P5' ,'P10' ,'P25' ,'P50' ,'P75' ,'P90' ,'P95' ,'P99' ,'MAX'])

 num_summary=custdata_df_num.apply(lambda x: var_summary(x)).T

 num_summary
diff --git a/DataLinearity b/DataLinearity
 # Seaborn scatter plot with regression line
 sns.lmplot(x='income', y='totalspend', data=custdata_df)

 # Creating Categorical List= Contains categorical variables...

 for x in ['region','townsize','gender','agecat','edcat','birthmonth','jobcat','union','employ','empcat','retire',
          'inccat','default','jobsat','marital','spousedcat','homeown','hometype','address','addresscat','cars','carown',
          'cartype','carcatvalue','carbought','carbuy','commute','commutecat','commutecar',
          'commutemotorcycle','commutecarpool','commutebus','commuterail','commutepublic','commutebike','commutewalk',
         'commutenonmotor','telecommute','reason','polview','polparty','polcontrib','vote','card','cardtype','cardbenefit',
         'cardfee','cardtenure','cardtenurecat','card2','card2type','card2benefit','card2fee','card2tenure','card2tenurecat',
         'active','bfast','churn','tollfree','equip','callcard','wireless','multline','voice','pager','internet','callid',
         'callwait','forward','confer','ebill','owntv','ownvcr','owndvd','owncd','ownpda','ownpc','ownipod','owngame','ownfax',
        'news','response_01','response_02','response_03']:
    custdata_df[x]=custdata_df[x].astype('object')
diff --git a/DropVariable b/DropVariable
 #Drop cardspent(first card spent amount) and card2spent(Second card spent amount) because its not adding any value
 #Also  we are droping "custid","birthmonth" bacuse cust id is unique and birth month is not adding any value

 custdata_df.drop(["cardspent","card2spent","custid"],axis=1, inplace=True)
diff --git a/Import.txt b/Import.txt
 # import relevant modules
 import sys
 import pandas as pd
 import numpy as np
 import seaborn as sns
 import matplotlib.pyplot as plt
 import statsmodels.formula.api as sn
 import scipy.stats as stats
 from matplotlib.backends.backend_pdf import PdfPages
 from sklearn.model_selection import train_test_split
 from sklearn import metrics
 from sklearn.linear_model import LogisticRegression
 from statsmodels.stats.outliers_influence import variance_inflation_factor
 from patsy import dmatrices
 %matplotlib inline

 # Ignore warnings
 import warnings
 warnings.filterwarnings('ignore')

 # Settings
 pd.set_option('display.max_columns', None)
 np.set_printoptions(threshold=sys.maxsize)
 np.set_printoptions(precision=3)
 sns.set(style="darkgrid")
 plt.rcParams['axes.labelsize'] = 14
 plt.rcParams['xtick.labelsize'] = 12
 plt.rcParams['ytick.labelsize'] = 12
diff --git a/LoadNinspect b/LoadNinspect
 custdata_df = pd.read_excel("Data Set.xlsx", sheet_name="customer_dbase")
 custdata_df.sample(5)

 # Find column information in the dataframe.
 custdata_df.columns

diff --git a/VariableSeparation b/VariableSeparation
 # Find numerical variable in Data frame.
 # This will return a list
 numeric_var_names = [key for key in dict(custdata_df.dtypes) if dict(custdata_df.dtypes)[key] in ['float64', 'int64', 'float32', 'int32']]

 # Find Categorical variable in Data frame
 cat_var_names = [key for key in dict(custdata_df.dtypes) if dict(custdata_df.dtypes)[key] in ['object']]

 #Print the data frame
 print( numeric_var_names)
 print(cat_var_names)

 #Convert a list in to dataframe 
 #Information about numericvariable in custdata_df

 custdata_df_num=custdata_df[numeric_var_names]
 custdata_df_num.head(5)

 #Convert a list in to dataframe 
 #Information about categorical variable in custdata_df

 custdata_df_cat=custdata_df[cat_var_names]
 custdata_df_cat.head(5)
diff --git a/Ycolumn b/Ycolumn
 #To create Y we need to sumup cardspent(first card spent amount) and card2spent(Second card spent amount)
 custdata_df['totalspend'] = custdata_df['cardspent'] + custdata_df['card2spent']

 custdata_df.head()

 # Now Run pandas profiling to see the data audit reports

 import pandas_profiling
 pandas_profiling.ProfileReport(custdata_df)
	# Use a general function that returns multiple values
	def var_summary(x):
	return pd.Series([x.count(), x.isnull().sum(), x.sum(), x.mean(), x.median(), x.std(), x.var(), x.min(), x.dropna().quantile(0.01), x.dropna().quantile(0.05),x.dropna().quantile(0.10),x.dropna().quantile(0.25),x.dropna().quantile(0.50),x.dropna().quantile(0.75), x.dropna().quantile(0.90),x.dropna().quantile(0.95), x.dropna().quantile(0.99),x.max()],
	index=['N', 'NMISS', 'SUM', 'MEAN','MEDIAN', 'STD', 'VAR', 'MIN', 'P1' , 'P5' ,'P10' ,'P25' ,'P50' ,'P75' ,'P90' ,'P95' ,'P99' ,'MAX'])

	num_summary=custdata_df_num.apply(lambda x: var_summary(x)).T

	num_summary
	# Seaborn scatter plot with regression line
	sns.lmplot(x='income', y='totalspend', data=custdata_df)

	# Creating Categorical List= Contains categorical variables...

	for x in ['region','townsize','gender','agecat','edcat','birthmonth','jobcat','union','employ','empcat','retire',
	'inccat','default','jobsat','marital','spousedcat','homeown','hometype','address','addresscat','cars','carown',
	'cartype','carcatvalue','carbought','carbuy','commute','commutecat','commutecar',
	'commutemotorcycle','commutecarpool','commutebus','commuterail','commutepublic','commutebike','commutewalk',
	'commutenonmotor','telecommute','reason','polview','polparty','polcontrib','vote','card','cardtype','cardbenefit',
	'cardfee','cardtenure','cardtenurecat','card2','card2type','card2benefit','card2fee','card2tenure','card2tenurecat',
	'active','bfast','churn','tollfree','equip','callcard','wireless','multline','voice','pager','internet','callid',
	'callwait','forward','confer','ebill','owntv','ownvcr','owndvd','owncd','ownpda','ownpc','ownipod','owngame','ownfax',
	'news','response_01','response_02','response_03']:
	custdata_df[x]=custdata_df[x].astype('object')
	#Drop cardspent(first card spent amount) and card2spent(Second card spent amount) because its not adding any value
	#Also we are droping "custid","birthmonth" bacuse cust id is unique and birth month is not adding any value

	custdata_df.drop(["cardspent","card2spent","custid"],axis=1, inplace=True)
	# import relevant modules
	import sys
	import pandas as pd
	import numpy as np
	import seaborn as sns
	import matplotlib.pyplot as plt
	import statsmodels.formula.api as sn
	import scipy.stats as stats
	from matplotlib.backends.backend_pdf import PdfPages
	from sklearn.model_selection import train_test_split
	from sklearn import metrics
	from sklearn.linear_model import LogisticRegression
	from statsmodels.stats.outliers_influence import variance_inflation_factor
	from patsy import dmatrices
	%matplotlib inline

	# Ignore warnings
	import warnings
	warnings.filterwarnings('ignore')

	# Settings
	pd.set_option('display.max_columns', None)
	np.set_printoptions(threshold=sys.maxsize)
	np.set_printoptions(precision=3)
	sns.set(style="darkgrid")
	plt.rcParams['axes.labelsize'] = 14
	plt.rcParams['xtick.labelsize'] = 12
	plt.rcParams['ytick.labelsize'] = 12
	custdata_df = pd.read_excel("Data Set.xlsx", sheet_name="customer_dbase")
	custdata_df.sample(5)

	# Find column information in the dataframe.
	custdata_df.columns
	# Find numerical variable in Data frame.
	# This will return a list
	numeric_var_names = [key for key in dict(custdata_df.dtypes) if dict(custdata_df.dtypes)[key] in ['float64', 'int64', 'float32', 'int32']]

	# Find Categorical variable in Data frame
	cat_var_names = [key for key in dict(custdata_df.dtypes) if dict(custdata_df.dtypes)[key] in ['object']]

	#Print the data frame
	print( numeric_var_names)
	print(cat_var_names)

	#Convert a list in to dataframe
	#Information about numericvariable in custdata_df

	custdata_df_num=custdata_df[numeric_var_names]
	custdata_df_num.head(5)

	#Convert a list in to dataframe
	#Information about categorical variable in custdata_df

	custdata_df_cat=custdata_df[cat_var_names]
	custdata_df_cat.head(5)
	#To create Y we need to sumup cardspent(first card spent amount) and card2spent(Second card spent amount)
	custdata_df['totalspend'] = custdata_df['cardspent'] + custdata_df['card2spent']

	custdata_df.head()

	# Now Run pandas profiling to see the data audit reports

	import pandas_profiling
	pandas_profiling.ProfileReport(custdata_df)