Skip to content

Instantly share code, notes, and snippets.

@fstfwd
Forked from aimanyaasin/DataAudit
Created June 15, 2023 14:59
Show Gist options
  • Save fstfwd/131ebf6183273f5f09eac14bbcc6053c to your computer and use it in GitHub Desktop.
Save fstfwd/131ebf6183273f5f09eac14bbcc6053c to your computer and use it in GitHub Desktop.
(Credit Prediction ML)
# Use a general function that returns multiple values
def var_summary(x):
return pd.Series([x.count(), x.isnull().sum(), x.sum(), x.mean(), x.median(), x.std(), x.var(), x.min(), x.dropna().quantile(0.01), x.dropna().quantile(0.05),x.dropna().quantile(0.10),x.dropna().quantile(0.25),x.dropna().quantile(0.50),x.dropna().quantile(0.75), x.dropna().quantile(0.90),x.dropna().quantile(0.95), x.dropna().quantile(0.99),x.max()],
index=['N', 'NMISS', 'SUM', 'MEAN','MEDIAN', 'STD', 'VAR', 'MIN', 'P1' , 'P5' ,'P10' ,'P25' ,'P50' ,'P75' ,'P90' ,'P95' ,'P99' ,'MAX'])
num_summary=custdata_df_num.apply(lambda x: var_summary(x)).T
num_summary
# Seaborn scatter plot with regression line
sns.lmplot(x='income', y='totalspend', data=custdata_df)
# Creating Categorical List= Contains categorical variables...
for x in ['region','townsize','gender','agecat','edcat','birthmonth','jobcat','union','employ','empcat','retire',
'inccat','default','jobsat','marital','spousedcat','homeown','hometype','address','addresscat','cars','carown',
'cartype','carcatvalue','carbought','carbuy','commute','commutecat','commutecar',
'commutemotorcycle','commutecarpool','commutebus','commuterail','commutepublic','commutebike','commutewalk',
'commutenonmotor','telecommute','reason','polview','polparty','polcontrib','vote','card','cardtype','cardbenefit',
'cardfee','cardtenure','cardtenurecat','card2','card2type','card2benefit','card2fee','card2tenure','card2tenurecat',
'active','bfast','churn','tollfree','equip','callcard','wireless','multline','voice','pager','internet','callid',
'callwait','forward','confer','ebill','owntv','ownvcr','owndvd','owncd','ownpda','ownpc','ownipod','owngame','ownfax',
'news','response_01','response_02','response_03']:
custdata_df[x]=custdata_df[x].astype('object')
#Drop cardspent(first card spent amount) and card2spent(Second card spent amount) because its not adding any value
#Also we are droping "custid","birthmonth" bacuse cust id is unique and birth month is not adding any value
custdata_df.drop(["cardspent","card2spent","custid"],axis=1, inplace=True)
# import relevant modules
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as sn
import scipy.stats as stats
from matplotlib.backends.backend_pdf import PdfPages
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor
from patsy import dmatrices
%matplotlib inline
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')
# Settings
pd.set_option('display.max_columns', None)
np.set_printoptions(threshold=sys.maxsize)
np.set_printoptions(precision=3)
sns.set(style="darkgrid")
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
custdata_df = pd.read_excel("Data Set.xlsx", sheet_name="customer_dbase")
custdata_df.sample(5)
# Find column information in the dataframe.
custdata_df.columns
# Find numerical variable in Data frame.
# This will return a list
numeric_var_names = [key for key in dict(custdata_df.dtypes) if dict(custdata_df.dtypes)[key] in ['float64', 'int64', 'float32', 'int32']]
# Find Categorical variable in Data frame
cat_var_names = [key for key in dict(custdata_df.dtypes) if dict(custdata_df.dtypes)[key] in ['object']]
#Print the data frame
print( numeric_var_names)
print(cat_var_names)
#Convert a list in to dataframe
#Information about numericvariable in custdata_df
custdata_df_num=custdata_df[numeric_var_names]
custdata_df_num.head(5)
#Convert a list in to dataframe
#Information about categorical variable in custdata_df
custdata_df_cat=custdata_df[cat_var_names]
custdata_df_cat.head(5)
#To create Y we need to sumup cardspent(first card spent amount) and card2spent(Second card spent amount)
custdata_df['totalspend'] = custdata_df['cardspent'] + custdata_df['card2spent']
custdata_df.head()
# Now Run pandas profiling to see the data audit reports
import pandas_profiling
pandas_profiling.ProfileReport(custdata_df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment