# import the useful libraries. import numpy import pandas import seaborn import matplotlib.pyplot %matplotlib inline # Load the data set data = pandas.read_csv("mode_of_transportation.csv") # Printing the data data.head() # HELPER METHODS def merge_columns(columns, name): new_column = name if name else ','.join(columns) for index in range(len(columns)): column = columns[index] val = data[column].apply(lambda x: str(x)) if index == 0: data[new_column]= val else: data[new_column]= data[new_column]+ '_' + val data.drop(column, axis = 1, inplace = True) def drop_missing_values(key, d): return d[~d[key].isnull()].copy() def impute_missing_values(key, d): d[key].fillna(d[key].mode()[0], inplace = True) columns_to_drop = [ 'ind_id', 'ind_definition', 'county_fips', 'version' ] # Drop the columns that are not used in analysis for column in columns_to_drop: data.drop(column, axis = 1, inplace = True) # Total rows and colums [row, column] print(data.shape) # Checking the missing values data.isnull().sum() # Drop rows that have missing geotype value data = drop_missing_values('geotype', data) # Verifying missing values data.isnull().sum() # Merge closely related colums merge_columns(['race_eth_code', 'race_eth_name'], 'racecode_and_name') merge_columns(['geotype', 'geotypevalue', 'geoname'], 'geotype_value_and_name') # merge_columns(['region_name', 'region_code'], 'regionname_and_code') # Checking the missing values data.isnull().sum() # print(data.shape) # Printing the Dataset # data