# import the useful libraries.
import numpy
import pandas
import seaborn
import matplotlib.pyplot
%matplotlib inline


# Load the data set 
data = pandas.read_csv("mode_of_transportation.csv")
    
# Printing the data
data.head()


# HELPER METHODS 


def merge_columns(columns, name):
    new_column = name if name else ','.join(columns)
    for index in range(len(columns)):
        column = columns[index]
        val = data[column].apply(lambda x: str(x))
        if index == 0:
            data[new_column]= val
        else:
            data[new_column]= data[new_column]+ '_' + val
        
        data.drop(column, axis = 1, inplace = True)
    
def drop_missing_values(key, d):
    return d[~d[key].isnull()].copy()

def impute_missing_values(key, d):
    d[key].fillna(d[key].mode()[0], inplace = True)    


columns_to_drop = [
    'ind_id',
    'ind_definition',
    'county_fips',
    'version'
]

# Drop the columns that are not used in analysis
for column in columns_to_drop:
    data.drop(column, axis = 1, inplace = True)

# Total rows and colums [row, column]
print(data.shape)

# Checking the missing values
data.isnull().sum()

# Drop rows that have missing geotype value
data = drop_missing_values('geotype', data)

# Verifying missing values
data.isnull().sum()

# Merge closely related colums
merge_columns(['race_eth_code', 'race_eth_name'], 'racecode_and_name')
merge_columns(['geotype', 'geotypevalue', 'geoname'], 'geotype_value_and_name')
# merge_columns(['region_name', 'region_code'], 'regionname_and_code')


# Checking the missing values
data.isnull().sum()
# print(data.shape)

# Printing the Dataset
# data