This is a collection of common pandas code used while performing Data Analysis
import pandas as pd
import numpy as np- Create a DataFrame from a csv file
csvFilePath = '/path/to/file.csv'
with open (csvFilePath, 'rb') as file:
dataFrameName = pd.read_csv(file)- Create a DataFrame from a csv file, explicitly definining the file encoding and dtype for each column. note: check sparse and array dtypes not included here.
csvFilePath = '/path/to/file.csv'
with open (csvFilePath, 'rb') as file:
dataFrameName = pd.read_csv(file, encoding = 'UTF-8',
thousands = ',',
decimal = '.',
dtype = {
'column1name':'float64',
'column2name':'int64',
'column3name':'datetime64',
'column4name':'string',
'column5name':'boolean',
'column6name':'category'
}
)- Use convertions while importing data, ie: change a string column like '67%' to float
## define convertion function
def percentToFloat(percent_string):
return float(percent_string.strip('%'))/100
## pass converters parameters in dictionary form
csvFilePath = '/path/to/file.csv'
with open (csvFilePath, 'rb') as file:
dataFrameName = pd.read_csv(file, encoding = 'UTF-8',
thousands = ',',
decimal = '.',
converters = {
'column4':percentToFloat,
'column5':percentToFloat
}
dtype = {
'column1name':'float64',
'column2name':'int64',
'column3name':'datetime64',
'column6name':'category'
}
# columns 4 and 5 were no longer on the dtype dict
)- Export DataFrame to csv
csvFilePath = '/path/to/file.csv' #file.csv should't exist, otherwise you would overwrite
dataFrameName.to_csv(csvFilePath)- Empty DataFrame
dataFrameName = np.DataFrame()- From a list of dicts (example)
myFriends = pd.DataFrame([
{'name':'friendName1',
'age':21,
'straight':True,
'sex':'male'
},
{'name':'friendName2',
'age':20,
'straight':False,
'sex':'female'
},
{'name':'friendName3',
'age':21,
'straight':True,
'sex':'female'
}
])- Quick view at first elements (default 5 of the DataFrame)
pd.dataFrameName.head()
pd.dataFrameName.head(n=10)- Create a DataFrame describing the dtypes of each of the columns that are part of another dataframe
columnTypes= {}
for column in dataFrameName.columns:
columnTypes.update({dataFrameName[column].name : dataFrameName[column].dtype})
columnTypes = pd.DataFrame(columnTypes.items(), columns=['column','type'])- DataFrame shape, without considering column labels or row index
pd.dataFrameName.shape()- DataFrame elements quantity, without considering column labels or row index
pd.dataFrameName.size()- Create a new dataFrame a selection of another dataframe columns
- Summary stats of numeric columns in a DataFrame
dataFrameName.describe()- Summary stats of an specific column
dataFrameName['columnName'].describe()- Changing column names
dataFrameName.columns = ['column0newName','column1newName','column2newName','column3newName']- Adding columns to a DataFrame
### empty one:
dataFrameName[newColumnName]=''
### fixed value
dataFrameName[newColumnName2]='myValue'
### as the result of the operation of other columns
dataFrameName[newColumnName3]=(dataFrameName[column3]/dataFrameName[column5])**2- Create a new DataFrame as the selection of another DataFrame columns
newDataFrame = dataFrame[['column1','column2','columnn']]- Change a percent string to float
dataFrameName['columnName'] = dataFrameName['columnName'].str.rstrip('%').astype('float') / 100.0-
Filtering
- By a few elements
condition = dataFrameName['columnName'].isin(['element1', 'element2', 'elementn']) newDataFrameName = dataFrameName[condition]
- By elements
elementsToFilter = ['element1', 'element2', 'elementn'] condition = dataFrameName['columnName'].isin(elementsToFilter) newDataFrameName = dataFrameName[condition]
- By matching criteria
condition = dataFrameName['columnName'] == True newDataFrameName = dataFrameName[condition]
- By eliminating data below and above certain percentiles (ie, 5 and 95
columnNamep05 = np.percentile(dataFrameName['columnName'].to_numpy(),5) columnNamep95 = np.percentile(dataFrameName['columnName'].to_numpy(),95) condition = ((dataFrameName['columnName'] > columnNamep05) & (dataFrameName['columnName'] > columnNamep95)) newDataFrameName = dataFrameName[condition]