-
-
Save manojpandey/f0de8df3a0707cf44c5c887c07ee357c to your computer and use it in GitHub Desktop.
Useful Pandas Snippets
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #List unique values in a DataFrame column | |
| pd.unique(df.column_name.ravel()) | |
| #Convert Series datatype to numeric, getting rid of any non-numeric values | |
| df['col'] = df['col'].astype(str).convert_objects(convert_numeric=True) | |
| #Grab DataFrame rows where column has certain values | |
| valuelist = ['value1', 'value2', 'value3'] | |
| df = df[df.column.isin(value_list)] | |
| #Grab DataFrame rows where column doesn't have certain values | |
| valuelist = ['value1', 'value2', 'value3'] | |
| df = df[~df.column.isin(value_list)] | |
| #Select from DataFrame using criteria from multiple columns | |
| newdf = df[(df['column_one']>2004) & (df['column_two']==9)] | |
| #Rename several DataFrame columns | |
| df = df.rename(columns = { | |
| 'col1 old name':'col1 new name', | |
| 'col2 old name':'col2 new name', | |
| 'col3 old name':'col3 new name', | |
| }) | |
| #Loop through rows in a DataFrame | |
| #(if you must) | |
| for index, row in df: | |
| print index, row['some column'] | |
| #lower-case all DataFrame column names | |
| df.columns = map(str.lower, df.columns) | |
| #Lower-case everything in a DataFrame column | |
| df.column_name = df.column_name.str.lower() | |
| #Sort multi-index dataframe | |
| df = df.sort_indx(by=['indexpart1', 'indexpart2']) | |
| #Grab DataFrame rows where specific column is null/notnull | |
| newdf = df[df['column'].isnull()] | |
| len(newdf) | |
| #Change all NaNs to None (useful before | |
| #loading to a db) | |
| df = df.where((pd.notnull(df)), None) | |
| #Get quick count of rows in a DataFrame | |
| len(df.index) | |
| #Pivot data (with flexibility about what what | |
| #becomes a column and what stays a row). | |
| #Syntax works on Pandas >= .14 | |
| pd.pivot_table( | |
| df,values='cell_value', | |
| index=['col1', 'col2', 'col3'], #these stay as columns | |
| columns=['col4']) #data values in this column become their own column | |
| #change data type of DataFrame column | |
| df.column_name = df.column_name.astype(np.int64) | |
| # Get rid of non-numeric values throughout a DataFrame: | |
| for col in refunds.columns.values: | |
| refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True) | |
| #Clean up missing values in multiple DataFrame columns | |
| df = df.fillna({ | |
| 'col1': 'missing', | |
| 'col2': '99.999', | |
| 'col3': '999', | |
| 'col4': 'missing', | |
| 'col5': 'missing', | |
| 'col6': '99' | |
| }) | |
| #Doing calculations with DataFrame columns that have missing values | |
| #In example below, swap in 0 for df['col1'] cells that contain null | |
| df['new_col'] = np.where(pd.isnull(df['col1']),0,df['col1']) + df['col2'] | |
| # Split delimited values in a DataFrame column into two new columns | |
| df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1))) | |
| #Convert Django queryset to DataFrame | |
| #convert queryset to dataframe | |
| qs = DjangoModelName.objects.all() | |
| q = qs.values() | |
| df = pd.DataFrame.from_records(q) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment