manojpandey · July 18, 2016 05:58 · Oct 12, 2015 · Sep 24, 2015 · Aug 28, 2015 · Jul 6, 2015
diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -37,9 +37,18 @@
 for index, row in df.iterrows():
     print index, row['some column']  
 
+#Next few examples show how to work with text data in Pandas.
+#Full list of .str functions: http://pandas.pydata.org/pandas-docs/stable/text.html
+
+#Slice values in a DataFrame column (aka Series)
+df.column.str[0:2]
+
 #Lower-case everything in a DataFrame column
 df.column_name = df.column_name.str.lower()
 
+#Get length of data in a DataFrame column
+df.column_name.str.len()
+
 #Sort dataframe by multiple columns
 df = df.sort(['col1','col2','col3'],ascending=[1,1,0])
 
@@ -49,7 +58,6 @@
 
 #Grab DataFrame rows where specific column is null/notnull
 newdf = df[df['column'].isnull()]
-len(newdf)
 
 #select from DataFrame using multiple keys of a hierarchical index
 df.xs(('index level 1 value','index level 2 value'), level=('level 1','level 2'))
@@ -58,9 +66,6 @@
 #loading to a db)
 df = df.where((pd.notnull(df)), None)
 
-#Slice values in a DataFrame column (aka Series)
-df.column.str[0:2]
-
 #Get quick count of rows in a DataFrame
 len(df.index)
 

diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -34,7 +34,7 @@
 
 #Loop through rows in a DataFrame
 #(if you must)
-for index, row in df:
+for index, row in df.iterrows():
     print index, row['some column']  
 
 #Lower-case everything in a DataFrame column

diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -107,7 +107,9 @@
 df.columns = df.columns.get_level_values(0)
 
 #Convert Django queryset to DataFrame
-#convert queryset to dataframe
 qs = DjangoModelName.objects.all()
 q = qs.values()
-df = pd.DataFrame.from_records(q)
+df = pd.DataFrame.from_records(q)
+
+#Create a DataFrame from a Python dictionary
+df = pd.DataFrame(list(a_dictionary.items()), columns = ['column1', 'column2'])
diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -92,6 +92,10 @@
     'col6': '99'
 })
 
+#Concatenate two DataFrame columns into a new, single column
+#(useful when dealing with composite keys, for example)
+df['newcol'] = df['col1'].map(str) + df['col2'].map(str)
+
 #Doing calculations with DataFrame columns that have missing values
 #In example below, swap in 0 for df['col1'] cells that contain null
 df['new_col'] = np.where(pd.isnull(df['col1']),0,df['col1']) + df['col2']

diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -99,6 +99,9 @@
 # Split delimited values in a DataFrame column into two new columns
 df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1)))
 
+# Collapse hierarchical column indexes
+df.columns = df.columns.get_level_values(0)
+
 #Convert Django queryset to DataFrame
 #convert queryset to dataframe
 qs = DjangoModelName.objects.all()

diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -32,10 +32,6 @@
 #lower-case all DataFrame column names (for example)
 df.rename(columns=lambda x: x.split('.')[-1], inplace=True)
 
-#set entire column values in a way that avoids the 
-#SettingWithCopy Warning
-df.loc[:, 'column_name'] = 'new value'
-
 #Loop through rows in a DataFrame
 #(if you must)
 for index, row in df:

diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -34,7 +34,7 @@
 
 #set entire column values in a way that avoids the 
 #SettingWithCopy Warning
-df.loc[df.index, 'column_name'] = 'new value'
+df.loc[:, 'column_name'] = 'new value'
 
 #Loop through rows in a DataFrame
 #(if you must)

diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -32,6 +32,10 @@
 #lower-case all DataFrame column names (for example)
 df.rename(columns=lambda x: x.split('.')[-1], inplace=True)
 
+#set entire column values in a way that avoids the 
+#SettingWithCopy Warning
+df.loc[df.index, 'column_name'] = 'new value'
+
 #Loop through rows in a DataFrame
 #(if you must)
 for index, row in df:

diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -12,6 +12,9 @@
 valuelist = ['value1', 'value2', 'value3']
 df = df[~df.column.isin(value_list)]
 
+#Delete column from DataFrame
+del df['column']
+
 #Select from DataFrame using criteria from multiple columns
 newdf = df[(df['column_one']>2004) & (df['column_two']==9)]
 

diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -55,6 +55,9 @@
 #loading to a db)
 df = df.where((pd.notnull(df)), None)
 
+#Slice values in a DataFrame column (aka Series)
+df.column.str[0:2]
+
 #Get quick count of rows in a DataFrame
 len(df.index)
 

diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -22,14 +22,18 @@
     'col3 old name':'col3 new name',
 })
 
+#lower-case all DataFrame column names
+df.columns = map(str.lower, df.columns)
+
+#even more fancy DataFrame column re-naming
+#lower-case all DataFrame column names (for example)
+df.rename(columns=lambda x: x.split('.')[-1], inplace=True)
+
 #Loop through rows in a DataFrame
 #(if you must)
 for index, row in df:
     print index, row['some column']  
 
-#lower-case all DataFrame column names
-df.columns = map(str.lower, df.columns)
-
 #Lower-case everything in a DataFrame column
 df.column_name = df.column_name.str.lower()
 

diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -69,6 +69,9 @@
 for col in refunds.columns.values:
   refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True)
 
+#Set DataFrame column values based on other column values
+df['column_to_change'][(df['column1'] == some_value) & (df['column2'] == some_other_value)] = new_value
+
 #Clean up missing values in multiple DataFrame columns
 df = df.fillna({
     'col1': 'missing',

diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -44,6 +44,9 @@
 newdf = df[df['column'].isnull()]
 len(newdf)
 
+#select from DataFrame using multiple keys of a hierarchical index
+df.xs(('index level 1 value','index level 2 value'), level=('level 1','level 2'))
+
 #Change all NaNs to None (useful before
 #loading to a db)
 df = df.where((pd.notnull(df)), None)

diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -34,7 +34,11 @@
 df.column_name = df.column_name.str.lower()
 
 #Sort dataframe by multiple columns
-df = df.sort_indx(by=['col1', 'col2'])
+df = df.sort(['col1','col2','col3'],ascending=[1,1,0])
+
+#get top n for each group of columns in a sorted dataframe
+#(make sure dataframe is sorted first)
+top5 = df.groupby(['groupingcol1', 'groupingcol2']).head(5)
 
 #Grab DataFrame rows where specific column is null/notnull
 newdf = df[df['column'].isnull()]

diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -33,8 +33,8 @@
 #Lower-case everything in a DataFrame column
 df.column_name = df.column_name.str.lower()
 
-#Sort multi-index dataframe
-df = df.sort_indx(by=['indexpart1', 'indexpart2'])
+#Sort dataframe by multiple columns
+df = df.sort_indx(by=['col1', 'col2'])
 
 #Grab DataFrame rows where specific column is null/notnull
 newdf = df[df['column'].isnull()]

diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -33,6 +33,9 @@
 #Lower-case everything in a DataFrame column
 df.column_name = df.column_name.str.lower()
 
+#Sort multi-index dataframe
+df = df.sort_indx(by=['indexpart1', 'indexpart2'])
+
 #Grab DataFrame rows where specific column is null/notnull
 newdf = df[df['column'].isnull()]
 len(newdf)

diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -1,20 +1,20 @@
-#Convert Series datatype to numeric, getting rid of any values that are non-numeric
+#List unique values in a DataFrame column
+pd.unique(df.column_name.ravel())
+
+#Convert Series datatype to numeric, getting rid of any non-numeric values
 df['col'] = df['col'].astype(str).convert_objects(convert_numeric=True)
 
-#Grab DataFrame rows where specified column has certain values
+#Grab DataFrame rows where column has certain values
 valuelist = ['value1', 'value2', 'value3']
 df = df[df.column.isin(value_list)]
 
-#List unique values in a DataFrame column
-pd.unique(df.column_name.ravel())
+#Grab DataFrame rows where column doesn't have certain values
+valuelist = ['value1', 'value2', 'value3']
+df = df[~df.column.isin(value_list)]
 
 #Select from DataFrame using criteria from multiple columns
 newdf = df[(df['column_one']>2004) & (df['column_two']==9)]
 
-#Grab DataFrame rows where specified column doesn't have certain values
-valuelist = ['value1', 'value2', 'value3']
-df = df[~df.column.isin(value_list)]
-
 #Rename several DataFrame columns
 df = df.rename(columns = {
     'col1 old name':'col1 new name',
@@ -42,7 +42,6 @@
 df = df.where((pd.notnull(df)), None)
 
 #Get quick count of rows in a DataFrame
-#(note that using df[0].count will exclude NaNs)
 len(df.index)
 
 #Pivot data (with flexibility about what what

diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -1,27 +1,16 @@
-# Get rid of non-numeric values throughout a DataFrame:
-for col in refunds.columns.values:
-  refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True)
-
-# Split delimited values in a DataFrame column into two new columns
-df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1)))
-
 #Convert Series datatype to numeric, getting rid of any values that are non-numeric
 df['col'] = df['col'].astype(str).convert_objects(convert_numeric=True)
 
-#Clean up missing values in multiple DataFrame columns
-df = df.fillna({
-    'fyq': 'missing',
-    'cfda_program_num': '99.999',
-    'recipient_county_code': '999',
-    'recipient_country_code': 'missing',
-    'uri': 'missing',
-    'recipient_state_code': '99'
-})
-
 #Grab DataFrame rows where specified column has certain values
 valuelist = ['value1', 'value2', 'value3']
 df = df[df.column.isin(value_list)]
 
+#List unique values in a DataFrame column
+pd.unique(df.column_name.ravel())
+
+#Select from DataFrame using criteria from multiple columns
+newdf = df[(df['column_one']>2004) & (df['column_two']==9)]
+
 #Grab DataFrame rows where specified column doesn't have certain values
 valuelist = ['value1', 'value2', 'value3']
 df = df[~df.column.isin(value_list)]
@@ -33,13 +22,8 @@
     'col3 old name':'col3 new name',
 })
 
-#Convert Django queryset to DataFrame
-#convert queryset to dataframe
-qs = DjangoModelName.objects.all()
-q = qs.values()
-df = pd.DataFrame.from_records(q)
-
 #Loop through rows in a DataFrame
+#(if you must)
 for index, row in df:
     print index, row['some column']  
 
@@ -57,12 +41,6 @@
 #loading to a db)
 df = df.where((pd.notnull(df)), None)
 
-#List unique values in a DataFrame column
-pd.unique(df.column_name.ravel())
-
-#Select from DataFrame using criteria from multiple columns
-newdf = df[(df['column_one']>2004) & (df['column_two']==9)]
-
 #Get quick count of rows in a DataFrame
 #(note that using df[0].count will exclude NaNs)
 len(df.index)
@@ -75,9 +53,32 @@
   index=['col1', 'col2', 'col3'], #these stay as columns
   columns=['col4']) #data values in this column become their own column
 
+#change data type of DataFrame column
+df.column_name = df.column_name.astype(np.int64)
+
+# Get rid of non-numeric values throughout a DataFrame:
+for col in refunds.columns.values:
+  refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True)
+
+#Clean up missing values in multiple DataFrame columns
+df = df.fillna({
+    'col1': 'missing',
+    'col2': '99.999',
+    'col3': '999',
+    'col4': 'missing',
+    'col5': 'missing',
+    'col6': '99'
+})
+
 #Doing calculations with DataFrame columns that have missing values
 #In example below, swap in 0 for df['col1'] cells that contain null
 df['new_col'] = np.where(pd.isnull(df['col1']),0,df['col1']) + df['col2']
 
-#change data type of DataFrame column
-df.column_name = df.column_name.astype(np.int64)
+# Split delimited values in a DataFrame column into two new columns
+df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1)))
+
+#Convert Django queryset to DataFrame
+#convert queryset to dataframe
+qs = DjangoModelName.objects.all()
+q = qs.values()
+df = pd.DataFrame.from_records(q)
diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -77,4 +77,7 @@
 
 #Doing calculations with DataFrame columns that have missing values
 #In example below, swap in 0 for df['col1'] cells that contain null
-df['new_col'] = np.where(pd.isnull(df['col1']),0,df['col1']) + df['col2']
+df['new_col'] = np.where(pd.isnull(df['col1']),0,df['col1']) + df['col2']
+
+#change data type of DataFrame column
+df.column_name = df.column_name.astype(np.int64)
diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -73,4 +73,8 @@
 pd.pivot_table(
   df,values='cell_value',
   index=['col1', 'col2', 'col3'], #these stay as columns
-  columns=['col4']) #data values in this column become their own column
+  columns=['col4']) #data values in this column become their own column
+
+#Doing calculations with DataFrame columns that have missing values
+#In example below, swap in 0 for df['col1'] cells that contain null
+df['new_col'] = np.where(pd.isnull(df['col1']),0,df['col1']) + df['col2']
diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -65,4 +65,12 @@
 
 #Get quick count of rows in a DataFrame
 #(note that using df[0].count will exclude NaNs)
-len(df.index)
+len(df.index)
+
+#Pivot data (with flexibility about what what
+#becomes a column and what stays a row).
+#Syntax works on Pandas >= .14
+pd.pivot_table(
+  df,values='cell_value',
+  index=['col1', 'col2', 'col3'], #these stay as columns
+  columns=['col4']) #data values in this column become their own column
diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -1,6 +1,6 @@
 # Get rid of non-numeric values throughout a DataFrame:
 for col in refunds.columns.values:
-  refunds[col] = refunds[col].replace('[^0-9]+', '', regex=True)
+  refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True)
 
 # Split delimited values in a DataFrame column into two new columns
 df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1)))

diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -61,4 +61,8 @@
 pd.unique(df.column_name.ravel())
 
 #Select from DataFrame using criteria from multiple columns
-newdf = df[(df['column_one']>2004) & (df['column_two']==9)]
+newdf = df[(df['column_one']>2004) & (df['column_two']==9)]
+
+#Get quick count of rows in a DataFrame
+#(note that using df[0].count will exclude NaNs)
+len(df.index)
diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -58,4 +58,7 @@
 df = df.where((pd.notnull(df)), None)
 
 #List unique values in a DataFrame column
-pd.unique(df.column_name.ravel())
+pd.unique(df.column_name.ravel())
+
+#Select from DataFrame using criteria from multiple columns
+newdf = df[(df['column_one']>2004) & (df['column_two']==9)]
diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -55,4 +55,7 @@
 
 #Change all NaNs to None (useful before
 #loading to a db)
-df = df.where((pd.notnull(df)), None)
+df = df.where((pd.notnull(df)), None)
+
+#List unique values in a DataFrame column
+pd.unique(df.column_name.ravel())
diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -51,4 +51,8 @@
 
 #Grab DataFrame rows where specific column is null/notnull
 newdf = df[df['column'].isnull()]
-len(newdf)
+len(newdf)
+
+#Change all NaNs to None (useful before
+#loading to a db)
+df = df.where((pd.notnull(df)), None)
diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -47,4 +47,8 @@
 df.columns = map(str.lower, df.columns)
 
 #Lower-case everything in a DataFrame column
-df.column_name = df.column_name.str.lower()
+df.column_name = df.column_name.str.lower()
+
+#Grab DataFrame rows where specific column is null/notnull
+newdf = df[df['column'].isnull()]
+len(newdf)
diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -47,4 +47,4 @@
 df.columns = map(str.lower, df.columns)
 
 #Lower-case everything in a DataFrame column
-df = df.column_name.str.lower()
+df.column_name = df.column_name.str.lower()
diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -42,3 +42,9 @@
 #Loop through rows in a DataFrame
 for index, row in df:
     print index, row['some column']  
+
+#lower-case all DataFrame column names
+df.columns = map(str.lower, df.columns)
+
+#Lower-case everything in a DataFrame column
+df = df.column_name.str.lower()
diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
@@ -38,3 +38,7 @@
 qs = DjangoModelName.objects.all()
 q = qs.values()
 df = pd.DataFrame.from_records(q)
+
+#Loop through rows in a DataFrame
+for index, row in df:
+    print index, row['some column']