Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save manojpandey/f0de8df3a0707cf44c5c887c07ee357c to your computer and use it in GitHub Desktop.
Save manojpandey/f0de8df3a0707cf44c5c887c07ee357c to your computer and use it in GitHub Desktop.

Revisions

  1. @bsweger bsweger revised this gist Oct 12, 2015. 1 changed file with 9 additions and 4 deletions.
    13 changes: 9 additions & 4 deletions useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -37,9 +37,18 @@
    for index, row in df.iterrows():
    print index, row['some column']

    #Next few examples show how to work with text data in Pandas.
    #Full list of .str functions: http://pandas.pydata.org/pandas-docs/stable/text.html

    #Slice values in a DataFrame column (aka Series)
    df.column.str[0:2]

    #Lower-case everything in a DataFrame column
    df.column_name = df.column_name.str.lower()

    #Get length of data in a DataFrame column
    df.column_name.str.len()

    #Sort dataframe by multiple columns
    df = df.sort(['col1','col2','col3'],ascending=[1,1,0])

    @@ -49,7 +58,6 @@

    #Grab DataFrame rows where specific column is null/notnull
    newdf = df[df['column'].isnull()]
    len(newdf)

    #select from DataFrame using multiple keys of a hierarchical index
    df.xs(('index level 1 value','index level 2 value'), level=('level 1','level 2'))
    @@ -58,9 +66,6 @@
    #loading to a db)
    df = df.where((pd.notnull(df)), None)

    #Slice values in a DataFrame column (aka Series)
    df.column.str[0:2]

    #Get quick count of rows in a DataFrame
    len(df.index)

  2. @bsweger bsweger revised this gist Sep 24, 2015. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -34,7 +34,7 @@

    #Loop through rows in a DataFrame
    #(if you must)
    for index, row in df:
    for index, row in df.iterrows():
    print index, row['some column']

    #Lower-case everything in a DataFrame column
  3. @bsweger bsweger revised this gist Aug 28, 2015. 1 changed file with 4 additions and 2 deletions.
    6 changes: 4 additions & 2 deletions useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -107,7 +107,9 @@
    df.columns = df.columns.get_level_values(0)

    #Convert Django queryset to DataFrame
    #convert queryset to dataframe
    qs = DjangoModelName.objects.all()
    q = qs.values()
    df = pd.DataFrame.from_records(q)
    df = pd.DataFrame.from_records(q)

    #Create a DataFrame from a Python dictionary
    df = pd.DataFrame(list(a_dictionary.items()), columns = ['column1', 'column2'])
  4. @bsweger bsweger revised this gist Jul 6, 2015. 1 changed file with 4 additions and 0 deletions.
    4 changes: 4 additions & 0 deletions useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -92,6 +92,10 @@
    'col6': '99'
    })

    #Concatenate two DataFrame columns into a new, single column
    #(useful when dealing with composite keys, for example)
    df['newcol'] = df['col1'].map(str) + df['col2'].map(str)

    #Doing calculations with DataFrame columns that have missing values
    #In example below, swap in 0 for df['col1'] cells that contain null
    df['new_col'] = np.where(pd.isnull(df['col1']),0,df['col1']) + df['col2']
  5. @bsweger bsweger revised this gist Jul 3, 2015. 1 changed file with 3 additions and 0 deletions.
    3 changes: 3 additions & 0 deletions useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -99,6 +99,9 @@
    # Split delimited values in a DataFrame column into two new columns
    df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1)))

    # Collapse hierarchical column indexes
    df.columns = df.columns.get_level_values(0)

    #Convert Django queryset to DataFrame
    #convert queryset to dataframe
    qs = DjangoModelName.objects.all()
  6. @bsweger bsweger revised this gist Dec 31, 2014. 1 changed file with 0 additions and 4 deletions.
    4 changes: 0 additions & 4 deletions useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -32,10 +32,6 @@
    #lower-case all DataFrame column names (for example)
    df.rename(columns=lambda x: x.split('.')[-1], inplace=True)

    #set entire column values in a way that avoids the
    #SettingWithCopy Warning
    df.loc[:, 'column_name'] = 'new value'

    #Loop through rows in a DataFrame
    #(if you must)
    for index, row in df:
  7. @bsweger bsweger revised this gist Dec 19, 2014. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -34,7 +34,7 @@

    #set entire column values in a way that avoids the
    #SettingWithCopy Warning
    df.loc[df.index, 'column_name'] = 'new value'
    df.loc[:, 'column_name'] = 'new value'

    #Loop through rows in a DataFrame
    #(if you must)
  8. @bsweger bsweger revised this gist Dec 19, 2014. 1 changed file with 4 additions and 0 deletions.
    4 changes: 4 additions & 0 deletions useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -32,6 +32,10 @@
    #lower-case all DataFrame column names (for example)
    df.rename(columns=lambda x: x.split('.')[-1], inplace=True)

    #set entire column values in a way that avoids the
    #SettingWithCopy Warning
    df.loc[df.index, 'column_name'] = 'new value'

    #Loop through rows in a DataFrame
    #(if you must)
    for index, row in df:
  9. @bsweger bsweger revised this gist Dec 5, 2014. 1 changed file with 3 additions and 0 deletions.
    3 changes: 3 additions & 0 deletions useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -12,6 +12,9 @@
    valuelist = ['value1', 'value2', 'value3']
    df = df[~df.column.isin(value_list)]

    #Delete column from DataFrame
    del df['column']

    #Select from DataFrame using criteria from multiple columns
    newdf = df[(df['column_one']>2004) & (df['column_two']==9)]

  10. @bsweger bsweger revised this gist Dec 4, 2014. 1 changed file with 3 additions and 0 deletions.
    3 changes: 3 additions & 0 deletions useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -55,6 +55,9 @@
    #loading to a db)
    df = df.where((pd.notnull(df)), None)

    #Slice values in a DataFrame column (aka Series)
    df.column.str[0:2]

    #Get quick count of rows in a DataFrame
    len(df.index)

  11. @bsweger bsweger revised this gist Aug 20, 2014. 1 changed file with 7 additions and 3 deletions.
    10 changes: 7 additions & 3 deletions useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -22,14 +22,18 @@
    'col3 old name':'col3 new name',
    })

    #lower-case all DataFrame column names
    df.columns = map(str.lower, df.columns)

    #even more fancy DataFrame column re-naming
    #lower-case all DataFrame column names (for example)
    df.rename(columns=lambda x: x.split('.')[-1], inplace=True)

    #Loop through rows in a DataFrame
    #(if you must)
    for index, row in df:
    print index, row['some column']

    #lower-case all DataFrame column names
    df.columns = map(str.lower, df.columns)

    #Lower-case everything in a DataFrame column
    df.column_name = df.column_name.str.lower()

  12. @bsweger bsweger revised this gist Aug 19, 2014. 1 changed file with 3 additions and 0 deletions.
    3 changes: 3 additions & 0 deletions useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -69,6 +69,9 @@
    for col in refunds.columns.values:
    refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True)

    #Set DataFrame column values based on other column values
    df['column_to_change'][(df['column1'] == some_value) & (df['column2'] == some_other_value)] = new_value

    #Clean up missing values in multiple DataFrame columns
    df = df.fillna({
    'col1': 'missing',
  13. @bsweger bsweger revised this gist Aug 19, 2014. 1 changed file with 3 additions and 0 deletions.
    3 changes: 3 additions & 0 deletions useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -44,6 +44,9 @@
    newdf = df[df['column'].isnull()]
    len(newdf)

    #select from DataFrame using multiple keys of a hierarchical index
    df.xs(('index level 1 value','index level 2 value'), level=('level 1','level 2'))

    #Change all NaNs to None (useful before
    #loading to a db)
    df = df.where((pd.notnull(df)), None)
  14. @bsweger bsweger revised this gist Aug 13, 2014. 1 changed file with 5 additions and 1 deletion.
    6 changes: 5 additions & 1 deletion useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -34,7 +34,11 @@
    df.column_name = df.column_name.str.lower()

    #Sort dataframe by multiple columns
    df = df.sort_indx(by=['col1', 'col2'])
    df = df.sort(['col1','col2','col3'],ascending=[1,1,0])

    #get top n for each group of columns in a sorted dataframe
    #(make sure dataframe is sorted first)
    top5 = df.groupby(['groupingcol1', 'groupingcol2']).head(5)

    #Grab DataFrame rows where specific column is null/notnull
    newdf = df[df['column'].isnull()]
  15. @bsweger bsweger revised this gist Aug 13, 2014. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -33,8 +33,8 @@
    #Lower-case everything in a DataFrame column
    df.column_name = df.column_name.str.lower()

    #Sort multi-index dataframe
    df = df.sort_indx(by=['indexpart1', 'indexpart2'])
    #Sort dataframe by multiple columns
    df = df.sort_indx(by=['col1', 'col2'])

    #Grab DataFrame rows where specific column is null/notnull
    newdf = df[df['column'].isnull()]
  16. @bsweger bsweger revised this gist Aug 13, 2014. 1 changed file with 3 additions and 0 deletions.
    3 changes: 3 additions & 0 deletions useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -33,6 +33,9 @@
    #Lower-case everything in a DataFrame column
    df.column_name = df.column_name.str.lower()

    #Sort multi-index dataframe
    df = df.sort_indx(by=['indexpart1', 'indexpart2'])

    #Grab DataFrame rows where specific column is null/notnull
    newdf = df[df['column'].isnull()]
    len(newdf)
  17. @bsweger bsweger revised this gist Aug 7, 2014. 1 changed file with 8 additions and 9 deletions.
    17 changes: 8 additions & 9 deletions useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -1,20 +1,20 @@
    #Convert Series datatype to numeric, getting rid of any values that are non-numeric
    #List unique values in a DataFrame column
    pd.unique(df.column_name.ravel())

    #Convert Series datatype to numeric, getting rid of any non-numeric values
    df['col'] = df['col'].astype(str).convert_objects(convert_numeric=True)

    #Grab DataFrame rows where specified column has certain values
    #Grab DataFrame rows where column has certain values
    valuelist = ['value1', 'value2', 'value3']
    df = df[df.column.isin(value_list)]

    #List unique values in a DataFrame column
    pd.unique(df.column_name.ravel())
    #Grab DataFrame rows where column doesn't have certain values
    valuelist = ['value1', 'value2', 'value3']
    df = df[~df.column.isin(value_list)]

    #Select from DataFrame using criteria from multiple columns
    newdf = df[(df['column_one']>2004) & (df['column_two']==9)]

    #Grab DataFrame rows where specified column doesn't have certain values
    valuelist = ['value1', 'value2', 'value3']
    df = df[~df.column.isin(value_list)]

    #Rename several DataFrame columns
    df = df.rename(columns = {
    'col1 old name':'col1 new name',
    @@ -42,7 +42,6 @@
    df = df.where((pd.notnull(df)), None)

    #Get quick count of rows in a DataFrame
    #(note that using df[0].count will exclude NaNs)
    len(df.index)

    #Pivot data (with flexibility about what what
  18. @bsweger bsweger revised this gist Aug 7, 2014. 1 changed file with 32 additions and 31 deletions.
    63 changes: 32 additions & 31 deletions useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -1,27 +1,16 @@
    # Get rid of non-numeric values throughout a DataFrame:
    for col in refunds.columns.values:
    refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True)

    # Split delimited values in a DataFrame column into two new columns
    df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1)))

    #Convert Series datatype to numeric, getting rid of any values that are non-numeric
    df['col'] = df['col'].astype(str).convert_objects(convert_numeric=True)

    #Clean up missing values in multiple DataFrame columns
    df = df.fillna({
    'fyq': 'missing',
    'cfda_program_num': '99.999',
    'recipient_county_code': '999',
    'recipient_country_code': 'missing',
    'uri': 'missing',
    'recipient_state_code': '99'
    })

    #Grab DataFrame rows where specified column has certain values
    valuelist = ['value1', 'value2', 'value3']
    df = df[df.column.isin(value_list)]

    #List unique values in a DataFrame column
    pd.unique(df.column_name.ravel())

    #Select from DataFrame using criteria from multiple columns
    newdf = df[(df['column_one']>2004) & (df['column_two']==9)]

    #Grab DataFrame rows where specified column doesn't have certain values
    valuelist = ['value1', 'value2', 'value3']
    df = df[~df.column.isin(value_list)]
    @@ -33,13 +22,8 @@
    'col3 old name':'col3 new name',
    })

    #Convert Django queryset to DataFrame
    #convert queryset to dataframe
    qs = DjangoModelName.objects.all()
    q = qs.values()
    df = pd.DataFrame.from_records(q)

    #Loop through rows in a DataFrame
    #(if you must)
    for index, row in df:
    print index, row['some column']

    @@ -57,12 +41,6 @@
    #loading to a db)
    df = df.where((pd.notnull(df)), None)

    #List unique values in a DataFrame column
    pd.unique(df.column_name.ravel())

    #Select from DataFrame using criteria from multiple columns
    newdf = df[(df['column_one']>2004) & (df['column_two']==9)]

    #Get quick count of rows in a DataFrame
    #(note that using df[0].count will exclude NaNs)
    len(df.index)
    @@ -75,9 +53,32 @@
    index=['col1', 'col2', 'col3'], #these stay as columns
    columns=['col4']) #data values in this column become their own column

    #change data type of DataFrame column
    df.column_name = df.column_name.astype(np.int64)

    # Get rid of non-numeric values throughout a DataFrame:
    for col in refunds.columns.values:
    refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True)

    #Clean up missing values in multiple DataFrame columns
    df = df.fillna({
    'col1': 'missing',
    'col2': '99.999',
    'col3': '999',
    'col4': 'missing',
    'col5': 'missing',
    'col6': '99'
    })

    #Doing calculations with DataFrame columns that have missing values
    #In example below, swap in 0 for df['col1'] cells that contain null
    df['new_col'] = np.where(pd.isnull(df['col1']),0,df['col1']) + df['col2']

    #change data type of DataFrame column
    df.column_name = df.column_name.astype(np.int64)
    # Split delimited values in a DataFrame column into two new columns
    df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1)))

    #Convert Django queryset to DataFrame
    #convert queryset to dataframe
    qs = DjangoModelName.objects.all()
    q = qs.values()
    df = pd.DataFrame.from_records(q)
  19. @bsweger bsweger revised this gist Aug 6, 2014. 1 changed file with 4 additions and 1 deletion.
    5 changes: 4 additions & 1 deletion useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -77,4 +77,7 @@

    #Doing calculations with DataFrame columns that have missing values
    #In example below, swap in 0 for df['col1'] cells that contain null
    df['new_col'] = np.where(pd.isnull(df['col1']),0,df['col1']) + df['col2']
    df['new_col'] = np.where(pd.isnull(df['col1']),0,df['col1']) + df['col2']

    #change data type of DataFrame column
    df.column_name = df.column_name.astype(np.int64)
  20. @bsweger bsweger revised this gist Aug 6, 2014. 1 changed file with 5 additions and 1 deletion.
    6 changes: 5 additions & 1 deletion useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -73,4 +73,8 @@
    pd.pivot_table(
    df,values='cell_value',
    index=['col1', 'col2', 'col3'], #these stay as columns
    columns=['col4']) #data values in this column become their own column
    columns=['col4']) #data values in this column become their own column

    #Doing calculations with DataFrame columns that have missing values
    #In example below, swap in 0 for df['col1'] cells that contain null
    df['new_col'] = np.where(pd.isnull(df['col1']),0,df['col1']) + df['col2']
  21. @bsweger bsweger revised this gist Aug 6, 2014. 1 changed file with 9 additions and 1 deletion.
    10 changes: 9 additions & 1 deletion useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -65,4 +65,12 @@

    #Get quick count of rows in a DataFrame
    #(note that using df[0].count will exclude NaNs)
    len(df.index)
    len(df.index)

    #Pivot data (with flexibility about what what
    #becomes a column and what stays a row).
    #Syntax works on Pandas >= .14
    pd.pivot_table(
    df,values='cell_value',
    index=['col1', 'col2', 'col3'], #these stay as columns
    columns=['col4']) #data values in this column become their own column
  22. @bsweger bsweger revised this gist Aug 6, 2014. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -1,6 +1,6 @@
    # Get rid of non-numeric values throughout a DataFrame:
    for col in refunds.columns.values:
    refunds[col] = refunds[col].replace('[^0-9]+', '', regex=True)
    refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True)

    # Split delimited values in a DataFrame column into two new columns
    df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1)))
  23. @bsweger bsweger revised this gist Aug 6, 2014. 1 changed file with 5 additions and 1 deletion.
    6 changes: 5 additions & 1 deletion useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -61,4 +61,8 @@
    pd.unique(df.column_name.ravel())

    #Select from DataFrame using criteria from multiple columns
    newdf = df[(df['column_one']>2004) & (df['column_two']==9)]
    newdf = df[(df['column_one']>2004) & (df['column_two']==9)]

    #Get quick count of rows in a DataFrame
    #(note that using df[0].count will exclude NaNs)
    len(df.index)
  24. @bsweger bsweger revised this gist Aug 5, 2014. 1 changed file with 4 additions and 1 deletion.
    5 changes: 4 additions & 1 deletion useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -58,4 +58,7 @@
    df = df.where((pd.notnull(df)), None)

    #List unique values in a DataFrame column
    pd.unique(df.column_name.ravel())
    pd.unique(df.column_name.ravel())

    #Select from DataFrame using criteria from multiple columns
    newdf = df[(df['column_one']>2004) & (df['column_two']==9)]
  25. @bsweger bsweger revised this gist Aug 5, 2014. 1 changed file with 4 additions and 1 deletion.
    5 changes: 4 additions & 1 deletion useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -55,4 +55,7 @@

    #Change all NaNs to None (useful before
    #loading to a db)
    df = df.where((pd.notnull(df)), None)
    df = df.where((pd.notnull(df)), None)

    #List unique values in a DataFrame column
    pd.unique(df.column_name.ravel())
  26. @bsweger bsweger revised this gist Aug 1, 2014. 1 changed file with 5 additions and 1 deletion.
    6 changes: 5 additions & 1 deletion useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -51,4 +51,8 @@

    #Grab DataFrame rows where specific column is null/notnull
    newdf = df[df['column'].isnull()]
    len(newdf)
    len(newdf)

    #Change all NaNs to None (useful before
    #loading to a db)
    df = df.where((pd.notnull(df)), None)
  27. @bsweger bsweger revised this gist Aug 1, 2014. 1 changed file with 5 additions and 1 deletion.
    6 changes: 5 additions & 1 deletion useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -47,4 +47,8 @@
    df.columns = map(str.lower, df.columns)

    #Lower-case everything in a DataFrame column
    df.column_name = df.column_name.str.lower()
    df.column_name = df.column_name.str.lower()

    #Grab DataFrame rows where specific column is null/notnull
    newdf = df[df['column'].isnull()]
    len(newdf)
  28. @bsweger bsweger revised this gist Aug 1, 2014. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -47,4 +47,4 @@
    df.columns = map(str.lower, df.columns)

    #Lower-case everything in a DataFrame column
    df = df.column_name.str.lower()
    df.column_name = df.column_name.str.lower()
  29. @bsweger bsweger revised this gist Aug 1, 2014. 1 changed file with 6 additions and 0 deletions.
    6 changes: 6 additions & 0 deletions useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -42,3 +42,9 @@
    #Loop through rows in a DataFrame
    for index, row in df:
    print index, row['some column']

    #lower-case all DataFrame column names
    df.columns = map(str.lower, df.columns)

    #Lower-case everything in a DataFrame column
    df = df.column_name.str.lower()
  30. @bsweger bsweger revised this gist Aug 1, 2014. 1 changed file with 4 additions and 0 deletions.
    4 changes: 4 additions & 0 deletions useful_pandas_snippets.py
    Original file line number Diff line number Diff line change
    @@ -38,3 +38,7 @@
    qs = DjangoModelName.objects.all()
    q = qs.values()
    df = pd.DataFrame.from_records(q)

    #Loop through rows in a DataFrame
    for index, row in df:
    print index, row['some column']