manojpandey · July 18, 2016 05:58
diff --git a/useful_pandas_snippets.py b/useful_pandas_snippets.py
 # Get rid of non-numeric values throughout a DataFrame:
 for col in refunds.columns.values:
  refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True)
  
 # Split delimited values in a DataFrame column into two new columns
 df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1)))

 #Convert Series datatype to numeric, getting rid of any values that are non-numeric
 df['col'] = df['col'].astype(str).convert_objects(convert_numeric=True)

 #Clean up missing values in multiple DataFrame columns
 df = df.fillna({
    'fyq': 'missing',
    'cfda_program_num': '99.999',
    'recipient_county_code': '999',
    'recipient_country_code': 'missing',
    'uri': 'missing',
    'recipient_state_code': '99'
 })

 #Grab DataFrame rows where specified column has certain values
 valuelist = ['value1', 'value2', 'value3']
 df = df[df.column.isin(value_list)]

 #Grab DataFrame rows where specified column doesn't have certain values
 valuelist = ['value1', 'value2', 'value3']
 df = df[~df.column.isin(value_list)]

 #Rename several DataFrame columns
 df = df.rename(columns = {
    'col1 old name':'col1 new name',
    'col2 old name':'col2 new name',
    'col3 old name':'col3 new name',
 })

 #Convert Django queryset to DataFrame
 #convert queryset to dataframe
 qs = DjangoModelName.objects.all()
 q = qs.values()
 df = pd.DataFrame.from_records(q)

 #Loop through rows in a DataFrame
 for index, row in df:
    print index, row['some column']  

 #lower-case all DataFrame column names
 df.columns = map(str.lower, df.columns)

 #Lower-case everything in a DataFrame column
 df.column_name = df.column_name.str.lower()

 #Grab DataFrame rows where specific column is null/notnull
 newdf = df[df['column'].isnull()]
 len(newdf)

 #Change all NaNs to None (useful before
 #loading to a db)
 df = df.where((pd.notnull(df)), None)

 #List unique values in a DataFrame column
 pd.unique(df.column_name.ravel())

 #Select from DataFrame using criteria from multiple columns
 newdf = df[(df['column_one']>2004) & (df['column_two']==9)]

 #Get quick count of rows in a DataFrame
 #(note that using df[0].count will exclude NaNs)
 len(df.index)

 #Pivot data (with flexibility about what what
 #becomes a column and what stays a row).
 #Syntax works on Pandas >= .14
 pd.pivot_table(
  df,values='cell_value',
  index=['col1', 'col2', 'col3'], #these stay as columns
  columns=['col4']) #data values in this column become their own column
	# Get rid of non-numeric values throughout a DataFrame:
	for col in refunds.columns.values:
	refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True)

	# Split delimited values in a DataFrame column into two new columns
	df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1)))

	#Convert Series datatype to numeric, getting rid of any values that are non-numeric
	df['col'] = df['col'].astype(str).convert_objects(convert_numeric=True)

	#Clean up missing values in multiple DataFrame columns
	df = df.fillna({
	'fyq': 'missing',
	'cfda_program_num': '99.999',
	'recipient_county_code': '999',
	'recipient_country_code': 'missing',
	'uri': 'missing',
	'recipient_state_code': '99'
	})

	#Grab DataFrame rows where specified column has certain values
	valuelist = ['value1', 'value2', 'value3']
	df = df[df.column.isin(value_list)]

	#Grab DataFrame rows where specified column doesn't have certain values
	valuelist = ['value1', 'value2', 'value3']
	df = df[~df.column.isin(value_list)]

	#Rename several DataFrame columns
	df = df.rename(columns = {
	'col1 old name':'col1 new name',
	'col2 old name':'col2 new name',
	'col3 old name':'col3 new name',
	})

	#Convert Django queryset to DataFrame
	#convert queryset to dataframe
	qs = DjangoModelName.objects.all()
	q = qs.values()
	df = pd.DataFrame.from_records(q)

	#Loop through rows in a DataFrame
	for index, row in df:
	print index, row['some column']

	#lower-case all DataFrame column names
	df.columns = map(str.lower, df.columns)

	#Lower-case everything in a DataFrame column
	df.column_name = df.column_name.str.lower()

	#Grab DataFrame rows where specific column is null/notnull
	newdf = df[df['column'].isnull()]
	len(newdf)

	#Change all NaNs to None (useful before
	#loading to a db)
	df = df.where((pd.notnull(df)), None)

	#List unique values in a DataFrame column
	pd.unique(df.column_name.ravel())

	#Select from DataFrame using criteria from multiple columns
	newdf = df[(df['column_one']>2004) & (df['column_two']==9)]

	#Get quick count of rows in a DataFrame
	#(note that using df[0].count will exclude NaNs)
	len(df.index)

	#Pivot data (with flexibility about what what
	#becomes a column and what stays a row).
	#Syntax works on Pandas >= .14
	pd.pivot_table(
	df,values='cell_value',
	index=['col1', 'col2', 'col3'], #these stay as columns
	columns=['col4']) #data values in this column become their own column