# Get rid of non-numeric values throughout a DataFrame: for col in refunds.columns.values: refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True) # Split delimited values in a DataFrame column into two new columns df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1))) #Convert Series datatype to numeric, getting rid of any values that are non-numeric df['col'] = df['col'].astype(str).convert_objects(convert_numeric=True) #Clean up missing values in multiple DataFrame columns df = df.fillna({ 'fyq': 'missing', 'cfda_program_num': '99.999', 'recipient_county_code': '999', 'recipient_country_code': 'missing', 'uri': 'missing', 'recipient_state_code': '99' }) #Grab DataFrame rows where specified column has certain values valuelist = ['value1', 'value2', 'value3'] df = df[df.column.isin(value_list)] #Grab DataFrame rows where specified column doesn't have certain values valuelist = ['value1', 'value2', 'value3'] df = df[~df.column.isin(value_list)] #Rename several DataFrame columns df = df.rename(columns = { 'col1 old name':'col1 new name', 'col2 old name':'col2 new name', 'col3 old name':'col3 new name', }) #Convert Django queryset to DataFrame #convert queryset to dataframe qs = DjangoModelName.objects.all() q = qs.values() df = pd.DataFrame.from_records(q) #Loop through rows in a DataFrame for index, row in df: print index, row['some column'] #lower-case all DataFrame column names df.columns = map(str.lower, df.columns) #Lower-case everything in a DataFrame column df.column_name = df.column_name.str.lower() #Grab DataFrame rows where specific column is null/notnull newdf = df[df['column'].isnull()] len(newdf) #Change all NaNs to None (useful before #loading to a db) df = df.where((pd.notnull(df)), None) #List unique values in a DataFrame column pd.unique(df.column_name.ravel()) #Select from DataFrame using criteria from multiple columns newdf = df[(df['column_one']>2004) & (df['column_two']==9)] #Get quick count of rows in a DataFrame #(note that using df[0].count will exclude NaNs) len(df.index) #Pivot data (with flexibility about what what #becomes a column and what stays a row). #Syntax works on Pandas >= .14 pd.pivot_table( df,values='cell_value', index=['col1', 'col2', 'col3'], #these stay as columns columns=['col4']) #data values in this column become their own column