# It is highly recommended to use Pandas for such data processing problems import pandas as pd import numpy as np dataset = {'feature1': np.random.rand(5000), 'feature2': np.random.rand(5000), 'feature3': np.random.rand(5000) } # Pandas data frame objects more convenient than Python dicts to perform data preprocessing operations dataframe = pd.DataFrame(dataset) # Let's do simple filtering... # if the value in the row is greater 0.99 - leave it, otherwise drop the row print(dataframe[(dataframe>0.99).any(axis=1)]) # remove all rows of the dataframe if the value # in the column lies too far (!) from corresponding median (the median computed for the column ) filtering_rule_1 = (dataframe.feature1.median() - dataframe.feature1).abs( ) > 0.3 print(dataframe[~filtering_rule_1]) # ~ -- means operation # Another filtering approach: using quantiles lower_bound = .25 upper_bound = .75 quant_df = dataframe.quantile([lower_bound, upper_bound]) # auxiliary dataframe, it consist of quantiles computed for each column # select outliers, i.e. values lie outside corresponding [lower_bound, upper_bound] intervals filtering_rule_2 = dataframe.apply(lambda x: (x < quant_df.loc[lower_bound, x.name]) | (x > quant_df.loc[upper_bound, x.name]), axis=0) # print filtered dataset: if the row includes outlier value, it is dropped; outlier = the value that lies outside [lower_bound, upper_bound] print(dataframe[~(filtering_rule_2).any(axis=1)]) # or assign/create a new df filtered_dataframe = dataframe[~(filtering_rule_2).any(axis=1)]