sourangshupal · February 4, 2019 12:04 · Aug 26, 2017
diff --git a/drop_outliers.py b/drop_outliers.py
@@ -0,0 +1,40 @@
+
+# It is highly recommended to use Pandas for such data processing problems
+import pandas as pd
+import numpy as np
+
+dataset = {'feature1': np.random.rand(5000),
+                 'feature2': np.random.rand(5000),
+                 'feature3': np.random.rand(5000)
+                 }
+
+# Pandas data frame objects more convenient than Python dicts  to perform data preprocessing operations
+dataframe = pd.DataFrame(dataset)
+
+
+# Let's do  simple filtering... 
+# if the value in the row is greater 0.99 - leave it, otherwise drop the row
+print(dataframe[(dataframe>0.99).any(axis=1)])
+
+
+# remove all rows of the dataframe if the value 
+# in the column <feature1> lies too far (!) from corresponding median (the median computed for the column <feature1>)
+filtering_rule_1  = (dataframe.feature1.median() - dataframe.feature1).abs( ) > 0.3 
+
+print(dataframe[~filtering_rule_1])  # ~ -- means <NOT> operation
+
+
+# Another filtering approach: using quantiles
+lower_bound = .25
+upper_bound = .75
+quant_df = dataframe.quantile([lower_bound, upper_bound]) # auxiliary dataframe, it consist of quantiles computed for each column
+
+# select outliers, i.e. values lie outside corresponding [lower_bound, upper_bound] intervals
+filtering_rule_2 = dataframe.apply(lambda x: (x < quant_df.loc[lower_bound, x.name]) |  (x > quant_df.loc[upper_bound, x.name]), axis=0)
+
+
+# print filtered dataset: if the row includes outlier value, it is dropped; outlier = the value that lies outside [lower_bound, upper_bound]
+print(dataframe[~(filtering_rule_2).any(axis=1)])
+
+# or assign/create a new df
+filtered_dataframe = dataframe[~(filtering_rule_2).any(axis=1)]