Polaris000 · November 4, 2022 16:49 · Nov 4, 2022
diff --git a/baseline.py b/baseline.py
@@ -0,0 +1,40 @@
+def auc_recall_at_k(y_true, y_conf):
+    """
+    Compute AUC under the Recall@k curve.
+
+    y_true: A numpy array of expected predictions
+    y_conf: A numpy array of the model's confidence
+            scores for each datapoint
+            
+    Returns: AUC-Recall@k (float)
+    """
+
+    # if there are no positive targets (good leads),
+    # auc becomes invalid
+    if y_true.count(1) == 0:
+        return np.nan
+
+    conf_df = pd.DataFrame()
+    conf_df["conf"] = y_conf
+    conf_df["expected"] = y_true
+    conf_df.columns = ["conf", "expected"]
+    conf_df = conf_df.sort_values("conf", ascending=False)
+
+    recall_at_k = []
+
+    # calculating recall@k
+    for i in range(len(conf_df)):
+        recall_at_k.append(
+            conf_df.iloc[:i+1, :]["expected"].to_list().count(1)
+            / conf_df["expected"].to_list().count(1)
+        )
+
+    # calculating ideal recall@k
+    ideal_recall_at_k = np.minimum(
+        np.ones(len(conf_df)),
+        np.array(list(range(1, len(conf_df["expected"]) + 1)))/ conf_df["expected"].to_list().count(1)
+    )
+
+    # Computing our final metric by getting the proportion of the areas
+    # under these two curves
+    return np.trapz(recall_at_k) / np.trapz(ideal_recall_at_k)
No results found