haydenflinner · January 17, 2024 12:38 · Jan 17, 2024
diff --git a/stats.py b/stats.py
@@ -0,0 +1,75 @@
+import pandas as pd
+
+from dataclasses import dataclass
+
+@dataclass
+class PositionSample:
+    lap: int
+    rider: str
+    pos: int
+Sample = PositionSample
+
+# 5 riders, 4 lap race. Real data should reveal more noise / trends.
+df = pd.DataFrame([
+    Sample(1, 'a', 1),
+    Sample(1, 'b', 2),
+    Sample(1, 'c', 3),
+    Sample(1, 'd', 4),
+    Sample(1, 'e', 5),
+
+    Sample(2, 'a', 1),
+    Sample(2, 'b', 2),
+    Sample(2, 'c', 3),
+    Sample(2, 'e', 4),
+    Sample(2, 'd', 5),  # <-- d and e traded places
+
+    Sample(3, 'a', 1),
+    Sample(3, 'd', 2),
+    Sample(3, 'e', 3),
+    Sample(3, 'c', 4),  # b took out c and himself. or c took out b and himself.
+    Sample(3, 'b', 5),
+
+    # Finishing lap unchanged from prior lap.
+    Sample(4, 'a', 1),
+    Sample(4, 'd', 2),
+    Sample(4, 'e', 3),
+    Sample(4, 'c', 4),
+    Sample(4, 'b', 5),
+])
+
+# Our goal here is to find if there's a correlation between being near certain riders
+# and changes in Position. For example, maybe a certain rider is known to ride
+# a wide bike and so being just behind him means you have a below average
+# chance of passing. To really weigh that you'd need to correct for speed with
+# something like ELO or maybe just finishing position in the current race.
+# Another example would be that having Jett Lawrence behind you is a recipe
+# for losing one spot by the end of the lap.
+# This may also reveal riders who have a tendency
+# to put other riders far down the results sheet.
+
+# For each lap sample, this is important info.
+df = df.sort_values(by=["lap", "pos"])
+df["rider_ahead"] = df.groupby("lap")["rider"].shift()
+df["rider_behind"] = df.groupby("lap")["rider"].shift(-1)
+
+# Group by "rider" and then use shift to get the previous lap's "pos"
+df = df.sort_values(by=["rider", "lap"])
+df["prev_pos"] = df.groupby("rider")["pos"].shift()
+df["rider_ahead_last_lap"] = df.groupby("rider")["rider_ahead"].shift()
+
+# display(df.sort_values(by=["lap", "pos"]))
+# Drop rows where there is no previous lap. Not a big loss since first laps are especially hectic.
+df = df.dropna(subset=["prev_pos"])
+
+df["pos_change"] = df.pos - df.prev_pos
+
+# Resetting index if needed
+df = df.reset_index(drop=True)
+
+
+df.sort_values(by=["lap", "pos"])
+
+display(df.groupby("rider_ahead_last_lap")["pos_change"].max())
+
+import plotly.express as px
+px.box(df, x='rider_ahead_last_lap', y='pos_change')