huseinzol05 · July 23, 2025 04:06 · Jul 23, 2025
diff --git a/hf_mfu.py b/hf_mfu.py
@@ -0,0 +1,70 @@
+"""
+https://www.oumi.ai/docs/en/latest/_modules/oumi/core/callbacks/hf_mfu_callback.html
+"""
+
+import torch
+import wandb
+from transformers import TrainerCallback, TrainerState, TrainerControl
+
+# Theoretical Peak Tensor Core Performance (BF16 / FP16)
+DEVICES = {
+    'NVIDIA GeForce RTX 3090': 142,
+    'NVIDIA GeForce RTX 3090 Ti': 160,
+    'NVIDIA H100 80GB HBM3': 990,
+    'NVIDIA GH200 480GB': 990,
+}
+
+class WandbMFUCallback(TrainerCallback):
+    def __init__(self, device_name = None):
+        self._time_of_second_step: Optional[float] = None
+        self._flops_at_second_step: Optional[float] = None
+        self._time_for_train_steps = 0.0
+        self._first_step_finished = False
+        self._device_name = device_name
+        if self._device_name is None:
+            self._device_name = torch.cuda.get_device_name()
+        if self._device_name not in DEVICES:
+            raise Exception('device name is not recognized.')
+
+    def on_step_begin(self, args, state: TrainerState, control: TrainerControl, **kwargs):
+        self._step_start_time = time.time()
+        if not self._first_step_finished:
+            return
+
+        if self._time_of_second_step is None:
+            self._time_of_second_step = self._step_start_time
+            if state is not None:
+                self._flops_at_second_step = state.total_flos
+
+    def on_step_end(self, args, state: TrainerState, control: TrainerControl, **kwargs):
+        delta_time_seconds = time.time() - self._step_start_time
+        if not self._first_step_finished:
+            self._first_step_finished = True
+            return
+        self._time_for_train_steps += delta_time_seconds
+
+    def on_log(self, args, state: TrainerState, control: TrainerControl, **kwargs):
+
+        if self._time_of_second_step is None:
+            return
+
+        delta_time_seconds_train = time.time() - self._time_of_second_step
+        delta_time_seconds_step = self._time_for_train_steps
+
+        if self._flops_at_second_step is not None and (
+            state is not None and state.total_flos > 0.0
+        ):
+            flops_since_second_step_on_all_devices = (
+                state.total_flos - self._flops_at_second_step
+            )
+            flops_step = flops_since_second_step_on_all_devices / delta_time_seconds_step
+            flops_train = flops_since_second_step_on_all_devices / delta_time_seconds_train
+            device_flops_per_second = DEVICES[self._device_name] * 1e12
+            train_step_mfu = flops_step / device_flops_per_second
+            train_mfu = flops_train / device_flops_per_second
+
+            wandb.log({
+                "train_step_mfu": train_step_mfu,
+                "train_mfu": train_mfu,
+            }, step=state.global_step)
+