mchiang0610 · October 25, 2025 07:46 · Oct 22, 2025 · Oct 17, 2025
diff --git a/pg98.txt b/pg98.txt
diff --git a/readme.md b/readme.md
@@ -10,7 +10,7 @@ pip install ollama
 Running the benchmark:
 
 ```
-echo <model> | python benchmark.py -k0.2 -c10 -n500 --temperature 0 -p "write me a short story" > dgx-model.bench
+echo <model> | python benchmark.py -k0.2 -c10 -n500 --temperature 0 -p "write an in-depth summary of this story: $(head -n200 pg98.txt)" > dgx-model.bench
 ```
 
 To examine the benchmarks use golang's benchstat:

diff --git a/benchmark.py b/benchmark.py
@@ -0,0 +1,80 @@
+# /// script
+# dependencies = [
+#   "ollama",
+# ]
+# ///
+
+import os
+import sys
+import time
+from argparse import ArgumentParser, FileType
+
+import ollama
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument("-c", "--count", type=int, default=1)
+    parser.add_argument("-n", "--max-new-tokens", type=int, default=100)
+    parser.add_argument("-p", "--prompt", type=str, default="Write a long story.")
+    parser.add_argument("-k", "--keep-alive", type=float, default=None)
+    parser.add_argument("--temperature", type=float, default=None)
+    parser.add_argument("input", type=FileType("r"), default=sys.stdin, nargs="?")
+    args = parser.parse_args()
+
+    uname = os.uname()
+    # TODO: metadata should be retrieved from device under test
+    print("sysname:", uname.sysname)
+    print("machine:", uname.machine)
+
+    for line in args.input:
+        model = line.strip()
+        for _ in range(args.count):
+            response = ollama.chat(
+                model=model,
+                messages=[{"role": "user", "content": args.prompt}],
+                options=ollama.Options(
+                    num_predict=args.max_new_tokens,
+                    temperature=args.temperature,
+                    seed=0 if args.temperature is not None else None,
+                ),
+                keep_alive=args.keep_alive,
+            )
+
+            for k, v in {
+                "prefill": {
+                    "count": response.prompt_eval_count,
+                    "duration": response.prompt_eval_duration,
+                },
+                "generate": {
+                    "count": response.eval_count,
+                    "duration": response.eval_duration,
+                },
+            }.items():
+                count = v["count"]
+                duration = v["duration"]
+                print(
+                    f"BenchmarkModel/name={model}/step={k}",
+                    count,
+                    duration / count,
+                    "ns/token",
+                    count / (duration + 1e-12) * 1e9,
+                    "token/sec",
+                )
+
+            for k, v in {
+                "": response.total_duration,
+                "/step=load": response.load_duration,
+            }.items():
+                print(
+                    f"BenchmarkModel/name={model}{k}",
+                    1,
+                    v,
+                    "ns/request",
+                )
+            if args.keep_alive:
+                time.sleep(args.keep_alive + 0.2)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/readme.md b/readme.md
@@ -0,0 +1,20 @@
+Installation:
+
+```
+go install golang.org/x/perf/cmd/benchstat@latest
+python3 -m venv venv
+source venv/bin/activate
+pip install ollama
+```
+
+Running the benchmark:
+
+```
+echo <model> | python benchmark.py -k0.2 -c10 -n500 --temperature 0 -p "write me a short story" > dgx-model.bench
+```
+
+To examine the benchmarks use golang's benchstat:
+
+```
+~/go/bin/benchstat dgx-model.bench
+```