scmanjarrez · September 10, 2025 12:33 · Sep 10, 2025
diff --git a/llm-benchmark.py b/llm-benchmark.py
@@ -0,0 +1,185 @@
+import argparse
+from types import SimpleNamespace
+
+from ollama import ChatResponse, Client
+
+
+def run_benchmark(model_name: str, prompt: str, verbose: bool) -> ChatResponse:
+    last_element = None
+
+    if verbose:
+        stream = CLIENT.chat(
+            model=model_name,
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt,
+                },
+            ],
+            stream=True,
+        )
+        for chunk in stream:
+            print(chunk["message"]["content"], end="", flush=True)
+            last_element = chunk
+    else:
+        last_element = CLIENT.chat(
+            model=model_name,
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt,
+                },
+            ],
+        )
+
+    if not last_element:
+        print("System Error: No response received from ollama")
+        return None
+
+    print("Last element: ", last_element)
+    # with open("data/ollama/ollama_res.json", "w") as outfile:
+    #     outfile.write(json.dumps(last_element, indent=4))
+
+    return last_element
+
+
+def nanosec_to_sec(nanosec):
+    return nanosec / 1000000000
+
+
+def inference_stats(model_response: ChatResponse):
+    # Use properties for calculations
+    prompt_ts = model_response.prompt_eval_count / (
+        nanosec_to_sec(model_response.prompt_eval_duration)
+    )
+    response_ts = model_response.eval_count / (
+        nanosec_to_sec(model_response.eval_duration)
+    )
+    total_ts = (
+        model_response.prompt_eval_count + model_response.eval_count
+    ) / (
+        nanosec_to_sec(
+            model_response.prompt_eval_duration + model_response.eval_duration
+        )
+    )
+
+    print(
+        f"""
+----------------------------------------------------
+        {model_response.model}
+        \tPrompt eval: {prompt_ts:.2f} t/s
+        \tResponse: {response_ts:.2f} t/s
+        \tTotal: {total_ts:.2f} t/s
+
+        Stats:
+        \tPrompt tokens: {model_response.prompt_eval_count}
+        \tResponse tokens: {model_response.eval_count}
+        \tModel load time: {nanosec_to_sec(model_response.load_duration):.2f}s
+        \tPrompt eval time: {nanosec_to_sec(model_response.prompt_eval_duration):.2f}s
+        \tResponse time: {nanosec_to_sec(model_response.eval_duration):.2f}s
+        \tTotal time: {nanosec_to_sec(model_response.total_duration):.2f}s
+----------------------------------------------------
+        """
+    )
+
+
+def average_stats(responses: list[ChatResponse]):
+    if len(responses) == 0:
+        print("No stats to average")
+        return
+
+    res = SimpleNamespace(
+        model=responses[-1].model,
+        total_duration=sum(r.total_duration for r in responses),
+        load_duration=sum(r.load_duration for r in responses),
+        prompt_eval_count=sum(r.prompt_eval_count for r in responses),
+        prompt_eval_duration=sum(r.prompt_eval_duration for r in responses),
+        eval_count=sum(r.eval_count for r in responses),
+        eval_duration=sum(r.eval_duration for r in responses),
+    )
+    inference_stats(res)
+
+
+def get_benchmark_models(skip_models: list[str] = []) -> list[str]:
+    models = CLIENT.list().get("models", [])
+    model_names = [model["model"] for model in models]
+    if len(skip_models) > 0:
+        model_names = [
+            model for model in model_names if model not in skip_models
+        ]
+    print(f"Evaluating models: {model_names}\n")
+    return model_names
+
+
+def main():
+    verbose = args.verbose
+    skip_models = args.skip_models
+    prompts = args.prompts
+    print(
+        f"\nVerbose: {verbose}\nSkip models: {skip_models}\nPrompts: {prompts}"
+    )
+
+    model_names = get_benchmark_models(skip_models)
+    benchmarks = {}
+
+    for model_name in model_names:
+        responses: list[ChatResponse] = []
+        for prompt in prompts:
+            if verbose:
+                print(f"\n\nBenchmarking: {model_name}\nPrompt: {prompt}")
+            response = run_benchmark(model_name, prompt, verbose=verbose)
+            responses.append(response)
+
+            if verbose:
+                print(f"Response: {response.message.content}")
+                inference_stats(response)
+        benchmarks[model_name] = responses
+
+    for model_name, responses in benchmarks.items():
+        average_stats(responses)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Run benchmarks on your Ollama models."
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Increase output verbosity",
+        default=False,
+    )
+    parser.add_argument(
+        "-s",
+        "--skip-models",
+        nargs="*",
+        default=[],
+        help="list of model names to skip. Separate multiple models with spaces.",
+    )
+    parser.add_argument(
+        "-r",
+        "--remote",
+        default="localhost:11434",
+        help="list of model names to skip. Separate multiple models with spaces.",
+    )
+    parser.add_argument(
+        "-p",
+        "--prompts",
+        nargs="*",
+        default=[
+            "Why is the sky blue?",
+            "Write a report on the financials of Apple Inc.",
+        ],
+        help="list of prompts to use for benchmarking. Separate multiple prompts with spaces.",
+    )
+
+    args = parser.parse_args()
+
+    CLIENT = Client(
+        host=args.remote,
+    )
+
+    main()
+    # Example usage:
+    # python benchmark.py --verbose --skip-models aisherpa/mistral-7b-instruct-v02:Q5_K_M llama2:latest --prompts "What color is the sky" "Write a report on the financials of Microsoft"
No results found