Skip to content

Instantly share code, notes, and snippets.

@scmanjarrez
Created September 10, 2025 12:33
Show Gist options
  • Select an option

  • Save scmanjarrez/41a8c4b6bf7da2e7e59899d51bcf7de9 to your computer and use it in GitHub Desktop.

Select an option

Save scmanjarrez/41a8c4b6bf7da2e7e59899d51bcf7de9 to your computer and use it in GitHub Desktop.

Revisions

  1. scmanjarrez created this gist Sep 10, 2025.
    185 changes: 185 additions & 0 deletions llm-benchmark.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,185 @@
    import argparse
    from types import SimpleNamespace

    from ollama import ChatResponse, Client


    def run_benchmark(model_name: str, prompt: str, verbose: bool) -> ChatResponse:
    last_element = None

    if verbose:
    stream = CLIENT.chat(
    model=model_name,
    messages=[
    {
    "role": "user",
    "content": prompt,
    },
    ],
    stream=True,
    )
    for chunk in stream:
    print(chunk["message"]["content"], end="", flush=True)
    last_element = chunk
    else:
    last_element = CLIENT.chat(
    model=model_name,
    messages=[
    {
    "role": "user",
    "content": prompt,
    },
    ],
    )

    if not last_element:
    print("System Error: No response received from ollama")
    return None

    print("Last element: ", last_element)
    # with open("data/ollama/ollama_res.json", "w") as outfile:
    # outfile.write(json.dumps(last_element, indent=4))

    return last_element


    def nanosec_to_sec(nanosec):
    return nanosec / 1000000000


    def inference_stats(model_response: ChatResponse):
    # Use properties for calculations
    prompt_ts = model_response.prompt_eval_count / (
    nanosec_to_sec(model_response.prompt_eval_duration)
    )
    response_ts = model_response.eval_count / (
    nanosec_to_sec(model_response.eval_duration)
    )
    total_ts = (
    model_response.prompt_eval_count + model_response.eval_count
    ) / (
    nanosec_to_sec(
    model_response.prompt_eval_duration + model_response.eval_duration
    )
    )

    print(
    f"""
    ----------------------------------------------------
    {model_response.model}
    \tPrompt eval: {prompt_ts:.2f} t/s
    \tResponse: {response_ts:.2f} t/s
    \tTotal: {total_ts:.2f} t/s
    Stats:
    \tPrompt tokens: {model_response.prompt_eval_count}
    \tResponse tokens: {model_response.eval_count}
    \tModel load time: {nanosec_to_sec(model_response.load_duration):.2f}s
    \tPrompt eval time: {nanosec_to_sec(model_response.prompt_eval_duration):.2f}s
    \tResponse time: {nanosec_to_sec(model_response.eval_duration):.2f}s
    \tTotal time: {nanosec_to_sec(model_response.total_duration):.2f}s
    ----------------------------------------------------
    """
    )


    def average_stats(responses: list[ChatResponse]):
    if len(responses) == 0:
    print("No stats to average")
    return

    res = SimpleNamespace(
    model=responses[-1].model,
    total_duration=sum(r.total_duration for r in responses),
    load_duration=sum(r.load_duration for r in responses),
    prompt_eval_count=sum(r.prompt_eval_count for r in responses),
    prompt_eval_duration=sum(r.prompt_eval_duration for r in responses),
    eval_count=sum(r.eval_count for r in responses),
    eval_duration=sum(r.eval_duration for r in responses),
    )
    inference_stats(res)


    def get_benchmark_models(skip_models: list[str] = []) -> list[str]:
    models = CLIENT.list().get("models", [])
    model_names = [model["model"] for model in models]
    if len(skip_models) > 0:
    model_names = [
    model for model in model_names if model not in skip_models
    ]
    print(f"Evaluating models: {model_names}\n")
    return model_names


    def main():
    verbose = args.verbose
    skip_models = args.skip_models
    prompts = args.prompts
    print(
    f"\nVerbose: {verbose}\nSkip models: {skip_models}\nPrompts: {prompts}"
    )

    model_names = get_benchmark_models(skip_models)
    benchmarks = {}

    for model_name in model_names:
    responses: list[ChatResponse] = []
    for prompt in prompts:
    if verbose:
    print(f"\n\nBenchmarking: {model_name}\nPrompt: {prompt}")
    response = run_benchmark(model_name, prompt, verbose=verbose)
    responses.append(response)

    if verbose:
    print(f"Response: {response.message.content}")
    inference_stats(response)
    benchmarks[model_name] = responses

    for model_name, responses in benchmarks.items():
    average_stats(responses)


    if __name__ == "__main__":
    parser = argparse.ArgumentParser(
    description="Run benchmarks on your Ollama models."
    )
    parser.add_argument(
    "-v",
    "--verbose",
    action="store_true",
    help="Increase output verbosity",
    default=False,
    )
    parser.add_argument(
    "-s",
    "--skip-models",
    nargs="*",
    default=[],
    help="list of model names to skip. Separate multiple models with spaces.",
    )
    parser.add_argument(
    "-r",
    "--remote",
    default="localhost:11434",
    help="list of model names to skip. Separate multiple models with spaces.",
    )
    parser.add_argument(
    "-p",
    "--prompts",
    nargs="*",
    default=[
    "Why is the sky blue?",
    "Write a report on the financials of Apple Inc.",
    ],
    help="list of prompts to use for benchmarking. Separate multiple prompts with spaces.",
    )

    args = parser.parse_args()

    CLIENT = Client(
    host=args.remote,
    )

    main()
    # Example usage:
    # python benchmark.py --verbose --skip-models aisherpa/mistral-7b-instruct-v02:Q5_K_M llama2:latest --prompts "What color is the sky" "Write a report on the financials of Microsoft"