Created
September 10, 2025 12:33
-
-
Save scmanjarrez/41a8c4b6bf7da2e7e59899d51bcf7de9 to your computer and use it in GitHub Desktop.
Revisions
-
scmanjarrez created this gist
Sep 10, 2025 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,185 @@ import argparse from types import SimpleNamespace from ollama import ChatResponse, Client def run_benchmark(model_name: str, prompt: str, verbose: bool) -> ChatResponse: last_element = None if verbose: stream = CLIENT.chat( model=model_name, messages=[ { "role": "user", "content": prompt, }, ], stream=True, ) for chunk in stream: print(chunk["message"]["content"], end="", flush=True) last_element = chunk else: last_element = CLIENT.chat( model=model_name, messages=[ { "role": "user", "content": prompt, }, ], ) if not last_element: print("System Error: No response received from ollama") return None print("Last element: ", last_element) # with open("data/ollama/ollama_res.json", "w") as outfile: # outfile.write(json.dumps(last_element, indent=4)) return last_element def nanosec_to_sec(nanosec): return nanosec / 1000000000 def inference_stats(model_response: ChatResponse): # Use properties for calculations prompt_ts = model_response.prompt_eval_count / ( nanosec_to_sec(model_response.prompt_eval_duration) ) response_ts = model_response.eval_count / ( nanosec_to_sec(model_response.eval_duration) ) total_ts = ( model_response.prompt_eval_count + model_response.eval_count ) / ( nanosec_to_sec( model_response.prompt_eval_duration + model_response.eval_duration ) ) print( f""" ---------------------------------------------------- {model_response.model} \tPrompt eval: {prompt_ts:.2f} t/s \tResponse: {response_ts:.2f} t/s \tTotal: {total_ts:.2f} t/s Stats: \tPrompt tokens: {model_response.prompt_eval_count} \tResponse tokens: {model_response.eval_count} \tModel load time: {nanosec_to_sec(model_response.load_duration):.2f}s \tPrompt eval time: {nanosec_to_sec(model_response.prompt_eval_duration):.2f}s \tResponse time: {nanosec_to_sec(model_response.eval_duration):.2f}s \tTotal time: {nanosec_to_sec(model_response.total_duration):.2f}s ---------------------------------------------------- """ ) def average_stats(responses: list[ChatResponse]): if len(responses) == 0: print("No stats to average") return res = SimpleNamespace( model=responses[-1].model, total_duration=sum(r.total_duration for r in responses), load_duration=sum(r.load_duration for r in responses), prompt_eval_count=sum(r.prompt_eval_count for r in responses), prompt_eval_duration=sum(r.prompt_eval_duration for r in responses), eval_count=sum(r.eval_count for r in responses), eval_duration=sum(r.eval_duration for r in responses), ) inference_stats(res) def get_benchmark_models(skip_models: list[str] = []) -> list[str]: models = CLIENT.list().get("models", []) model_names = [model["model"] for model in models] if len(skip_models) > 0: model_names = [ model for model in model_names if model not in skip_models ] print(f"Evaluating models: {model_names}\n") return model_names def main(): verbose = args.verbose skip_models = args.skip_models prompts = args.prompts print( f"\nVerbose: {verbose}\nSkip models: {skip_models}\nPrompts: {prompts}" ) model_names = get_benchmark_models(skip_models) benchmarks = {} for model_name in model_names: responses: list[ChatResponse] = [] for prompt in prompts: if verbose: print(f"\n\nBenchmarking: {model_name}\nPrompt: {prompt}") response = run_benchmark(model_name, prompt, verbose=verbose) responses.append(response) if verbose: print(f"Response: {response.message.content}") inference_stats(response) benchmarks[model_name] = responses for model_name, responses in benchmarks.items(): average_stats(responses) if __name__ == "__main__": parser = argparse.ArgumentParser( description="Run benchmarks on your Ollama models." ) parser.add_argument( "-v", "--verbose", action="store_true", help="Increase output verbosity", default=False, ) parser.add_argument( "-s", "--skip-models", nargs="*", default=[], help="list of model names to skip. Separate multiple models with spaces.", ) parser.add_argument( "-r", "--remote", default="localhost:11434", help="list of model names to skip. Separate multiple models with spaces.", ) parser.add_argument( "-p", "--prompts", nargs="*", default=[ "Why is the sky blue?", "Write a report on the financials of Apple Inc.", ], help="list of prompts to use for benchmarking. Separate multiple prompts with spaces.", ) args = parser.parse_args() CLIENT = Client( host=args.remote, ) main() # Example usage: # python benchmark.py --verbose --skip-models aisherpa/mistral-7b-instruct-v02:Q5_K_M llama2:latest --prompts "What color is the sky" "Write a report on the financials of Microsoft"