## Setup
# conda create -n modular python=3.11
# uv pip install modular   --extra-index-url https://download.pytorch.org/whl/cpu   --index-url https://dl.modular.com/public/nightly/python/simple/ --index-strategy unsafe-best-match --prerelease allow
# conda install -c conda-forge gcc=12.1.0

model_path = 'Qwen/Qwen2.5-0.5B'

import time
from max.entrypoints.llm import LLM
from max.pipelines import PipelineConfig


def main() -> None:
    pipeline_config = PipelineConfig(model_path=model_path)
    llm = LLM(pipeline_config)

    prompts = ["Quebec pausing some applications for sponsorship of immigrants "]

    start = time.perf_counter()
    outputs = llm.generate(prompts, max_new_tokens=2048)
    stop = time.perf_counter()

    out_text = outputs[0]
    out_length = 2048
    out_seconds = stop - start

    print(out_text)
    print("out_seconds:", out_seconds)
    print("out_length:", out_length)
    print("tok/sec:", out_length / out_seconds)


if __name__ == "__main__":
    main()