## Setup # conda create -n modular python=3.11 # uv pip install modular --extra-index-url https://download.pytorch.org/whl/cpu --index-url https://dl.modular.com/public/nightly/python/simple/ --index-strategy unsafe-best-match --prerelease allow # conda install -c conda-forge gcc=12.1.0 model_path = 'Qwen/Qwen2.5-0.5B' import time from max.entrypoints.llm import LLM from max.pipelines import PipelineConfig def main() -> None: pipeline_config = PipelineConfig(model_path=model_path) llm = LLM(pipeline_config) prompts = ["Quebec pausing some applications for sponsorship of immigrants "] start = time.perf_counter() outputs = llm.generate(prompts, max_new_tokens=2048) stop = time.perf_counter() out_text = outputs[0] out_length = 2048 out_seconds = stop - start print(out_text) print("out_seconds:", out_seconds) print("out_length:", out_length) print("tok/sec:", out_length / out_seconds) if __name__ == "__main__": main()