## setup # uv pip install "sglang[all]==0.4.9.post2" # conda install gxx_linux-64 import time import sglang as sgl import nest_asyncio nest_asyncio.apply() path = '/home/tk/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/' if __name__ == '__main__': llm = sgl.Engine(model_path=path) prompts = ["Quebec pausing some applications for sponsorship of immigrants "] sampling_params = {"temperature": 0.1, "max_new_tokens": 2048} start = time.perf_counter() outputs = llm.generate(prompts, sampling_params) stop = time.perf_counter() out_text = outputs[0]['text'] out_length = outputs[0]['meta_info']['completion_tokens'] out_seconds = stop - start print(out_text) print("out_seconds:", out_seconds) print("reported:", outputs[0]['meta_info']['e2e_latency']) print("out_length:", out_length) print("tok/sec:", out_length / out_seconds)