w32zhong · September 28, 2025 17:03 · w32zhong · Sep 10, 2025 · w32zhong · Sep 28, 2025
diff --git a/sglang.py b/sglang.py
 import time
 from transformers import AutoTokenizer
 import asyncio
 import sglang as sgl
 from sglang.utils import async_stream_and_merge, trim_overlap

 async def main(llm, tokenizer, prompt):
    sampling_params = {"temperature": 0.8, "top_p": 0.95, "max_new_tokens": 8000}

    final_text = ""
    generator = await llm.async_generate(prompt, sampling_params, stream=True)
    cnt_token = 0
    async for chunk in generator:
        chunk_text = chunk["text"]
        cleaned_chunk = trim_overlap(final_text, chunk_text)
        final_text += cleaned_chunk
        print(tokenizer.decode(chunk['output_ids']), end="")
        cnt_token += len(chunk['output_ids'])
    return cnt_token


 if __name__ == '__main__':
    tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-4B-Instruct-2507')
    question = "Thomas is very healthy, but he has to go to the hospital every day. What could be the reasons?"
    messages = [
        {"role": "user", "content": question},
    ]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    print(prompt)
    llm = sgl.Engine(
        model_path="/mnt/asus_card/hfdownloader/Qwen_Qwen3-4B-Instruct-2507",
        tp_size=4,
        cuda_graph_max_bs=16
    )

    begin = time.perf_counter()
    cnt_token = asyncio.run(main(llm, tokenizer, prompt))
    time_cost = time.perf_counter() - begin
    llm.shutdown()

    print()
    print('tokens and time:', cnt_token, time_cost)
    print('e2e speed:', cnt_token / time_cost)
	import time
	from transformers import AutoTokenizer
	import asyncio
	import sglang as sgl
	from sglang.utils import async_stream_and_merge, trim_overlap

	async def main(llm, tokenizer, prompt):
	sampling_params = {"temperature": 0.8, "top_p": 0.95, "max_new_tokens": 8000}

	final_text = ""
	generator = await llm.async_generate(prompt, sampling_params, stream=True)
	cnt_token = 0
	async for chunk in generator:
	chunk_text = chunk["text"]
	cleaned_chunk = trim_overlap(final_text, chunk_text)
	final_text += cleaned_chunk
	print(tokenizer.decode(chunk['output_ids']), end="")
	cnt_token += len(chunk['output_ids'])
	return cnt_token


	if __name__ == '__main__':
	tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-4B-Instruct-2507')
	question = "Thomas is very healthy, but he has to go to the hospital every day. What could be the reasons?"
	messages = [
	{"role": "user", "content": question},
	]
	prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	print(prompt)
	llm = sgl.Engine(
	model_path="/mnt/asus_card/hfdownloader/Qwen_Qwen3-4B-Instruct-2507",
	tp_size=4,
	cuda_graph_max_bs=16
	)

	begin = time.perf_counter()
	cnt_token = asyncio.run(main(llm, tokenizer, prompt))
	time_cost = time.perf_counter() - begin
	llm.shutdown()

	print()
	print('tokens and time:', cnt_token, time_cost)
	print('e2e speed:', cnt_token / time_cost)