Skip to content

Instantly share code, notes, and snippets.

@w32zhong
Last active September 28, 2025 17:03
Show Gist options
  • Save w32zhong/2c066a7f7ed0bdc9e31007bf0ea8c6d6 to your computer and use it in GitHub Desktop.
Save w32zhong/2c066a7f7ed0bdc9e31007bf0ea8c6d6 to your computer and use it in GitHub Desktop.
sglang
import time
from transformers import AutoTokenizer
import asyncio
import sglang as sgl
from sglang.utils import async_stream_and_merge, trim_overlap
async def main(llm, tokenizer, prompt):
sampling_params = {"temperature": 0.8, "top_p": 0.95, "max_new_tokens": 8000}
final_text = ""
generator = await llm.async_generate(prompt, sampling_params, stream=True)
cnt_token = 0
async for chunk in generator:
chunk_text = chunk["text"]
cleaned_chunk = trim_overlap(final_text, chunk_text)
final_text += cleaned_chunk
print(tokenizer.decode(chunk['output_ids']), end="")
cnt_token += len(chunk['output_ids'])
return cnt_token
if __name__ == '__main__':
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-4B-Instruct-2507')
question = "Thomas is very healthy, but he has to go to the hospital every day. What could be the reasons?"
messages = [
{"role": "user", "content": question},
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(prompt)
llm = sgl.Engine(
model_path="/mnt/asus_card/hfdownloader/Qwen_Qwen3-4B-Instruct-2507",
tp_size=4,
cuda_graph_max_bs=16
)
begin = time.perf_counter()
cnt_token = asyncio.run(main(llm, tokenizer, prompt))
time_cost = time.perf_counter() - begin
llm.shutdown()
print()
print('tokens and time:', cnt_token, time_cost)
print('e2e speed:', cnt_token / time_cost)
@w32zhong
Copy link
Author

sglang==0.5.1

@w32zhong
Copy link
Author

related PR: sgl-project/sglang#10846

CUDA_VISIBLE_DEVICES=0 uv run python -m sglang.launch_server --speculative-algo EAGLE \
    --model Qwen/Qwen3-4B-Instruct-2507 \
    --speculative-draft-model-path /workspace/mnt/specforge_PoC/output/deft-bee-66/draft_model_sglang \
    --speculative-num-steps 6 \
    --speculative-eagle-topk 10 \
    --speculative-num-draft-tokens 60 \
    --cuda-graph-max-bs 2 \
    --mem-fraction-static 0.8 \
    --chunked-prefill-size 4
cd benchmark/mtbench
python3 bench_sglang_eagle.py --parallel 1 --num-questions 10
# or: python3 -m sglang.test.send_one --batch-size 2

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment