Last active
September 28, 2025 17:03
-
-
Save w32zhong/2c066a7f7ed0bdc9e31007bf0ea8c6d6 to your computer and use it in GitHub Desktop.
sglang
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import time | |
| from transformers import AutoTokenizer | |
| import asyncio | |
| import sglang as sgl | |
| from sglang.utils import async_stream_and_merge, trim_overlap | |
| async def main(llm, tokenizer, prompt): | |
| sampling_params = {"temperature": 0.8, "top_p": 0.95, "max_new_tokens": 8000} | |
| final_text = "" | |
| generator = await llm.async_generate(prompt, sampling_params, stream=True) | |
| cnt_token = 0 | |
| async for chunk in generator: | |
| chunk_text = chunk["text"] | |
| cleaned_chunk = trim_overlap(final_text, chunk_text) | |
| final_text += cleaned_chunk | |
| print(tokenizer.decode(chunk['output_ids']), end="") | |
| cnt_token += len(chunk['output_ids']) | |
| return cnt_token | |
| if __name__ == '__main__': | |
| tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-4B-Instruct-2507') | |
| question = "Thomas is very healthy, but he has to go to the hospital every day. What could be the reasons?" | |
| messages = [ | |
| {"role": "user", "content": question}, | |
| ] | |
| prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
| print(prompt) | |
| llm = sgl.Engine( | |
| model_path="/mnt/asus_card/hfdownloader/Qwen_Qwen3-4B-Instruct-2507", | |
| tp_size=4, | |
| cuda_graph_max_bs=16 | |
| ) | |
| begin = time.perf_counter() | |
| cnt_token = asyncio.run(main(llm, tokenizer, prompt)) | |
| time_cost = time.perf_counter() - begin | |
| llm.shutdown() | |
| print() | |
| print('tokens and time:', cnt_token, time_cost) | |
| print('e2e speed:', cnt_token / time_cost) |
related PR: sgl-project/sglang#10846
CUDA_VISIBLE_DEVICES=0 uv run python -m sglang.launch_server --speculative-algo EAGLE \
--model Qwen/Qwen3-4B-Instruct-2507 \
--speculative-draft-model-path /workspace/mnt/specforge_PoC/output/deft-bee-66/draft_model_sglang \
--speculative-num-steps 6 \
--speculative-eagle-topk 10 \
--speculative-num-draft-tokens 60 \
--cuda-graph-max-bs 2 \
--mem-fraction-static 0.8 \
--chunked-prefill-size 4cd benchmark/mtbench
python3 bench_sglang_eagle.py --parallel 1 --num-questions 10
# or: python3 -m sglang.test.send_one --batch-size 2
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
sglang==0.5.1