Skip to content

Instantly share code, notes, and snippets.

@w32zhong
Last active July 14, 2025 17:35
Show Gist options
  • Save w32zhong/bc1a973ec91f967a63a9c540e5a7df8c to your computer and use it in GitHub Desktop.
Save w32zhong/bc1a973ec91f967a63a9c540e5a7df8c to your computer and use it in GitHub Desktop.
vllm compared to nano-vllm
## Setup
# conda create -n modular python=3.11
# uv pip install modular --extra-index-url https://download.pytorch.org/whl/cpu --index-url https://dl.modular.com/public/nightly/python/simple/ --index-strategy unsafe-best-match --prerelease allow
# conda install -c conda-forge gcc=12.1.0
model_path = 'Qwen/Qwen2.5-0.5B'
import time
from max.entrypoints.llm import LLM
from max.pipelines import PipelineConfig
def main() -> None:
pipeline_config = PipelineConfig(model_path=model_path)
llm = LLM(pipeline_config)
prompts = ["Quebec pausing some applications for sponsorship of immigrants "]
start = time.perf_counter()
outputs = llm.generate(prompts, max_new_tokens=2048)
stop = time.perf_counter()
out_text = outputs[0]
out_length = 2048
out_seconds = stop - start
print(out_text)
print("out_seconds:", out_seconds)
print("out_length:", out_length)
print("tok/sec:", out_length / out_seconds)
if __name__ == "__main__":
main()
## setup
# uv pip install "sglang[all]==0.4.9.post2"
# conda install gxx_linux-64
import time
import sglang as sgl
import nest_asyncio
nest_asyncio.apply()
path = '/home/tk/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/'
if __name__ == '__main__':
llm = sgl.Engine(model_path=path)
prompts = ["Quebec pausing some applications for sponsorship of immigrants "]
sampling_params = {"temperature": 0.1, "max_new_tokens": 2048}
start = time.perf_counter()
outputs = llm.generate(prompts, sampling_params)
stop = time.perf_counter()
out_text = outputs[0]['text']
out_length = outputs[0]['meta_info']['completion_tokens']
out_seconds = stop - start
print(out_text)
print("out_seconds:", out_seconds)
print("reported:", outputs[0]['meta_info']['e2e_latency'])
print("out_length:", out_length)
print("tok/sec:", out_length / out_seconds)
import time
#from nanovllm import LLM, SamplingParams
from vllm import LLM, SamplingParams
path = '/home/tk/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/e6de91484c29aa9480d55605af694f39b081c455'
llm = LLM(path, enforce_eager=True, tensor_parallel_size=1)
sampling_params = SamplingParams(temperature=0.1, max_tokens=512)
prompts = ["The problem was caused by not setting the CUDA_VISIBLE_DEVICES variable within the shell correctly. To specify CUDA device 1 for example, "]
start = time.perf_counter()
outputs = llm.generate(prompts, sampling_params)
stop = time.perf_counter()
print(outputs)
if hasattr(outputs[0], 'request_id'):
token_ids = outputs[0].outputs[0].token_ids
tokenizer = llm.get_tokenizer()
else:
token_ids = outputs[0]['token_ids']
tokenizer = llm.tokenizer
out_text = tokenizer.decode(token_ids)
out_length = len(token_ids)
out_seconds = stop - start
print(out_text)
print("out_seconds:", out_seconds)
print("out_length:", out_length)
print("tok/sec:", out_length / out_seconds)
## On a single RTX3060:
# nano-vlm (3 trials):
# tok/sec: 31.2
# tok/sec: 31.6
# tok/sec: 30.9
# vlm (3 trials):
# tok/sec: 72.5
# tok/sec: 73.1
# tok/sec: 72.2
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment