Last active
July 14, 2025 17:35
-
-
Save w32zhong/bc1a973ec91f967a63a9c540e5a7df8c to your computer and use it in GitHub Desktop.
vllm compared to nano-vllm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ## Setup | |
| # conda create -n modular python=3.11 | |
| # uv pip install modular --extra-index-url https://download.pytorch.org/whl/cpu --index-url https://dl.modular.com/public/nightly/python/simple/ --index-strategy unsafe-best-match --prerelease allow | |
| # conda install -c conda-forge gcc=12.1.0 | |
| model_path = 'Qwen/Qwen2.5-0.5B' | |
| import time | |
| from max.entrypoints.llm import LLM | |
| from max.pipelines import PipelineConfig | |
| def main() -> None: | |
| pipeline_config = PipelineConfig(model_path=model_path) | |
| llm = LLM(pipeline_config) | |
| prompts = ["Quebec pausing some applications for sponsorship of immigrants "] | |
| start = time.perf_counter() | |
| outputs = llm.generate(prompts, max_new_tokens=2048) | |
| stop = time.perf_counter() | |
| out_text = outputs[0] | |
| out_length = 2048 | |
| out_seconds = stop - start | |
| print(out_text) | |
| print("out_seconds:", out_seconds) | |
| print("out_length:", out_length) | |
| print("tok/sec:", out_length / out_seconds) | |
| if __name__ == "__main__": | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ## setup | |
| # uv pip install "sglang[all]==0.4.9.post2" | |
| # conda install gxx_linux-64 | |
| import time | |
| import sglang as sgl | |
| import nest_asyncio | |
| nest_asyncio.apply() | |
| path = '/home/tk/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/' | |
| if __name__ == '__main__': | |
| llm = sgl.Engine(model_path=path) | |
| prompts = ["Quebec pausing some applications for sponsorship of immigrants "] | |
| sampling_params = {"temperature": 0.1, "max_new_tokens": 2048} | |
| start = time.perf_counter() | |
| outputs = llm.generate(prompts, sampling_params) | |
| stop = time.perf_counter() | |
| out_text = outputs[0]['text'] | |
| out_length = outputs[0]['meta_info']['completion_tokens'] | |
| out_seconds = stop - start | |
| print(out_text) | |
| print("out_seconds:", out_seconds) | |
| print("reported:", outputs[0]['meta_info']['e2e_latency']) | |
| print("out_length:", out_length) | |
| print("tok/sec:", out_length / out_seconds) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import time | |
| #from nanovllm import LLM, SamplingParams | |
| from vllm import LLM, SamplingParams | |
| path = '/home/tk/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/e6de91484c29aa9480d55605af694f39b081c455' | |
| llm = LLM(path, enforce_eager=True, tensor_parallel_size=1) | |
| sampling_params = SamplingParams(temperature=0.1, max_tokens=512) | |
| prompts = ["The problem was caused by not setting the CUDA_VISIBLE_DEVICES variable within the shell correctly. To specify CUDA device 1 for example, "] | |
| start = time.perf_counter() | |
| outputs = llm.generate(prompts, sampling_params) | |
| stop = time.perf_counter() | |
| print(outputs) | |
| if hasattr(outputs[0], 'request_id'): | |
| token_ids = outputs[0].outputs[0].token_ids | |
| tokenizer = llm.get_tokenizer() | |
| else: | |
| token_ids = outputs[0]['token_ids'] | |
| tokenizer = llm.tokenizer | |
| out_text = tokenizer.decode(token_ids) | |
| out_length = len(token_ids) | |
| out_seconds = stop - start | |
| print(out_text) | |
| print("out_seconds:", out_seconds) | |
| print("out_length:", out_length) | |
| print("tok/sec:", out_length / out_seconds) | |
| ## On a single RTX3060: | |
| # nano-vlm (3 trials): | |
| # tok/sec: 31.2 | |
| # tok/sec: 31.6 | |
| # tok/sec: 30.9 | |
| # vlm (3 trials): | |
| # tok/sec: 72.5 | |
| # tok/sec: 73.1 | |
| # tok/sec: 72.2 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment