Last active
July 14, 2025 17:35
-
-
Save w32zhong/bc1a973ec91f967a63a9c540e5a7df8c to your computer and use it in GitHub Desktop.
Revisions
-
w32zhong revised this gist
Jul 14, 2025 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -21,7 +21,7 @@ def main() -> None: stop = time.perf_counter() out_text = outputs[0] out_length = 2048 out_seconds = stop - start print(out_text) -
w32zhong revised this gist
Jul 14, 2025 . 1 changed file with 34 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,34 @@ ## Setup # conda create -n modular python=3.11 # uv pip install modular --extra-index-url https://download.pytorch.org/whl/cpu --index-url https://dl.modular.com/public/nightly/python/simple/ --index-strategy unsafe-best-match --prerelease allow # conda install -c conda-forge gcc=12.1.0 model_path = 'Qwen/Qwen2.5-0.5B' import time from max.entrypoints.llm import LLM from max.pipelines import PipelineConfig def main() -> None: pipeline_config = PipelineConfig(model_path=model_path) llm = LLM(pipeline_config) prompts = ["Quebec pausing some applications for sponsorship of immigrants "] start = time.perf_counter() outputs = llm.generate(prompts, max_new_tokens=2048) stop = time.perf_counter() out_text = outputs[0] out_length = 512 out_seconds = stop - start print(out_text) print("out_seconds:", out_seconds) print("out_length:", out_length) print("tok/sec:", out_length / out_seconds) if __name__ == "__main__": main() -
w32zhong renamed this gist
Jul 14, 2025 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
w32zhong revised this gist
Jul 14, 2025 . 1 changed file with 7 additions and 10 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,15 +1,19 @@ ## setup # uv pip install "sglang[all]==0.4.9.post2" # conda install gxx_linux-64 import time import sglang as sgl import nest_asyncio nest_asyncio.apply() path = '/home/tk/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/' if __name__ == '__main__': llm = sgl.Engine(model_path=path) prompts = ["Quebec pausing some applications for sponsorship of immigrants "] sampling_params = {"temperature": 0.1, "max_new_tokens": 2048} start = time.perf_counter() outputs = llm.generate(prompts, sampling_params) @@ -24,10 +28,3 @@ print("reported:", outputs[0]['meta_info']['e2e_latency']) print("out_length:", out_length) print("tok/sec:", out_length / out_seconds) -
w32zhong revised this gist
Jul 12, 2025 . 1 changed file with 33 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,33 @@ import time import sglang as sgl import nest_asyncio nest_asyncio.apply() path = '/home/tk/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/e6de91484c29aa9480d55605af694f39b081c455' if __name__ == '__main__': llm = sgl.Engine(model_path=path) prompts = ["The problem was caused by not setting the CUDA_VISIBLE_DEVICES variable within the shell correctly. To specify CUDA device 1 for example, "] sampling_params = {"temperature": 0.1, "max_new_tokens": 512} start = time.perf_counter() outputs = llm.generate(prompts, sampling_params) stop = time.perf_counter() out_text = outputs[0]['text'] out_length = outputs[0]['meta_info']['completion_tokens'] out_seconds = stop - start print(out_text) print("out_seconds:", out_seconds) print("reported:", outputs[0]['meta_info']['e2e_latency']) print("out_length:", out_length) print("tok/sec:", out_length / out_seconds) ## On a single RTX3060: # 3 trials: # tok/sec: 170.3 # tok/sec: 171.1 # tok/sec: 171.2 -
w32zhong created this gist
Jul 12, 2025 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,40 @@ import time #from nanovllm import LLM, SamplingParams from vllm import LLM, SamplingParams path = '/home/tk/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/e6de91484c29aa9480d55605af694f39b081c455' llm = LLM(path, enforce_eager=True, tensor_parallel_size=1) sampling_params = SamplingParams(temperature=0.1, max_tokens=512) prompts = ["The problem was caused by not setting the CUDA_VISIBLE_DEVICES variable within the shell correctly. To specify CUDA device 1 for example, "] start = time.perf_counter() outputs = llm.generate(prompts, sampling_params) stop = time.perf_counter() print(outputs) if hasattr(outputs[0], 'request_id'): token_ids = outputs[0].outputs[0].token_ids tokenizer = llm.get_tokenizer() else: token_ids = outputs[0]['token_ids'] tokenizer = llm.tokenizer out_text = tokenizer.decode(token_ids) out_length = len(token_ids) out_seconds = stop - start print(out_text) print("out_seconds:", out_seconds) print("out_length:", out_length) print("tok/sec:", out_length / out_seconds) ## On a single RTX3060: # nano-vlm (3 trials): # tok/sec: 31.2 # tok/sec: 31.6 # tok/sec: 30.9 # vlm (3 trials): # tok/sec: 72.5 # tok/sec: 73.1 # tok/sec: 72.2