w32zhong · July 14, 2025 17:35 · Jul 14, 2025 · Jul 14, 2025 · Jul 14, 2025 · Jul 14, 2025
diff --git a/modular-max.py b/modular-max.py
@@ -21,7 +21,7 @@ def main() -> None:
     stop = time.perf_counter()
 
     out_text = outputs[0]
-    out_length = 512
+    out_length = 2048
     out_seconds = stop - start
 
     print(out_text)

diff --git a/modular-max.py b/modular-max.py
@@ -0,0 +1,34 @@
+## Setup
+# conda create -n modular python=3.11
+# uv pip install modular   --extra-index-url https://download.pytorch.org/whl/cpu   --index-url https://dl.modular.com/public/nightly/python/simple/ --index-strategy unsafe-best-match --prerelease allow
+# conda install -c conda-forge gcc=12.1.0
+
+model_path = 'Qwen/Qwen2.5-0.5B'
+
+import time
+from max.entrypoints.llm import LLM
+from max.pipelines import PipelineConfig
+
+
+def main() -> None:
+    pipeline_config = PipelineConfig(model_path=model_path)
+    llm = LLM(pipeline_config)
+
+    prompts = ["Quebec pausing some applications for sponsorship of immigrants "]
+
+    start = time.perf_counter()
+    outputs = llm.generate(prompts, max_new_tokens=2048)
+    stop = time.perf_counter()
+
+    out_text = outputs[0]
+    out_length = 512
+    out_seconds = stop - start
+
+    print(out_text)
+    print("out_seconds:", out_seconds)
+    print("out_length:", out_length)
+    print("tok/sec:", out_length / out_seconds)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm-compare.py → vllm-or-nanovllm.py b/vllm-compare.py → vllm-or-nanovllm.py
diff --git a/sglang.py b/sglang.py
@@ -1,15 +1,19 @@
+## setup
+# uv pip install "sglang[all]==0.4.9.post2"
+# conda install gxx_linux-64
 import time
 import sglang as sgl
 
 import nest_asyncio
 nest_asyncio.apply()
 
-path = '/home/tk/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/e6de91484c29aa9480d55605af694f39b081c455'
+path = '/home/tk/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/'
 
 if __name__ == '__main__':
     llm = sgl.Engine(model_path=path)
-    prompts = ["The problem was caused by not setting the CUDA_VISIBLE_DEVICES variable within the shell correctly. To specify CUDA device 1 for example, "]
-    sampling_params = {"temperature": 0.1, "max_new_tokens": 512}
+    prompts = ["Quebec pausing some applications for sponsorship of immigrants "]
+
+    sampling_params = {"temperature": 0.1, "max_new_tokens": 2048}
 
     start = time.perf_counter()
     outputs = llm.generate(prompts, sampling_params)
@@ -24,10 +28,3 @@
     print("reported:", outputs[0]['meta_info']['e2e_latency'])
     print("out_length:", out_length)
     print("tok/sec:", out_length / out_seconds)
-
-## On a single RTX3060:
-
-# 3 trials:
-# tok/sec: 170.3
-# tok/sec: 171.1
-# tok/sec: 171.2
diff --git a/sglang.py b/sglang.py
@@ -0,0 +1,33 @@
+import time
+import sglang as sgl
+
+import nest_asyncio
+nest_asyncio.apply()
+
+path = '/home/tk/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/e6de91484c29aa9480d55605af694f39b081c455'
+
+if __name__ == '__main__':
+    llm = sgl.Engine(model_path=path)
+    prompts = ["The problem was caused by not setting the CUDA_VISIBLE_DEVICES variable within the shell correctly. To specify CUDA device 1 for example, "]
+    sampling_params = {"temperature": 0.1, "max_new_tokens": 512}
+
+    start = time.perf_counter()
+    outputs = llm.generate(prompts, sampling_params)
+    stop = time.perf_counter()
+
+    out_text = outputs[0]['text']
+    out_length = outputs[0]['meta_info']['completion_tokens']
+    out_seconds = stop - start
+
+    print(out_text)
+    print("out_seconds:", out_seconds)
+    print("reported:", outputs[0]['meta_info']['e2e_latency'])
+    print("out_length:", out_length)
+    print("tok/sec:", out_length / out_seconds)
+
+## On a single RTX3060:
+
+# 3 trials:
+# tok/sec: 170.3
+# tok/sec: 171.1
+# tok/sec: 171.2
diff --git a/vllm-compare.py b/vllm-compare.py
@@ -0,0 +1,40 @@
+import time
+#from nanovllm import LLM, SamplingParams
+from vllm import LLM, SamplingParams
+
+path = '/home/tk/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/e6de91484c29aa9480d55605af694f39b081c455'
+llm = LLM(path, enforce_eager=True, tensor_parallel_size=1)
+sampling_params = SamplingParams(temperature=0.1, max_tokens=512)
+prompts = ["The problem was caused by not setting the CUDA_VISIBLE_DEVICES variable within the shell correctly. To specify CUDA device 1 for example, "]
+start = time.perf_counter()
+outputs = llm.generate(prompts, sampling_params)
+stop = time.perf_counter()
+print(outputs)
+
+if hasattr(outputs[0], 'request_id'):
+    token_ids = outputs[0].outputs[0].token_ids
+    tokenizer = llm.get_tokenizer()
+else:
+    token_ids = outputs[0]['token_ids']
+    tokenizer = llm.tokenizer
+
+out_text = tokenizer.decode(token_ids)
+out_length = len(token_ids)
+out_seconds = stop - start
+
+print(out_text)
+print("out_seconds:", out_seconds)
+print("out_length:", out_length)
+print("tok/sec:", out_length / out_seconds)
+
+## On a single RTX3060:
+
+# nano-vlm (3 trials):
+# tok/sec: 31.2
+# tok/sec: 31.6
+# tok/sec: 30.9
+
+# vlm (3 trials):
+# tok/sec: 72.5
+# tok/sec: 73.1
+# tok/sec: 72.2