w32zhong · July 14, 2025 17:35
diff --git a/modular-max.py b/modular-max.py
 ## Setup
 # conda create -n modular python=3.11
 # uv pip install modular   --extra-index-url https://download.pytorch.org/whl/cpu   --index-url https://dl.modular.com/public/nightly/python/simple/ --index-strategy unsafe-best-match --prerelease allow
 # conda install -c conda-forge gcc=12.1.0

 model_path = 'Qwen/Qwen2.5-0.5B'

 import time
 from max.entrypoints.llm import LLM
 from max.pipelines import PipelineConfig


 def main() -> None:
    pipeline_config = PipelineConfig(model_path=model_path)
    llm = LLM(pipeline_config)

    prompts = ["Quebec pausing some applications for sponsorship of immigrants "]

    start = time.perf_counter()
    outputs = llm.generate(prompts, max_new_tokens=2048)
    stop = time.perf_counter()

    out_text = outputs[0]
    out_length = 2048
    out_seconds = stop - start

    print(out_text)
    print("out_seconds:", out_seconds)
    print("out_length:", out_length)
    print("tok/sec:", out_length / out_seconds)


 if __name__ == "__main__":
    main()
diff --git a/sglang.py b/sglang.py
 ## setup
 # uv pip install "sglang[all]==0.4.9.post2"
 # conda install gxx_linux-64
 import time
 import sglang as sgl

 import nest_asyncio
 nest_asyncio.apply()

 path = '/home/tk/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/'

 if __name__ == '__main__':
    llm = sgl.Engine(model_path=path)
    prompts = ["Quebec pausing some applications for sponsorship of immigrants "]

    sampling_params = {"temperature": 0.1, "max_new_tokens": 2048}

    start = time.perf_counter()
    outputs = llm.generate(prompts, sampling_params)
    stop = time.perf_counter()

    out_text = outputs[0]['text']
    out_length = outputs[0]['meta_info']['completion_tokens']
    out_seconds = stop - start

    print(out_text)
    print("out_seconds:", out_seconds)
    print("reported:", outputs[0]['meta_info']['e2e_latency'])
    print("out_length:", out_length)
    print("tok/sec:", out_length / out_seconds)
diff --git a/vllm-or-nanovllm.py b/vllm-or-nanovllm.py
 import time
 #from nanovllm import LLM, SamplingParams
 from vllm import LLM, SamplingParams

 path = '/home/tk/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/e6de91484c29aa9480d55605af694f39b081c455'
 llm = LLM(path, enforce_eager=True, tensor_parallel_size=1)
 sampling_params = SamplingParams(temperature=0.1, max_tokens=512)
 prompts = ["The problem was caused by not setting the CUDA_VISIBLE_DEVICES variable within the shell correctly. To specify CUDA device 1 for example, "]
 start = time.perf_counter()
 outputs = llm.generate(prompts, sampling_params)
 stop = time.perf_counter()
 print(outputs)

 if hasattr(outputs[0], 'request_id'):
    token_ids = outputs[0].outputs[0].token_ids
    tokenizer = llm.get_tokenizer()
 else:
    token_ids = outputs[0]['token_ids']
    tokenizer = llm.tokenizer

 out_text = tokenizer.decode(token_ids)
 out_length = len(token_ids)
 out_seconds = stop - start

 print(out_text)
 print("out_seconds:", out_seconds)
 print("out_length:", out_length)
 print("tok/sec:", out_length / out_seconds)

 ## On a single RTX3060:

 # nano-vlm (3 trials):
 # tok/sec: 31.2
 # tok/sec: 31.6
 # tok/sec: 30.9

 # vlm (3 trials):
 # tok/sec: 72.5
 # tok/sec: 73.1
 # tok/sec: 72.2
	## Setup
	# conda create -n modular python=3.11
	# uv pip install modular --extra-index-url https://download.pytorch.org/whl/cpu --index-url https://dl.modular.com/public/nightly/python/simple/ --index-strategy unsafe-best-match --prerelease allow
	# conda install -c conda-forge gcc=12.1.0

	model_path = 'Qwen/Qwen2.5-0.5B'

	import time
	from max.entrypoints.llm import LLM
	from max.pipelines import PipelineConfig


	def main() -> None:
	pipeline_config = PipelineConfig(model_path=model_path)
	llm = LLM(pipeline_config)

	prompts = ["Quebec pausing some applications for sponsorship of immigrants "]

	start = time.perf_counter()
	outputs = llm.generate(prompts, max_new_tokens=2048)
	stop = time.perf_counter()

	out_text = outputs[0]
	out_length = 2048
	out_seconds = stop - start

	print(out_text)
	print("out_seconds:", out_seconds)
	print("out_length:", out_length)
	print("tok/sec:", out_length / out_seconds)


	if __name__ == "__main__":
	main()
	## setup
	# uv pip install "sglang[all]==0.4.9.post2"
	# conda install gxx_linux-64
	import time
	import sglang as sgl

	import nest_asyncio
	nest_asyncio.apply()

	path = '/home/tk/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/'

	if __name__ == '__main__':
	llm = sgl.Engine(model_path=path)
	prompts = ["Quebec pausing some applications for sponsorship of immigrants "]

	sampling_params = {"temperature": 0.1, "max_new_tokens": 2048}

	start = time.perf_counter()
	outputs = llm.generate(prompts, sampling_params)
	stop = time.perf_counter()

	out_text = outputs[0]['text']
	out_length = outputs[0]['meta_info']['completion_tokens']
	out_seconds = stop - start

	print(out_text)
	print("out_seconds:", out_seconds)
	print("reported:", outputs[0]['meta_info']['e2e_latency'])
	print("out_length:", out_length)
	print("tok/sec:", out_length / out_seconds)
	import time
	#from nanovllm import LLM, SamplingParams
	from vllm import LLM, SamplingParams

	path = '/home/tk/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/e6de91484c29aa9480d55605af694f39b081c455'
	llm = LLM(path, enforce_eager=True, tensor_parallel_size=1)
	sampling_params = SamplingParams(temperature=0.1, max_tokens=512)
	prompts = ["The problem was caused by not setting the CUDA_VISIBLE_DEVICES variable within the shell correctly. To specify CUDA device 1 for example, "]
	start = time.perf_counter()
	outputs = llm.generate(prompts, sampling_params)
	stop = time.perf_counter()
	print(outputs)

	if hasattr(outputs[0], 'request_id'):
	token_ids = outputs[0].outputs[0].token_ids
	tokenizer = llm.get_tokenizer()
	else:
	token_ids = outputs[0]['token_ids']
	tokenizer = llm.tokenizer

	out_text = tokenizer.decode(token_ids)
	out_length = len(token_ids)
	out_seconds = stop - start

	print(out_text)
	print("out_seconds:", out_seconds)
	print("out_length:", out_length)
	print("tok/sec:", out_length / out_seconds)

	## On a single RTX3060:

	# nano-vlm (3 trials):
	# tok/sec: 31.2
	# tok/sec: 31.6
	# tok/sec: 30.9

	# vlm (3 trials):
	# tok/sec: 72.5
	# tok/sec: 73.1
	# tok/sec: 72.2