Skip to content

Instantly share code, notes, and snippets.

@w32zhong
Last active July 14, 2025 17:35
Show Gist options
  • Save w32zhong/bc1a973ec91f967a63a9c540e5a7df8c to your computer and use it in GitHub Desktop.
Save w32zhong/bc1a973ec91f967a63a9c540e5a7df8c to your computer and use it in GitHub Desktop.

Revisions

  1. w32zhong revised this gist Jul 14, 2025. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion modular-max.py
    Original file line number Diff line number Diff line change
    @@ -21,7 +21,7 @@ def main() -> None:
    stop = time.perf_counter()

    out_text = outputs[0]
    out_length = 512
    out_length = 2048
    out_seconds = stop - start

    print(out_text)
  2. w32zhong revised this gist Jul 14, 2025. 1 changed file with 34 additions and 0 deletions.
    34 changes: 34 additions & 0 deletions modular-max.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,34 @@
    ## Setup
    # conda create -n modular python=3.11
    # uv pip install modular --extra-index-url https://download.pytorch.org/whl/cpu --index-url https://dl.modular.com/public/nightly/python/simple/ --index-strategy unsafe-best-match --prerelease allow
    # conda install -c conda-forge gcc=12.1.0

    model_path = 'Qwen/Qwen2.5-0.5B'

    import time
    from max.entrypoints.llm import LLM
    from max.pipelines import PipelineConfig


    def main() -> None:
    pipeline_config = PipelineConfig(model_path=model_path)
    llm = LLM(pipeline_config)

    prompts = ["Quebec pausing some applications for sponsorship of immigrants "]

    start = time.perf_counter()
    outputs = llm.generate(prompts, max_new_tokens=2048)
    stop = time.perf_counter()

    out_text = outputs[0]
    out_length = 512
    out_seconds = stop - start

    print(out_text)
    print("out_seconds:", out_seconds)
    print("out_length:", out_length)
    print("tok/sec:", out_length / out_seconds)


    if __name__ == "__main__":
    main()
  3. w32zhong renamed this gist Jul 14, 2025. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  4. w32zhong revised this gist Jul 14, 2025. 1 changed file with 7 additions and 10 deletions.
    17 changes: 7 additions & 10 deletions sglang.py
    Original file line number Diff line number Diff line change
    @@ -1,15 +1,19 @@
    ## setup
    # uv pip install "sglang[all]==0.4.9.post2"
    # conda install gxx_linux-64
    import time
    import sglang as sgl

    import nest_asyncio
    nest_asyncio.apply()

    path = '/home/tk/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/e6de91484c29aa9480d55605af694f39b081c455'
    path = '/home/tk/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/'

    if __name__ == '__main__':
    llm = sgl.Engine(model_path=path)
    prompts = ["The problem was caused by not setting the CUDA_VISIBLE_DEVICES variable within the shell correctly. To specify CUDA device 1 for example, "]
    sampling_params = {"temperature": 0.1, "max_new_tokens": 512}
    prompts = ["Quebec pausing some applications for sponsorship of immigrants "]

    sampling_params = {"temperature": 0.1, "max_new_tokens": 2048}

    start = time.perf_counter()
    outputs = llm.generate(prompts, sampling_params)
    @@ -24,10 +28,3 @@
    print("reported:", outputs[0]['meta_info']['e2e_latency'])
    print("out_length:", out_length)
    print("tok/sec:", out_length / out_seconds)

    ## On a single RTX3060:

    # 3 trials:
    # tok/sec: 170.3
    # tok/sec: 171.1
    # tok/sec: 171.2
  5. w32zhong revised this gist Jul 12, 2025. 1 changed file with 33 additions and 0 deletions.
    33 changes: 33 additions & 0 deletions sglang.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,33 @@
    import time
    import sglang as sgl

    import nest_asyncio
    nest_asyncio.apply()

    path = '/home/tk/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/e6de91484c29aa9480d55605af694f39b081c455'

    if __name__ == '__main__':
    llm = sgl.Engine(model_path=path)
    prompts = ["The problem was caused by not setting the CUDA_VISIBLE_DEVICES variable within the shell correctly. To specify CUDA device 1 for example, "]
    sampling_params = {"temperature": 0.1, "max_new_tokens": 512}

    start = time.perf_counter()
    outputs = llm.generate(prompts, sampling_params)
    stop = time.perf_counter()

    out_text = outputs[0]['text']
    out_length = outputs[0]['meta_info']['completion_tokens']
    out_seconds = stop - start

    print(out_text)
    print("out_seconds:", out_seconds)
    print("reported:", outputs[0]['meta_info']['e2e_latency'])
    print("out_length:", out_length)
    print("tok/sec:", out_length / out_seconds)

    ## On a single RTX3060:

    # 3 trials:
    # tok/sec: 170.3
    # tok/sec: 171.1
    # tok/sec: 171.2
  6. w32zhong created this gist Jul 12, 2025.
    40 changes: 40 additions & 0 deletions vllm-compare.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,40 @@
    import time
    #from nanovllm import LLM, SamplingParams
    from vllm import LLM, SamplingParams

    path = '/home/tk/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/e6de91484c29aa9480d55605af694f39b081c455'
    llm = LLM(path, enforce_eager=True, tensor_parallel_size=1)
    sampling_params = SamplingParams(temperature=0.1, max_tokens=512)
    prompts = ["The problem was caused by not setting the CUDA_VISIBLE_DEVICES variable within the shell correctly. To specify CUDA device 1 for example, "]
    start = time.perf_counter()
    outputs = llm.generate(prompts, sampling_params)
    stop = time.perf_counter()
    print(outputs)

    if hasattr(outputs[0], 'request_id'):
    token_ids = outputs[0].outputs[0].token_ids
    tokenizer = llm.get_tokenizer()
    else:
    token_ids = outputs[0]['token_ids']
    tokenizer = llm.tokenizer

    out_text = tokenizer.decode(token_ids)
    out_length = len(token_ids)
    out_seconds = stop - start

    print(out_text)
    print("out_seconds:", out_seconds)
    print("out_length:", out_length)
    print("tok/sec:", out_length / out_seconds)

    ## On a single RTX3060:

    # nano-vlm (3 trials):
    # tok/sec: 31.2
    # tok/sec: 31.6
    # tok/sec: 30.9

    # vlm (3 trials):
    # tok/sec: 72.5
    # tok/sec: 73.1
    # tok/sec: 72.2