Skip to content

Instantly share code, notes, and snippets.

@zainhas
Created February 8, 2025 17:36
Show Gist options
  • Save zainhas/2c21ee14873fa4e12a0128da8bc1da98 to your computer and use it in GitHub Desktop.
Save zainhas/2c21ee14873fa4e12a0128da8bc1da98 to your computer and use it in GitHub Desktop.

Revisions

  1. zainhas created this gist Feb 8, 2025.
    38 changes: 38 additions & 0 deletions API_prefill_decode_speed.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,38 @@
    import time
    from together import Together

    client = Together(api_key = "---")

    prompt = "How many r's in the word strawberry?"

    prefill_tokens_len = len(tokenizer.encode(prompt))

    decode_text = ""
    decode_started = False

    start_time = time.time()

    completion = completion = client.chat.completions.create(
    model="deepseek-ai/DeepSeek-R1",
    messages=[{"role": "user", "content": prompt}],
    stream=True,
    )

    for chunk in completion:
    if chunk.choices:
    decode_text += chunk.choices[0].delta.content
    if not decode_started:
    decode_started_time = time.time()
    prefill_time = decode_started_time - start_time
    decode_started = True

    end_time = time.time()

    decode_time = end_time - decode_started_time

    decode_tokens = tokenizer.encode(decode_text)
    decode_tokens_len = len(decode_tokens)

    # tokens/per sec
    prefill_throughput = prefill_tokens_len / prefill_time
    decode_throughput = decode_tokens_len / decode_time