cli99 · August 29, 2024 02:34 · Aug 29, 2024 · Aug 29, 2024 · Aug 29, 2024
diff --git a/gistfile1.txt → test_fp8.py b/gistfile1.txt → test_fp8.py
diff --git a/gistfile1.txt b/gistfile1.txt
@@ -0,0 +1,52 @@
+import os
+import time
+
+import torch
+import transformers
+from torch.profiler import ProfilerActivity, profile, record_function
+
+from vllm import LLM, SamplingParams
+
+os.environ["HOST_IP"] = "10.42.10.16"
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:False"
+
+if __name__ == "__main__":
+    model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+    tensor_parallel_size = 1
+    llm = LLM(
+        model=model_id,
+        tensor_parallel_size=tensor_parallel_size,
+        quantization="fp8",
+        # kv_cache_dtype="fp8",
+        enforce_eager=False,
+        enable_chunked_prefill=True,
+        max_num_batched_tokens=2048,
+        gpu_memory_utilization=0.90,
+    )
+
+    # model = llm.llm_engine.model_executor.driver_worker.model_runner.model
+    # print(model)
+    batch_size = 1
+    prompts = [
+        # "Hello, my name is",
+        "The president of the United States is",
+        # "The capital of France is",
+        # "The future of AI is",
+    ] * batch_size
+
+    sampling_params = SamplingParams(temperature=0, top_p=1.0, top_k=-1, max_tokens=5)
+
+    outputs = llm.generate(prompts, sampling_params)
+
+    with profile(
+        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True
+    ) as prof:
+        outputs = llm.generate(prompts, sampling_params)
+    timestamp = int(time.time())
+    prof.export_chrome_trace(f"trace_8b_tp1_bs{batch_size}_{timestamp}.json")
+
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")