sutyum · July 24, 2024 11:31 · Jul 24, 2024
diff --git a/api.py b/api.py
@@ -0,0 +1,133 @@
+import modal
+
+vllm_image = modal.Image.debian_slim(python_version="3.10").pip_install(
+    [
+        "vllm==0.5.3post1",  # LLM serving
+        "huggingface_hub==0.24.1",  # download models from the Hugging Face Hub
+        "hf-transfer==0.1.8",  # download models faster
+    ]
+)
+
+MODEL_NAME = "meta-llama/Meta-Llama-3.1-405B-Instruct-FP8"
+MODEL_REVISION = "d8e5bf570eac69f7dfc596cfaaebe6acbf95ca2e"
+MODEL_DIR = f"/models/{MODEL_NAME}"
+
+MINUTES = 60
+HOURS = 60 * MINUTES
+
+app = modal.App("vllm-openai-compatible")
+
+N_GPU = 8  # tip: for best results, first upgrade to A100s or H100s, and only then increase GPU count
+TOKEN = (
+    "super-secret-token"  # auth token. for production use, replace with a modal.Secret
+)
+
+volume = modal.Volume.from_name("llama3-405b-fp8", create_if_missing=True)
+
+
+@app.function(
+    image=vllm_image,
+    gpu=modal.gpu.A100(count=N_GPU, size="80GB"),
+    container_idle_timeout=20 * MINUTES,
+    timeout=1 * HOURS,
+    allow_concurrent_inputs=100,
+    volumes={MODEL_DIR: volume},
+)
+@modal.asgi_app()
+def serve():
+    import asyncio
+
+    import fastapi
+    import vllm.entrypoints.openai.api_server as api_server
+    from vllm.engine.arg_utils import AsyncEngineArgs
+    from vllm.engine.async_llm_engine import AsyncLLMEngine
+    from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+    from vllm.entrypoints.openai.serving_completion import (
+        OpenAIServingCompletion,
+    )
+    from vllm.entrypoints.logger import RequestLogger
+    from vllm.usage.usage_lib import UsageContext
+
+    volume.reload()
+
+    # create a fastAPI app that uses vLLM's OpenAI-compatible router
+    app = fastapi.FastAPI(
+        title=f"OpenAI-compatible {MODEL_NAME} server",
+        description="Run an OpenAI-compatible LLM server with vLLM on modal.com",
+        version="0.0.1",
+        docs_url="/docs",
+    )
+
+    # security: CORS middleware for external requests
+    http_bearer = fastapi.security.HTTPBearer(
+        scheme_name="Bearer Token", description="See code for authentication details."
+    )
+    app.add_middleware(
+        fastapi.middleware.cors.CORSMiddleware,
+        allow_origins=["*"],
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+
+    # security: inject dependency on authed routes
+    async def is_authenticated(api_key: str = fastapi.Security(http_bearer)):
+        if api_key.credentials != TOKEN:
+            raise fastapi.HTTPException(
+                status_code=fastapi.status.HTTP_401_UNAUTHORIZED,
+                detail="Invalid authentication credentials",
+            )
+        return {"username": "authenticated_user"}
+
+    router = fastapi.APIRouter(dependencies=[fastapi.Depends(is_authenticated)])
+
+    router.include_router(api_server.router)
+    app.include_router(router)
+
+    engine_args = AsyncEngineArgs(
+        model=MODEL_DIR,
+        tensor_parallel_size=N_GPU,
+        gpu_memory_utilization=0.90,
+        max_model_len=1024 + 128,
+        enforce_eager=True,
+    )
+
+    engine = AsyncLLMEngine.from_engine_args(
+        engine_args, usage_context=UsageContext.OPENAI_API_SERVER
+    )
+
+    try:  # copied from vLLM -- https://github.com/vllm-project/vllm/blob/507ef787d85dec24490069ffceacbd6b161f4f72/vllm/entrypoints/openai/api_server.py#L235C1-L247C1
+        event_loop = asyncio.get_running_loop()
+    except RuntimeError:
+        event_loop = None
+
+    if event_loop is not None and event_loop.is_running():
+        # If the current is instanced by Ray Serve,
+        # there is already a running event loop
+        model_config = event_loop.run_until_complete(engine.get_model_config())
+    else:
+        # When using single vLLM without engine_use_ray
+        model_config = asyncio.run(engine.get_model_config())
+
+    request_logger = RequestLogger(max_log_len=2048)
+
+    api_server.openai_serving_chat = OpenAIServingChat(
+        engine,
+        model_config=model_config,
+        served_model_names=[MODEL_DIR],
+        chat_template=None,
+        response_role="assistant",
+        lora_modules=[],
+        prompt_adapters=[],
+        request_logger=request_logger,
+    )
+    api_server.openai_serving_completion = OpenAIServingCompletion(
+        engine,
+        model_config=model_config,
+        served_model_names=[MODEL_DIR],
+        lora_modules=[],
+        prompt_adapters=[],
+        request_logger=request_logger,
+    )
+
+    return app
diff --git a/client.py b/client.py
@@ -0,0 +1,75 @@
+"""This simple script shows how to interact with an OpenAI-compatible server from a client."""
+import modal
+from openai import OpenAI
+
+
+class Colors:
+    """ANSI color codes"""
+
+    GREEN = "\033[0;32m"
+    BLUE = "\033[0;34m"
+    GRAY = "\033[0;90m"
+    BOLD = "\033[1m"
+    END = "\033[0m"
+
+
+client = OpenAI(api_key="super-secret-token")
+
+WORKSPACE = modal.config._profile
+
+client.base_url = f"https://{WORKSPACE}--vllm-openai-compatible-serve.modal.run/v1"
+
+print(
+    Colors.GREEN,
+    Colors.BOLD,
+    f"🧠: Looking up available models on server at {client.base_url}. This may trigger a boot!",
+    Colors.END,
+    sep="",
+)
+model = client.models.list().data[0]
+
+print(
+    Colors.GREEN,
+    Colors.BOLD,
+    f"🧠: Requesting completion from model {model.id}",
+    Colors.END,
+    sep="",
+)
+
+messages = [
+    {
+        "role": "system",
+        "content": "You are a poetic assistant, skilled in writing satirical doggerel with creative flair.",
+    },
+    {
+        "role": "user",
+        "content": "Compose a limerick about baboons and racoons.",
+    },
+]
+
+for message in messages:
+    color = Colors.GRAY
+    emoji = "👉"
+    if message["role"] == "user":
+        color = Colors.GREEN
+        emoji = "👤"
+    elif message["role"] == "assistant":
+        color = Colors.BLUE
+        emoji = "🤖"
+    print(
+        color,
+        f"{emoji}: {message['content']}",
+        Colors.END,
+        sep="",
+    )
+
+stream = client.chat.completions.create(
+    model=model.id,
+    messages=messages,
+    stream=True,
+)
+print(Colors.BLUE, "🤖:", sep="", end="")
+for chunk in stream:
+    if chunk.choices[0].delta.content is not None:
+        print(chunk.choices[0].delta.content, end="")
+print(Colors.END)
diff --git a/download.py b/download.py
@@ -0,0 +1,49 @@
+import modal
+
+MODEL_NAME = "meta-llama/Meta-Llama-3.1-405B-Instruct-FP8"
+MODEL_REVISION = "d8e5bf570eac69f7dfc596cfaaebe6acbf95ca2e"
+MODEL_DIR = f"/models/{MODEL_NAME}"
+
+volume = modal.Volume.from_name("llama3-405b-fp8", create_if_missing=True)
+
+image = (
+    modal.Image.debian_slim(python_version="3.10")
+    .pip_install(
+        [
+            "vllm==0.5.3post1",  # LLM serving
+            "huggingface_hub",  # download models from the Hugging Face Hub
+            "hf-transfer",  # download models faster
+        ]
+    )
+    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
+)
+
+
+MINUTES = 60
+HOURS = 60 * MINUTES
+
+
+app = modal.App(image=image, secrets=[modal.Secret.from_name("huggingface")])
+
+
+# should take about 30 minutes
+@app.function(volumes={MODEL_DIR: volume}, timeout=4 * HOURS)
+def download_model(model_dir, model_name, model_revision):
+    import os
+
+    from huggingface_hub import snapshot_download
+
+    os.makedirs(model_dir, exist_ok=True)
+
+    # ALSO NEED TOKENIZER?
+    snapshot_download(
+        model_name,
+        local_dir=model_dir,
+        ignore_patterns=["*.pt", "*.bin", "*.pth"],  # Ensure safetensors
+        revision=model_revision,
+    )
+
+
+@app.local_entrypoint()
+def main():
+    download_model.remote(MODEL_DIR, MODEL_NAME, MODEL_REVISION)