Skip to content

Instantly share code, notes, and snippets.

@sutyum
Forked from charlesfrye/api.py
Created July 24, 2024 11:31
Show Gist options
  • Save sutyum/a88c4d1b23339e629002b5b3eed6c4bc to your computer and use it in GitHub Desktop.
Save sutyum/a88c4d1b23339e629002b5b3eed6c4bc to your computer and use it in GitHub Desktop.

Revisions

  1. @charlesfrye charlesfrye created this gist Jul 24, 2024.
    133 changes: 133 additions & 0 deletions api.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,133 @@
    import modal

    vllm_image = modal.Image.debian_slim(python_version="3.10").pip_install(
    [
    "vllm==0.5.3post1", # LLM serving
    "huggingface_hub==0.24.1", # download models from the Hugging Face Hub
    "hf-transfer==0.1.8", # download models faster
    ]
    )

    MODEL_NAME = "meta-llama/Meta-Llama-3.1-405B-Instruct-FP8"
    MODEL_REVISION = "d8e5bf570eac69f7dfc596cfaaebe6acbf95ca2e"
    MODEL_DIR = f"/models/{MODEL_NAME}"

    MINUTES = 60
    HOURS = 60 * MINUTES

    app = modal.App("vllm-openai-compatible")

    N_GPU = 8 # tip: for best results, first upgrade to A100s or H100s, and only then increase GPU count
    TOKEN = (
    "super-secret-token" # auth token. for production use, replace with a modal.Secret
    )

    volume = modal.Volume.from_name("llama3-405b-fp8", create_if_missing=True)


    @app.function(
    image=vllm_image,
    gpu=modal.gpu.A100(count=N_GPU, size="80GB"),
    container_idle_timeout=20 * MINUTES,
    timeout=1 * HOURS,
    allow_concurrent_inputs=100,
    volumes={MODEL_DIR: volume},
    )
    @modal.asgi_app()
    def serve():
    import asyncio

    import fastapi
    import vllm.entrypoints.openai.api_server as api_server
    from vllm.engine.arg_utils import AsyncEngineArgs
    from vllm.engine.async_llm_engine import AsyncLLMEngine
    from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
    from vllm.entrypoints.openai.serving_completion import (
    OpenAIServingCompletion,
    )
    from vllm.entrypoints.logger import RequestLogger
    from vllm.usage.usage_lib import UsageContext

    volume.reload()

    # create a fastAPI app that uses vLLM's OpenAI-compatible router
    app = fastapi.FastAPI(
    title=f"OpenAI-compatible {MODEL_NAME} server",
    description="Run an OpenAI-compatible LLM server with vLLM on modal.com",
    version="0.0.1",
    docs_url="/docs",
    )

    # security: CORS middleware for external requests
    http_bearer = fastapi.security.HTTPBearer(
    scheme_name="Bearer Token", description="See code for authentication details."
    )
    app.add_middleware(
    fastapi.middleware.cors.CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
    )

    # security: inject dependency on authed routes
    async def is_authenticated(api_key: str = fastapi.Security(http_bearer)):
    if api_key.credentials != TOKEN:
    raise fastapi.HTTPException(
    status_code=fastapi.status.HTTP_401_UNAUTHORIZED,
    detail="Invalid authentication credentials",
    )
    return {"username": "authenticated_user"}

    router = fastapi.APIRouter(dependencies=[fastapi.Depends(is_authenticated)])

    router.include_router(api_server.router)
    app.include_router(router)

    engine_args = AsyncEngineArgs(
    model=MODEL_DIR,
    tensor_parallel_size=N_GPU,
    gpu_memory_utilization=0.90,
    max_model_len=1024 + 128,
    enforce_eager=True,
    )

    engine = AsyncLLMEngine.from_engine_args(
    engine_args, usage_context=UsageContext.OPENAI_API_SERVER
    )

    try: # copied from vLLM -- https://github.com/vllm-project/vllm/blob/507ef787d85dec24490069ffceacbd6b161f4f72/vllm/entrypoints/openai/api_server.py#L235C1-L247C1
    event_loop = asyncio.get_running_loop()
    except RuntimeError:
    event_loop = None

    if event_loop is not None and event_loop.is_running():
    # If the current is instanced by Ray Serve,
    # there is already a running event loop
    model_config = event_loop.run_until_complete(engine.get_model_config())
    else:
    # When using single vLLM without engine_use_ray
    model_config = asyncio.run(engine.get_model_config())

    request_logger = RequestLogger(max_log_len=2048)

    api_server.openai_serving_chat = OpenAIServingChat(
    engine,
    model_config=model_config,
    served_model_names=[MODEL_DIR],
    chat_template=None,
    response_role="assistant",
    lora_modules=[],
    prompt_adapters=[],
    request_logger=request_logger,
    )
    api_server.openai_serving_completion = OpenAIServingCompletion(
    engine,
    model_config=model_config,
    served_model_names=[MODEL_DIR],
    lora_modules=[],
    prompt_adapters=[],
    request_logger=request_logger,
    )

    return app
    75 changes: 75 additions & 0 deletions client.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,75 @@
    """This simple script shows how to interact with an OpenAI-compatible server from a client."""
    import modal
    from openai import OpenAI


    class Colors:
    """ANSI color codes"""

    GREEN = "\033[0;32m"
    BLUE = "\033[0;34m"
    GRAY = "\033[0;90m"
    BOLD = "\033[1m"
    END = "\033[0m"


    client = OpenAI(api_key="super-secret-token")

    WORKSPACE = modal.config._profile

    client.base_url = f"https://{WORKSPACE}--vllm-openai-compatible-serve.modal.run/v1"

    print(
    Colors.GREEN,
    Colors.BOLD,
    f"🧠: Looking up available models on server at {client.base_url}. This may trigger a boot!",
    Colors.END,
    sep="",
    )
    model = client.models.list().data[0]

    print(
    Colors.GREEN,
    Colors.BOLD,
    f"🧠: Requesting completion from model {model.id}",
    Colors.END,
    sep="",
    )

    messages = [
    {
    "role": "system",
    "content": "You are a poetic assistant, skilled in writing satirical doggerel with creative flair.",
    },
    {
    "role": "user",
    "content": "Compose a limerick about baboons and racoons.",
    },
    ]

    for message in messages:
    color = Colors.GRAY
    emoji = "👉"
    if message["role"] == "user":
    color = Colors.GREEN
    emoji = "👤"
    elif message["role"] == "assistant":
    color = Colors.BLUE
    emoji = "🤖"
    print(
    color,
    f"{emoji}: {message['content']}",
    Colors.END,
    sep="",
    )

    stream = client.chat.completions.create(
    model=model.id,
    messages=messages,
    stream=True,
    )
    print(Colors.BLUE, "🤖:", sep="", end="")
    for chunk in stream:
    if chunk.choices[0].delta.content is not None:
    print(chunk.choices[0].delta.content, end="")
    print(Colors.END)
    49 changes: 49 additions & 0 deletions download.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,49 @@
    import modal

    MODEL_NAME = "meta-llama/Meta-Llama-3.1-405B-Instruct-FP8"
    MODEL_REVISION = "d8e5bf570eac69f7dfc596cfaaebe6acbf95ca2e"
    MODEL_DIR = f"/models/{MODEL_NAME}"

    volume = modal.Volume.from_name("llama3-405b-fp8", create_if_missing=True)

    image = (
    modal.Image.debian_slim(python_version="3.10")
    .pip_install(
    [
    "vllm==0.5.3post1", # LLM serving
    "huggingface_hub", # download models from the Hugging Face Hub
    "hf-transfer", # download models faster
    ]
    )
    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
    )


    MINUTES = 60
    HOURS = 60 * MINUTES


    app = modal.App(image=image, secrets=[modal.Secret.from_name("huggingface")])


    # should take about 30 minutes
    @app.function(volumes={MODEL_DIR: volume}, timeout=4 * HOURS)
    def download_model(model_dir, model_name, model_revision):
    import os

    from huggingface_hub import snapshot_download

    os.makedirs(model_dir, exist_ok=True)

    # ALSO NEED TOKENIZER?
    snapshot_download(
    model_name,
    local_dir=model_dir,
    ignore_patterns=["*.pt", "*.bin", "*.pth"], # Ensure safetensors
    revision=model_revision,
    )


    @app.local_entrypoint()
    def main():
    download_model.remote(MODEL_DIR, MODEL_NAME, MODEL_REVISION)