Skip to content

Instantly share code, notes, and snippets.

@sutyum
Forked from charlesfrye/api.py
Created July 24, 2024 11:31
Show Gist options
  • Save sutyum/a88c4d1b23339e629002b5b3eed6c4bc to your computer and use it in GitHub Desktop.
Save sutyum/a88c4d1b23339e629002b5b3eed6c4bc to your computer and use it in GitHub Desktop.
LLaMA 3.1 405B Instruct FP8 - vLLM - OpenAI-compatible server
import modal
vllm_image = modal.Image.debian_slim(python_version="3.10").pip_install(
[
"vllm==0.5.3post1", # LLM serving
"huggingface_hub==0.24.1", # download models from the Hugging Face Hub
"hf-transfer==0.1.8", # download models faster
]
)
MODEL_NAME = "meta-llama/Meta-Llama-3.1-405B-Instruct-FP8"
MODEL_REVISION = "d8e5bf570eac69f7dfc596cfaaebe6acbf95ca2e"
MODEL_DIR = f"/models/{MODEL_NAME}"
MINUTES = 60
HOURS = 60 * MINUTES
app = modal.App("vllm-openai-compatible")
N_GPU = 8 # tip: for best results, first upgrade to A100s or H100s, and only then increase GPU count
TOKEN = (
"super-secret-token" # auth token. for production use, replace with a modal.Secret
)
volume = modal.Volume.from_name("llama3-405b-fp8", create_if_missing=True)
@app.function(
image=vllm_image,
gpu=modal.gpu.A100(count=N_GPU, size="80GB"),
container_idle_timeout=20 * MINUTES,
timeout=1 * HOURS,
allow_concurrent_inputs=100,
volumes={MODEL_DIR: volume},
)
@modal.asgi_app()
def serve():
import asyncio
import fastapi
import vllm.entrypoints.openai.api_server as api_server
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import (
OpenAIServingCompletion,
)
from vllm.entrypoints.logger import RequestLogger
from vllm.usage.usage_lib import UsageContext
volume.reload()
# create a fastAPI app that uses vLLM's OpenAI-compatible router
app = fastapi.FastAPI(
title=f"OpenAI-compatible {MODEL_NAME} server",
description="Run an OpenAI-compatible LLM server with vLLM on modal.com",
version="0.0.1",
docs_url="/docs",
)
# security: CORS middleware for external requests
http_bearer = fastapi.security.HTTPBearer(
scheme_name="Bearer Token", description="See code for authentication details."
)
app.add_middleware(
fastapi.middleware.cors.CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# security: inject dependency on authed routes
async def is_authenticated(api_key: str = fastapi.Security(http_bearer)):
if api_key.credentials != TOKEN:
raise fastapi.HTTPException(
status_code=fastapi.status.HTTP_401_UNAUTHORIZED,
detail="Invalid authentication credentials",
)
return {"username": "authenticated_user"}
router = fastapi.APIRouter(dependencies=[fastapi.Depends(is_authenticated)])
router.include_router(api_server.router)
app.include_router(router)
engine_args = AsyncEngineArgs(
model=MODEL_DIR,
tensor_parallel_size=N_GPU,
gpu_memory_utilization=0.90,
max_model_len=1024 + 128,
enforce_eager=True,
)
engine = AsyncLLMEngine.from_engine_args(
engine_args, usage_context=UsageContext.OPENAI_API_SERVER
)
try: # copied from vLLM -- https://github.com/vllm-project/vllm/blob/507ef787d85dec24490069ffceacbd6b161f4f72/vllm/entrypoints/openai/api_server.py#L235C1-L247C1
event_loop = asyncio.get_running_loop()
except RuntimeError:
event_loop = None
if event_loop is not None and event_loop.is_running():
# If the current is instanced by Ray Serve,
# there is already a running event loop
model_config = event_loop.run_until_complete(engine.get_model_config())
else:
# When using single vLLM without engine_use_ray
model_config = asyncio.run(engine.get_model_config())
request_logger = RequestLogger(max_log_len=2048)
api_server.openai_serving_chat = OpenAIServingChat(
engine,
model_config=model_config,
served_model_names=[MODEL_DIR],
chat_template=None,
response_role="assistant",
lora_modules=[],
prompt_adapters=[],
request_logger=request_logger,
)
api_server.openai_serving_completion = OpenAIServingCompletion(
engine,
model_config=model_config,
served_model_names=[MODEL_DIR],
lora_modules=[],
prompt_adapters=[],
request_logger=request_logger,
)
return app
"""This simple script shows how to interact with an OpenAI-compatible server from a client."""
import modal
from openai import OpenAI
class Colors:
"""ANSI color codes"""
GREEN = "\033[0;32m"
BLUE = "\033[0;34m"
GRAY = "\033[0;90m"
BOLD = "\033[1m"
END = "\033[0m"
client = OpenAI(api_key="super-secret-token")
WORKSPACE = modal.config._profile
client.base_url = f"https://{WORKSPACE}--vllm-openai-compatible-serve.modal.run/v1"
print(
Colors.GREEN,
Colors.BOLD,
f"🧠: Looking up available models on server at {client.base_url}. This may trigger a boot!",
Colors.END,
sep="",
)
model = client.models.list().data[0]
print(
Colors.GREEN,
Colors.BOLD,
f"🧠: Requesting completion from model {model.id}",
Colors.END,
sep="",
)
messages = [
{
"role": "system",
"content": "You are a poetic assistant, skilled in writing satirical doggerel with creative flair.",
},
{
"role": "user",
"content": "Compose a limerick about baboons and racoons.",
},
]
for message in messages:
color = Colors.GRAY
emoji = "πŸ‘‰"
if message["role"] == "user":
color = Colors.GREEN
emoji = "πŸ‘€"
elif message["role"] == "assistant":
color = Colors.BLUE
emoji = "πŸ€–"
print(
color,
f"{emoji}: {message['content']}",
Colors.END,
sep="",
)
stream = client.chat.completions.create(
model=model.id,
messages=messages,
stream=True,
)
print(Colors.BLUE, "πŸ€–:", sep="", end="")
for chunk in stream:
if chunk.choices[0].delta.content is not None:
print(chunk.choices[0].delta.content, end="")
print(Colors.END)
import modal
MODEL_NAME = "meta-llama/Meta-Llama-3.1-405B-Instruct-FP8"
MODEL_REVISION = "d8e5bf570eac69f7dfc596cfaaebe6acbf95ca2e"
MODEL_DIR = f"/models/{MODEL_NAME}"
volume = modal.Volume.from_name("llama3-405b-fp8", create_if_missing=True)
image = (
modal.Image.debian_slim(python_version="3.10")
.pip_install(
[
"vllm==0.5.3post1", # LLM serving
"huggingface_hub", # download models from the Hugging Face Hub
"hf-transfer", # download models faster
]
)
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
)
MINUTES = 60
HOURS = 60 * MINUTES
app = modal.App(image=image, secrets=[modal.Secret.from_name("huggingface")])
# should take about 30 minutes
@app.function(volumes={MODEL_DIR: volume}, timeout=4 * HOURS)
def download_model(model_dir, model_name, model_revision):
import os
from huggingface_hub import snapshot_download
os.makedirs(model_dir, exist_ok=True)
# ALSO NEED TOKENIZER?
snapshot_download(
model_name,
local_dir=model_dir,
ignore_patterns=["*.pt", "*.bin", "*.pth"], # Ensure safetensors
revision=model_revision,
)
@app.local_entrypoint()
def main():
download_model.remote(MODEL_DIR, MODEL_NAME, MODEL_REVISION)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment