- 
      
 - 
        
Save sutyum/a88c4d1b23339e629002b5b3eed6c4bc to your computer and use it in GitHub Desktop.  
    LLaMA 3.1 405B Instruct FP8 - vLLM - OpenAI-compatible server
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | import modal | |
| vllm_image = modal.Image.debian_slim(python_version="3.10").pip_install( | |
| [ | |
| "vllm==0.5.3post1", # LLM serving | |
| "huggingface_hub==0.24.1", # download models from the Hugging Face Hub | |
| "hf-transfer==0.1.8", # download models faster | |
| ] | |
| ) | |
| MODEL_NAME = "meta-llama/Meta-Llama-3.1-405B-Instruct-FP8" | |
| MODEL_REVISION = "d8e5bf570eac69f7dfc596cfaaebe6acbf95ca2e" | |
| MODEL_DIR = f"/models/{MODEL_NAME}" | |
| MINUTES = 60 | |
| HOURS = 60 * MINUTES | |
| app = modal.App("vllm-openai-compatible") | |
| N_GPU = 8 # tip: for best results, first upgrade to A100s or H100s, and only then increase GPU count | |
| TOKEN = ( | |
| "super-secret-token" # auth token. for production use, replace with a modal.Secret | |
| ) | |
| volume = modal.Volume.from_name("llama3-405b-fp8", create_if_missing=True) | |
| @app.function( | |
| image=vllm_image, | |
| gpu=modal.gpu.A100(count=N_GPU, size="80GB"), | |
| container_idle_timeout=20 * MINUTES, | |
| timeout=1 * HOURS, | |
| allow_concurrent_inputs=100, | |
| volumes={MODEL_DIR: volume}, | |
| ) | |
| @modal.asgi_app() | |
| def serve(): | |
| import asyncio | |
| import fastapi | |
| import vllm.entrypoints.openai.api_server as api_server | |
| from vllm.engine.arg_utils import AsyncEngineArgs | |
| from vllm.engine.async_llm_engine import AsyncLLMEngine | |
| from vllm.entrypoints.openai.serving_chat import OpenAIServingChat | |
| from vllm.entrypoints.openai.serving_completion import ( | |
| OpenAIServingCompletion, | |
| ) | |
| from vllm.entrypoints.logger import RequestLogger | |
| from vllm.usage.usage_lib import UsageContext | |
| volume.reload() | |
| # create a fastAPI app that uses vLLM's OpenAI-compatible router | |
| app = fastapi.FastAPI( | |
| title=f"OpenAI-compatible {MODEL_NAME} server", | |
| description="Run an OpenAI-compatible LLM server with vLLM on modal.com", | |
| version="0.0.1", | |
| docs_url="/docs", | |
| ) | |
| # security: CORS middleware for external requests | |
| http_bearer = fastapi.security.HTTPBearer( | |
| scheme_name="Bearer Token", description="See code for authentication details." | |
| ) | |
| app.add_middleware( | |
| fastapi.middleware.cors.CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # security: inject dependency on authed routes | |
| async def is_authenticated(api_key: str = fastapi.Security(http_bearer)): | |
| if api_key.credentials != TOKEN: | |
| raise fastapi.HTTPException( | |
| status_code=fastapi.status.HTTP_401_UNAUTHORIZED, | |
| detail="Invalid authentication credentials", | |
| ) | |
| return {"username": "authenticated_user"} | |
| router = fastapi.APIRouter(dependencies=[fastapi.Depends(is_authenticated)]) | |
| router.include_router(api_server.router) | |
| app.include_router(router) | |
| engine_args = AsyncEngineArgs( | |
| model=MODEL_DIR, | |
| tensor_parallel_size=N_GPU, | |
| gpu_memory_utilization=0.90, | |
| max_model_len=1024 + 128, | |
| enforce_eager=True, | |
| ) | |
| engine = AsyncLLMEngine.from_engine_args( | |
| engine_args, usage_context=UsageContext.OPENAI_API_SERVER | |
| ) | |
| try: # copied from vLLM -- https://github.com/vllm-project/vllm/blob/507ef787d85dec24490069ffceacbd6b161f4f72/vllm/entrypoints/openai/api_server.py#L235C1-L247C1 | |
| event_loop = asyncio.get_running_loop() | |
| except RuntimeError: | |
| event_loop = None | |
| if event_loop is not None and event_loop.is_running(): | |
| # If the current is instanced by Ray Serve, | |
| # there is already a running event loop | |
| model_config = event_loop.run_until_complete(engine.get_model_config()) | |
| else: | |
| # When using single vLLM without engine_use_ray | |
| model_config = asyncio.run(engine.get_model_config()) | |
| request_logger = RequestLogger(max_log_len=2048) | |
| api_server.openai_serving_chat = OpenAIServingChat( | |
| engine, | |
| model_config=model_config, | |
| served_model_names=[MODEL_DIR], | |
| chat_template=None, | |
| response_role="assistant", | |
| lora_modules=[], | |
| prompt_adapters=[], | |
| request_logger=request_logger, | |
| ) | |
| api_server.openai_serving_completion = OpenAIServingCompletion( | |
| engine, | |
| model_config=model_config, | |
| served_model_names=[MODEL_DIR], | |
| lora_modules=[], | |
| prompt_adapters=[], | |
| request_logger=request_logger, | |
| ) | |
| return app | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | """This simple script shows how to interact with an OpenAI-compatible server from a client.""" | |
| import modal | |
| from openai import OpenAI | |
| class Colors: | |
| """ANSI color codes""" | |
| GREEN = "\033[0;32m" | |
| BLUE = "\033[0;34m" | |
| GRAY = "\033[0;90m" | |
| BOLD = "\033[1m" | |
| END = "\033[0m" | |
| client = OpenAI(api_key="super-secret-token") | |
| WORKSPACE = modal.config._profile | |
| client.base_url = f"https://{WORKSPACE}--vllm-openai-compatible-serve.modal.run/v1" | |
| print( | |
| Colors.GREEN, | |
| Colors.BOLD, | |
| f"π§ : Looking up available models on server at {client.base_url}. This may trigger a boot!", | |
| Colors.END, | |
| sep="", | |
| ) | |
| model = client.models.list().data[0] | |
| print( | |
| Colors.GREEN, | |
| Colors.BOLD, | |
| f"π§ : Requesting completion from model {model.id}", | |
| Colors.END, | |
| sep="", | |
| ) | |
| messages = [ | |
| { | |
| "role": "system", | |
| "content": "You are a poetic assistant, skilled in writing satirical doggerel with creative flair.", | |
| }, | |
| { | |
| "role": "user", | |
| "content": "Compose a limerick about baboons and racoons.", | |
| }, | |
| ] | |
| for message in messages: | |
| color = Colors.GRAY | |
| emoji = "π" | |
| if message["role"] == "user": | |
| color = Colors.GREEN | |
| emoji = "π€" | |
| elif message["role"] == "assistant": | |
| color = Colors.BLUE | |
| emoji = "π€" | |
| print( | |
| color, | |
| f"{emoji}: {message['content']}", | |
| Colors.END, | |
| sep="", | |
| ) | |
| stream = client.chat.completions.create( | |
| model=model.id, | |
| messages=messages, | |
| stream=True, | |
| ) | |
| print(Colors.BLUE, "π€:", sep="", end="") | |
| for chunk in stream: | |
| if chunk.choices[0].delta.content is not None: | |
| print(chunk.choices[0].delta.content, end="") | |
| print(Colors.END) | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | import modal | |
| MODEL_NAME = "meta-llama/Meta-Llama-3.1-405B-Instruct-FP8" | |
| MODEL_REVISION = "d8e5bf570eac69f7dfc596cfaaebe6acbf95ca2e" | |
| MODEL_DIR = f"/models/{MODEL_NAME}" | |
| volume = modal.Volume.from_name("llama3-405b-fp8", create_if_missing=True) | |
| image = ( | |
| modal.Image.debian_slim(python_version="3.10") | |
| .pip_install( | |
| [ | |
| "vllm==0.5.3post1", # LLM serving | |
| "huggingface_hub", # download models from the Hugging Face Hub | |
| "hf-transfer", # download models faster | |
| ] | |
| ) | |
| .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) | |
| ) | |
| MINUTES = 60 | |
| HOURS = 60 * MINUTES | |
| app = modal.App(image=image, secrets=[modal.Secret.from_name("huggingface")]) | |
| # should take about 30 minutes | |
| @app.function(volumes={MODEL_DIR: volume}, timeout=4 * HOURS) | |
| def download_model(model_dir, model_name, model_revision): | |
| import os | |
| from huggingface_hub import snapshot_download | |
| os.makedirs(model_dir, exist_ok=True) | |
| # ALSO NEED TOKENIZER? | |
| snapshot_download( | |
| model_name, | |
| local_dir=model_dir, | |
| ignore_patterns=["*.pt", "*.bin", "*.pth"], # Ensure safetensors | |
| revision=model_revision, | |
| ) | |
| @app.local_entrypoint() | |
| def main(): | |
| download_model.remote(MODEL_DIR, MODEL_NAME, MODEL_REVISION) | 
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment