lukestanley · January 17, 2025 23:51 · Jan 17, 2025
diff --git a/llama_cpp_server_model_swapping_proxy_middleware.py b/llama_cpp_server_model_swapping_proxy_middleware.py
@@ -0,0 +1,233 @@
+# Minimalist OpenAI API compatiable llama.cpp server dynamic models switching middleware / proxy server manages model loading and auto-shutdown..
+# Provides seamless model hot-swapping and idle shutdown while exposing llama.cpp's advanced features like speculative decoding."""
+import asyncio
+import json
+from datetime import datetime, timedelta
+import subprocess
+import aiohttp
+from aiohttp import web
+
+# Constants
+PROXY_PORT = 8000 # Intended as an external port
+SERVER_PORT = 8312 # Intended as an internal port
+IDLE_TIMEOUT_SECONDS = 60 * 100 # 100 minutes
+SERVER_READY_TIMEOUT_SECONDS = 90
+DEFAULT_MODEL = "/fast/Meta-Llama-3.1-8B-Instruct-IQ4_XS.gguf"
+DEFAULT_DRAFT_MODEL = "/fast/Llama-3.2-1B-Instruct-IQ4_XS.gguf"
+CHUNK_SIZE = 4096
+MAX_GPU_LAYERS = 99
+DEFAULT_CTX_SIZE = 4096
+DEFAULT_THREADS = 8
+
+LLAMA_SERVER_CMD = """
+/fast/llama_gpu/bin/llama-server
+--model {model}
+--model-draft {draft_model}
+--ctx-size {ctx_size}
+--threads {threads}
+--port {port}
+-fa
+-ngl {gpu_layers}
+--gpu-layers-draft {gpu_layers}
+"""
+
+KEY_MODEL = "model"
+KEY_DRAFT_MODEL = "draft_model"
+KEY_PROCESS = "process"
+KEY_LAST_REQUEST = "last_request"
+# Shared State
+state = {
+    KEY_PROCESS: None,
+    KEY_LAST_REQUEST: None,
+    KEY_MODEL: DEFAULT_MODEL,
+    KEY_DRAFT_MODEL: DEFAULT_DRAFT_MODEL,
+}
+
+
+async def is_server_ready():
+    """Check if the Llama server is ready."""
+    try:
+        async with aiohttp.ClientSession() as session:
+            async with session.get(f"http://127.0.0.1:{SERVER_PORT}/health") as resp:
+                return resp.status == 200
+    except aiohttp.ClientConnectorError:
+        return False
+
+
+async def wait_for_server_ready():
+    """Wait for the Llama server to be ready."""
+    start_time = datetime.now()
+    while not await is_server_ready():
+        if datetime.now() - start_time > timedelta(
+            seconds=SERVER_READY_TIMEOUT_SECONDS
+        ):
+            raise Exception("Server failed to become ready")
+        await asyncio.sleep(1)
+
+
+async def start_server(
+    model=None, draft_model=None, ctx_size=DEFAULT_CTX_SIZE, threads=DEFAULT_THREADS
+):
+    """Start the Llama server."""
+    model = model or state[KEY_MODEL]
+    draft_model = draft_model or state[KEY_DRAFT_MODEL]
+
+    if state[KEY_PROCESS] and (
+        state[KEY_MODEL] != model or state[KEY_DRAFT_MODEL] != draft_model
+    ):
+        print("Stopping server to switch models...")
+        await stop_server()
+
+    if state[KEY_PROCESS] is None:
+        cmd = LLAMA_SERVER_CMD.format(
+            model=model,
+            draft_model=draft_model,
+            ctx_size=ctx_size,
+            threads=threads,
+            port=SERVER_PORT,
+            gpu_layers=MAX_GPU_LAYERS,
+        ).split()
+
+        state[KEY_PROCESS] = await asyncio.create_subprocess_exec(
+            *cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        asyncio.create_task(stream_subprocess_output(state[KEY_PROCESS]))
+
+        state[KEY_MODEL] = model
+        state[KEY_DRAFT_MODEL] = draft_model
+        state[KEY_LAST_REQUEST] = datetime.now()
+
+        print(
+            f"Llama server started on port {SERVER_PORT} with models: {model}, {draft_model}"
+        )
+        await wait_for_server_ready()
+
+
+async def stop_server():
+    """Stop the Llama server."""
+    if state[KEY_PROCESS]:
+        print("Terminating Llama server process...")
+        state[KEY_PROCESS].terminate()
+        await state[KEY_PROCESS].wait()
+        print("Llama server process terminated.")
+        state[KEY_PROCESS] = None
+
+
+async def stream_subprocess_output(process):
+    """Stream subprocess output."""
+
+    async def stream_pipe(pipe):
+        while line := await pipe.readline():
+            print(line.decode(), end="")
+
+    await asyncio.gather(stream_pipe(process.stdout), stream_pipe(process.stderr))
+
+
+async def monitor_idle_timeout():
+    """Monitor server idle timeout."""
+    while True:
+        await asyncio.sleep(5)
+        if state[KEY_LAST_REQUEST] and datetime.now() - state[
+            KEY_LAST_REQUEST
+        ] > timedelta(seconds=IDLE_TIMEOUT_SECONDS):
+            print("Stopping server due to inactivity...")
+            await stop_server()
+            state[KEY_LAST_REQUEST] = None
+
+
+def adapt_request_for_llama(headers, body):
+    """Adapt request for Llama.cpp server."""
+    try:
+        request_json = json.loads(body)
+        model = request_json.pop("model", None)
+        draft_model = request_json.pop("draft_model", None)
+        body = json.dumps(request_json).encode()
+        headers["content-type"] = "application/json"
+        headers["content-length"] = str(len(body))
+        return headers, body, model, draft_model
+    except json.JSONDecodeError:
+        return headers, body, None, None
+
+
+async def proxy_request(request):
+    """Handle incoming requests using aiohttp.web."""
+    try:
+        # Read request body
+        body = await request.read()
+        headers = dict(request.headers)
+
+        # Adapt request for Llama
+        headers, body, model, draft_model = adapt_request_for_llama(headers, body)
+
+        # Start the Llama Server (if needed)
+        await start_server(model, draft_model)
+        state[KEY_LAST_REQUEST] = datetime.now()
+
+        # Proxy to Llama.cpp Server
+        async with aiohttp.ClientSession() as session:
+            async with session.request(
+                method=request.method,
+                url=f"http://127.0.0.1:{SERVER_PORT}{request.path}",
+                headers=headers,
+                data=body
+            ) as llama_response:
+                # Stream response from Llama server
+                response = web.StreamResponse(
+                    status=llama_response.status,
+                    headers=llama_response.headers
+                )
+                await response.prepare(request)
+
+                async for chunk in llama_response.content.iter_any():
+                    await response.write(chunk)
+
+                await response.write_eof()
+                return response
+
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        return web.Response(status=500, text="Internal Server Error")
+
+async def main():
+    """Main function."""
+    await start_server()
+    asyncio.create_task(monitor_idle_timeout())
+
+    app = web.Application()
+    app.router.add_route('*', '/{path:.*}', proxy_request)
+
+    runner = web.AppRunner(app)
+    await runner.setup()
+    site = web.TCPSite(runner, '127.0.0.1', PROXY_PORT)
+
+    print(f"Serving on http://127.0.0.1:{PROXY_PORT}")
+    await site.start()
+
+    try:
+        await asyncio.Future()  # run forever
+    finally:
+        await runner.cleanup()
+
+asyncio.run(main())
+
+"""
+Why not just use the Llama server directly?
+-That's fine until you want to switch models, or save resources when not in use.
+-It's bleeding edge and supports the latest models with speculative decoding to speed up completions.
+-It also can support JSON schema enforcement and other great features.
+Ollama is a similar project but it is more complicated and lags behind llama.cpp.
+Python-llama-cpp is a Python wrapper around llama.cpp but it is not as feature rich as llama.cpp.
+This is a minimalist server that is easy to understand and modify.
+The standard OpenAI API compatibility is implemented by Llama.cpp Server.
+The OpenAI style API is even offered by Google's Gemini API.
+"""
+
+"""
+This works but how could it be more minimalist while preserving the primary functionality?
+httpx may help. FastAPI and Flask would not.
+This is middleware.
+FastAPI is a nice wrapper around Starlette, which is a nice wrapper around anyio IIRC.
+config managment is not a big chunk of the code. Celery would be overkill.
+We don't need logging libraries.
+I want to keep everything in one file.
+"""
No results found