def start_model_server(model_name : str, client : OpenAI): def get_model(): models = client.models.list() model = models.data[0].id return model import torch num_gpus = torch.cuda.device_count() command_args = [ "python", "-m", "vllm.entrypoints.openai.api_server", "--model", model_name, "--tensor-parallel-size", f"{num_gpus}", "--max-model-len", "4096", "--host", "localhost", "--port", "8000", "--enforce-eager", ] stdout_tempfile = tempfile.NamedTemporaryFile("w", delete=False) stderr_tempfile = tempfile.NamedTemporaryFile("w", delete=False) print(f"Logging model outputs at {stdout_tempfile.name} and {stderr_tempfile.name}") process = subprocess.Popen(command_args, stdout=stdout_tempfile, stderr=stderr_tempfile) def wait_for_server(): try: model = get_model() assert model print("Model server started successfully!") return True except Exception as e: sleep(10) return wait_for_server() wait_for_server() sleep(10) return process ## once the moder server starts client = OpenAI(api_key="EMPTY", base_url="http://127.0.0.1:8000/v1") ## use it as an openai server (code reuse!) response = client.completions.create( model=args.model, prompt=prompt, echo=False, n=args.n, max_tokens=args.max_tokens, temperature=args.temperature, top_p=args.top_p, frequency_penalty=0, presence_penalty=0, stream=False, ) outputs = [c.text for c in response.choices]