Skip to content

Instantly share code, notes, and snippets.

@darrenangle
Last active March 20, 2024 16:56
Show Gist options
  • Save darrenangle/1deb90b2a5a0ce018b80bcfdf8ee277d to your computer and use it in GitHub Desktop.
Save darrenangle/1deb90b2a5a0ce018b80bcfdf8ee277d to your computer and use it in GitHub Desktop.

Revisions

  1. darrenangle revised this gist Mar 20, 2024. No changes.
  2. darrenangle created this gist Mar 20, 2024.
    54 changes: 54 additions & 0 deletions vllm.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,54 @@
    import subprocess
    import time
    import re
    import signal
    import sys
    import select
    import os

    def start_server():
    command = [
    "/usr/bin/python3", "-m", "vllm.entrypoints.openai.api_server",
    "--model", "hf-models/NousResearch-Hermes-2-Pro-Mistral-7B",
    "--max-model-len", "8192",
    "--enforce-eager"
    ]
    return subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, preexec_fn=os.setsid)

    def read_output(process):
    error_pattern = re.compile(r"AsyncEngineDeadError: Task finished unexpectedly\.")
    while True:
    ready, _, _ = select.select([process.stdout, sys.stdin], [], [])
    if process.stdout in ready:
    output = process.stdout.readline()
    if output == '' and process.poll() is not None:
    break
    if output:
    print(output.strip())
    if error_pattern.search(output):
    print("Error detected. Restarting the server...")
    terminate_process(process)
    return True
    if sys.stdin in ready:
    input() # Consume the input to prevent blocking
    print("Keyboard interrupt received. Terminating the server...")
    terminate_process(process)
    return False
    return False

    def terminate_process(process):
    os.killpg(os.getpgid(process.pid), signal.SIGTERM)
    time.sleep(5) # Wait for a short duration to allow the process to terminate
    if process.poll() is None:
    os.killpg(os.getpgid(process.pid), signal.SIGKILL)

    def main():
    while True:
    process = start_server()
    restart = read_output(process)
    if not restart:
    break
    time.sleep(5) # Wait for 5 seconds before restarting

    if __name__ == "__main__":
    main()
    55 changes: 55 additions & 0 deletions vllm.ts
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,55 @@
    import axios from "axios";

    export class vllm {
    private static readonly API_URL = "http://localhost:8000/v1/chat/completions";

    public async getCompletion(
    messages: Array<{ content: string; role: string }>,
    config: {
    model?: string;
    max_tokens?: number;
    temperature?: number;
    stop?: string[];
    } = {}
    ): Promise<any> {
    const data = {
    model: config.model || "hf-models/NousResearch-Hermes-2-Pro-Mistral-7B",
    messages: [...messages],
    max_tokens: config.max_tokens || 3072,
    temperature: config.temperature || 0.5,
    stop: config.stop || ["<|im_start|>", "<|im_end|>"],
    stream: false,
    };

    const headers = {
    headers: {
    "Content-Type": "application/json",
    },
    };

    const makeRequest: any = async (retryCount = 10) => {
    try {
    const response = await axios.post(vllm.API_URL, data, headers);
    console.log(
    response.status,
    ": ",
    response.data.choices[0].message.content.trim()
    );
    // Adjusting to OpenAI's API response structure
    return response.data.choices[0].message.content.trim();
    } catch (error) {
    console.error(error);
    if (retryCount > 0) {
    const delay = Math.pow(2, 10 - retryCount) * 100; // Exponential backoff formula
    console.log(`Retrying after ${delay}ms...`);
    await new Promise((resolve) => setTimeout(resolve, delay));
    return makeRequest(retryCount - 1);
    } else {
    throw new Error("Max retries reached. Request failed.");
    }
    }
    };

    return makeRequest();
    }
    }