Skip to content

Instantly share code, notes, and snippets.

@darrenangle
Last active March 20, 2024 16:56
Show Gist options
  • Save darrenangle/1deb90b2a5a0ce018b80bcfdf8ee277d to your computer and use it in GitHub Desktop.
Save darrenangle/1deb90b2a5a0ce018b80bcfdf8ee277d to your computer and use it in GitHub Desktop.
vllm server wrapper with auto restart and vllm node script with exponential backoff for requests
import subprocess
import time
import re
import signal
import sys
import select
import os
def start_server():
command = [
"/usr/bin/python3", "-m", "vllm.entrypoints.openai.api_server",
"--model", "hf-models/NousResearch-Hermes-2-Pro-Mistral-7B",
"--max-model-len", "8192",
"--enforce-eager"
]
return subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, preexec_fn=os.setsid)
def read_output(process):
error_pattern = re.compile(r"AsyncEngineDeadError: Task finished unexpectedly\.")
while True:
ready, _, _ = select.select([process.stdout, sys.stdin], [], [])
if process.stdout in ready:
output = process.stdout.readline()
if output == '' and process.poll() is not None:
break
if output:
print(output.strip())
if error_pattern.search(output):
print("Error detected. Restarting the server...")
terminate_process(process)
return True
if sys.stdin in ready:
input() # Consume the input to prevent blocking
print("Keyboard interrupt received. Terminating the server...")
terminate_process(process)
return False
return False
def terminate_process(process):
os.killpg(os.getpgid(process.pid), signal.SIGTERM)
time.sleep(5) # Wait for a short duration to allow the process to terminate
if process.poll() is None:
os.killpg(os.getpgid(process.pid), signal.SIGKILL)
def main():
while True:
process = start_server()
restart = read_output(process)
if not restart:
break
time.sleep(5) # Wait for 5 seconds before restarting
if __name__ == "__main__":
main()
import axios from "axios";
export class vllm {
private static readonly API_URL = "http://localhost:8000/v1/chat/completions";
public async getCompletion(
messages: Array<{ content: string; role: string }>,
config: {
model?: string;
max_tokens?: number;
temperature?: number;
stop?: string[];
} = {}
): Promise<any> {
const data = {
model: config.model || "hf-models/NousResearch-Hermes-2-Pro-Mistral-7B",
messages: [...messages],
max_tokens: config.max_tokens || 3072,
temperature: config.temperature || 0.5,
stop: config.stop || ["<|im_start|>", "<|im_end|>"],
stream: false,
};
const headers = {
headers: {
"Content-Type": "application/json",
},
};
const makeRequest: any = async (retryCount = 10) => {
try {
const response = await axios.post(vllm.API_URL, data, headers);
console.log(
response.status,
": ",
response.data.choices[0].message.content.trim()
);
// Adjusting to OpenAI's API response structure
return response.data.choices[0].message.content.trim();
} catch (error) {
console.error(error);
if (retryCount > 0) {
const delay = Math.pow(2, 10 - retryCount) * 100; // Exponential backoff formula
console.log(`Retrying after ${delay}ms...`);
await new Promise((resolve) => setTimeout(resolve, delay));
return makeRequest(retryCount - 1);
} else {
throw new Error("Max retries reached. Request failed.");
}
}
};
return makeRequest();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment