### Make a new python 3.11 environment via pyenv or conda or whatever. # Install vLLM with CUDA 11.8. export CUDA_VER="cu118" export VLLM_VERSION=0.4.2 export PYTHON_VERSION=311 pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+${CUDA_VER}-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x 86_64.whl --extra-index-url https://download.pytorch.org/whl/${CUDA_VER} export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install wheel pip install --no-cache-dir flash-attn --no-build-isolation python -m vllm.entrypoints.openai.api_server -pp 1 -tp 1 --dtype float16 --max-model-len 4096 --enable-prefix-caching --device cuda --max-log-len 25 --max-logprobs 10 --enforce-eager --model meta-llama/Meta-Llama-3-8B-Instruct --served-model-name gpt-3.5-turbo --api-key=sk-1234 --disable-custom-all-reduce --disable-log-requests --gpu-memory-utilization 1.0 --uvicorn-log-level critical & ### Tested on vLLM engine (v0.4.2) openai -b http://0.0.0.0:8000/v1/ api completions.create -M 2048 -n1 -m gpt-3.5-turbo -t 0.6 -P0.9 --prompt "Hi. My name is..."