dhruvilp · November 10, 2025 05:17
diff --git a/bash.sh b/bash.sh
 mkdir vllm_wheels
 pip download vllm torch torchvision torchaudio --only-binary :all: -d vllm_wheels
 # You will also need to download all dependencies listed in vllm's setup.py or pyproject.toml

 cd vllm_wheels
 pip install --no-index --find-links . vllm

 # serve the model
 # Assuming model weights are at /opt/app/model_weights
 python -m vllm.entrypoints.openai.api_server --model /opt/app/model_weights --host 0.0.0.0 --port 8000
diff --git a/Dockerfile b/Dockerfile
 # Use an AWS Deep Learning Container (DLC) as a base or a vLLM specific image
 # Ensure the base image has the necessary CUDA drivers and PyTorch
 FROM vllm/vllm-openai:latest # Or a specific version that matches your CUDA

 # Copy the pre-downloaded model weights into the container image
 COPY /mnt/models/granite-docling-258M /app/local_model

 WORKDIR /app

 # The entrypoint command will use the local directory path for the --model argument
 CMD ["python", "-m", "vllm.entrypoints.openai.api_server", "--model", "/app/local_model"]
diff --git a/vllm-serve.py b/vllm-serve.py
 from vllm import LLM, SamplingParams

 # Point to the local directory path
 model_path = "/path/to/your/local/model/directory"

 # Initialize the LLM engine
 llm = LLM(model=model_path, trust_remote_code=True) # trust_remote_code might be needed

 # Define sampling parameters
 sampling_params = SamplingParams(temperature=0.7, top_p=0.9, max_tokens=100)

 # Generate predictions
 prompts = ["What is the capital of France?", "The sky is what color?"]
 outputs = llm.generate(prompts, sampling_params)

 for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
	mkdir vllm_wheels
	pip download vllm torch torchvision torchaudio --only-binary :all: -d vllm_wheels
	# You will also need to download all dependencies listed in vllm's setup.py or pyproject.toml

	cd vllm_wheels
	pip install --no-index --find-links . vllm

	# serve the model
	# Assuming model weights are at /opt/app/model_weights
	python -m vllm.entrypoints.openai.api_server --model /opt/app/model_weights --host 0.0.0.0 --port 8000
	# Use an AWS Deep Learning Container (DLC) as a base or a vLLM specific image
	# Ensure the base image has the necessary CUDA drivers and PyTorch
	FROM vllm/vllm-openai:latest # Or a specific version that matches your CUDA

	# Copy the pre-downloaded model weights into the container image
	COPY /mnt/models/granite-docling-258M /app/local_model

	WORKDIR /app

	# The entrypoint command will use the local directory path for the --model argument
	CMD ["python", "-m", "vllm.entrypoints.openai.api_server", "--model", "/app/local_model"]
	from vllm import LLM, SamplingParams

	# Point to the local directory path
	model_path = "/path/to/your/local/model/directory"

	# Initialize the LLM engine
	llm = LLM(model=model_path, trust_remote_code=True) # trust_remote_code might be needed

	# Define sampling parameters
	sampling_params = SamplingParams(temperature=0.7, top_p=0.9, max_tokens=100)

	# Generate predictions
	prompts = ["What is the capital of France?", "The sky is what color?"]
	outputs = llm.generate(prompts, sampling_params)

	for output in outputs:
	prompt = output.prompt
	generated_text = output.outputs[0].text
	print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")