Deploy using kubectl apply -f deepseek.yaml and watch the logs on the pod. When you see vllm listening on port 8000, use port forward so you can try running python query.py
kubectl port-forward svc/deepseek-r1-server 8000:8000
Deploy using kubectl apply -f deepseek.yaml and watch the logs on the pod. When you see vllm listening on port 8000, use port forward so you can try running python query.py
kubectl port-forward svc/deepseek-r1-server 8000:8000
| apiVersion: apps/v1 | |
| kind: Deployment | |
| metadata: | |
| name: deepseek-server | |
| annotations: | |
| description: "Deployment for DeepSeek server" | |
| spec: | |
| replicas: 1 | |
| selector: | |
| matchLabels: | |
| app: deepseek-server | |
| template: | |
| metadata: | |
| labels: | |
| app: deepseek-server | |
| spec: | |
| containers: | |
| - name: vllm-server | |
| image: vllm/vllm-openai:latest | |
| env: | |
| - name: HF_HUB_ENABLE_HF_TRANSFER | |
| value: "1" | |
| - name: HF_HOME | |
| value: "/local/huggingface" | |
| - name: HF_TOKEN | |
| valueFrom: | |
| secretKeyRef: | |
| name: hf-token-secret | |
| key: token | |
| - name: MODEL_REPO | |
| value: "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" | |
| command: ["/bin/bash"] | |
| args: | |
| - "-c" | |
| - > | |
| vllm serve ${MODEL_REPO} | |
| --host 0.0.0.0 | |
| --port 8000 | |
| --trust-remote-code | |
| resources: | |
| limits: | |
| cpu: "32" | |
| memory: 100G | |
| nvidia.com/gpu: "1" | |
| requests: | |
| cpu: "16" | |
| memory: 30G | |
| nvidia.com/gpu: "1" | |
| securityContext: | |
| privileged: true | |
| ports: | |
| - containerPort: 8000 | |
| startupProbe: | |
| periodSeconds: 10 | |
| failureThreshold: 720 | |
| httpGet: | |
| path: /health | |
| port: 8000 | |
| volumeMounts: | |
| - name: local-storage | |
| mountPath: /local | |
| - name: shm | |
| mountPath: /dev/shm | |
| volumes: | |
| - name: local-storage | |
| hostPath: | |
| path: /root/local | |
| type: DirectoryOrCreate | |
| - name: shm | |
| emptyDir: | |
| medium: Memory | |
| sizeLimit: "2Gi" | |
| --- | |
| apiVersion: v1 | |
| kind: Service | |
| metadata: | |
| name: deepseek-r1-server | |
| spec: | |
| selector: | |
| app: deepseek-server | |
| type: ClusterIP | |
| ports: | |
| - name: port-8000 | |
| port: 8000 | |
| targetPort: 8000 | |
| --- | |
| apiVersion: v1 | |
| kind: Secret | |
| metadata: | |
| name: hf-token-secret | |
| type: Opaque | |
| data: | |
| token: "<YOUR-HF-TOKEN-GOES-HERE>" |
| #!/usr/bin/env python | |
| import openai | |
| client = openai.Client( | |
| base_url="http://127.0.0.1:8000/v1", api_key="EMPTY") | |
| # Get the Models | |
| models = client.models.list() | |
| print(models) | |
| # Chat completion | |
| response = client.chat.completions.create( | |
| model=models.data[0].id, | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": "What is Kubernetes?" | |
| }, | |
| ] | |
| ) | |
| print(response) |