File size: 3,065 Bytes
b6ae7b8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 | apiVersion: apps/v1
kind: Deployment
metadata:
name: stack-2.9
namespace: stack-2.9
labels:
app: stack-2.9
version: "2.9"
spec:
replicas: 1
selector:
matchLabels:
app: stack-2.9
template:
metadata:
labels:
app: stack-2.9
version: "2.9"
spec:
containers:
- name: stack-2.9
image: your-registry/stack-2.9:latest
imagePullPolicy: IfNotPresent
ports:
- containerPort: 8000
name: http
protocol: TCP
env:
- name: MODEL_ID
value: "TheBloke/Llama-2-7B-Chat-AWQ"
- name: HUGGING_FACE_TOKEN
valueFrom:
secretKeyRef:
name: stack-2.9-secrets
key: huggingface-token
- name: QUANTIZATION
value: "awq"
- name: TENSOR_PARALLEL_SIZE
value: "1"
- name: GPU_MEMORY_UTILIZATION
value: "0.9"
- name: MAX_MODEL_LEN
value: "4096"
- name: MAX_NUM_SEQS
value: "64"
- name: MAX_NUM_BATCHED_TOKENS
value: "4096"
- name: ENFORCE_EAGER
value: "false"
- name: DISABLE_LOG_STATS
value: "false"
- name: HOST
value: "0.0.0.0"
- name: PORT
value: "8000"
- name: MODEL_CACHE_DIR
value: "/models"
- name: OMP_NUM_THREADS
value: "4"
resources:
limits:
nvidia.com/gpu: 1
memory: "16Gi"
cpu: "4"
requests:
nvidia.com/gpu: 1
memory: "8Gi"
cpu: "2"
volumeMounts:
- name: model-cache
mountPath: /models
livenessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 60
periodSeconds: 30
timeoutSeconds: 10
failureThreshold: 3
readinessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
securityContext:
allowPrivilegeEscalation: false
runAsNonRoot: true
runAsUser: 1000
capabilities:
drop:
- ALL
volumes:
- name: model-cache
persistentVolumeClaim:
claimName: stack-2.9-model-cache
nodeSelector:
# Uncomment to schedule on GPU nodes only
# nvidia.com/gpu.product: A100-80GB
accelerator: nvidia-tesla
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- stack-2.9
topologyKey: kubernetes.io/hostname
|