apiVersion: apps/v1
kind: Deployment
metadata:
  name: stack-2.9
  namespace: stack-2.9
  labels:
    app: stack-2.9
    version: "2.9"
spec:
  replicas: 1
  selector:
    matchLabels:
      app: stack-2.9
  template:
    metadata:
      labels:
        app: stack-2.9
        version: "2.9"
    spec:
      containers:
      - name: stack-2.9
        image: your-registry/stack-2.9:latest
        imagePullPolicy: IfNotPresent
        ports:
        - containerPort: 8000
          name: http
          protocol: TCP
        env:
        - name: MODEL_ID
          value: "TheBloke/Llama-2-7B-Chat-AWQ"
        - name: HUGGING_FACE_TOKEN
          valueFrom:
            secretKeyRef:
              name: stack-2.9-secrets
              key: huggingface-token
        - name: QUANTIZATION
          value: "awq"
        - name: TENSOR_PARALLEL_SIZE
          value: "1"
        - name: GPU_MEMORY_UTILIZATION
          value: "0.9"
        - name: MAX_MODEL_LEN
          value: "4096"
        - name: MAX_NUM_SEQS
          value: "64"
        - name: MAX_NUM_BATCHED_TOKENS
          value: "4096"
        - name: ENFORCE_EAGER
          value: "false"
        - name: DISABLE_LOG_STATS
          value: "false"
        - name: HOST
          value: "0.0.0.0"
        - name: PORT
          value: "8000"
        - name: MODEL_CACHE_DIR
          value: "/models"
        - name: OMP_NUM_THREADS
          value: "4"
        resources:
          limits:
            nvidia.com/gpu: 1
            memory: "16Gi"
            cpu: "4"
          requests:
            nvidia.com/gpu: 1
            memory: "8Gi"
            cpu: "2"
        volumeMounts:
        - name: model-cache
          mountPath: /models
        livenessProbe:
          httpGet:
            path: /health
            port: 8000
          initialDelaySeconds: 60
          periodSeconds: 30
          timeoutSeconds: 10
          failureThreshold: 3
        readinessProbe:
          httpGet:
            path: /health
            port: 8000
          initialDelaySeconds: 30
          periodSeconds: 10
          timeoutSeconds: 5
          failureThreshold: 3
        securityContext:
          allowPrivilegeEscalation: false
          runAsNonRoot: true
          runAsUser: 1000
          capabilities:
            drop:
            - ALL
      volumes:
      - name: model-cache
        persistentVolumeClaim:
          claimName: stack-2.9-model-cache
      nodeSelector:
        # Uncomment to schedule on GPU nodes only
        # nvidia.com/gpu.product: A100-80GB
        accelerator: nvidia-tesla
      tolerations:
      - key: "nvidia.com/gpu"
        operator: "Exists"
        effect: "NoSchedule"
      affinity:
        podAntiAffinity:
          preferredDuringSchedulingIgnoredDuringExecution:
          - weight: 100
            podAffinityTerm:
              labelSelector:
                matchExpressions:
                - key: app
                  operator: In
                  values:
                  - stack-2.9
              topologyKey: kubernetes.io/hostname