File size: 3,065 Bytes
b6ae7b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
apiVersion: apps/v1
kind: Deployment
metadata:
  name: stack-2.9
  namespace: stack-2.9
  labels:
    app: stack-2.9
    version: "2.9"
spec:
  replicas: 1
  selector:
    matchLabels:
      app: stack-2.9
  template:
    metadata:
      labels:
        app: stack-2.9
        version: "2.9"
    spec:
      containers:
      - name: stack-2.9
        image: your-registry/stack-2.9:latest
        imagePullPolicy: IfNotPresent
        ports:
        - containerPort: 8000
          name: http
          protocol: TCP
        env:
        - name: MODEL_ID
          value: "TheBloke/Llama-2-7B-Chat-AWQ"
        - name: HUGGING_FACE_TOKEN
          valueFrom:
            secretKeyRef:
              name: stack-2.9-secrets
              key: huggingface-token
        - name: QUANTIZATION
          value: "awq"
        - name: TENSOR_PARALLEL_SIZE
          value: "1"
        - name: GPU_MEMORY_UTILIZATION
          value: "0.9"
        - name: MAX_MODEL_LEN
          value: "4096"
        - name: MAX_NUM_SEQS
          value: "64"
        - name: MAX_NUM_BATCHED_TOKENS
          value: "4096"
        - name: ENFORCE_EAGER
          value: "false"
        - name: DISABLE_LOG_STATS
          value: "false"
        - name: HOST
          value: "0.0.0.0"
        - name: PORT
          value: "8000"
        - name: MODEL_CACHE_DIR
          value: "/models"
        - name: OMP_NUM_THREADS
          value: "4"
        resources:
          limits:
            nvidia.com/gpu: 1
            memory: "16Gi"
            cpu: "4"
          requests:
            nvidia.com/gpu: 1
            memory: "8Gi"
            cpu: "2"
        volumeMounts:
        - name: model-cache
          mountPath: /models
        livenessProbe:
          httpGet:
            path: /health
            port: 8000
          initialDelaySeconds: 60
          periodSeconds: 30
          timeoutSeconds: 10
          failureThreshold: 3
        readinessProbe:
          httpGet:
            path: /health
            port: 8000
          initialDelaySeconds: 30
          periodSeconds: 10
          timeoutSeconds: 5
          failureThreshold: 3
        securityContext:
          allowPrivilegeEscalation: false
          runAsNonRoot: true
          runAsUser: 1000
          capabilities:
            drop:
            - ALL
      volumes:
      - name: model-cache
        persistentVolumeClaim:
          claimName: stack-2.9-model-cache
      nodeSelector:
        # Uncomment to schedule on GPU nodes only
        # nvidia.com/gpu.product: A100-80GB
        accelerator: nvidia-tesla
      tolerations:
      - key: "nvidia.com/gpu"
        operator: "Exists"
        effect: "NoSchedule"
      affinity:
        podAntiAffinity:
          preferredDuringSchedulingIgnoredDuringExecution:
          - weight: 100
            podAffinityTerm:
              labelSelector:
                matchExpressions:
                - key: app
                  operator: In
                  values:
                  - stack-2.9
              topologyKey: kubernetes.io/hostname