AlexandreScriptsMT commited on
Commit
64daee6
·
verified ·
1 Parent(s): 6ca36c7

Enable q4 KV cache and 128K context defaults

Browse files
Files changed (1) hide show
  1. entrypoint.sh +5 -1
entrypoint.sh CHANGED
@@ -4,10 +4,12 @@ set -eu
4
  MODEL_SPEC="${MODEL_SPEC:-unsloth/gemma-4-E2B-it-GGUF:Q4_0}"
5
  HOST="${HOST:-0.0.0.0}"
6
  PORT="${PORT:-7860}"
7
- CTX_SIZE="${CTX_SIZE:-4096}"
8
  THREADS="${THREADS:-2}"
9
  PARALLEL="${PARALLEL:-1}"
10
  REASONING_MODE="${REASONING_MODE:-off}"
 
 
11
 
12
  exec /app/llama-server \
13
  -hf "$MODEL_SPEC" \
@@ -16,4 +18,6 @@ exec /app/llama-server \
16
  --ctx-size "$CTX_SIZE" \
17
  --threads "$THREADS" \
18
  --parallel "$PARALLEL" \
 
 
19
  --reasoning "$REASONING_MODE"
 
4
  MODEL_SPEC="${MODEL_SPEC:-unsloth/gemma-4-E2B-it-GGUF:Q4_0}"
5
  HOST="${HOST:-0.0.0.0}"
6
  PORT="${PORT:-7860}"
7
+ CTX_SIZE="${CTX_SIZE:-131072}"
8
  THREADS="${THREADS:-2}"
9
  PARALLEL="${PARALLEL:-1}"
10
  REASONING_MODE="${REASONING_MODE:-off}"
11
+ CACHE_TYPE_K="${CACHE_TYPE_K:-q4_0}"
12
+ CACHE_TYPE_V="${CACHE_TYPE_V:-q4_0}"
13
 
14
  exec /app/llama-server \
15
  -hf "$MODEL_SPEC" \
 
18
  --ctx-size "$CTX_SIZE" \
19
  --threads "$THREADS" \
20
  --parallel "$PARALLEL" \
21
+ --cache-type-k "$CACHE_TYPE_K" \
22
+ --cache-type-v "$CACHE_TYPE_V" \
23
  --reasoning "$REASONING_MODE"