Enable q4 KV cache and 128K context defaults
Browse files- entrypoint.sh +5 -1
entrypoint.sh
CHANGED
|
@@ -4,10 +4,12 @@ set -eu
|
|
| 4 |
MODEL_SPEC="${MODEL_SPEC:-unsloth/gemma-4-E2B-it-GGUF:Q4_0}"
|
| 5 |
HOST="${HOST:-0.0.0.0}"
|
| 6 |
PORT="${PORT:-7860}"
|
| 7 |
-
CTX_SIZE="${CTX_SIZE:-
|
| 8 |
THREADS="${THREADS:-2}"
|
| 9 |
PARALLEL="${PARALLEL:-1}"
|
| 10 |
REASONING_MODE="${REASONING_MODE:-off}"
|
|
|
|
|
|
|
| 11 |
|
| 12 |
exec /app/llama-server \
|
| 13 |
-hf "$MODEL_SPEC" \
|
|
@@ -16,4 +18,6 @@ exec /app/llama-server \
|
|
| 16 |
--ctx-size "$CTX_SIZE" \
|
| 17 |
--threads "$THREADS" \
|
| 18 |
--parallel "$PARALLEL" \
|
|
|
|
|
|
|
| 19 |
--reasoning "$REASONING_MODE"
|
|
|
|
| 4 |
MODEL_SPEC="${MODEL_SPEC:-unsloth/gemma-4-E2B-it-GGUF:Q4_0}"
|
| 5 |
HOST="${HOST:-0.0.0.0}"
|
| 6 |
PORT="${PORT:-7860}"
|
| 7 |
+
CTX_SIZE="${CTX_SIZE:-131072}"
|
| 8 |
THREADS="${THREADS:-2}"
|
| 9 |
PARALLEL="${PARALLEL:-1}"
|
| 10 |
REASONING_MODE="${REASONING_MODE:-off}"
|
| 11 |
+
CACHE_TYPE_K="${CACHE_TYPE_K:-q4_0}"
|
| 12 |
+
CACHE_TYPE_V="${CACHE_TYPE_V:-q4_0}"
|
| 13 |
|
| 14 |
exec /app/llama-server \
|
| 15 |
-hf "$MODEL_SPEC" \
|
|
|
|
| 18 |
--ctx-size "$CTX_SIZE" \
|
| 19 |
--threads "$THREADS" \
|
| 20 |
--parallel "$PARALLEL" \
|
| 21 |
+
--cache-type-k "$CACHE_TYPE_K" \
|
| 22 |
+
--cache-type-v "$CACHE_TYPE_V" \
|
| 23 |
--reasoning "$REASONING_MODE"
|