Spaces:

Kbrian1237
/

kitnbot-api

Running

isaachaaseinstitutemail-spec

fix: resolve python file truncation bug in patching script

d794d9b 21 days ago

1.61 kB

	FROM python:3.9-slim

	# Install compilation tools
	RUN apt-get update && apt-get install -y build-essential curl

	# Downgrade Uvicorn to avoid strict HTTP header parsing issues behind HF Envoy proxy
	RUN pip install "uvicorn<0.30.0"

	# Install llama-cpp-python with a server (Using pre-compiled CPU wheels to skip the long C++ build!)
	RUN pip install "llama-cpp-python[server]" huggingface-hub --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu

	# CRITICAL FIX: Hugging Face proxy injects a non-UUID 'X-Request-ID' header (e.g. 'qKsNoh').
	# starlette_context's RequestIdPlugin strictly validates this as UUIDv4 by default and throws a 400 Bad Request with an empty body.
	# We must patch llama_cpp to disable this validation.
	RUN python3 -c "import llama_cpp.server.app as app; path = app.__file__; data = open(path, 'r').read(); open(path, 'w').write(data.replace('RequestIdPlugin()', 'RequestIdPlugin(validate=False)'))"

	# Download a fast, compressed model (Qwen 1.5B is perfect for free CPUs)
	RUN python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download(repo_id='Qwen/Qwen2.5-1.5B-Instruct-GGUF', filename='qwen2.5-1.5b-instruct-q4_k_m.gguf', local_dir='/model')"

	EXPOSE 7860

	# Start the OpenAI-compatible API server with explicit limits for free CPU instances
	# - n_threads: 2 (matches 2 vCPU limit)
	# - n_ctx: 2048 (limits context memory usage)
	CMD ["python3", "-m", "llama_cpp.server", "--model", "/model/qwen2.5-1.5b-instruct-q4_k_m.gguf", "--chat_format", "chatml", "--host", "0.0.0.0", "--port", "7860", "--n_ctx", "2048", "--n_threads", "2", "--n_threads_batch", "2"]