Spaces:
Running
Running
| FROM python:3.9-slim | |
| # Install compilation tools | |
| RUN apt-get update && apt-get install -y build-essential curl | |
| # Downgrade Uvicorn to avoid strict HTTP header parsing issues behind HF Envoy proxy | |
| RUN pip install "uvicorn<0.30.0" | |
| # Install llama-cpp-python with a server (Using pre-compiled CPU wheels to skip the long C++ build!) | |
| RUN pip install "llama-cpp-python[server]" huggingface-hub --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu | |
| # CRITICAL FIX: Hugging Face proxy injects a non-UUID 'X-Request-ID' header (e.g. 'qKsNoh'). | |
| # starlette_context's RequestIdPlugin strictly validates this as UUIDv4 by default and throws a 400 Bad Request with an empty body. | |
| # We must patch llama_cpp to disable this validation. | |
| RUN python3 -c "import llama_cpp.server.app as app; path = app.__file__; data = open(path, 'r').read(); open(path, 'w').write(data.replace('RequestIdPlugin()', 'RequestIdPlugin(validate=False)'))" | |
| # Download a fast, compressed model (Qwen 1.5B is perfect for free CPUs) | |
| RUN python3 -c "from huggingface_hub import hf_hub_download; hf_hub_download(repo_id='Qwen/Qwen2.5-1.5B-Instruct-GGUF', filename='qwen2.5-1.5b-instruct-q4_k_m.gguf', local_dir='/model')" | |
| EXPOSE 7860 | |
| # Start the OpenAI-compatible API server with explicit limits for free CPU instances | |
| # - n_threads: 2 (matches 2 vCPU limit) | |
| # - n_ctx: 2048 (limits context memory usage) | |
| CMD ["python3", "-m", "llama_cpp.server", "--model", "/model/qwen2.5-1.5b-instruct-q4_k_m.gguf", "--chat_format", "chatml", "--host", "0.0.0.0", "--port", "7860", "--n_ctx", "2048", "--n_threads", "2", "--n_threads_batch", "2"] | |