import os from fastapi import FastAPI, Request from fastapi.responses import StreamingResponse from huggingface_hub import InferenceClient import json import asyncio app = FastAPI() # Get your token from Hugging Face Secrets (Settings > Secrets) HF_TOKEN = os.getenv("HF_TOKEN") # Model choice (e.g., "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct") MODEL_ID = "Qwen/Qwen2.5-Coder-32B-Instruct" #"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" client = InferenceClient(model=MODEL_ID, token=HF_TOKEN) @app.get("/") def health_check(): return {"status": "Agent Active", "model": MODEL_ID} @app.post("/v1/chat/completions") async def chat_completions(request: Request): body = await request.json() messages = body.get("messages", []) stream = body.get("stream", False) if stream: return StreamingResponse( stream_generator(messages), media_type="text/event-stream" ) else: # Standard non-streaming response response = client.chat_completion( messages=messages, max_tokens=body.get("max_tokens", 1024), temperature=body.get("temperature", 0.7), ) return response async def stream_generator(messages): """Generates an OpenAI-compatible SSE stream""" for chunk in client.chat_completion( messages=messages, max_tokens=2048, stream=True, ): # Format the chunk to look like OpenAI's wire format data = { "id": "chatcmpl-custom", "object": "chat.completion.chunk", "choices": [{ "delta": {"content": chunk.choices[0].delta.content}, "finish_reason": chunk.choices[0].finish_reason, "index": 0 }] } yield f"data: {json.dumps(data)}\n\n" yield "data: [DONE]\n\n" if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)