import threading import time from typing import Any, Dict, Generator, List, Optional from huggingface_hub import hf_hub_download from llama_cpp import Llama from src.core.config import settings class ModelEngine: def __init__(self): self.llm = None self.lock = threading.Lock() self._load_model() def _load_model(self): try: print(f"Downloading/Loading model: {settings.REPO_ID}...") model_path = hf_hub_download( repo_id=settings.REPO_ID, filename=settings.FILENAME ) self.llm = Llama( model_path=model_path, n_ctx=settings.CONTEXT_SIZE, n_threads=settings.N_THREADS, n_gpu_layers=settings.N_GPU_LAYERS, chat_format="chatml", verbose=True, ) print("Model loaded successfully!") except Exception as e: print(f"CRITICAL ERROR loading model: {e}") def generate( self, messages: List[Dict[str, Any]], abort_event: Optional[threading.Event] = None, **kwargs, ): if not self.llm: raise RuntimeError("Model not loaded") stream_mode = kwargs.get("stream", True) # Подготавливаем аргументы для llama_cpp_python llama_kwargs = { "messages": messages, "max_tokens": int(kwargs.get("max_tokens", settings.DEFAULT_MAX_TOKENS)), "temperature": float(kwargs.get("temperature", settings.DEFAULT_TEMP)), "top_p": float(kwargs.get("top_p", 0.95)), "repeat_penalty": float(kwargs.get("repeat_penalty", 1.15)), "stop": kwargs.get("stop", []), "stream": stream_mode, } # Прокидываем инструменты (Tool Calling) if kwargs.get("tools"): llama_kwargs["tools"] = kwargs["tools"] if kwargs.get("tool_choice"): llama_kwargs["tool_choice"] = kwargs["tool_choice"] # Если включен стриминг, выносим yield во внутреннюю функцию-генератор if stream_mode: def stream_generator(): acquired = False while not acquired: if abort_event and abort_event.is_set(): print("Request aborted while waiting in queue.") return acquired = self.lock.acquire(timeout=0.5) try: if abort_event and abort_event.is_set(): return response = self.llm.create_chat_completion(**llama_kwargs) for chunk in response: if abort_event and abort_event.is_set(): print("Request aborted during generation.") break yield chunk finally: self.lock.release() return stream_generator() # Если стриминг выключен (режим Агента), возвращаем обычный словарь else: acquired = False while not acquired: if abort_event and abort_event.is_set(): print("Request aborted while waiting in queue.") return acquired = self.lock.acquire(timeout=0.5) try: if abort_event and abort_event.is_set(): return response = self.llm.create_chat_completion(**llama_kwargs) return response finally: self.lock.release() engine = ModelEngine()