| | from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline |
| | from peft import PeftModel |
| | import torch |
| | import os |
| | import traceback |
| |
|
| | class EndpointHandler: |
| | def __init__(self, path=""): |
| | base_model_id = "microsoft/Phi-4-mini-instruct" |
| | adapter_path = path |
| |
|
| | try: |
| | print(f"Iniciando Handler: Carregando modelo base {base_model_id}") |
| | |
| | dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16 |
| | self.base_model = AutoModelForCausalLM.from_pretrained( |
| | base_model_id, |
| | torch_dtype=dtype, |
| | trust_remote_code=True |
| | |
| | ) |
| |
|
| | print(f"Carregando tokenizer de {base_model_id}") |
| | self.tokenizer = AutoTokenizer.from_pretrained( |
| | base_model_id, |
| | trust_remote_code=True |
| | ) |
| | if self.tokenizer.pad_token is None or self.tokenizer.pad_token_id == self.tokenizer.eos_token_id: |
| | self.tokenizer.pad_token = self.tokenizer.unk_token |
| | print("Definido tokenizer.pad_token = tokenizer.unk_token") |
| |
|
| | print(f"Carregando adaptador LoRA de {adapter_path}") |
| | self.model = PeftModel.from_pretrained(self.base_model, adapter_path) |
| | self.model.eval() |
| | print("Adaptador LoRA carregado.") |
| |
|
| | self.pipeline = pipeline( |
| | "text-generation", |
| | model=self.model, |
| | tokenizer=self.tokenizer, |
| | |
| | ) |
| | print("Pipeline de text-generation criado. Handler pronto.") |
| |
|
| | except Exception as e: |
| | print(f"ERRO FATAL durante __init__ do Handler: {e}") |
| | print(traceback.format_exc()) |
| | raise e |
| |
|
| |
|
| | def __call__(self, data): |
| | try: |
| | inputs = data.pop("inputs", data) |
| | parameters = data.pop("parameters", None) or {} |
| |
|
| | print(f"Handler __call__ recebeu inputs: {inputs}") |
| | print(f"Handler __call__ recebeu parâmetros: {parameters}") |
| |
|
| | |
| | prompt_text = inputs |
| | if isinstance(inputs, list) and len(inputs) > 0 and isinstance(inputs[0], dict) and 'role' in inputs[0]: |
| | print("Aplicando chat template...") |
| | |
| | prompt_text = self.tokenizer.apply_chat_template(inputs, tokenize=False, add_generation_prompt=True) |
| |
|
| | print(f"Texto do prompt para o pipeline: {prompt_text}") |
| |
|
| | |
| | outputs = self.pipeline(prompt_text, **parameters) |
| |
|
| | print(f"Handler __call__ gerou outputs: {outputs}") |
| | |
| | return outputs |
| |
|
| | except Exception as e: |
| | print(f"ERRO durante __call__ do Handler: {e}") |
| | print(traceback.format_exc()) |
| | |
| | return [{"error": str(e), "traceback": traceback.format_exc()}] |