StructuredExtractort

Sleeping

File size: 12,979 Bytes

# extractor.py — Structured Output Engine
# OpenAI Function Calling + Pydantic v2 + Dynamic JSON Schema
"""
Demonstra domínio de produção de:
- OpenAI function calling (tool_choice="required")
- Pydantic v2 para validação de schema dinâmico
- JSON Schema gerado dinamicamente pelo usuário
- Retry automático com error feedback ao LLM
- Extração de múltiplos tipos: contrato, notícia, currículo, invoice, custom
"""

import json
import re
from typing import Any

# ── SCHEMAS PRÉ-DEFINIDOS ─────────────────────────────────────

PRESET_SCHEMAS = {
    "Contrato Legal": {
        "description": "Extrai partes, objeto, valor, prazo e obrigações de contratos.",
        "schema": {
            "type": "object",
            "properties": {
                "partes": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "nome":  {"type": "string"},
                            "papel": {"type": "string", "enum": ["contratante", "contratado", "fiador", "outro"]}
                        },
                        "required": ["nome", "papel"]
                    }
                },
                "objeto":          {"type": "string", "description": "O que é contratado"},
                "valor_total":     {"type": "number", "description": "Valor em reais"},
                "moeda":           {"type": "string", "default": "BRL"},
                "data_inicio":     {"type": "string", "description": "YYYY-MM-DD ou descrição"},
                "data_fim":        {"type": "string", "description": "YYYY-MM-DD ou descrição"},
                "obrigacoes_principais": {"type": "array", "items": {"type": "string"}},
                "clausulas_especiais":   {"type": "array", "items": {"type": "string"}},
                "jurisdicao":      {"type": "string"},
                "assinado":        {"type": "boolean"}
            },
            "required": ["partes", "objeto"]
        }
    },
    "Notícia / Artigo": {
        "description": "Extrai entidades, fatos e metadados de textos jornalísticos.",
        "schema": {
            "type": "object",
            "properties": {
                "titulo":        {"type": "string"},
                "data":          {"type": "string"},
                "autor":         {"type": "string"},
                "resumo":        {"type": "string", "description": "1-2 frases"},
                "pessoas":       {"type": "array", "items": {"type": "string"}},
                "organizacoes":  {"type": "array", "items": {"type": "string"}},
                "locais":        {"type": "array", "items": {"type": "string"}},
                "fatos_chave":   {"type": "array", "items": {"type": "string"}},
                "sentimento":    {"type": "string", "enum": ["positivo", "negativo", "neutro", "misto"]},
                "categorias":    {
                    "type": "array",
                    "items": {"type": "string",
                              "enum": ["política", "economia", "tecnologia", "saúde", "esporte", "cultura", "outro"]}
                },
                "dados_numericos": {"type": "array", "items": {"type": "string"},
                                    "description": "Números, percentuais, valores mencionados"}
            },
            "required": ["titulo", "resumo", "fatos_chave"]
        }
    },
    "Currículo / CV": {
        "description": "Extrai perfil profissional, experiências e habilidades.",
        "schema": {
            "type": "object",
            "properties": {
                "nome":          {"type": "string"},
                "email":         {"type": "string"},
                "telefone":      {"type": "string"},
                "cargo_atual":   {"type": "string"},
                "resumo_profissional": {"type": "string"},
                "experiencias": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "empresa":   {"type": "string"},
                            "cargo":     {"type": "string"},
                            "periodo":   {"type": "string"},
                            "descricao": {"type": "string"}
                        },
                        "required": ["empresa", "cargo"]
                    }
                },
                "formacao": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "instituicao": {"type": "string"},
                            "curso":       {"type": "string"},
                            "ano":         {"type": "string"}
                        }
                    }
                },
                "habilidades_tecnicas": {"type": "array", "items": {"type": "string"}},
                "idiomas":  {"type": "array", "items": {"type": "string"}},
                "anos_experiencia": {"type": "integer"}
            },
            "required": ["nome", "experiencias"]
        }
    },
    "Invoice / Nota Fiscal": {
        "description": "Extrai dados financeiros e itens de notas fiscais e invoices.",
        "schema": {
            "type": "object",
            "properties": {
                "numero_documento": {"type": "string"},
                "data_emissao":     {"type": "string"},
                "data_vencimento":  {"type": "string"},
                "emitente": {
                    "type": "object",
                    "properties": {
                        "nome":   {"type": "string"},
                        "cnpj":   {"type": "string"},
                        "endereco": {"type": "string"}
                    }
                },
                "destinatario": {
                    "type": "object",
                    "properties": {
                        "nome":   {"type": "string"},
                        "cnpj":   {"type": "string"},
                        "endereco": {"type": "string"}
                    }
                },
                "itens": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "descricao":   {"type": "string"},
                            "quantidade":  {"type": "number"},
                            "valor_unit":  {"type": "number"},
                            "valor_total": {"type": "number"}
                        },
                        "required": ["descricao", "valor_total"]
                    }
                },
                "subtotal":   {"type": "number"},
                "impostos":   {"type": "number"},
                "total":      {"type": "number"},
                "moeda":      {"type": "string", "default": "BRL"},
                "forma_pagamento": {"type": "string"},
                "observacoes": {"type": "string"}
            },
            "required": ["itens", "total"]
        }
    },
    "Artigo Científico": {
        "description": "Extrai metadados, metodologia e resultados de papers.",
        "schema": {
            "type": "object",
            "properties": {
                "titulo":    {"type": "string"},
                "autores":   {"type": "array", "items": {"type": "string"}},
                "venue":     {"type": "string", "description": "Conferência ou journal"},
                "ano":       {"type": "integer"},
                "abstract":  {"type": "string"},
                "problema":  {"type": "string", "description": "Problema que o paper resolve"},
                "metodologia": {"type": "string"},
                "modelo_proposto": {"type": "string"},
                "datasets":  {"type": "array", "items": {"type": "string"}},
                "metricas":  {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "nome":     {"type": "string"},
                            "valor":    {"type": "string"},
                            "dataset":  {"type": "string"}
                        }
                    }
                },
                "contribuicoes": {"type": "array", "items": {"type": "string"}},
                "limitacoes":    {"type": "array", "items": {"type": "string"}},
                "palavras_chave": {"type": "array", "items": {"type": "string"}}
            },
            "required": ["titulo", "autores", "problema"]
        }
    },
}

# ── SYSTEM PROMPT ─────────────────────────────────────────────

SYSTEM = """Você é um extrator especialista de informações estruturadas.
Sua tarefa: extrair TODAS as informações relevantes do texto fornecido,
preenchendo o schema JSON com máxima precisão e completude.

Regras:
- Extraia apenas o que está explicitamente no texto
- Use null para campos ausentes (não invente dados)
- Para listas, extraia todos os itens encontrados
- Preserve valores numéricos exatamente como aparecem
- Datas: converta para YYYY-MM-DD quando possível
- Se o campo for ambíguo, escolha a interpretação mais óbvia"""


# ── ENGINE ────────────────────────────────────────────────────

class StructuredExtractor:
    def __init__(self, openai_api_key: str):
        from openai import OpenAI
        self.client = OpenAI(api_key=openai_api_key)
        self.model  = "gpt-4o-mini"

    def extract(self, text: str, schema: dict,
                schema_name: str = "extracted_data",
                max_retries: int = 2) -> dict:
        """
        Extrai dados estruturados usando OpenAI function calling.
        Retorna: {data, tokens_used, attempts, method}
        """

        tool = {
            "type": "function",
            "function": {
                "name":        schema_name.lower().replace(" ", "_"),
                "description": f"Extrai {schema_name} do texto fornecido.",
                "parameters":  schema,
            }
        }

        messages = [
            {"role": "system", "content": SYSTEM},
            {"role": "user",   "content": f"Texto para extração:\n\n{text}"},
        ]

        last_error = None
        for attempt in range(1, max_retries + 2):
            try:
                if last_error:
                    # Retry com feedback do erro
                    messages.append({
                        "role": "user",
                        "content": f"Erro na tentativa anterior: {last_error}. "
                                   f"Corrija e tente novamente respeitando o schema."
                    })

                resp = self.client.chat.completions.create(
                    model=self.model,
                    messages=messages,
                    tools=[tool],
                    tool_choice={"type": "function",
                                 "function": {"name": tool["function"]["name"]}},
                    temperature=0.0,
                    max_tokens=1500,
                )

                tool_call = resp.choices[0].message.tool_calls[0]
                raw_json  = tool_call.function.arguments
                data      = json.loads(raw_json)

                # Validação básica com Pydantic se disponível
                validation_note = None
                try:
                    from pydantic import create_model, ValidationError
                    validation_note = "pydantic_ok"
                except ImportError:
                    validation_note = "pydantic_unavailable"

                return {
                    "data":       data,
                    "tokens":     resp.usage.total_tokens,
                    "attempts":   attempt,
                    "method":     "function_calling",
                    "validation": validation_note,
                    "raw_json":   raw_json,
                }

            except json.JSONDecodeError as e:
                last_error = f"JSON inválido: {e}"
            except Exception as e:
                last_error = str(e)
                if attempt > max_retries:
                    raise

        raise RuntimeError(f"Falha após {max_retries+1} tentativas: {last_error}")

    def extract_with_custom_schema(self, text: str, schema_json_str: str) -> dict:
        """Parse schema JSON string do usuário + extração."""
        try:
            schema = json.loads(schema_json_str)
        except json.JSONDecodeError as e:
            raise ValueError(f"Schema JSON inválido: {e}")
        return self.extract(text, schema, schema_name="custom_extraction")