# app.py — Structured Output Extractor | Function Calling + Pydantic import streamlit as st import json import os st.set_page_config( page_title="Structured Extractor · Daniel Fonseca", page_icon="⬡", layout="wide", initial_sidebar_state="expanded", ) # ── CSS: TERMINAL HACKER ────────────────────────────────────── st.markdown(""" """, unsafe_allow_html=True) # ── SESSION STATE ────────────────────────────────────────────── for k, v in { 'openai_key': '', 'history': [], 'active_schema': 'Contrato Legal', 'custom_schema': '', }.items(): if k not in st.session_state: st.session_state[k] = v # ── HELPERS ─────────────────────────────────────────────────── def get_key(): try: if 'OPENAI_API_KEY' in st.secrets: return st.secrets['OPENAI_API_KEY'] except Exception: pass return os.getenv('OPENAI_API_KEY', st.session_state.openai_key) def syntax_highlight_json(obj, indent=0) -> str: """Renderiza JSON com syntax highlighting HTML.""" pad = " " * (indent * 3) pad2 = " " * ((indent + 1) * 3) if isinstance(obj, dict): if not obj: return '{}' lines = ['{'] items = list(obj.items()) for i, (k, v) in enumerate(items): comma = "," if i < len(items) - 1 else "" val_html = syntax_highlight_json(v, indent + 1) lines.append(f'{pad2}"{k}": {val_html}{comma}') lines.append(f'{pad}}}') return "\n".join(lines) elif isinstance(obj, list): if not obj: return '[]' lines = ['['] for i, item in enumerate(obj): comma = "," if i < len(obj) - 1 else "" val_html = syntax_highlight_json(item, indent + 1) lines.append(f'{pad2}{val_html}{comma}') lines.append(f'{pad}]') return "\n".join(lines) elif isinstance(obj, str): escaped = obj.replace("&", "&").replace("<", "<").replace(">", ">") return f'"{escaped}"' elif isinstance(obj, bool): cls = "json-bool" return f'{"true" if obj else "false"}' elif obj is None: return 'null' elif isinstance(obj, (int, float)): return f'{obj}' else: return f'"{obj}"' def render_flat_fields(data: dict) -> str: """Renderiza campos flat (não-aninhados) como cards.""" cards = [] for k, v in data.items(): if isinstance(v, (dict, list)): continue key_html = f'

{k}

' if v is None: val_html = '

null

' elif isinstance(v, bool): cls = "field-val-bool-true" if v else "field-val-bool-false" val_html = f'

{"true" if v else "false"}

' else: escaped = str(v).replace("<", "<").replace(">", ">") val_html = f'

{escaped}

' cards.append(f'

{key_html}{val_html}

') if not cards: return "" return f'

{"".join(cards)}

' # ── SIDEBAR ─────────────────────────────────────────────────── with st.sidebar: st.markdown("""

STRUCT//EXTRACT

    v1.0 · Function Calling Engine
    

""", unsafe_allow_html=True) st.divider() st.markdown("**🔑 OpenAI API Key**") k_in = st.text_input("", type="password", value=st.session_state.openai_key, placeholder="sk-...", label_visibility="collapsed") if k_in: st.session_state.openai_key = k_in if get_key(): st.markdown('

✓ KEY LOADED

', unsafe_allow_html=True) else: st.markdown('

✗ KEY MISSING

', unsafe_allow_html=True) st.divider() st.markdown("""

// PIPELINE

Text input

Schema selection

Tool definition (OpenAI)

Function calling

JSON parse + validate

Retry on error

Render + export

""", unsafe_allow_html=True) st.divider() st.markdown("""

    model: gpt-4o-mini

    tool_choice: required

    temperature: 0.0

    max_retries: 2

    validation: pydantic v2

""", unsafe_allow_html=True) st.divider() if st.button("⬡ Limpar histórico", use_container_width=True): st.session_state.history = [] st.rerun() # ── HEADER ──────────────────────────────────────────────────── st.markdown("""

⬡ STRUCTURED OUTPUT EXTRACTOR _

OpenAI Function Calling · Pydantic v2 · Dynamic JSON Schema · Auto-Retry

""", unsafe_allow_html=True) # ── TABS ────────────────────────────────────────────────────── tab_extract, tab_custom, tab_history = st.tabs([ "⬡ Extrair", "⬢ Schema Customizado", "⬣ Histórico", ]) # ════════════════════════════════════════════════════════════════ # EXEMPLOS # ════════════════════════════════════════════════════════════════ EXAMPLES = { "Contrato Legal": """CONTRATO DE PRESTAÇÃO DE SERVIÇOS DE CONSULTORIA EM INTELIGÊNCIA ARTIFICIAL Entre as partes: CONTRATANTE: TechCorp Brasil Ltda., CNPJ 12.345.678/0001-99, com sede em São Paulo/SP. CONTRATADO: Daniel Fonseca - ML Engineer, CPF 123.456.789-00, residente no Rio de Janeiro/RJ. CLÁUSULA 1 - OBJETO O CONTRATADO prestará serviços de consultoria em Graph Neural Networks e sistemas de detecção de fraude com IA Generativa, incluindo desenvolvimento de modelos, treinamento de equipes e documentação técnica. CLÁUSULA 2 - VALOR O valor total dos serviços é de R$ 48.000,00 (quarenta e oito mil reais), pagos em 4 parcelas mensais de R$ 12.000,00. CLÁUSULA 3 - PRAZO Vigência de 4 (quatro) meses, iniciando em 01/04/2025 e encerrando em 31/07/2025. CLÁUSULA 4 - OBRIGAÇÕES DO CONTRATADO - Entregar relatórios mensais de progresso - Participar de reuniões semanais remotas - Manter confidencialidade sobre os dados da empresa CLÁUSULA 5 - FORO Fica eleito o foro da Comarca de São Paulo/SP para dirimir quaisquer controvérsias. Assinado digitalmente em 28/03/2025.""", "Notícia / Artigo": """Meta anuncia novo modelo de linguagem open-source com 405 bilhões de parâmetros SAN FRANCISCO, 15 de março de 2025 — A Meta Platforms anunciou nesta quinta-feira o lançamento do Llama 4, seu mais novo modelo de linguagem de grande escala com 405 bilhões de parâmetros, disponível gratuitamente para pesquisadores e empresas sob licença open-source. O CEO Mark Zuckerberg afirmou que o modelo supera o GPT-4o em 73% dos benchmarks testados internamente, incluindo MMLU, HumanEval e MT-Bench. A vice-presidente de IA da empresa, Yann LeCun, destacou que o modelo foi treinado em 30 trilhões de tokens de dados multimodais. O lançamento acontece em meio à crescente disputa entre Meta, OpenAI, Google e Anthropic pelo mercado de IA generativa, avaliado em US$ 2,4 trilhões até 2030 segundo a consultoria Goldman Sachs. Especialistas do MIT e Stanford avaliam que a decisão de tornar o modelo open-source pode democratizar o acesso à IA avançada, embora levante preocupações sobre uso malicioso. O governo americano já sinalizou que pode regulamentar o setor ainda em 2025.""", "Artigo Científico": """GraphSAGE: Inductive Representation Learning on Large Graphs Autores: William L. Hamilton, Rex Ying, Jure Leskovec Venue: NeurIPS 2017, Long Beach, CA Abstract: Low-dimensional embeddings of nodes in large graphs have proved extremely useful in a variety of prediction tasks. However, most existing approaches require that all nodes in the graph are present during training of the embeddings; these previous approaches are inherently transductive and do not naturally generalize to unseen nodes. Problema resolvido: A maioria dos métodos de embedding para grafos é transductive — só funciona para nós vistos durante o treino. Em aplicações reais como redes sociais e sistemas de recomendação, novos nós aparecem constantemente. Metodologia: O GraphSAGE aprende funções de agregação (mean, LSTM, pooling) que generalizam para nós não vistos, combinando features do nó com as de sua vizinhança amostrada. Resultados: - Dataset Citation (Cora): F1 = 0.935 - Dataset Reddit: F1 = 0.950 - Dataset PPI (Protein-Protein Interaction): F1 = 0.612 (vs 0.421 baseline) Contribuições principais: 1. Framework inductive para grafos de larga escala 2. Três agregadores comparados: mean, LSTM, max-pooling 3. Mini-batch training para escalabilidade 4. Open-source no repositório snap-stanford/GraphSAGE""", } # ════════════════════════════════════════════════════════════════ # TAB 1 — EXTRAIR # ════════════════════════════════════════════════════════════════ with tab_extract: from extractor import PRESET_SCHEMAS # Schema selector st.markdown("""

user@extractor:~$ select --schema

""", unsafe_allow_html=True) schema_cols = st.columns(len(PRESET_SCHEMAS)) for i, (name, _) in enumerate(PRESET_SCHEMAS.items()): with schema_cols[i]: active = st.session_state.active_schema == name if st.button(name, key=f"sc_{i}", use_container_width=True): st.session_state.active_schema = name st.rerun() active_schema = PRESET_SCHEMAS[st.session_state.active_schema] st.markdown(f"""

    // {st.session_state.active_schema} — {active_schema['description']}
    

""", unsafe_allow_html=True) # Exemplo rápido col_ex, _ = st.columns([2, 3]) with col_ex: if st.button(f"⬡ Carregar exemplo: {st.session_state.active_schema}", use_container_width=True): ex_text = EXAMPLES.get(st.session_state.active_schema, "") if ex_text: st.session_state["load_example"] = ex_text default_text = st.session_state.pop("load_example", "") st.markdown("""

user@extractor:~$ paste --input

""", unsafe_allow_html=True) text_input = st.text_area( "", value=default_text, height=220, placeholder="Cole qualquer texto aqui: contrato, notícia, currículo, invoice, artigo...", label_visibility="collapsed", key="main_text" ) run_col, _ = st.columns([1, 3]) with run_col: run_btn = st.button("⬡ EXTRAIR DADOS", use_container_width=True, type="primary") if run_btn: if not get_key(): st.markdown('

✗ API Key não configurada

', unsafe_allow_html=True) st.stop() if not text_input.strip(): st.markdown('

⚠ Cole um texto para extrair

', unsafe_allow_html=True) st.stop() from extractor import StructuredExtractor # Terminal de progresso prog_ph = st.empty() prog_ph.markdown("""

extraction.log

→ Inicializando engine...

→ Tool definition criada

→ Chamando gpt-4o-mini com tool_choice=required...

⟳ Aguardando resposta_

""", unsafe_allow_html=True) try: engine = StructuredExtractor(get_key()) result = engine.extract( text=text_input, schema=active_schema["schema"], schema_name=st.session_state.active_schema, ) prog_ph.markdown(f"""

extraction.log

✓ Engine inicializado

✓ Tool definition: {st.session_state.active_schema}

✓ Function call executado com sucesso

✓ JSON parseado e validado

✓ EXTRAÇÃO COMPLETA em {result['attempts']} tentativa(s) · {result['tokens']} tokens

""", unsafe_allow_html=True) # Salva no histórico st.session_state.history.append({ "schema": st.session_state.active_schema, "text_preview": text_input[:120] + "...", "result": result, }) # ── OUTPUT ────────────────────────────────────────── out_col, raw_col = st.columns([3, 2], gap="large") with out_col: st.markdown("""

user@extractor:~$ render --view=structured

""", unsafe_allow_html=True) # Cards de campos flat flat_html = render_flat_fields(result["data"]) if flat_html: st.markdown(flat_html, unsafe_allow_html=True) # Campos complexos (listas/objetos) for k, v in result["data"].items(): if not isinstance(v, (dict, list)): continue st.markdown(f"""

// {k}

""", unsafe_allow_html=True) if isinstance(v, list): for item in v: if isinstance(item, dict): st.markdown(f"""

{syntax_highlight_json(item, 0)}

""", unsafe_allow_html=True) else: esc = str(item).replace("<","<") st.markdown(f'

{esc}

', unsafe_allow_html=True) elif isinstance(v, dict): st.markdown(f"""

{syntax_highlight_json(v, 0)}

""", unsafe_allow_html=True) # Stats st.markdown(f"""

schema: {st.session_state.active_schema}

fields: {len(result['data'])}

tokens: {result['tokens']}

attempts: {result['attempts']}

method: {result['method']}

""", unsafe_allow_html=True) with raw_col: st.markdown("""

user@extractor:~$ cat output.json

""", unsafe_allow_html=True) json_html = syntax_highlight_json(result["data"]) st.markdown(f'

{json_html}

', unsafe_allow_html=True) # Download st.download_button( "⬡ Download JSON", data=json.dumps(result["data"], ensure_ascii=False, indent=2), file_name=f"extracted_{st.session_state.active_schema.lower().replace(' ','_')}.json", mime="application/json", use_container_width=True, ) except Exception as e: prog_ph.markdown(f"""

error.log

✗ ERRO: {e}

""", unsafe_allow_html=True) # ════════════════════════════════════════════════════════════════ # TAB 2 — SCHEMA CUSTOMIZADO # ════════════════════════════════════════════════════════════════ with tab_custom: st.markdown("""

user@extractor:~$ define --schema=custom

    // Defina seu próprio JSON Schema e extraia qualquer estrutura de qualquer texto
    

""", unsafe_allow_html=True) DEFAULT_CUSTOM = '''{ "type": "object", "properties": { "nome_produto": {"type": "string"}, "preco": {"type": "number"}, "categorias": {"type": "array", "items": {"type": "string"}}, "disponivel": {"type": "boolean"}, "especificacoes": { "type": "object", "properties": { "peso": {"type": "string"}, "cor": {"type": "string"} } } }, "required": ["nome_produto"] }''' c_left, c_right = st.columns(2, gap="large") with c_left: st.markdown('

$ vim schema.json

', unsafe_allow_html=True) custom_schema = st.text_area( "", value=st.session_state.custom_schema or DEFAULT_CUSTOM, height=280, label_visibility="collapsed", key="custom_schema_input" ) with c_right: st.markdown('

$ cat input.txt

', unsafe_allow_html=True) custom_text = st.text_area( "", height=280, label_visibility="collapsed", key="custom_text", placeholder="Cole o texto para extrair..." ) run_custom = st.button("⬡ EXTRAIR COM SCHEMA CUSTOMIZADO", use_container_width=True) if run_custom: if not get_key(): st.error("Configure a API Key na sidebar.") st.stop() if not custom_text.strip() or not custom_schema.strip(): st.warning("Preencha o schema e o texto.") st.stop() from extractor import StructuredExtractor with st.spinner("Extraindo..."): try: engine = StructuredExtractor(get_key()) result = engine.extract_with_custom_schema(custom_text, custom_schema) st.markdown('

$ cat output.json

', unsafe_allow_html=True) json_html = syntax_highlight_json(result["data"]) st.markdown(f'

{json_html}

', unsafe_allow_html=True) st.markdown(f"""

tokens: {result['tokens']}

attempts: {result['attempts']}

""", unsafe_allow_html=True) st.download_button( "⬡ Download JSON", data=json.dumps(result["data"], ensure_ascii=False, indent=2), file_name="custom_extraction.json", mime="application/json", ) st.session_state.history.append({ "schema": "Custom", "text_preview": custom_text[:120] + "...", "result": result, }) except ValueError as e: st.error(f"Schema inválido: {e}") except Exception as e: st.error(f"Erro: {e}") # ════════════════════════════════════════════════════════════════ # TAB 3 — HISTÓRICO # ════════════════════════════════════════════════════════════════ with tab_history: if not st.session_state.history: st.markdown("""

        // nenhuma extração executada ainda
        

""", unsafe_allow_html=True) else: for i, h in enumerate(reversed(st.session_state.history)): r = h["result"] with st.expander( f"#{len(st.session_state.history)-i} · {h['schema']} · {r['tokens']} tokens", expanded=(i == 0) ): st.markdown(f"""

// {h['text_preview']}

""", unsafe_allow_html=True) json_html = syntax_highlight_json(r["data"]) st.markdown(f'

{json_html}

', unsafe_allow_html=True) st.download_button( "⬡ Download", data=json.dumps(r["data"], ensure_ascii=False, indent=2), file_name=f"extract_{i}.json", mime="application/json", key=f"dl_{i}", )