# app.py — Structured Output Extractor | Function Calling + Pydantic
import streamlit as st
import json
import os
st.set_page_config(
page_title="Structured Extractor · Daniel Fonseca",
page_icon="⬡",
layout="wide",
initial_sidebar_state="expanded",
)
# ── CSS: TERMINAL HACKER ──────────────────────────────────────
st.markdown("""
""", unsafe_allow_html=True)
# ── SESSION STATE ──────────────────────────────────────────────
for k, v in {
'openai_key': '',
'history': [],
'active_schema': 'Contrato Legal',
'custom_schema': '',
}.items():
if k not in st.session_state:
st.session_state[k] = v
# ── HELPERS ───────────────────────────────────────────────────
def get_key():
try:
if 'OPENAI_API_KEY' in st.secrets:
return st.secrets['OPENAI_API_KEY']
except Exception:
pass
return os.getenv('OPENAI_API_KEY', st.session_state.openai_key)
def syntax_highlight_json(obj, indent=0) -> str:
"""Renderiza JSON com syntax highlighting HTML."""
pad = " " * (indent * 3)
pad2 = " " * ((indent + 1) * 3)
if isinstance(obj, dict):
if not obj:
return '{}'
lines = ['{']
items = list(obj.items())
for i, (k, v) in enumerate(items):
comma = "," if i < len(items) - 1 else ""
val_html = syntax_highlight_json(v, indent + 1)
lines.append(f'{pad2}"{k}": {val_html}{comma}')
lines.append(f'{pad}}}')
return "\n".join(lines)
elif isinstance(obj, list):
if not obj:
return '[]'
lines = ['[']
for i, item in enumerate(obj):
comma = "," if i < len(obj) - 1 else ""
val_html = syntax_highlight_json(item, indent + 1)
lines.append(f'{pad2}{val_html}{comma}')
lines.append(f'{pad}]')
return "\n".join(lines)
elif isinstance(obj, str):
escaped = obj.replace("&", "&").replace("<", "<").replace(">", ">")
return f'"{escaped}"'
elif isinstance(obj, bool):
cls = "json-bool"
return f'{"true" if obj else "false"}'
elif obj is None:
return 'null'
elif isinstance(obj, (int, float)):
return f'{obj}'
else:
return f'"{obj}"'
def render_flat_fields(data: dict) -> str:
"""Renderiza campos flat (não-aninhados) como cards."""
cards = []
for k, v in data.items():
if isinstance(v, (dict, list)):
continue
key_html = f'
{k}
'
if v is None:
val_html = 'null
'
elif isinstance(v, bool):
cls = "field-val-bool-true" if v else "field-val-bool-false"
val_html = f'{"true" if v else "false"}
'
else:
escaped = str(v).replace("<", "<").replace(">", ">")
val_html = f'{escaped}
'
cards.append(f'{key_html}{val_html}
')
if not cards:
return ""
return f'{"".join(cards)}
'
# ── SIDEBAR ───────────────────────────────────────────────────
with st.sidebar:
st.markdown("""
STRUCT//EXTRACT
v1.0 · Function Calling Engine
""", unsafe_allow_html=True)
st.divider()
st.markdown("**🔑 OpenAI API Key**")
k_in = st.text_input("", type="password", value=st.session_state.openai_key,
placeholder="sk-...", label_visibility="collapsed")
if k_in:
st.session_state.openai_key = k_in
if get_key():
st.markdown('✓ KEY LOADED
',
unsafe_allow_html=True)
else:
st.markdown('✗ KEY MISSING
',
unsafe_allow_html=True)
st.divider()
st.markdown("""
// PIPELINE
01. Text input
02. Schema selection
03. Tool definition (OpenAI)
04. Function calling
05. JSON parse + validate
06. Retry on error
07. Render + export
""", unsafe_allow_html=True)
st.divider()
st.markdown("""
model: gpt-4o-mini
tool_choice: required
temperature: 0.0
max_retries: 2
validation: pydantic v2
""", unsafe_allow_html=True)
st.divider()
if st.button("⬡ Limpar histórico", use_container_width=True):
st.session_state.history = []
st.rerun()
# ── HEADER ────────────────────────────────────────────────────
st.markdown("""
""", unsafe_allow_html=True)
# ── TABS ──────────────────────────────────────────────────────
tab_extract, tab_custom, tab_history = st.tabs([
"⬡ Extrair",
"⬢ Schema Customizado",
"⬣ Histórico",
])
# ════════════════════════════════════════════════════════════════
# EXEMPLOS
# ════════════════════════════════════════════════════════════════
EXAMPLES = {
"Contrato Legal": """CONTRATO DE PRESTAÇÃO DE SERVIÇOS DE CONSULTORIA EM INTELIGÊNCIA ARTIFICIAL
Entre as partes:
CONTRATANTE: TechCorp Brasil Ltda., CNPJ 12.345.678/0001-99, com sede em São Paulo/SP.
CONTRATADO: Daniel Fonseca - ML Engineer, CPF 123.456.789-00, residente no Rio de Janeiro/RJ.
CLÁUSULA 1 - OBJETO
O CONTRATADO prestará serviços de consultoria em Graph Neural Networks e sistemas de detecção de fraude com IA Generativa, incluindo desenvolvimento de modelos, treinamento de equipes e documentação técnica.
CLÁUSULA 2 - VALOR
O valor total dos serviços é de R$ 48.000,00 (quarenta e oito mil reais), pagos em 4 parcelas mensais de R$ 12.000,00.
CLÁUSULA 3 - PRAZO
Vigência de 4 (quatro) meses, iniciando em 01/04/2025 e encerrando em 31/07/2025.
CLÁUSULA 4 - OBRIGAÇÕES DO CONTRATADO
- Entregar relatórios mensais de progresso
- Participar de reuniões semanais remotas
- Manter confidencialidade sobre os dados da empresa
CLÁUSULA 5 - FORO
Fica eleito o foro da Comarca de São Paulo/SP para dirimir quaisquer controvérsias.
Assinado digitalmente em 28/03/2025.""",
"Notícia / Artigo": """Meta anuncia novo modelo de linguagem open-source com 405 bilhões de parâmetros
SAN FRANCISCO, 15 de março de 2025 — A Meta Platforms anunciou nesta quinta-feira o lançamento do Llama 4, seu mais novo modelo de linguagem de grande escala com 405 bilhões de parâmetros, disponível gratuitamente para pesquisadores e empresas sob licença open-source.
O CEO Mark Zuckerberg afirmou que o modelo supera o GPT-4o em 73% dos benchmarks testados internamente, incluindo MMLU, HumanEval e MT-Bench. A vice-presidente de IA da empresa, Yann LeCun, destacou que o modelo foi treinado em 30 trilhões de tokens de dados multimodais.
O lançamento acontece em meio à crescente disputa entre Meta, OpenAI, Google e Anthropic pelo mercado de IA generativa, avaliado em US$ 2,4 trilhões até 2030 segundo a consultoria Goldman Sachs.
Especialistas do MIT e Stanford avaliam que a decisão de tornar o modelo open-source pode democratizar o acesso à IA avançada, embora levante preocupações sobre uso malicioso. O governo americano já sinalizou que pode regulamentar o setor ainda em 2025.""",
"Artigo Científico": """GraphSAGE: Inductive Representation Learning on Large Graphs
Autores: William L. Hamilton, Rex Ying, Jure Leskovec
Venue: NeurIPS 2017, Long Beach, CA
Abstract:
Low-dimensional embeddings of nodes in large graphs have proved extremely useful in a variety of prediction tasks. However, most existing approaches require that all nodes in the graph are present during training of the embeddings; these previous approaches are inherently transductive and do not naturally generalize to unseen nodes.
Problema resolvido:
A maioria dos métodos de embedding para grafos é transductive — só funciona para nós vistos durante o treino. Em aplicações reais como redes sociais e sistemas de recomendação, novos nós aparecem constantemente.
Metodologia:
O GraphSAGE aprende funções de agregação (mean, LSTM, pooling) que generalizam para nós não vistos, combinando features do nó com as de sua vizinhança amostrada.
Resultados:
- Dataset Citation (Cora): F1 = 0.935
- Dataset Reddit: F1 = 0.950
- Dataset PPI (Protein-Protein Interaction): F1 = 0.612 (vs 0.421 baseline)
Contribuições principais:
1. Framework inductive para grafos de larga escala
2. Três agregadores comparados: mean, LSTM, max-pooling
3. Mini-batch training para escalabilidade
4. Open-source no repositório snap-stanford/GraphSAGE""",
}
# ════════════════════════════════════════════════════════════════
# TAB 1 — EXTRAIR
# ════════════════════════════════════════════════════════════════
with tab_extract:
from extractor import PRESET_SCHEMAS
# Schema selector
st.markdown("""
user@extractor:~$ select --schema
""", unsafe_allow_html=True)
schema_cols = st.columns(len(PRESET_SCHEMAS))
for i, (name, _) in enumerate(PRESET_SCHEMAS.items()):
with schema_cols[i]:
active = st.session_state.active_schema == name
if st.button(name, key=f"sc_{i}", use_container_width=True):
st.session_state.active_schema = name
st.rerun()
active_schema = PRESET_SCHEMAS[st.session_state.active_schema]
st.markdown(f"""
// {st.session_state.active_schema} — {active_schema['description']}
""", unsafe_allow_html=True)
# Exemplo rápido
col_ex, _ = st.columns([2, 3])
with col_ex:
if st.button(f"⬡ Carregar exemplo: {st.session_state.active_schema}",
use_container_width=True):
ex_text = EXAMPLES.get(st.session_state.active_schema, "")
if ex_text:
st.session_state["load_example"] = ex_text
default_text = st.session_state.pop("load_example", "")
st.markdown("""
user@extractor:~$ paste --input
""", unsafe_allow_html=True)
text_input = st.text_area(
"", value=default_text, height=220,
placeholder="Cole qualquer texto aqui: contrato, notícia, currículo, invoice, artigo...",
label_visibility="collapsed", key="main_text"
)
run_col, _ = st.columns([1, 3])
with run_col:
run_btn = st.button("⬡ EXTRAIR DADOS", use_container_width=True, type="primary")
if run_btn:
if not get_key():
st.markdown('✗ API Key não configurada
',
unsafe_allow_html=True)
st.stop()
if not text_input.strip():
st.markdown('⚠ Cole um texto para extrair
',
unsafe_allow_html=True)
st.stop()
from extractor import StructuredExtractor
# Terminal de progresso
prog_ph = st.empty()
prog_ph.markdown("""
extraction.log
→ Inicializando engine...
→ Tool definition criada
→ Chamando gpt-4o-mini com tool_choice=required...
⟳ Aguardando resposta_
""", unsafe_allow_html=True)
try:
engine = StructuredExtractor(get_key())
result = engine.extract(
text=text_input,
schema=active_schema["schema"],
schema_name=st.session_state.active_schema,
)
prog_ph.markdown(f"""
extraction.log
✓ Engine inicializado
✓ Tool definition: {st.session_state.active_schema}
✓ Function call executado com sucesso
✓ JSON parseado e validado
✓ EXTRAÇÃO COMPLETA em {result['attempts']} tentativa(s) · {result['tokens']} tokens
""", unsafe_allow_html=True)
# Salva no histórico
st.session_state.history.append({
"schema": st.session_state.active_schema,
"text_preview": text_input[:120] + "...",
"result": result,
})
# ── OUTPUT ──────────────────────────────────────────
out_col, raw_col = st.columns([3, 2], gap="large")
with out_col:
st.markdown("""
user@extractor:~$ render --view=structured
""", unsafe_allow_html=True)
# Cards de campos flat
flat_html = render_flat_fields(result["data"])
if flat_html:
st.markdown(flat_html, unsafe_allow_html=True)
# Campos complexos (listas/objetos)
for k, v in result["data"].items():
if not isinstance(v, (dict, list)):
continue
st.markdown(f"""
// {k}
""", unsafe_allow_html=True)
if isinstance(v, list):
for item in v:
if isinstance(item, dict):
st.markdown(f"""
{syntax_highlight_json(item, 0)}
""", unsafe_allow_html=True)
else:
esc = str(item).replace("<","<")
st.markdown(f'',
unsafe_allow_html=True)
elif isinstance(v, dict):
st.markdown(f"""
{syntax_highlight_json(v, 0)}
""", unsafe_allow_html=True)
# Stats
st.markdown(f"""
schema: {st.session_state.active_schema}
fields: {len(result['data'])}
tokens: {result['tokens']}
attempts: {result['attempts']}
method: {result['method']}
""", unsafe_allow_html=True)
with raw_col:
st.markdown("""
user@extractor:~$ cat output.json
""", unsafe_allow_html=True)
json_html = syntax_highlight_json(result["data"])
st.markdown(f'{json_html}
',
unsafe_allow_html=True)
# Download
st.download_button(
"⬡ Download JSON",
data=json.dumps(result["data"], ensure_ascii=False, indent=2),
file_name=f"extracted_{st.session_state.active_schema.lower().replace(' ','_')}.json",
mime="application/json",
use_container_width=True,
)
except Exception as e:
prog_ph.markdown(f"""
""", unsafe_allow_html=True)
# ════════════════════════════════════════════════════════════════
# TAB 2 — SCHEMA CUSTOMIZADO
# ════════════════════════════════════════════════════════════════
with tab_custom:
st.markdown("""
user@extractor:~$ define --schema=custom
// Defina seu próprio JSON Schema e extraia qualquer estrutura de qualquer texto
""", unsafe_allow_html=True)
DEFAULT_CUSTOM = '''{
"type": "object",
"properties": {
"nome_produto": {"type": "string"},
"preco": {"type": "number"},
"categorias": {"type": "array", "items": {"type": "string"}},
"disponivel": {"type": "boolean"},
"especificacoes": {
"type": "object",
"properties": {
"peso": {"type": "string"},
"cor": {"type": "string"}
}
}
},
"required": ["nome_produto"]
}'''
c_left, c_right = st.columns(2, gap="large")
with c_left:
st.markdown('$ vim schema.json
',
unsafe_allow_html=True)
custom_schema = st.text_area(
"", value=st.session_state.custom_schema or DEFAULT_CUSTOM,
height=280, label_visibility="collapsed", key="custom_schema_input"
)
with c_right:
st.markdown('$ cat input.txt
',
unsafe_allow_html=True)
custom_text = st.text_area(
"", height=280, label_visibility="collapsed", key="custom_text",
placeholder="Cole o texto para extrair..."
)
run_custom = st.button("⬡ EXTRAIR COM SCHEMA CUSTOMIZADO", use_container_width=True)
if run_custom:
if not get_key():
st.error("Configure a API Key na sidebar.")
st.stop()
if not custom_text.strip() or not custom_schema.strip():
st.warning("Preencha o schema e o texto.")
st.stop()
from extractor import StructuredExtractor
with st.spinner("Extraindo..."):
try:
engine = StructuredExtractor(get_key())
result = engine.extract_with_custom_schema(custom_text, custom_schema)
st.markdown('$ cat output.json
',
unsafe_allow_html=True)
json_html = syntax_highlight_json(result["data"])
st.markdown(f'{json_html}
',
unsafe_allow_html=True)
st.markdown(f"""
tokens: {result['tokens']}
attempts: {result['attempts']}
""", unsafe_allow_html=True)
st.download_button(
"⬡ Download JSON",
data=json.dumps(result["data"], ensure_ascii=False, indent=2),
file_name="custom_extraction.json",
mime="application/json",
)
st.session_state.history.append({
"schema": "Custom",
"text_preview": custom_text[:120] + "...",
"result": result,
})
except ValueError as e:
st.error(f"Schema inválido: {e}")
except Exception as e:
st.error(f"Erro: {e}")
# ════════════════════════════════════════════════════════════════
# TAB 3 — HISTÓRICO
# ════════════════════════════════════════════════════════════════
with tab_history:
if not st.session_state.history:
st.markdown("""
// nenhuma extração executada ainda
""", unsafe_allow_html=True)
else:
for i, h in enumerate(reversed(st.session_state.history)):
r = h["result"]
with st.expander(
f"#{len(st.session_state.history)-i} · {h['schema']} · {r['tokens']} tokens",
expanded=(i == 0)
):
st.markdown(f"""
// {h['text_preview']}
""", unsafe_allow_html=True)
json_html = syntax_highlight_json(r["data"])
st.markdown(f'{json_html}
',
unsafe_allow_html=True)
st.download_button(
"⬡ Download",
data=json.dumps(r["data"], ensure_ascii=False, indent=2),
file_name=f"extract_{i}.json",
mime="application/json",
key=f"dl_{i}",
)