Danielfonseca1212's picture
Create app.py
5a9f09e verified
# app.py β€” Structured Output Extractor | Function Calling + Pydantic
import streamlit as st
import json
import os
st.set_page_config(
page_title="Structured Extractor Β· Daniel Fonseca",
page_icon="⬑",
layout="wide",
initial_sidebar_state="expanded",
)
# ── CSS: TERMINAL HACKER ──────────────────────────────────────
st.markdown("""
<style>
@import url('https://fonts.googleapis.com/css2?family=Share+Tech+Mono&family=Orbitron:wght@400;700;900&family=VT323&display=swap');
:root {
--bg: #060810;
--bg2: #0a0d18;
--bg3: #0e1220;
--green: #00ff88;
--green2: #00cc66;
--green3: #009944;
--amber: #ffb700;
--cyan: #00d4ff;
--red: #ff3355;
--dim: #1a2a1a;
--grid: #0d1a0d;
--border: #0a3a0a;
--border2: #1a4a1a;
--text: #c8ffc8;
--text2: #88cc88;
--text3: #446644;
}
html, body, [class*="css"] {
background: var(--bg) !important;
color: var(--text) !important;
font-family: 'Share Tech Mono', monospace !important;
}
/* CRT scanlines overlay */
body::before {
content: '';
position: fixed;
top: 0; left: 0; right: 0; bottom: 0;
background: repeating-linear-gradient(
0deg,
transparent,
transparent 2px,
rgba(0,255,136,0.015) 2px,
rgba(0,255,136,0.015) 4px
);
pointer-events: none;
z-index: 9999;
}
#MainMenu, footer, header { visibility: hidden; }
.block-container { padding-top: 1rem; max-width: 1300px; }
/* ── HEADER ── */
.term-header {
border-bottom: 1px solid var(--green3);
padding-bottom: 0.8rem;
margin-bottom: 1.2rem;
}
.term-title {
font-family: 'Orbitron', monospace;
font-weight: 900;
font-size: 1.8rem;
color: var(--green);
letter-spacing: 0.08em;
text-shadow: 0 0 20px rgba(0,255,136,0.5);
line-height: 1;
}
.term-sub {
font-family: 'Share Tech Mono', monospace;
font-size: 0.7rem;
color: var(--green3);
letter-spacing: 0.2em;
margin-top: 0.3rem;
}
.blink {
animation: blink 1s step-end infinite;
color: var(--green);
}
@keyframes blink { 50% { opacity: 0; } }
/* ── TERMINAL WINDOW ── */
.term-window {
background: var(--bg2);
border: 1px solid var(--border2);
border-radius: 4px;
overflow: hidden;
margin-bottom: 1rem;
}
.term-titlebar {
background: var(--bg3);
border-bottom: 1px solid var(--border);
padding: 0.4rem 0.8rem;
display: flex;
align-items: center;
gap: 0.5rem;
}
.term-dot {
width: 8px; height: 8px;
border-radius: 50%;
display: inline-block;
}
.dot-r { background: #ff3355; }
.dot-y { background: #ffb700; }
.dot-g { background: #00ff88; }
.term-wintitle {
font-size: 0.65rem;
color: var(--text3);
letter-spacing: 0.15em;
text-transform: uppercase;
margin-left: 0.5rem;
}
.term-body { padding: 1rem 1.2rem; }
/* ── PROMPT LINE ── */
.prompt-line {
font-size: 0.8rem;
color: var(--green3);
margin-bottom: 0.4rem;
}
.prompt-line span { color: var(--green); }
/* ── JSON RENDERER ── */
.json-output {
background: #040608;
border: 1px solid var(--border);
border-radius: 3px;
padding: 1.2rem;
font-family: 'Share Tech Mono', monospace;
font-size: 0.8rem;
line-height: 1.7;
overflow-x: auto;
position: relative;
}
.json-key { color: var(--cyan); }
.json-str { color: var(--amber); }
.json-num { color: #ff88aa; }
.json-bool { color: var(--green); font-weight: bold; }
.json-null { color: var(--text3); font-style: italic; }
.json-bracket { color: var(--text2); }
/* ── FIELD CARDS ── */
.field-grid {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(220px, 1fr));
gap: 0.6rem;
margin-top: 0.8rem;
}
.field-card {
background: #040a08;
border: 1px solid var(--border);
border-left: 2px solid var(--green3);
border-radius: 3px;
padding: 0.6rem 0.8rem;
transition: border-color 0.2s;
}
.field-card:hover { border-left-color: var(--green); }
.field-key {
font-size: 0.65rem;
color: var(--cyan);
text-transform: uppercase;
letter-spacing: 0.12em;
margin-bottom: 0.2rem;
}
.field-val {
font-size: 0.82rem;
color: var(--amber);
word-break: break-word;
}
.field-val-null { color: var(--text3); font-style: italic; }
.field-val-bool-true { color: var(--green); }
.field-val-bool-false { color: var(--red); }
/* ── STATS BAR ── */
.stats-bar {
display: flex;
gap: 1.5rem;
padding: 0.5rem 0;
border-top: 1px solid var(--border);
margin-top: 0.8rem;
flex-wrap: wrap;
}
.stat-item {
font-size: 0.68rem;
color: var(--text3);
}
.stat-item span { color: var(--green2); }
/* ── SCHEMA SELECTOR ── */
.schema-btn-active {
background: var(--dim) !important;
border: 1px solid var(--green) !important;
color: var(--green) !important;
}
/* ── SIDEBAR ── */
section[data-testid="stSidebar"] {
background: var(--bg2) !important;
border-right: 1px solid var(--border) !important;
}
section[data-testid="stSidebar"] * { color: var(--text2) !important; }
/* ── STREAMLIT OVERRIDES ── */
.stTextArea textarea {
background: #040608 !important;
border: 1px solid var(--border2) !important;
border-radius: 3px !important;
color: var(--text) !important;
font-family: 'Share Tech Mono', monospace !important;
font-size: 0.8rem !important;
line-height: 1.6 !important;
}
.stTextArea textarea:focus {
border-color: var(--green) !important;
box-shadow: 0 0 8px rgba(0,255,136,0.2) !important;
}
.stSelectbox select, .stSelectbox > div {
background: var(--bg2) !important;
border-color: var(--border2) !important;
color: var(--text) !important;
font-family: 'Share Tech Mono', monospace !important;
}
.stButton button {
background: transparent !important;
border: 1px solid var(--green3) !important;
color: var(--green) !important;
border-radius: 3px !important;
font-family: 'Orbitron', monospace !important;
font-size: 0.68rem !important;
letter-spacing: 0.1em !important;
text-transform: uppercase !important;
transition: all 0.2s !important;
}
.stButton button:hover {
background: var(--dim) !important;
border-color: var(--green) !important;
box-shadow: 0 0 12px rgba(0,255,136,0.3) !important;
}
.stTextInput input {
background: #040608 !important;
border: 1px solid var(--border2) !important;
color: var(--text) !important;
font-family: 'Share Tech Mono', monospace !important;
font-size: 0.8rem !important;
}
div[data-testid="stTabs"] button {
font-family: 'Orbitron', monospace !important;
font-size: 0.62rem !important;
letter-spacing: 0.08em !important;
color: var(--text3) !important;
}
div[data-testid="stTabs"] button[aria-selected="true"] {
color: var(--green) !important;
border-bottom-color: var(--green) !important;
}
hr { border-color: var(--border) !important; }
</style>
""", unsafe_allow_html=True)
# ── SESSION STATE ──────────────────────────────────────────────
for k, v in {
'openai_key': '',
'history': [],
'active_schema': 'Contrato Legal',
'custom_schema': '',
}.items():
if k not in st.session_state:
st.session_state[k] = v
# ── HELPERS ───────────────────────────────────────────────────
def get_key():
try:
if 'OPENAI_API_KEY' in st.secrets:
return st.secrets['OPENAI_API_KEY']
except Exception:
pass
return os.getenv('OPENAI_API_KEY', st.session_state.openai_key)
def syntax_highlight_json(obj, indent=0) -> str:
"""Renderiza JSON com syntax highlighting HTML."""
pad = "&nbsp;" * (indent * 3)
pad2 = "&nbsp;" * ((indent + 1) * 3)
if isinstance(obj, dict):
if not obj:
return '<span class="json-bracket">{}</span>'
lines = ['<span class="json-bracket">{</span>']
items = list(obj.items())
for i, (k, v) in enumerate(items):
comma = "," if i < len(items) - 1 else ""
val_html = syntax_highlight_json(v, indent + 1)
lines.append(f'{pad2}<span class="json-key">"{k}"</span>: {val_html}{comma}')
lines.append(f'{pad}<span class="json-bracket">}}</span>')
return "\n".join(lines)
elif isinstance(obj, list):
if not obj:
return '<span class="json-bracket">[]</span>'
lines = ['<span class="json-bracket">[</span>']
for i, item in enumerate(obj):
comma = "," if i < len(obj) - 1 else ""
val_html = syntax_highlight_json(item, indent + 1)
lines.append(f'{pad2}{val_html}{comma}')
lines.append(f'{pad}<span class="json-bracket">]</span>')
return "\n".join(lines)
elif isinstance(obj, str):
escaped = obj.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
return f'<span class="json-str">"{escaped}"</span>'
elif isinstance(obj, bool):
cls = "json-bool"
return f'<span class="{cls}">{"true" if obj else "false"}</span>'
elif obj is None:
return '<span class="json-null">null</span>'
elif isinstance(obj, (int, float)):
return f'<span class="json-num">{obj}</span>'
else:
return f'<span class="json-str">"{obj}"</span>'
def render_flat_fields(data: dict) -> str:
"""Renderiza campos flat (nΓ£o-aninhados) como cards."""
cards = []
for k, v in data.items():
if isinstance(v, (dict, list)):
continue
key_html = f'<div class="field-key">{k}</div>'
if v is None:
val_html = '<div class="field-val field-val-null">null</div>'
elif isinstance(v, bool):
cls = "field-val-bool-true" if v else "field-val-bool-false"
val_html = f'<div class="field-val {cls}">{"true" if v else "false"}</div>'
else:
escaped = str(v).replace("<", "&lt;").replace(">", "&gt;")
val_html = f'<div class="field-val">{escaped}</div>'
cards.append(f'<div class="field-card">{key_html}{val_html}</div>')
if not cards:
return ""
return f'<div class="field-grid">{"".join(cards)}</div>'
# ── SIDEBAR ───────────────────────────────────────────────────
with st.sidebar:
st.markdown("""
<div style='font-family:Orbitron,monospace;font-weight:900;
font-size:1rem;color:#00ff88;text-shadow:0 0 10px rgba(0,255,136,0.4);
letter-spacing:0.1em'>STRUCT//EXTRACT</div>
<div style='font-family:Share Tech Mono,monospace;font-size:0.6rem;
color:#446644;letter-spacing:0.2em;text-transform:uppercase;margin-top:0.2rem'>
v1.0 Β· Function Calling Engine
</div>
""", unsafe_allow_html=True)
st.divider()
st.markdown("**πŸ”‘ OpenAI API Key**")
k_in = st.text_input("", type="password", value=st.session_state.openai_key,
placeholder="sk-...", label_visibility="collapsed")
if k_in:
st.session_state.openai_key = k_in
if get_key():
st.markdown('<div style="color:#00ff88;font-size:0.75rem">βœ“ KEY LOADED</div>',
unsafe_allow_html=True)
else:
st.markdown('<div style="color:#ff3355;font-size:0.75rem">βœ— KEY MISSING</div>',
unsafe_allow_html=True)
st.divider()
st.markdown("""
<div style='font-family:Share Tech Mono,monospace;font-size:0.72rem;
color:#446644;line-height:1.8'>
<div style='color:#00cc66;margin-bottom:0.4rem'>// PIPELINE</div>
01. Text input<br>
02. Schema selection<br>
03. Tool definition (OpenAI)<br>
04. Function calling<br>
05. JSON parse + validate<br>
06. Retry on error<br>
07. Render + export
</div>
""", unsafe_allow_html=True)
st.divider()
st.markdown("""
<div style='font-family:Share Tech Mono,monospace;font-size:0.65rem;color:#2a4a2a'>
model: gpt-4o-mini<br>
tool_choice: required<br>
temperature: 0.0<br>
max_retries: 2<br>
validation: pydantic v2
</div>
""", unsafe_allow_html=True)
st.divider()
if st.button("⬑ Limpar histórico", use_container_width=True):
st.session_state.history = []
st.rerun()
# ── HEADER ────────────────────────────────────────────────────
st.markdown("""
<div class="term-header">
<div class="term-title">⬑ STRUCTURED OUTPUT EXTRACTOR <span class="blink">_</span></div>
<div class="term-sub">OpenAI Function Calling Β· Pydantic v2 Β· Dynamic JSON Schema Β· Auto-Retry</div>
</div>
""", unsafe_allow_html=True)
# ── TABS ──────────────────────────────────────────────────────
tab_extract, tab_custom, tab_history = st.tabs([
"⬑ Extrair",
"β¬’ Schema Customizado",
"⬣ Histórico",
])
# ════════════════════════════════════════════════════════════════
# EXEMPLOS
# ════════════════════════════════════════════════════════════════
EXAMPLES = {
"Contrato Legal": """CONTRATO DE PRESTAÇÃO DE SERVIΓ‡OS DE CONSULTORIA EM INTELIGÊNCIA ARTIFICIAL
Entre as partes:
CONTRATANTE: TechCorp Brasil Ltda., CNPJ 12.345.678/0001-99, com sede em SΓ£o Paulo/SP.
CONTRATADO: Daniel Fonseca - ML Engineer, CPF 123.456.789-00, residente no Rio de Janeiro/RJ.
CLÁUSULA 1 - OBJETO
O CONTRATADO prestarΓ‘ serviΓ§os de consultoria em Graph Neural Networks e sistemas de detecΓ§Γ£o de fraude com IA Generativa, incluindo desenvolvimento de modelos, treinamento de equipes e documentaΓ§Γ£o tΓ©cnica.
CLÁUSULA 2 - VALOR
O valor total dos serviΓ§os Γ© de R$ 48.000,00 (quarenta e oito mil reais), pagos em 4 parcelas mensais de R$ 12.000,00.
CLÁUSULA 3 - PRAZO
VigΓͺncia de 4 (quatro) meses, iniciando em 01/04/2025 e encerrando em 31/07/2025.
CLÁUSULA 4 - OBRIGAÇÕES DO CONTRATADO
- Entregar relatΓ³rios mensais de progresso
- Participar de reuniΓ΅es semanais remotas
- Manter confidencialidade sobre os dados da empresa
CLÁUSULA 5 - FORO
Fica eleito o foro da Comarca de SΓ£o Paulo/SP para dirimir quaisquer controvΓ©rsias.
Assinado digitalmente em 28/03/2025.""",
"NotΓ­cia / Artigo": """Meta anuncia novo modelo de linguagem open-source com 405 bilhΓ΅es de parΓ’metros
SAN FRANCISCO, 15 de marΓ§o de 2025 β€” A Meta Platforms anunciou nesta quinta-feira o lanΓ§amento do Llama 4, seu mais novo modelo de linguagem de grande escala com 405 bilhΓ΅es de parΓ’metros, disponΓ­vel gratuitamente para pesquisadores e empresas sob licenΓ§a open-source.
O CEO Mark Zuckerberg afirmou que o modelo supera o GPT-4o em 73% dos benchmarks testados internamente, incluindo MMLU, HumanEval e MT-Bench. A vice-presidente de IA da empresa, Yann LeCun, destacou que o modelo foi treinado em 30 trilhΓ΅es de tokens de dados multimodais.
O lanΓ§amento acontece em meio Γ  crescente disputa entre Meta, OpenAI, Google e Anthropic pelo mercado de IA generativa, avaliado em US$ 2,4 trilhΓ΅es atΓ© 2030 segundo a consultoria Goldman Sachs.
Especialistas do MIT e Stanford avaliam que a decisΓ£o de tornar o modelo open-source pode democratizar o acesso Γ  IA avanΓ§ada, embora levante preocupaΓ§Γ΅es sobre uso malicioso. O governo americano jΓ‘ sinalizou que pode regulamentar o setor ainda em 2025.""",
"Artigo CientΓ­fico": """GraphSAGE: Inductive Representation Learning on Large Graphs
Autores: William L. Hamilton, Rex Ying, Jure Leskovec
Venue: NeurIPS 2017, Long Beach, CA
Abstract:
Low-dimensional embeddings of nodes in large graphs have proved extremely useful in a variety of prediction tasks. However, most existing approaches require that all nodes in the graph are present during training of the embeddings; these previous approaches are inherently transductive and do not naturally generalize to unseen nodes.
Problema resolvido:
A maioria dos mΓ©todos de embedding para grafos Γ© transductive β€” sΓ³ funciona para nΓ³s vistos durante o treino. Em aplicaΓ§Γ΅es reais como redes sociais e sistemas de recomendaΓ§Γ£o, novos nΓ³s aparecem constantemente.
Metodologia:
O GraphSAGE aprende funΓ§Γ΅es de agregaΓ§Γ£o (mean, LSTM, pooling) que generalizam para nΓ³s nΓ£o vistos, combinando features do nΓ³ com as de sua vizinhanΓ§a amostrada.
Resultados:
- Dataset Citation (Cora): F1 = 0.935
- Dataset Reddit: F1 = 0.950
- Dataset PPI (Protein-Protein Interaction): F1 = 0.612 (vs 0.421 baseline)
ContribuiΓ§Γ΅es principais:
1. Framework inductive para grafos de larga escala
2. TrΓͺs agregadores comparados: mean, LSTM, max-pooling
3. Mini-batch training para escalabilidade
4. Open-source no repositΓ³rio snap-stanford/GraphSAGE""",
}
# ════════════════════════════════════════════════════════════════
# TAB 1 β€” EXTRAIR
# ════════════════════════════════════════════════════════════════
with tab_extract:
from extractor import PRESET_SCHEMAS
# Schema selector
st.markdown("""
<div class="prompt-line">user@extractor:~$ <span>select --schema</span></div>
""", unsafe_allow_html=True)
schema_cols = st.columns(len(PRESET_SCHEMAS))
for i, (name, _) in enumerate(PRESET_SCHEMAS.items()):
with schema_cols[i]:
active = st.session_state.active_schema == name
if st.button(name, key=f"sc_{i}", use_container_width=True):
st.session_state.active_schema = name
st.rerun()
active_schema = PRESET_SCHEMAS[st.session_state.active_schema]
st.markdown(f"""
<div style='font-family:Share Tech Mono,monospace;font-size:0.68rem;
color:#446644;margin:0.4rem 0 0.8rem;padding:0.4rem 0.8rem;
border-left:2px solid #0a3a0a;background:#040a04'>
// {st.session_state.active_schema} β€” {active_schema['description']}
</div>
""", unsafe_allow_html=True)
# Exemplo rΓ‘pido
col_ex, _ = st.columns([2, 3])
with col_ex:
if st.button(f"⬑ Carregar exemplo: {st.session_state.active_schema}",
use_container_width=True):
ex_text = EXAMPLES.get(st.session_state.active_schema, "")
if ex_text:
st.session_state["load_example"] = ex_text
default_text = st.session_state.pop("load_example", "")
st.markdown("""
<div class="prompt-line" style="margin-top:0.8rem">
user@extractor:~$ <span>paste --input</span></div>
""", unsafe_allow_html=True)
text_input = st.text_area(
"", value=default_text, height=220,
placeholder="Cole qualquer texto aqui: contrato, notΓ­cia, currΓ­culo, invoice, artigo...",
label_visibility="collapsed", key="main_text"
)
run_col, _ = st.columns([1, 3])
with run_col:
run_btn = st.button("⬑ EXTRAIR DADOS", use_container_width=True, type="primary")
if run_btn:
if not get_key():
st.markdown('<div style="color:#ff3355;font-size:0.8rem">βœ— API Key nΓ£o configurada</div>',
unsafe_allow_html=True)
st.stop()
if not text_input.strip():
st.markdown('<div style="color:#ffb700;font-size:0.8rem">⚠ Cole um texto para extrair</div>',
unsafe_allow_html=True)
st.stop()
from extractor import StructuredExtractor
# Terminal de progresso
prog_ph = st.empty()
prog_ph.markdown("""
<div class="term-window">
<div class="term-titlebar">
<span class="term-dot dot-r"></span>
<span class="term-dot dot-y"></span>
<span class="term-dot dot-g"></span>
<span class="term-wintitle">extraction.log</span>
</div>
<div class="term-body" style="font-size:0.75rem;color:#446644;line-height:2">
<div>β†’ Inicializando engine...</div>
<div>β†’ Tool definition criada</div>
<div>β†’ Chamando gpt-4o-mini com tool_choice=required...</div>
<div style="color:#ffb700">⟳ Aguardando resposta<span class="blink">_</span></div>
</div>
</div>
""", unsafe_allow_html=True)
try:
engine = StructuredExtractor(get_key())
result = engine.extract(
text=text_input,
schema=active_schema["schema"],
schema_name=st.session_state.active_schema,
)
prog_ph.markdown(f"""
<div class="term-window">
<div class="term-titlebar">
<span class="term-dot dot-r"></span>
<span class="term-dot dot-y"></span>
<span class="term-dot dot-g"></span>
<span class="term-wintitle">extraction.log</span>
</div>
<div class="term-body" style="font-size:0.75rem;color:#446644;line-height:2">
<div>βœ“ Engine inicializado</div>
<div>βœ“ Tool definition: <span style="color:#00d4ff">{st.session_state.active_schema}</span></div>
<div>βœ“ Function call executado com sucesso</div>
<div>βœ“ JSON parseado e validado</div>
<div style="color:#00ff88">βœ“ EXTRAÇÃO COMPLETA em {result['attempts']} tentativa(s) Β· {result['tokens']} tokens</div>
</div>
</div>
""", unsafe_allow_html=True)
# Salva no histΓ³rico
st.session_state.history.append({
"schema": st.session_state.active_schema,
"text_preview": text_input[:120] + "...",
"result": result,
})
# ── OUTPUT ──────────────────────────────────────────
out_col, raw_col = st.columns([3, 2], gap="large")
with out_col:
st.markdown("""
<div class="prompt-line">user@extractor:~$ <span>render --view=structured</span></div>
""", unsafe_allow_html=True)
# Cards de campos flat
flat_html = render_flat_fields(result["data"])
if flat_html:
st.markdown(flat_html, unsafe_allow_html=True)
# Campos complexos (listas/objetos)
for k, v in result["data"].items():
if not isinstance(v, (dict, list)):
continue
st.markdown(f"""
<div style='font-family:Share Tech Mono,monospace;font-size:0.65rem;
color:#00d4ff;text-transform:uppercase;letter-spacing:0.1em;
margin:0.8rem 0 0.3rem'>// {k}</div>
""", unsafe_allow_html=True)
if isinstance(v, list):
for item in v:
if isinstance(item, dict):
st.markdown(f"""
<div class="json-output" style="font-size:0.75rem;margin-bottom:0.4rem">
{syntax_highlight_json(item, 0)}
</div>
""", unsafe_allow_html=True)
else:
esc = str(item).replace("<","&lt;")
st.markdown(f'<div class="field-card"><div class="field-val">{esc}</div></div>',
unsafe_allow_html=True)
elif isinstance(v, dict):
st.markdown(f"""
<div class="json-output" style="font-size:0.75rem">
{syntax_highlight_json(v, 0)}
</div>
""", unsafe_allow_html=True)
# Stats
st.markdown(f"""
<div class="stats-bar">
<div class="stat-item">schema: <span>{st.session_state.active_schema}</span></div>
<div class="stat-item">fields: <span>{len(result['data'])}</span></div>
<div class="stat-item">tokens: <span>{result['tokens']}</span></div>
<div class="stat-item">attempts: <span>{result['attempts']}</span></div>
<div class="stat-item">method: <span>{result['method']}</span></div>
</div>
""", unsafe_allow_html=True)
with raw_col:
st.markdown("""
<div class="prompt-line">user@extractor:~$ <span>cat output.json</span></div>
""", unsafe_allow_html=True)
json_html = syntax_highlight_json(result["data"])
st.markdown(f'<div class="json-output">{json_html}</div>',
unsafe_allow_html=True)
# Download
st.download_button(
"⬑ Download JSON",
data=json.dumps(result["data"], ensure_ascii=False, indent=2),
file_name=f"extracted_{st.session_state.active_schema.lower().replace(' ','_')}.json",
mime="application/json",
use_container_width=True,
)
except Exception as e:
prog_ph.markdown(f"""
<div class="term-window">
<div class="term-titlebar">
<span class="term-dot dot-r"></span><span class="term-wintitle">error.log</span>
</div>
<div class="term-body" style="color:#ff3355;font-size:0.8rem">
βœ— ERRO: {e}
</div>
</div>
""", unsafe_allow_html=True)
# ════════════════════════════════════════════════════════════════
# TAB 2 β€” SCHEMA CUSTOMIZADO
# ════════════════════════════════════════════════════════════════
with tab_custom:
st.markdown("""
<div class="prompt-line">user@extractor:~$ <span>define --schema=custom</span></div>
<div style='font-family:Share Tech Mono,monospace;font-size:0.7rem;
color:#446644;margin:0.3rem 0 0.8rem'>
// Defina seu prΓ³prio JSON Schema e extraia qualquer estrutura de qualquer texto
</div>
""", unsafe_allow_html=True)
DEFAULT_CUSTOM = '''{
"type": "object",
"properties": {
"nome_produto": {"type": "string"},
"preco": {"type": "number"},
"categorias": {"type": "array", "items": {"type": "string"}},
"disponivel": {"type": "boolean"},
"especificacoes": {
"type": "object",
"properties": {
"peso": {"type": "string"},
"cor": {"type": "string"}
}
}
},
"required": ["nome_produto"]
}'''
c_left, c_right = st.columns(2, gap="large")
with c_left:
st.markdown('<div class="prompt-line">$ <span>vim schema.json</span></div>',
unsafe_allow_html=True)
custom_schema = st.text_area(
"", value=st.session_state.custom_schema or DEFAULT_CUSTOM,
height=280, label_visibility="collapsed", key="custom_schema_input"
)
with c_right:
st.markdown('<div class="prompt-line">$ <span>cat input.txt</span></div>',
unsafe_allow_html=True)
custom_text = st.text_area(
"", height=280, label_visibility="collapsed", key="custom_text",
placeholder="Cole o texto para extrair..."
)
run_custom = st.button("⬑ EXTRAIR COM SCHEMA CUSTOMIZADO", use_container_width=True)
if run_custom:
if not get_key():
st.error("Configure a API Key na sidebar.")
st.stop()
if not custom_text.strip() or not custom_schema.strip():
st.warning("Preencha o schema e o texto.")
st.stop()
from extractor import StructuredExtractor
with st.spinner("Extraindo..."):
try:
engine = StructuredExtractor(get_key())
result = engine.extract_with_custom_schema(custom_text, custom_schema)
st.markdown('<div class="prompt-line">$ <span>cat output.json</span></div>',
unsafe_allow_html=True)
json_html = syntax_highlight_json(result["data"])
st.markdown(f'<div class="json-output">{json_html}</div>',
unsafe_allow_html=True)
st.markdown(f"""
<div class="stats-bar">
<div class="stat-item">tokens: <span>{result['tokens']}</span></div>
<div class="stat-item">attempts: <span>{result['attempts']}</span></div>
</div>
""", unsafe_allow_html=True)
st.download_button(
"⬑ Download JSON",
data=json.dumps(result["data"], ensure_ascii=False, indent=2),
file_name="custom_extraction.json",
mime="application/json",
)
st.session_state.history.append({
"schema": "Custom",
"text_preview": custom_text[:120] + "...",
"result": result,
})
except ValueError as e:
st.error(f"Schema invΓ‘lido: {e}")
except Exception as e:
st.error(f"Erro: {e}")
# ════════════════════════════════════════════════════════════════
# TAB 3 β€” HISTΓ“RICO
# ════════════════════════════════════════════════════════════════
with tab_history:
if not st.session_state.history:
st.markdown("""
<div style='font-family:Share Tech Mono,monospace;font-size:0.8rem;
color:#2a4a2a;text-align:center;padding:3rem'>
// nenhuma extraΓ§Γ£o executada ainda
</div>
""", unsafe_allow_html=True)
else:
for i, h in enumerate(reversed(st.session_state.history)):
r = h["result"]
with st.expander(
f"#{len(st.session_state.history)-i} Β· {h['schema']} Β· {r['tokens']} tokens",
expanded=(i == 0)
):
st.markdown(f"""
<div style='font-family:Share Tech Mono,monospace;font-size:0.7rem;
color:#446644;margin-bottom:0.5rem'>// {h['text_preview']}</div>
""", unsafe_allow_html=True)
json_html = syntax_highlight_json(r["data"])
st.markdown(f'<div class="json-output" style="font-size:0.75rem">{json_html}</div>',
unsafe_allow_html=True)
st.download_button(
"⬑ Download",
data=json.dumps(r["data"], ensure_ascii=False, indent=2),
file_name=f"extract_{i}.json",
mime="application/json",
key=f"dl_{i}",
)