Spaces:
Sleeping
Sleeping
| # app.py β Structured Output Extractor | Function Calling + Pydantic | |
| import streamlit as st | |
| import json | |
| import os | |
| st.set_page_config( | |
| page_title="Structured Extractor Β· Daniel Fonseca", | |
| page_icon="⬑", | |
| layout="wide", | |
| initial_sidebar_state="expanded", | |
| ) | |
| # ββ CSS: TERMINAL HACKER ββββββββββββββββββββββββββββββββββββββ | |
| st.markdown(""" | |
| <style> | |
| @import url('https://fonts.googleapis.com/css2?family=Share+Tech+Mono&family=Orbitron:wght@400;700;900&family=VT323&display=swap'); | |
| :root { | |
| --bg: #060810; | |
| --bg2: #0a0d18; | |
| --bg3: #0e1220; | |
| --green: #00ff88; | |
| --green2: #00cc66; | |
| --green3: #009944; | |
| --amber: #ffb700; | |
| --cyan: #00d4ff; | |
| --red: #ff3355; | |
| --dim: #1a2a1a; | |
| --grid: #0d1a0d; | |
| --border: #0a3a0a; | |
| --border2: #1a4a1a; | |
| --text: #c8ffc8; | |
| --text2: #88cc88; | |
| --text3: #446644; | |
| } | |
| html, body, [class*="css"] { | |
| background: var(--bg) !important; | |
| color: var(--text) !important; | |
| font-family: 'Share Tech Mono', monospace !important; | |
| } | |
| /* CRT scanlines overlay */ | |
| body::before { | |
| content: ''; | |
| position: fixed; | |
| top: 0; left: 0; right: 0; bottom: 0; | |
| background: repeating-linear-gradient( | |
| 0deg, | |
| transparent, | |
| transparent 2px, | |
| rgba(0,255,136,0.015) 2px, | |
| rgba(0,255,136,0.015) 4px | |
| ); | |
| pointer-events: none; | |
| z-index: 9999; | |
| } | |
| #MainMenu, footer, header { visibility: hidden; } | |
| .block-container { padding-top: 1rem; max-width: 1300px; } | |
| /* ββ HEADER ββ */ | |
| .term-header { | |
| border-bottom: 1px solid var(--green3); | |
| padding-bottom: 0.8rem; | |
| margin-bottom: 1.2rem; | |
| } | |
| .term-title { | |
| font-family: 'Orbitron', monospace; | |
| font-weight: 900; | |
| font-size: 1.8rem; | |
| color: var(--green); | |
| letter-spacing: 0.08em; | |
| text-shadow: 0 0 20px rgba(0,255,136,0.5); | |
| line-height: 1; | |
| } | |
| .term-sub { | |
| font-family: 'Share Tech Mono', monospace; | |
| font-size: 0.7rem; | |
| color: var(--green3); | |
| letter-spacing: 0.2em; | |
| margin-top: 0.3rem; | |
| } | |
| .blink { | |
| animation: blink 1s step-end infinite; | |
| color: var(--green); | |
| } | |
| @keyframes blink { 50% { opacity: 0; } } | |
| /* ββ TERMINAL WINDOW ββ */ | |
| .term-window { | |
| background: var(--bg2); | |
| border: 1px solid var(--border2); | |
| border-radius: 4px; | |
| overflow: hidden; | |
| margin-bottom: 1rem; | |
| } | |
| .term-titlebar { | |
| background: var(--bg3); | |
| border-bottom: 1px solid var(--border); | |
| padding: 0.4rem 0.8rem; | |
| display: flex; | |
| align-items: center; | |
| gap: 0.5rem; | |
| } | |
| .term-dot { | |
| width: 8px; height: 8px; | |
| border-radius: 50%; | |
| display: inline-block; | |
| } | |
| .dot-r { background: #ff3355; } | |
| .dot-y { background: #ffb700; } | |
| .dot-g { background: #00ff88; } | |
| .term-wintitle { | |
| font-size: 0.65rem; | |
| color: var(--text3); | |
| letter-spacing: 0.15em; | |
| text-transform: uppercase; | |
| margin-left: 0.5rem; | |
| } | |
| .term-body { padding: 1rem 1.2rem; } | |
| /* ββ PROMPT LINE ββ */ | |
| .prompt-line { | |
| font-size: 0.8rem; | |
| color: var(--green3); | |
| margin-bottom: 0.4rem; | |
| } | |
| .prompt-line span { color: var(--green); } | |
| /* ββ JSON RENDERER ββ */ | |
| .json-output { | |
| background: #040608; | |
| border: 1px solid var(--border); | |
| border-radius: 3px; | |
| padding: 1.2rem; | |
| font-family: 'Share Tech Mono', monospace; | |
| font-size: 0.8rem; | |
| line-height: 1.7; | |
| overflow-x: auto; | |
| position: relative; | |
| } | |
| .json-key { color: var(--cyan); } | |
| .json-str { color: var(--amber); } | |
| .json-num { color: #ff88aa; } | |
| .json-bool { color: var(--green); font-weight: bold; } | |
| .json-null { color: var(--text3); font-style: italic; } | |
| .json-bracket { color: var(--text2); } | |
| /* ββ FIELD CARDS ββ */ | |
| .field-grid { | |
| display: grid; | |
| grid-template-columns: repeat(auto-fill, minmax(220px, 1fr)); | |
| gap: 0.6rem; | |
| margin-top: 0.8rem; | |
| } | |
| .field-card { | |
| background: #040a08; | |
| border: 1px solid var(--border); | |
| border-left: 2px solid var(--green3); | |
| border-radius: 3px; | |
| padding: 0.6rem 0.8rem; | |
| transition: border-color 0.2s; | |
| } | |
| .field-card:hover { border-left-color: var(--green); } | |
| .field-key { | |
| font-size: 0.65rem; | |
| color: var(--cyan); | |
| text-transform: uppercase; | |
| letter-spacing: 0.12em; | |
| margin-bottom: 0.2rem; | |
| } | |
| .field-val { | |
| font-size: 0.82rem; | |
| color: var(--amber); | |
| word-break: break-word; | |
| } | |
| .field-val-null { color: var(--text3); font-style: italic; } | |
| .field-val-bool-true { color: var(--green); } | |
| .field-val-bool-false { color: var(--red); } | |
| /* ββ STATS BAR ββ */ | |
| .stats-bar { | |
| display: flex; | |
| gap: 1.5rem; | |
| padding: 0.5rem 0; | |
| border-top: 1px solid var(--border); | |
| margin-top: 0.8rem; | |
| flex-wrap: wrap; | |
| } | |
| .stat-item { | |
| font-size: 0.68rem; | |
| color: var(--text3); | |
| } | |
| .stat-item span { color: var(--green2); } | |
| /* ββ SCHEMA SELECTOR ββ */ | |
| .schema-btn-active { | |
| background: var(--dim) !important; | |
| border: 1px solid var(--green) !important; | |
| color: var(--green) !important; | |
| } | |
| /* ββ SIDEBAR ββ */ | |
| section[data-testid="stSidebar"] { | |
| background: var(--bg2) !important; | |
| border-right: 1px solid var(--border) !important; | |
| } | |
| section[data-testid="stSidebar"] * { color: var(--text2) !important; } | |
| /* ββ STREAMLIT OVERRIDES ββ */ | |
| .stTextArea textarea { | |
| background: #040608 !important; | |
| border: 1px solid var(--border2) !important; | |
| border-radius: 3px !important; | |
| color: var(--text) !important; | |
| font-family: 'Share Tech Mono', monospace !important; | |
| font-size: 0.8rem !important; | |
| line-height: 1.6 !important; | |
| } | |
| .stTextArea textarea:focus { | |
| border-color: var(--green) !important; | |
| box-shadow: 0 0 8px rgba(0,255,136,0.2) !important; | |
| } | |
| .stSelectbox select, .stSelectbox > div { | |
| background: var(--bg2) !important; | |
| border-color: var(--border2) !important; | |
| color: var(--text) !important; | |
| font-family: 'Share Tech Mono', monospace !important; | |
| } | |
| .stButton button { | |
| background: transparent !important; | |
| border: 1px solid var(--green3) !important; | |
| color: var(--green) !important; | |
| border-radius: 3px !important; | |
| font-family: 'Orbitron', monospace !important; | |
| font-size: 0.68rem !important; | |
| letter-spacing: 0.1em !important; | |
| text-transform: uppercase !important; | |
| transition: all 0.2s !important; | |
| } | |
| .stButton button:hover { | |
| background: var(--dim) !important; | |
| border-color: var(--green) !important; | |
| box-shadow: 0 0 12px rgba(0,255,136,0.3) !important; | |
| } | |
| .stTextInput input { | |
| background: #040608 !important; | |
| border: 1px solid var(--border2) !important; | |
| color: var(--text) !important; | |
| font-family: 'Share Tech Mono', monospace !important; | |
| font-size: 0.8rem !important; | |
| } | |
| div[data-testid="stTabs"] button { | |
| font-family: 'Orbitron', monospace !important; | |
| font-size: 0.62rem !important; | |
| letter-spacing: 0.08em !important; | |
| color: var(--text3) !important; | |
| } | |
| div[data-testid="stTabs"] button[aria-selected="true"] { | |
| color: var(--green) !important; | |
| border-bottom-color: var(--green) !important; | |
| } | |
| hr { border-color: var(--border) !important; } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # ββ SESSION STATE ββββββββββββββββββββββββββββββββββββββββββββββ | |
| for k, v in { | |
| 'openai_key': '', | |
| 'history': [], | |
| 'active_schema': 'Contrato Legal', | |
| 'custom_schema': '', | |
| }.items(): | |
| if k not in st.session_state: | |
| st.session_state[k] = v | |
| # ββ HELPERS βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def get_key(): | |
| try: | |
| if 'OPENAI_API_KEY' in st.secrets: | |
| return st.secrets['OPENAI_API_KEY'] | |
| except Exception: | |
| pass | |
| return os.getenv('OPENAI_API_KEY', st.session_state.openai_key) | |
| def syntax_highlight_json(obj, indent=0) -> str: | |
| """Renderiza JSON com syntax highlighting HTML.""" | |
| pad = " " * (indent * 3) | |
| pad2 = " " * ((indent + 1) * 3) | |
| if isinstance(obj, dict): | |
| if not obj: | |
| return '<span class="json-bracket">{}</span>' | |
| lines = ['<span class="json-bracket">{</span>'] | |
| items = list(obj.items()) | |
| for i, (k, v) in enumerate(items): | |
| comma = "," if i < len(items) - 1 else "" | |
| val_html = syntax_highlight_json(v, indent + 1) | |
| lines.append(f'{pad2}<span class="json-key">"{k}"</span>: {val_html}{comma}') | |
| lines.append(f'{pad}<span class="json-bracket">}}</span>') | |
| return "\n".join(lines) | |
| elif isinstance(obj, list): | |
| if not obj: | |
| return '<span class="json-bracket">[]</span>' | |
| lines = ['<span class="json-bracket">[</span>'] | |
| for i, item in enumerate(obj): | |
| comma = "," if i < len(obj) - 1 else "" | |
| val_html = syntax_highlight_json(item, indent + 1) | |
| lines.append(f'{pad2}{val_html}{comma}') | |
| lines.append(f'{pad}<span class="json-bracket">]</span>') | |
| return "\n".join(lines) | |
| elif isinstance(obj, str): | |
| escaped = obj.replace("&", "&").replace("<", "<").replace(">", ">") | |
| return f'<span class="json-str">"{escaped}"</span>' | |
| elif isinstance(obj, bool): | |
| cls = "json-bool" | |
| return f'<span class="{cls}">{"true" if obj else "false"}</span>' | |
| elif obj is None: | |
| return '<span class="json-null">null</span>' | |
| elif isinstance(obj, (int, float)): | |
| return f'<span class="json-num">{obj}</span>' | |
| else: | |
| return f'<span class="json-str">"{obj}"</span>' | |
| def render_flat_fields(data: dict) -> str: | |
| """Renderiza campos flat (nΓ£o-aninhados) como cards.""" | |
| cards = [] | |
| for k, v in data.items(): | |
| if isinstance(v, (dict, list)): | |
| continue | |
| key_html = f'<div class="field-key">{k}</div>' | |
| if v is None: | |
| val_html = '<div class="field-val field-val-null">null</div>' | |
| elif isinstance(v, bool): | |
| cls = "field-val-bool-true" if v else "field-val-bool-false" | |
| val_html = f'<div class="field-val {cls}">{"true" if v else "false"}</div>' | |
| else: | |
| escaped = str(v).replace("<", "<").replace(">", ">") | |
| val_html = f'<div class="field-val">{escaped}</div>' | |
| cards.append(f'<div class="field-card">{key_html}{val_html}</div>') | |
| if not cards: | |
| return "" | |
| return f'<div class="field-grid">{"".join(cards)}</div>' | |
| # ββ SIDEBAR βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with st.sidebar: | |
| st.markdown(""" | |
| <div style='font-family:Orbitron,monospace;font-weight:900; | |
| font-size:1rem;color:#00ff88;text-shadow:0 0 10px rgba(0,255,136,0.4); | |
| letter-spacing:0.1em'>STRUCT//EXTRACT</div> | |
| <div style='font-family:Share Tech Mono,monospace;font-size:0.6rem; | |
| color:#446644;letter-spacing:0.2em;text-transform:uppercase;margin-top:0.2rem'> | |
| v1.0 Β· Function Calling Engine | |
| </div> | |
| """, unsafe_allow_html=True) | |
| st.divider() | |
| st.markdown("**π OpenAI API Key**") | |
| k_in = st.text_input("", type="password", value=st.session_state.openai_key, | |
| placeholder="sk-...", label_visibility="collapsed") | |
| if k_in: | |
| st.session_state.openai_key = k_in | |
| if get_key(): | |
| st.markdown('<div style="color:#00ff88;font-size:0.75rem">β KEY LOADED</div>', | |
| unsafe_allow_html=True) | |
| else: | |
| st.markdown('<div style="color:#ff3355;font-size:0.75rem">β KEY MISSING</div>', | |
| unsafe_allow_html=True) | |
| st.divider() | |
| st.markdown(""" | |
| <div style='font-family:Share Tech Mono,monospace;font-size:0.72rem; | |
| color:#446644;line-height:1.8'> | |
| <div style='color:#00cc66;margin-bottom:0.4rem'>// PIPELINE</div> | |
| 01. Text input<br> | |
| 02. Schema selection<br> | |
| 03. Tool definition (OpenAI)<br> | |
| 04. Function calling<br> | |
| 05. JSON parse + validate<br> | |
| 06. Retry on error<br> | |
| 07. Render + export | |
| </div> | |
| """, unsafe_allow_html=True) | |
| st.divider() | |
| st.markdown(""" | |
| <div style='font-family:Share Tech Mono,monospace;font-size:0.65rem;color:#2a4a2a'> | |
| model: gpt-4o-mini<br> | |
| tool_choice: required<br> | |
| temperature: 0.0<br> | |
| max_retries: 2<br> | |
| validation: pydantic v2 | |
| </div> | |
| """, unsafe_allow_html=True) | |
| st.divider() | |
| if st.button("⬑ Limpar histórico", use_container_width=True): | |
| st.session_state.history = [] | |
| st.rerun() | |
| # ββ HEADER ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.markdown(""" | |
| <div class="term-header"> | |
| <div class="term-title">⬑ STRUCTURED OUTPUT EXTRACTOR <span class="blink">_</span></div> | |
| <div class="term-sub">OpenAI Function Calling Β· Pydantic v2 Β· Dynamic JSON Schema Β· Auto-Retry</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # ββ TABS ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| tab_extract, tab_custom, tab_history = st.tabs([ | |
| "⬑ Extrair", | |
| "β¬’ Schema Customizado", | |
| "⬣ Histórico", | |
| ]) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # EXEMPLOS | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| EXAMPLES = { | |
| "Contrato Legal": """CONTRATO DE PRESTAΓΓO DE SERVIΓOS DE CONSULTORIA EM INTELIGΓNCIA ARTIFICIAL | |
| Entre as partes: | |
| CONTRATANTE: TechCorp Brasil Ltda., CNPJ 12.345.678/0001-99, com sede em SΓ£o Paulo/SP. | |
| CONTRATADO: Daniel Fonseca - ML Engineer, CPF 123.456.789-00, residente no Rio de Janeiro/RJ. | |
| CLΓUSULA 1 - OBJETO | |
| O CONTRATADO prestarΓ‘ serviΓ§os de consultoria em Graph Neural Networks e sistemas de detecΓ§Γ£o de fraude com IA Generativa, incluindo desenvolvimento de modelos, treinamento de equipes e documentaΓ§Γ£o tΓ©cnica. | |
| CLΓUSULA 2 - VALOR | |
| O valor total dos serviΓ§os Γ© de R$ 48.000,00 (quarenta e oito mil reais), pagos em 4 parcelas mensais de R$ 12.000,00. | |
| CLΓUSULA 3 - PRAZO | |
| VigΓͺncia de 4 (quatro) meses, iniciando em 01/04/2025 e encerrando em 31/07/2025. | |
| CLΓUSULA 4 - OBRIGAΓΓES DO CONTRATADO | |
| - Entregar relatΓ³rios mensais de progresso | |
| - Participar de reuniΓ΅es semanais remotas | |
| - Manter confidencialidade sobre os dados da empresa | |
| CLΓUSULA 5 - FORO | |
| Fica eleito o foro da Comarca de SΓ£o Paulo/SP para dirimir quaisquer controvΓ©rsias. | |
| Assinado digitalmente em 28/03/2025.""", | |
| "NotΓcia / Artigo": """Meta anuncia novo modelo de linguagem open-source com 405 bilhΓ΅es de parΓ’metros | |
| SAN FRANCISCO, 15 de marΓ§o de 2025 β A Meta Platforms anunciou nesta quinta-feira o lanΓ§amento do Llama 4, seu mais novo modelo de linguagem de grande escala com 405 bilhΓ΅es de parΓ’metros, disponΓvel gratuitamente para pesquisadores e empresas sob licenΓ§a open-source. | |
| O CEO Mark Zuckerberg afirmou que o modelo supera o GPT-4o em 73% dos benchmarks testados internamente, incluindo MMLU, HumanEval e MT-Bench. A vice-presidente de IA da empresa, Yann LeCun, destacou que o modelo foi treinado em 30 trilhΓ΅es de tokens de dados multimodais. | |
| O lanΓ§amento acontece em meio Γ crescente disputa entre Meta, OpenAI, Google e Anthropic pelo mercado de IA generativa, avaliado em US$ 2,4 trilhΓ΅es atΓ© 2030 segundo a consultoria Goldman Sachs. | |
| Especialistas do MIT e Stanford avaliam que a decisΓ£o de tornar o modelo open-source pode democratizar o acesso Γ IA avanΓ§ada, embora levante preocupaΓ§Γ΅es sobre uso malicioso. O governo americano jΓ‘ sinalizou que pode regulamentar o setor ainda em 2025.""", | |
| "Artigo CientΓfico": """GraphSAGE: Inductive Representation Learning on Large Graphs | |
| Autores: William L. Hamilton, Rex Ying, Jure Leskovec | |
| Venue: NeurIPS 2017, Long Beach, CA | |
| Abstract: | |
| Low-dimensional embeddings of nodes in large graphs have proved extremely useful in a variety of prediction tasks. However, most existing approaches require that all nodes in the graph are present during training of the embeddings; these previous approaches are inherently transductive and do not naturally generalize to unseen nodes. | |
| Problema resolvido: | |
| A maioria dos mΓ©todos de embedding para grafos Γ© transductive β sΓ³ funciona para nΓ³s vistos durante o treino. Em aplicaΓ§Γ΅es reais como redes sociais e sistemas de recomendaΓ§Γ£o, novos nΓ³s aparecem constantemente. | |
| Metodologia: | |
| O GraphSAGE aprende funΓ§Γ΅es de agregaΓ§Γ£o (mean, LSTM, pooling) que generalizam para nΓ³s nΓ£o vistos, combinando features do nΓ³ com as de sua vizinhanΓ§a amostrada. | |
| Resultados: | |
| - Dataset Citation (Cora): F1 = 0.935 | |
| - Dataset Reddit: F1 = 0.950 | |
| - Dataset PPI (Protein-Protein Interaction): F1 = 0.612 (vs 0.421 baseline) | |
| ContribuiΓ§Γ΅es principais: | |
| 1. Framework inductive para grafos de larga escala | |
| 2. TrΓͺs agregadores comparados: mean, LSTM, max-pooling | |
| 3. Mini-batch training para escalabilidade | |
| 4. Open-source no repositΓ³rio snap-stanford/GraphSAGE""", | |
| } | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TAB 1 β EXTRAIR | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with tab_extract: | |
| from extractor import PRESET_SCHEMAS | |
| # Schema selector | |
| st.markdown(""" | |
| <div class="prompt-line">user@extractor:~$ <span>select --schema</span></div> | |
| """, unsafe_allow_html=True) | |
| schema_cols = st.columns(len(PRESET_SCHEMAS)) | |
| for i, (name, _) in enumerate(PRESET_SCHEMAS.items()): | |
| with schema_cols[i]: | |
| active = st.session_state.active_schema == name | |
| if st.button(name, key=f"sc_{i}", use_container_width=True): | |
| st.session_state.active_schema = name | |
| st.rerun() | |
| active_schema = PRESET_SCHEMAS[st.session_state.active_schema] | |
| st.markdown(f""" | |
| <div style='font-family:Share Tech Mono,monospace;font-size:0.68rem; | |
| color:#446644;margin:0.4rem 0 0.8rem;padding:0.4rem 0.8rem; | |
| border-left:2px solid #0a3a0a;background:#040a04'> | |
| // {st.session_state.active_schema} β {active_schema['description']} | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Exemplo rΓ‘pido | |
| col_ex, _ = st.columns([2, 3]) | |
| with col_ex: | |
| if st.button(f"⬑ Carregar exemplo: {st.session_state.active_schema}", | |
| use_container_width=True): | |
| ex_text = EXAMPLES.get(st.session_state.active_schema, "") | |
| if ex_text: | |
| st.session_state["load_example"] = ex_text | |
| default_text = st.session_state.pop("load_example", "") | |
| st.markdown(""" | |
| <div class="prompt-line" style="margin-top:0.8rem"> | |
| user@extractor:~$ <span>paste --input</span></div> | |
| """, unsafe_allow_html=True) | |
| text_input = st.text_area( | |
| "", value=default_text, height=220, | |
| placeholder="Cole qualquer texto aqui: contrato, notΓcia, currΓculo, invoice, artigo...", | |
| label_visibility="collapsed", key="main_text" | |
| ) | |
| run_col, _ = st.columns([1, 3]) | |
| with run_col: | |
| run_btn = st.button("⬑ EXTRAIR DADOS", use_container_width=True, type="primary") | |
| if run_btn: | |
| if not get_key(): | |
| st.markdown('<div style="color:#ff3355;font-size:0.8rem">β API Key nΓ£o configurada</div>', | |
| unsafe_allow_html=True) | |
| st.stop() | |
| if not text_input.strip(): | |
| st.markdown('<div style="color:#ffb700;font-size:0.8rem">β Cole um texto para extrair</div>', | |
| unsafe_allow_html=True) | |
| st.stop() | |
| from extractor import StructuredExtractor | |
| # Terminal de progresso | |
| prog_ph = st.empty() | |
| prog_ph.markdown(""" | |
| <div class="term-window"> | |
| <div class="term-titlebar"> | |
| <span class="term-dot dot-r"></span> | |
| <span class="term-dot dot-y"></span> | |
| <span class="term-dot dot-g"></span> | |
| <span class="term-wintitle">extraction.log</span> | |
| </div> | |
| <div class="term-body" style="font-size:0.75rem;color:#446644;line-height:2"> | |
| <div>β Inicializando engine...</div> | |
| <div>β Tool definition criada</div> | |
| <div>β Chamando gpt-4o-mini com tool_choice=required...</div> | |
| <div style="color:#ffb700">β³ Aguardando resposta<span class="blink">_</span></div> | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| try: | |
| engine = StructuredExtractor(get_key()) | |
| result = engine.extract( | |
| text=text_input, | |
| schema=active_schema["schema"], | |
| schema_name=st.session_state.active_schema, | |
| ) | |
| prog_ph.markdown(f""" | |
| <div class="term-window"> | |
| <div class="term-titlebar"> | |
| <span class="term-dot dot-r"></span> | |
| <span class="term-dot dot-y"></span> | |
| <span class="term-dot dot-g"></span> | |
| <span class="term-wintitle">extraction.log</span> | |
| </div> | |
| <div class="term-body" style="font-size:0.75rem;color:#446644;line-height:2"> | |
| <div>β Engine inicializado</div> | |
| <div>β Tool definition: <span style="color:#00d4ff">{st.session_state.active_schema}</span></div> | |
| <div>β Function call executado com sucesso</div> | |
| <div>β JSON parseado e validado</div> | |
| <div style="color:#00ff88">β EXTRAΓΓO COMPLETA em {result['attempts']} tentativa(s) Β· {result['tokens']} tokens</div> | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Salva no histΓ³rico | |
| st.session_state.history.append({ | |
| "schema": st.session_state.active_schema, | |
| "text_preview": text_input[:120] + "...", | |
| "result": result, | |
| }) | |
| # ββ OUTPUT ββββββββββββββββββββββββββββββββββββββββββ | |
| out_col, raw_col = st.columns([3, 2], gap="large") | |
| with out_col: | |
| st.markdown(""" | |
| <div class="prompt-line">user@extractor:~$ <span>render --view=structured</span></div> | |
| """, unsafe_allow_html=True) | |
| # Cards de campos flat | |
| flat_html = render_flat_fields(result["data"]) | |
| if flat_html: | |
| st.markdown(flat_html, unsafe_allow_html=True) | |
| # Campos complexos (listas/objetos) | |
| for k, v in result["data"].items(): | |
| if not isinstance(v, (dict, list)): | |
| continue | |
| st.markdown(f""" | |
| <div style='font-family:Share Tech Mono,monospace;font-size:0.65rem; | |
| color:#00d4ff;text-transform:uppercase;letter-spacing:0.1em; | |
| margin:0.8rem 0 0.3rem'>// {k}</div> | |
| """, unsafe_allow_html=True) | |
| if isinstance(v, list): | |
| for item in v: | |
| if isinstance(item, dict): | |
| st.markdown(f""" | |
| <div class="json-output" style="font-size:0.75rem;margin-bottom:0.4rem"> | |
| {syntax_highlight_json(item, 0)} | |
| </div> | |
| """, unsafe_allow_html=True) | |
| else: | |
| esc = str(item).replace("<","<") | |
| st.markdown(f'<div class="field-card"><div class="field-val">{esc}</div></div>', | |
| unsafe_allow_html=True) | |
| elif isinstance(v, dict): | |
| st.markdown(f""" | |
| <div class="json-output" style="font-size:0.75rem"> | |
| {syntax_highlight_json(v, 0)} | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Stats | |
| st.markdown(f""" | |
| <div class="stats-bar"> | |
| <div class="stat-item">schema: <span>{st.session_state.active_schema}</span></div> | |
| <div class="stat-item">fields: <span>{len(result['data'])}</span></div> | |
| <div class="stat-item">tokens: <span>{result['tokens']}</span></div> | |
| <div class="stat-item">attempts: <span>{result['attempts']}</span></div> | |
| <div class="stat-item">method: <span>{result['method']}</span></div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with raw_col: | |
| st.markdown(""" | |
| <div class="prompt-line">user@extractor:~$ <span>cat output.json</span></div> | |
| """, unsafe_allow_html=True) | |
| json_html = syntax_highlight_json(result["data"]) | |
| st.markdown(f'<div class="json-output">{json_html}</div>', | |
| unsafe_allow_html=True) | |
| # Download | |
| st.download_button( | |
| "⬑ Download JSON", | |
| data=json.dumps(result["data"], ensure_ascii=False, indent=2), | |
| file_name=f"extracted_{st.session_state.active_schema.lower().replace(' ','_')}.json", | |
| mime="application/json", | |
| use_container_width=True, | |
| ) | |
| except Exception as e: | |
| prog_ph.markdown(f""" | |
| <div class="term-window"> | |
| <div class="term-titlebar"> | |
| <span class="term-dot dot-r"></span><span class="term-wintitle">error.log</span> | |
| </div> | |
| <div class="term-body" style="color:#ff3355;font-size:0.8rem"> | |
| β ERRO: {e} | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TAB 2 β SCHEMA CUSTOMIZADO | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with tab_custom: | |
| st.markdown(""" | |
| <div class="prompt-line">user@extractor:~$ <span>define --schema=custom</span></div> | |
| <div style='font-family:Share Tech Mono,monospace;font-size:0.7rem; | |
| color:#446644;margin:0.3rem 0 0.8rem'> | |
| // Defina seu prΓ³prio JSON Schema e extraia qualquer estrutura de qualquer texto | |
| </div> | |
| """, unsafe_allow_html=True) | |
| DEFAULT_CUSTOM = '''{ | |
| "type": "object", | |
| "properties": { | |
| "nome_produto": {"type": "string"}, | |
| "preco": {"type": "number"}, | |
| "categorias": {"type": "array", "items": {"type": "string"}}, | |
| "disponivel": {"type": "boolean"}, | |
| "especificacoes": { | |
| "type": "object", | |
| "properties": { | |
| "peso": {"type": "string"}, | |
| "cor": {"type": "string"} | |
| } | |
| } | |
| }, | |
| "required": ["nome_produto"] | |
| }''' | |
| c_left, c_right = st.columns(2, gap="large") | |
| with c_left: | |
| st.markdown('<div class="prompt-line">$ <span>vim schema.json</span></div>', | |
| unsafe_allow_html=True) | |
| custom_schema = st.text_area( | |
| "", value=st.session_state.custom_schema or DEFAULT_CUSTOM, | |
| height=280, label_visibility="collapsed", key="custom_schema_input" | |
| ) | |
| with c_right: | |
| st.markdown('<div class="prompt-line">$ <span>cat input.txt</span></div>', | |
| unsafe_allow_html=True) | |
| custom_text = st.text_area( | |
| "", height=280, label_visibility="collapsed", key="custom_text", | |
| placeholder="Cole o texto para extrair..." | |
| ) | |
| run_custom = st.button("⬑ EXTRAIR COM SCHEMA CUSTOMIZADO", use_container_width=True) | |
| if run_custom: | |
| if not get_key(): | |
| st.error("Configure a API Key na sidebar.") | |
| st.stop() | |
| if not custom_text.strip() or not custom_schema.strip(): | |
| st.warning("Preencha o schema e o texto.") | |
| st.stop() | |
| from extractor import StructuredExtractor | |
| with st.spinner("Extraindo..."): | |
| try: | |
| engine = StructuredExtractor(get_key()) | |
| result = engine.extract_with_custom_schema(custom_text, custom_schema) | |
| st.markdown('<div class="prompt-line">$ <span>cat output.json</span></div>', | |
| unsafe_allow_html=True) | |
| json_html = syntax_highlight_json(result["data"]) | |
| st.markdown(f'<div class="json-output">{json_html}</div>', | |
| unsafe_allow_html=True) | |
| st.markdown(f""" | |
| <div class="stats-bar"> | |
| <div class="stat-item">tokens: <span>{result['tokens']}</span></div> | |
| <div class="stat-item">attempts: <span>{result['attempts']}</span></div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| st.download_button( | |
| "⬑ Download JSON", | |
| data=json.dumps(result["data"], ensure_ascii=False, indent=2), | |
| file_name="custom_extraction.json", | |
| mime="application/json", | |
| ) | |
| st.session_state.history.append({ | |
| "schema": "Custom", | |
| "text_preview": custom_text[:120] + "...", | |
| "result": result, | |
| }) | |
| except ValueError as e: | |
| st.error(f"Schema invΓ‘lido: {e}") | |
| except Exception as e: | |
| st.error(f"Erro: {e}") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TAB 3 β HISTΓRICO | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with tab_history: | |
| if not st.session_state.history: | |
| st.markdown(""" | |
| <div style='font-family:Share Tech Mono,monospace;font-size:0.8rem; | |
| color:#2a4a2a;text-align:center;padding:3rem'> | |
| // nenhuma extraΓ§Γ£o executada ainda | |
| </div> | |
| """, unsafe_allow_html=True) | |
| else: | |
| for i, h in enumerate(reversed(st.session_state.history)): | |
| r = h["result"] | |
| with st.expander( | |
| f"#{len(st.session_state.history)-i} Β· {h['schema']} Β· {r['tokens']} tokens", | |
| expanded=(i == 0) | |
| ): | |
| st.markdown(f""" | |
| <div style='font-family:Share Tech Mono,monospace;font-size:0.7rem; | |
| color:#446644;margin-bottom:0.5rem'>// {h['text_preview']}</div> | |
| """, unsafe_allow_html=True) | |
| json_html = syntax_highlight_json(r["data"]) | |
| st.markdown(f'<div class="json-output" style="font-size:0.75rem">{json_html}</div>', | |
| unsafe_allow_html=True) | |
| st.download_button( | |
| "⬑ Download", | |
| data=json.dumps(r["data"], ensure_ascii=False, indent=2), | |
| file_name=f"extract_{i}.json", | |
| mime="application/json", | |
| key=f"dl_{i}", | |
| ) |