| |
| import streamlit as st |
| import json |
| import os |
|
|
| st.set_page_config( |
| page_title="Structured Extractor Β· Daniel Fonseca", |
| page_icon="⬑", |
| layout="wide", |
| initial_sidebar_state="expanded", |
| ) |
|
|
| |
| st.markdown(""" |
| <style> |
| @import url('https://fonts.googleapis.com/css2?family=Share+Tech+Mono&family=Orbitron:wght@400;700;900&family=VT323&display=swap'); |
| |
| :root { |
| --bg: #060810; |
| --bg2: #0a0d18; |
| --bg3: #0e1220; |
| --green: #00ff88; |
| --green2: #00cc66; |
| --green3: #009944; |
| --amber: #ffb700; |
| --cyan: #00d4ff; |
| --red: #ff3355; |
| --dim: #1a2a1a; |
| --grid: #0d1a0d; |
| --border: #0a3a0a; |
| --border2: #1a4a1a; |
| --text: #c8ffc8; |
| --text2: #88cc88; |
| --text3: #446644; |
| } |
| |
| html, body, [class*="css"] { |
| background: var(--bg) !important; |
| color: var(--text) !important; |
| font-family: 'Share Tech Mono', monospace !important; |
| } |
| |
| /* CRT scanlines overlay */ |
| body::before { |
| content: ''; |
| position: fixed; |
| top: 0; left: 0; right: 0; bottom: 0; |
| background: repeating-linear-gradient( |
| 0deg, |
| transparent, |
| transparent 2px, |
| rgba(0,255,136,0.015) 2px, |
| rgba(0,255,136,0.015) 4px |
| ); |
| pointer-events: none; |
| z-index: 9999; |
| } |
| |
| #MainMenu, footer, header { visibility: hidden; } |
| .block-container { padding-top: 1rem; max-width: 1300px; } |
| |
| /* ββ HEADER ββ */ |
| .term-header { |
| border-bottom: 1px solid var(--green3); |
| padding-bottom: 0.8rem; |
| margin-bottom: 1.2rem; |
| } |
| .term-title { |
| font-family: 'Orbitron', monospace; |
| font-weight: 900; |
| font-size: 1.8rem; |
| color: var(--green); |
| letter-spacing: 0.08em; |
| text-shadow: 0 0 20px rgba(0,255,136,0.5); |
| line-height: 1; |
| } |
| .term-sub { |
| font-family: 'Share Tech Mono', monospace; |
| font-size: 0.7rem; |
| color: var(--green3); |
| letter-spacing: 0.2em; |
| margin-top: 0.3rem; |
| } |
| .blink { |
| animation: blink 1s step-end infinite; |
| color: var(--green); |
| } |
| @keyframes blink { 50% { opacity: 0; } } |
| |
| /* ββ TERMINAL WINDOW ββ */ |
| .term-window { |
| background: var(--bg2); |
| border: 1px solid var(--border2); |
| border-radius: 4px; |
| overflow: hidden; |
| margin-bottom: 1rem; |
| } |
| .term-titlebar { |
| background: var(--bg3); |
| border-bottom: 1px solid var(--border); |
| padding: 0.4rem 0.8rem; |
| display: flex; |
| align-items: center; |
| gap: 0.5rem; |
| } |
| .term-dot { |
| width: 8px; height: 8px; |
| border-radius: 50%; |
| display: inline-block; |
| } |
| .dot-r { background: #ff3355; } |
| .dot-y { background: #ffb700; } |
| .dot-g { background: #00ff88; } |
| .term-wintitle { |
| font-size: 0.65rem; |
| color: var(--text3); |
| letter-spacing: 0.15em; |
| text-transform: uppercase; |
| margin-left: 0.5rem; |
| } |
| .term-body { padding: 1rem 1.2rem; } |
| |
| /* ββ PROMPT LINE ββ */ |
| .prompt-line { |
| font-size: 0.8rem; |
| color: var(--green3); |
| margin-bottom: 0.4rem; |
| } |
| .prompt-line span { color: var(--green); } |
| |
| /* ββ JSON RENDERER ββ */ |
| .json-output { |
| background: #040608; |
| border: 1px solid var(--border); |
| border-radius: 3px; |
| padding: 1.2rem; |
| font-family: 'Share Tech Mono', monospace; |
| font-size: 0.8rem; |
| line-height: 1.7; |
| overflow-x: auto; |
| position: relative; |
| } |
| .json-key { color: var(--cyan); } |
| .json-str { color: var(--amber); } |
| .json-num { color: #ff88aa; } |
| .json-bool { color: var(--green); font-weight: bold; } |
| .json-null { color: var(--text3); font-style: italic; } |
| .json-bracket { color: var(--text2); } |
| |
| /* ββ FIELD CARDS ββ */ |
| .field-grid { |
| display: grid; |
| grid-template-columns: repeat(auto-fill, minmax(220px, 1fr)); |
| gap: 0.6rem; |
| margin-top: 0.8rem; |
| } |
| .field-card { |
| background: #040a08; |
| border: 1px solid var(--border); |
| border-left: 2px solid var(--green3); |
| border-radius: 3px; |
| padding: 0.6rem 0.8rem; |
| transition: border-color 0.2s; |
| } |
| .field-card:hover { border-left-color: var(--green); } |
| .field-key { |
| font-size: 0.65rem; |
| color: var(--cyan); |
| text-transform: uppercase; |
| letter-spacing: 0.12em; |
| margin-bottom: 0.2rem; |
| } |
| .field-val { |
| font-size: 0.82rem; |
| color: var(--amber); |
| word-break: break-word; |
| } |
| .field-val-null { color: var(--text3); font-style: italic; } |
| .field-val-bool-true { color: var(--green); } |
| .field-val-bool-false { color: var(--red); } |
| |
| /* ββ STATS BAR ββ */ |
| .stats-bar { |
| display: flex; |
| gap: 1.5rem; |
| padding: 0.5rem 0; |
| border-top: 1px solid var(--border); |
| margin-top: 0.8rem; |
| flex-wrap: wrap; |
| } |
| .stat-item { |
| font-size: 0.68rem; |
| color: var(--text3); |
| } |
| .stat-item span { color: var(--green2); } |
| |
| /* ββ SCHEMA SELECTOR ββ */ |
| .schema-btn-active { |
| background: var(--dim) !important; |
| border: 1px solid var(--green) !important; |
| color: var(--green) !important; |
| } |
| |
| /* ββ SIDEBAR ββ */ |
| section[data-testid="stSidebar"] { |
| background: var(--bg2) !important; |
| border-right: 1px solid var(--border) !important; |
| } |
| section[data-testid="stSidebar"] * { color: var(--text2) !important; } |
| |
| /* ββ STREAMLIT OVERRIDES ββ */ |
| .stTextArea textarea { |
| background: #040608 !important; |
| border: 1px solid var(--border2) !important; |
| border-radius: 3px !important; |
| color: var(--text) !important; |
| font-family: 'Share Tech Mono', monospace !important; |
| font-size: 0.8rem !important; |
| line-height: 1.6 !important; |
| } |
| .stTextArea textarea:focus { |
| border-color: var(--green) !important; |
| box-shadow: 0 0 8px rgba(0,255,136,0.2) !important; |
| } |
| .stSelectbox select, .stSelectbox > div { |
| background: var(--bg2) !important; |
| border-color: var(--border2) !important; |
| color: var(--text) !important; |
| font-family: 'Share Tech Mono', monospace !important; |
| } |
| .stButton button { |
| background: transparent !important; |
| border: 1px solid var(--green3) !important; |
| color: var(--green) !important; |
| border-radius: 3px !important; |
| font-family: 'Orbitron', monospace !important; |
| font-size: 0.68rem !important; |
| letter-spacing: 0.1em !important; |
| text-transform: uppercase !important; |
| transition: all 0.2s !important; |
| } |
| .stButton button:hover { |
| background: var(--dim) !important; |
| border-color: var(--green) !important; |
| box-shadow: 0 0 12px rgba(0,255,136,0.3) !important; |
| } |
| .stTextInput input { |
| background: #040608 !important; |
| border: 1px solid var(--border2) !important; |
| color: var(--text) !important; |
| font-family: 'Share Tech Mono', monospace !important; |
| font-size: 0.8rem !important; |
| } |
| div[data-testid="stTabs"] button { |
| font-family: 'Orbitron', monospace !important; |
| font-size: 0.62rem !important; |
| letter-spacing: 0.08em !important; |
| color: var(--text3) !important; |
| } |
| div[data-testid="stTabs"] button[aria-selected="true"] { |
| color: var(--green) !important; |
| border-bottom-color: var(--green) !important; |
| } |
| hr { border-color: var(--border) !important; } |
| </style> |
| """, unsafe_allow_html=True) |
|
|
| |
| for k, v in { |
| 'openai_key': '', |
| 'history': [], |
| 'active_schema': 'Contrato Legal', |
| 'custom_schema': '', |
| }.items(): |
| if k not in st.session_state: |
| st.session_state[k] = v |
|
|
| |
| def get_key(): |
| try: |
| if 'OPENAI_API_KEY' in st.secrets: |
| return st.secrets['OPENAI_API_KEY'] |
| except Exception: |
| pass |
| return os.getenv('OPENAI_API_KEY', st.session_state.openai_key) |
|
|
|
|
| def syntax_highlight_json(obj, indent=0) -> str: |
| """Renderiza JSON com syntax highlighting HTML.""" |
| pad = " " * (indent * 3) |
| pad2 = " " * ((indent + 1) * 3) |
|
|
| if isinstance(obj, dict): |
| if not obj: |
| return '<span class="json-bracket">{}</span>' |
| lines = ['<span class="json-bracket">{</span>'] |
| items = list(obj.items()) |
| for i, (k, v) in enumerate(items): |
| comma = "," if i < len(items) - 1 else "" |
| val_html = syntax_highlight_json(v, indent + 1) |
| lines.append(f'{pad2}<span class="json-key">"{k}"</span>: {val_html}{comma}') |
| lines.append(f'{pad}<span class="json-bracket">}}</span>') |
| return "\n".join(lines) |
|
|
| elif isinstance(obj, list): |
| if not obj: |
| return '<span class="json-bracket">[]</span>' |
| lines = ['<span class="json-bracket">[</span>'] |
| for i, item in enumerate(obj): |
| comma = "," if i < len(obj) - 1 else "" |
| val_html = syntax_highlight_json(item, indent + 1) |
| lines.append(f'{pad2}{val_html}{comma}') |
| lines.append(f'{pad}<span class="json-bracket">]</span>') |
| return "\n".join(lines) |
|
|
| elif isinstance(obj, str): |
| escaped = obj.replace("&", "&").replace("<", "<").replace(">", ">") |
| return f'<span class="json-str">"{escaped}"</span>' |
|
|
| elif isinstance(obj, bool): |
| cls = "json-bool" |
| return f'<span class="{cls}">{"true" if obj else "false"}</span>' |
|
|
| elif obj is None: |
| return '<span class="json-null">null</span>' |
|
|
| elif isinstance(obj, (int, float)): |
| return f'<span class="json-num">{obj}</span>' |
|
|
| else: |
| return f'<span class="json-str">"{obj}"</span>' |
|
|
|
|
| def render_flat_fields(data: dict) -> str: |
| """Renderiza campos flat (nΓ£o-aninhados) como cards.""" |
| cards = [] |
| for k, v in data.items(): |
| if isinstance(v, (dict, list)): |
| continue |
| key_html = f'<div class="field-key">{k}</div>' |
| if v is None: |
| val_html = '<div class="field-val field-val-null">null</div>' |
| elif isinstance(v, bool): |
| cls = "field-val-bool-true" if v else "field-val-bool-false" |
| val_html = f'<div class="field-val {cls}">{"true" if v else "false"}</div>' |
| else: |
| escaped = str(v).replace("<", "<").replace(">", ">") |
| val_html = f'<div class="field-val">{escaped}</div>' |
| cards.append(f'<div class="field-card">{key_html}{val_html}</div>') |
| if not cards: |
| return "" |
| return f'<div class="field-grid">{"".join(cards)}</div>' |
|
|
|
|
| |
| with st.sidebar: |
| st.markdown(""" |
| <div style='font-family:Orbitron,monospace;font-weight:900; |
| font-size:1rem;color:#00ff88;text-shadow:0 0 10px rgba(0,255,136,0.4); |
| letter-spacing:0.1em'>STRUCT//EXTRACT</div> |
| <div style='font-family:Share Tech Mono,monospace;font-size:0.6rem; |
| color:#446644;letter-spacing:0.2em;text-transform:uppercase;margin-top:0.2rem'> |
| v1.0 Β· Function Calling Engine |
| </div> |
| """, unsafe_allow_html=True) |
| st.divider() |
|
|
| st.markdown("**π OpenAI API Key**") |
| k_in = st.text_input("", type="password", value=st.session_state.openai_key, |
| placeholder="sk-...", label_visibility="collapsed") |
| if k_in: |
| st.session_state.openai_key = k_in |
| if get_key(): |
| st.markdown('<div style="color:#00ff88;font-size:0.75rem">β KEY LOADED</div>', |
| unsafe_allow_html=True) |
| else: |
| st.markdown('<div style="color:#ff3355;font-size:0.75rem">β KEY MISSING</div>', |
| unsafe_allow_html=True) |
|
|
| st.divider() |
| st.markdown(""" |
| <div style='font-family:Share Tech Mono,monospace;font-size:0.72rem; |
| color:#446644;line-height:1.8'> |
| <div style='color:#00cc66;margin-bottom:0.4rem'>// PIPELINE</div> |
| 01. Text input<br> |
| 02. Schema selection<br> |
| 03. Tool definition (OpenAI)<br> |
| 04. Function calling<br> |
| 05. JSON parse + validate<br> |
| 06. Retry on error<br> |
| 07. Render + export |
| </div> |
| """, unsafe_allow_html=True) |
| st.divider() |
| st.markdown(""" |
| <div style='font-family:Share Tech Mono,monospace;font-size:0.65rem;color:#2a4a2a'> |
| model: gpt-4o-mini<br> |
| tool_choice: required<br> |
| temperature: 0.0<br> |
| max_retries: 2<br> |
| validation: pydantic v2 |
| </div> |
| """, unsafe_allow_html=True) |
| st.divider() |
| if st.button("⬑ Limpar histórico", use_container_width=True): |
| st.session_state.history = [] |
| st.rerun() |
|
|
| |
| st.markdown(""" |
| <div class="term-header"> |
| <div class="term-title">⬑ STRUCTURED OUTPUT EXTRACTOR <span class="blink">_</span></div> |
| <div class="term-sub">OpenAI Function Calling Β· Pydantic v2 Β· Dynamic JSON Schema Β· Auto-Retry</div> |
| </div> |
| """, unsafe_allow_html=True) |
|
|
| |
| tab_extract, tab_custom, tab_history = st.tabs([ |
| "⬑ Extrair", |
| "β¬’ Schema Customizado", |
| "⬣ Histórico", |
| ]) |
|
|
| |
| |
| |
| EXAMPLES = { |
| "Contrato Legal": """CONTRATO DE PRESTAΓΓO DE SERVIΓOS DE CONSULTORIA EM INTELIGΓNCIA ARTIFICIAL |
| |
| Entre as partes: |
| CONTRATANTE: TechCorp Brasil Ltda., CNPJ 12.345.678/0001-99, com sede em SΓ£o Paulo/SP. |
| CONTRATADO: Daniel Fonseca - ML Engineer, CPF 123.456.789-00, residente no Rio de Janeiro/RJ. |
| |
| CLΓUSULA 1 - OBJETO |
| O CONTRATADO prestarΓ‘ serviΓ§os de consultoria em Graph Neural Networks e sistemas de detecΓ§Γ£o de fraude com IA Generativa, incluindo desenvolvimento de modelos, treinamento de equipes e documentaΓ§Γ£o tΓ©cnica. |
| |
| CLΓUSULA 2 - VALOR |
| O valor total dos serviΓ§os Γ© de R$ 48.000,00 (quarenta e oito mil reais), pagos em 4 parcelas mensais de R$ 12.000,00. |
| |
| CLΓUSULA 3 - PRAZO |
| VigΓͺncia de 4 (quatro) meses, iniciando em 01/04/2025 e encerrando em 31/07/2025. |
| |
| CLΓUSULA 4 - OBRIGAΓΓES DO CONTRATADO |
| - Entregar relatΓ³rios mensais de progresso |
| - Participar de reuniΓ΅es semanais remotas |
| - Manter confidencialidade sobre os dados da empresa |
| |
| CLΓUSULA 5 - FORO |
| Fica eleito o foro da Comarca de SΓ£o Paulo/SP para dirimir quaisquer controvΓ©rsias. |
| |
| Assinado digitalmente em 28/03/2025.""", |
|
|
| "NotΓcia / Artigo": """Meta anuncia novo modelo de linguagem open-source com 405 bilhΓ΅es de parΓ’metros |
| |
| SAN FRANCISCO, 15 de marΓ§o de 2025 β A Meta Platforms anunciou nesta quinta-feira o lanΓ§amento do Llama 4, seu mais novo modelo de linguagem de grande escala com 405 bilhΓ΅es de parΓ’metros, disponΓvel gratuitamente para pesquisadores e empresas sob licenΓ§a open-source. |
| |
| O CEO Mark Zuckerberg afirmou que o modelo supera o GPT-4o em 73% dos benchmarks testados internamente, incluindo MMLU, HumanEval e MT-Bench. A vice-presidente de IA da empresa, Yann LeCun, destacou que o modelo foi treinado em 30 trilhΓ΅es de tokens de dados multimodais. |
| |
| O lanΓ§amento acontece em meio Γ crescente disputa entre Meta, OpenAI, Google e Anthropic pelo mercado de IA generativa, avaliado em US$ 2,4 trilhΓ΅es atΓ© 2030 segundo a consultoria Goldman Sachs. |
| |
| Especialistas do MIT e Stanford avaliam que a decisΓ£o de tornar o modelo open-source pode democratizar o acesso Γ IA avanΓ§ada, embora levante preocupaΓ§Γ΅es sobre uso malicioso. O governo americano jΓ‘ sinalizou que pode regulamentar o setor ainda em 2025.""", |
|
|
| "Artigo CientΓfico": """GraphSAGE: Inductive Representation Learning on Large Graphs |
| |
| Autores: William L. Hamilton, Rex Ying, Jure Leskovec |
| Venue: NeurIPS 2017, Long Beach, CA |
| |
| Abstract: |
| Low-dimensional embeddings of nodes in large graphs have proved extremely useful in a variety of prediction tasks. However, most existing approaches require that all nodes in the graph are present during training of the embeddings; these previous approaches are inherently transductive and do not naturally generalize to unseen nodes. |
| |
| Problema resolvido: |
| A maioria dos mΓ©todos de embedding para grafos Γ© transductive β sΓ³ funciona para nΓ³s vistos durante o treino. Em aplicaΓ§Γ΅es reais como redes sociais e sistemas de recomendaΓ§Γ£o, novos nΓ³s aparecem constantemente. |
| |
| Metodologia: |
| O GraphSAGE aprende funΓ§Γ΅es de agregaΓ§Γ£o (mean, LSTM, pooling) que generalizam para nΓ³s nΓ£o vistos, combinando features do nΓ³ com as de sua vizinhanΓ§a amostrada. |
| |
| Resultados: |
| - Dataset Citation (Cora): F1 = 0.935 |
| - Dataset Reddit: F1 = 0.950 |
| - Dataset PPI (Protein-Protein Interaction): F1 = 0.612 (vs 0.421 baseline) |
| |
| ContribuiΓ§Γ΅es principais: |
| 1. Framework inductive para grafos de larga escala |
| 2. TrΓͺs agregadores comparados: mean, LSTM, max-pooling |
| 3. Mini-batch training para escalabilidade |
| 4. Open-source no repositΓ³rio snap-stanford/GraphSAGE""", |
| } |
|
|
| |
| |
| |
| with tab_extract: |
| from extractor import PRESET_SCHEMAS |
|
|
| |
| st.markdown(""" |
| <div class="prompt-line">user@extractor:~$ <span>select --schema</span></div> |
| """, unsafe_allow_html=True) |
|
|
| schema_cols = st.columns(len(PRESET_SCHEMAS)) |
| for i, (name, _) in enumerate(PRESET_SCHEMAS.items()): |
| with schema_cols[i]: |
| active = st.session_state.active_schema == name |
| if st.button(name, key=f"sc_{i}", use_container_width=True): |
| st.session_state.active_schema = name |
| st.rerun() |
|
|
| active_schema = PRESET_SCHEMAS[st.session_state.active_schema] |
| st.markdown(f""" |
| <div style='font-family:Share Tech Mono,monospace;font-size:0.68rem; |
| color:#446644;margin:0.4rem 0 0.8rem;padding:0.4rem 0.8rem; |
| border-left:2px solid #0a3a0a;background:#040a04'> |
| // {st.session_state.active_schema} β {active_schema['description']} |
| </div> |
| """, unsafe_allow_html=True) |
|
|
| |
| col_ex, _ = st.columns([2, 3]) |
| with col_ex: |
| if st.button(f"⬑ Carregar exemplo: {st.session_state.active_schema}", |
| use_container_width=True): |
| ex_text = EXAMPLES.get(st.session_state.active_schema, "") |
| if ex_text: |
| st.session_state["load_example"] = ex_text |
|
|
| default_text = st.session_state.pop("load_example", "") |
|
|
| st.markdown(""" |
| <div class="prompt-line" style="margin-top:0.8rem"> |
| user@extractor:~$ <span>paste --input</span></div> |
| """, unsafe_allow_html=True) |
|
|
| text_input = st.text_area( |
| "", value=default_text, height=220, |
| placeholder="Cole qualquer texto aqui: contrato, notΓcia, currΓculo, invoice, artigo...", |
| label_visibility="collapsed", key="main_text" |
| ) |
|
|
| run_col, _ = st.columns([1, 3]) |
| with run_col: |
| run_btn = st.button("⬑ EXTRAIR DADOS", use_container_width=True, type="primary") |
|
|
| if run_btn: |
| if not get_key(): |
| st.markdown('<div style="color:#ff3355;font-size:0.8rem">β API Key nΓ£o configurada</div>', |
| unsafe_allow_html=True) |
| st.stop() |
| if not text_input.strip(): |
| st.markdown('<div style="color:#ffb700;font-size:0.8rem">β Cole um texto para extrair</div>', |
| unsafe_allow_html=True) |
| st.stop() |
|
|
| from extractor import StructuredExtractor |
|
|
| |
| prog_ph = st.empty() |
| prog_ph.markdown(""" |
| <div class="term-window"> |
| <div class="term-titlebar"> |
| <span class="term-dot dot-r"></span> |
| <span class="term-dot dot-y"></span> |
| <span class="term-dot dot-g"></span> |
| <span class="term-wintitle">extraction.log</span> |
| </div> |
| <div class="term-body" style="font-size:0.75rem;color:#446644;line-height:2"> |
| <div>β Inicializando engine...</div> |
| <div>β Tool definition criada</div> |
| <div>β Chamando gpt-4o-mini com tool_choice=required...</div> |
| <div style="color:#ffb700">β³ Aguardando resposta<span class="blink">_</span></div> |
| </div> |
| </div> |
| """, unsafe_allow_html=True) |
|
|
| try: |
| engine = StructuredExtractor(get_key()) |
| result = engine.extract( |
| text=text_input, |
| schema=active_schema["schema"], |
| schema_name=st.session_state.active_schema, |
| ) |
|
|
| prog_ph.markdown(f""" |
| <div class="term-window"> |
| <div class="term-titlebar"> |
| <span class="term-dot dot-r"></span> |
| <span class="term-dot dot-y"></span> |
| <span class="term-dot dot-g"></span> |
| <span class="term-wintitle">extraction.log</span> |
| </div> |
| <div class="term-body" style="font-size:0.75rem;color:#446644;line-height:2"> |
| <div>β Engine inicializado</div> |
| <div>β Tool definition: <span style="color:#00d4ff">{st.session_state.active_schema}</span></div> |
| <div>β Function call executado com sucesso</div> |
| <div>β JSON parseado e validado</div> |
| <div style="color:#00ff88">β EXTRAΓΓO COMPLETA em {result['attempts']} tentativa(s) Β· {result['tokens']} tokens</div> |
| </div> |
| </div> |
| """, unsafe_allow_html=True) |
|
|
| |
| st.session_state.history.append({ |
| "schema": st.session_state.active_schema, |
| "text_preview": text_input[:120] + "...", |
| "result": result, |
| }) |
|
|
| |
| out_col, raw_col = st.columns([3, 2], gap="large") |
|
|
| with out_col: |
| st.markdown(""" |
| <div class="prompt-line">user@extractor:~$ <span>render --view=structured</span></div> |
| """, unsafe_allow_html=True) |
|
|
| |
| flat_html = render_flat_fields(result["data"]) |
| if flat_html: |
| st.markdown(flat_html, unsafe_allow_html=True) |
|
|
| |
| for k, v in result["data"].items(): |
| if not isinstance(v, (dict, list)): |
| continue |
| st.markdown(f""" |
| <div style='font-family:Share Tech Mono,monospace;font-size:0.65rem; |
| color:#00d4ff;text-transform:uppercase;letter-spacing:0.1em; |
| margin:0.8rem 0 0.3rem'>// {k}</div> |
| """, unsafe_allow_html=True) |
|
|
| if isinstance(v, list): |
| for item in v: |
| if isinstance(item, dict): |
| st.markdown(f""" |
| <div class="json-output" style="font-size:0.75rem;margin-bottom:0.4rem"> |
| {syntax_highlight_json(item, 0)} |
| </div> |
| """, unsafe_allow_html=True) |
| else: |
| esc = str(item).replace("<","<") |
| st.markdown(f'<div class="field-card"><div class="field-val">{esc}</div></div>', |
| unsafe_allow_html=True) |
| elif isinstance(v, dict): |
| st.markdown(f""" |
| <div class="json-output" style="font-size:0.75rem"> |
| {syntax_highlight_json(v, 0)} |
| </div> |
| """, unsafe_allow_html=True) |
|
|
| |
| st.markdown(f""" |
| <div class="stats-bar"> |
| <div class="stat-item">schema: <span>{st.session_state.active_schema}</span></div> |
| <div class="stat-item">fields: <span>{len(result['data'])}</span></div> |
| <div class="stat-item">tokens: <span>{result['tokens']}</span></div> |
| <div class="stat-item">attempts: <span>{result['attempts']}</span></div> |
| <div class="stat-item">method: <span>{result['method']}</span></div> |
| </div> |
| """, unsafe_allow_html=True) |
|
|
| with raw_col: |
| st.markdown(""" |
| <div class="prompt-line">user@extractor:~$ <span>cat output.json</span></div> |
| """, unsafe_allow_html=True) |
| json_html = syntax_highlight_json(result["data"]) |
| st.markdown(f'<div class="json-output">{json_html}</div>', |
| unsafe_allow_html=True) |
|
|
| |
| st.download_button( |
| "⬑ Download JSON", |
| data=json.dumps(result["data"], ensure_ascii=False, indent=2), |
| file_name=f"extracted_{st.session_state.active_schema.lower().replace(' ','_')}.json", |
| mime="application/json", |
| use_container_width=True, |
| ) |
|
|
| except Exception as e: |
| prog_ph.markdown(f""" |
| <div class="term-window"> |
| <div class="term-titlebar"> |
| <span class="term-dot dot-r"></span><span class="term-wintitle">error.log</span> |
| </div> |
| <div class="term-body" style="color:#ff3355;font-size:0.8rem"> |
| β ERRO: {e} |
| </div> |
| </div> |
| """, unsafe_allow_html=True) |
|
|
| |
| |
| |
| with tab_custom: |
| st.markdown(""" |
| <div class="prompt-line">user@extractor:~$ <span>define --schema=custom</span></div> |
| <div style='font-family:Share Tech Mono,monospace;font-size:0.7rem; |
| color:#446644;margin:0.3rem 0 0.8rem'> |
| // Defina seu prΓ³prio JSON Schema e extraia qualquer estrutura de qualquer texto |
| </div> |
| """, unsafe_allow_html=True) |
|
|
| DEFAULT_CUSTOM = '''{ |
| "type": "object", |
| "properties": { |
| "nome_produto": {"type": "string"}, |
| "preco": {"type": "number"}, |
| "categorias": {"type": "array", "items": {"type": "string"}}, |
| "disponivel": {"type": "boolean"}, |
| "especificacoes": { |
| "type": "object", |
| "properties": { |
| "peso": {"type": "string"}, |
| "cor": {"type": "string"} |
| } |
| } |
| }, |
| "required": ["nome_produto"] |
| }''' |
|
|
| c_left, c_right = st.columns(2, gap="large") |
|
|
| with c_left: |
| st.markdown('<div class="prompt-line">$ <span>vim schema.json</span></div>', |
| unsafe_allow_html=True) |
| custom_schema = st.text_area( |
| "", value=st.session_state.custom_schema or DEFAULT_CUSTOM, |
| height=280, label_visibility="collapsed", key="custom_schema_input" |
| ) |
|
|
| with c_right: |
| st.markdown('<div class="prompt-line">$ <span>cat input.txt</span></div>', |
| unsafe_allow_html=True) |
| custom_text = st.text_area( |
| "", height=280, label_visibility="collapsed", key="custom_text", |
| placeholder="Cole o texto para extrair..." |
| ) |
|
|
| run_custom = st.button("⬑ EXTRAIR COM SCHEMA CUSTOMIZADO", use_container_width=True) |
|
|
| if run_custom: |
| if not get_key(): |
| st.error("Configure a API Key na sidebar.") |
| st.stop() |
| if not custom_text.strip() or not custom_schema.strip(): |
| st.warning("Preencha o schema e o texto.") |
| st.stop() |
|
|
| from extractor import StructuredExtractor |
| with st.spinner("Extraindo..."): |
| try: |
| engine = StructuredExtractor(get_key()) |
| result = engine.extract_with_custom_schema(custom_text, custom_schema) |
|
|
| st.markdown('<div class="prompt-line">$ <span>cat output.json</span></div>', |
| unsafe_allow_html=True) |
| json_html = syntax_highlight_json(result["data"]) |
| st.markdown(f'<div class="json-output">{json_html}</div>', |
| unsafe_allow_html=True) |
|
|
| st.markdown(f""" |
| <div class="stats-bar"> |
| <div class="stat-item">tokens: <span>{result['tokens']}</span></div> |
| <div class="stat-item">attempts: <span>{result['attempts']}</span></div> |
| </div> |
| """, unsafe_allow_html=True) |
|
|
| st.download_button( |
| "⬑ Download JSON", |
| data=json.dumps(result["data"], ensure_ascii=False, indent=2), |
| file_name="custom_extraction.json", |
| mime="application/json", |
| ) |
| st.session_state.history.append({ |
| "schema": "Custom", |
| "text_preview": custom_text[:120] + "...", |
| "result": result, |
| }) |
| except ValueError as e: |
| st.error(f"Schema invΓ‘lido: {e}") |
| except Exception as e: |
| st.error(f"Erro: {e}") |
|
|
| |
| |
| |
| with tab_history: |
| if not st.session_state.history: |
| st.markdown(""" |
| <div style='font-family:Share Tech Mono,monospace;font-size:0.8rem; |
| color:#2a4a2a;text-align:center;padding:3rem'> |
| // nenhuma extraΓ§Γ£o executada ainda |
| </div> |
| """, unsafe_allow_html=True) |
| else: |
| for i, h in enumerate(reversed(st.session_state.history)): |
| r = h["result"] |
| with st.expander( |
| f"#{len(st.session_state.history)-i} Β· {h['schema']} Β· {r['tokens']} tokens", |
| expanded=(i == 0) |
| ): |
| st.markdown(f""" |
| <div style='font-family:Share Tech Mono,monospace;font-size:0.7rem; |
| color:#446644;margin-bottom:0.5rem'>// {h['text_preview']}</div> |
| """, unsafe_allow_html=True) |
| json_html = syntax_highlight_json(r["data"]) |
| st.markdown(f'<div class="json-output" style="font-size:0.75rem">{json_html}</div>', |
| unsafe_allow_html=True) |
| st.download_button( |
| "⬑ Download", |
| data=json.dumps(r["data"], ensure_ascii=False, indent=2), |
| file_name=f"extract_{i}.json", |
| mime="application/json", |
| key=f"dl_{i}", |
| ) |