Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| """ | |
| UltraData Math Parser - Hugging Face Space Demo | |
| A unified HTML parser optimized for extracting mathematical content. | |
| """ | |
| import gradio as gr | |
| import requests | |
| from ultradata_math_parser import GeneralParser | |
| def fetch_url_content(url: str) -> tuple: | |
| """Fetch HTML content from a URL.""" | |
| if not url or not url.strip(): | |
| return "", "Please enter a URL" | |
| url = url.strip() | |
| if not url.startswith(("http://", "https://")): | |
| url = "https://" + url | |
| try: | |
| headers = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" | |
| } | |
| response = requests.get(url, headers=headers, timeout=15) | |
| response.raise_for_status() | |
| return response.text, url | |
| except requests.exceptions.Timeout: | |
| return "", f"Request timed out for {url}" | |
| except requests.exceptions.RequestException as e: | |
| return "", f"Failed to fetch URL: {str(e)}" | |
| def fetch_and_parse(url: str, process_math: bool, include_tables: bool, enable_forum: bool, html_type: str) -> tuple: | |
| """Fetch URL content and parse it in one step.""" | |
| html_content, base_url = fetch_url_content(url) | |
| if not html_content: | |
| # base_url contains error message in this case | |
| error_msg = base_url | |
| return "", error_msg, f"β {error_msg}", "", "", f"**Error:** {error_msg}" | |
| result = parse_html( | |
| html_content=html_content, | |
| base_url=base_url, | |
| process_math=process_math, | |
| include_tables=include_tables, | |
| enable_forum_assembly=enable_forum, | |
| html_type=html_type, | |
| ) | |
| formatted = format_output(result) | |
| # Return: html_content, base_url, title, html_output, text_output, markdown_output | |
| return html_content, base_url, formatted[0], formatted[1], formatted[2], formatted[3] | |
| def parse_html( | |
| html_content: str, | |
| base_url: str = "", | |
| process_math: bool = True, | |
| include_tables: bool = True, | |
| enable_forum_assembly: bool = True, | |
| html_type: str = "unified", | |
| ) -> dict: | |
| """ | |
| Parse HTML content using GeneralParser. | |
| Args: | |
| html_content: Raw HTML string to parse | |
| base_url: Base URL for resolving relative links | |
| process_math: Whether to process and convert math expressions | |
| include_tables: Whether to preserve table elements | |
| enable_forum_assembly: Whether to enable forum post assembly | |
| html_type: Parser type (unified/article/forum) | |
| Returns: | |
| Dictionary containing parsed results | |
| """ | |
| if not html_content or not html_content.strip(): | |
| return { | |
| "title": "", | |
| "html": "", | |
| "text": "", | |
| "text_length": 0, | |
| "xp_num": "", | |
| "fallback_strategy": "", | |
| "forum_assembled": False, | |
| "error": "Please provide HTML content to parse.", | |
| } | |
| parser = GeneralParser() | |
| try: | |
| result = parser.extract( | |
| html=html_content, | |
| base_url=base_url, | |
| process_math=process_math, | |
| include_tables=include_tables, | |
| enable_forum_assembly=enable_forum_assembly, | |
| html_type=html_type, | |
| ) | |
| return { | |
| "title": result.get("title", ""), | |
| "html": result.get("html", ""), | |
| "text": result.get("text", ""), | |
| "text_length": result.get("text_length", 0), | |
| "xp_num": result.get("xp_num", ""), | |
| "fallback_strategy": result.get("fallback_strategy", ""), | |
| "forum_assembled": result.get("forum_assembled", False), | |
| "error": None, | |
| } | |
| except Exception as e: | |
| return { | |
| "title": "", | |
| "html": "", | |
| "text": "", | |
| "text_length": 0, | |
| "xp_num": "", | |
| "fallback_strategy": "", | |
| "forum_assembled": False, | |
| "error": str(e), | |
| } | |
| def format_output(result: dict) -> tuple: | |
| """Format the parser output for Gradio display.""" | |
| if result.get("error"): | |
| return ( | |
| f"β Error: {result['error']}", | |
| "", | |
| "", | |
| f"**Error:** {result['error']}", | |
| ) | |
| # Format text as markdown (wrap in code block for better display) | |
| text_content = result.get("text", "") | |
| markdown_content = text_content if text_content else "_No content extracted_" | |
| return ( | |
| result.get("title", ""), | |
| result.get("html", ""), | |
| result.get("text", ""), | |
| markdown_content, | |
| ) | |
| def process_input(html_content, base_url, process_math, include_tables, enable_forum, html_type): | |
| """Main processing function for Gradio interface.""" | |
| result = parse_html( | |
| html_content=html_content, | |
| base_url=base_url, | |
| process_math=process_math, | |
| include_tables=include_tables, | |
| enable_forum_assembly=enable_forum, | |
| html_type=html_type, | |
| ) | |
| return format_output(result) | |
| # Example HTML content for demo | |
| EXAMPLE_HTML = """<!DOCTYPE html> | |
| <html> | |
| <head> | |
| <title>Quadratic Formula Example</title> | |
| </head> | |
| <body> | |
| <article class="post-content"> | |
| <h1>Understanding the Quadratic Formula</h1> | |
| <p>The quadratic formula is used to solve equations of the form axΒ² + bx + c = 0.</p> | |
| <p>The solution is given by:</p> | |
| <math xmlns="http://www.w3.org/1998/Math/MathML"> | |
| <mi>x</mi> | |
| <mo>=</mo> | |
| <mfrac> | |
| <mrow> | |
| <mo>-</mo> | |
| <mi>b</mi> | |
| <mo>Β±</mo> | |
| <msqrt> | |
| <mrow> | |
| <msup><mi>b</mi><mn>2</mn></msup> | |
| <mo>-</mo> | |
| <mn>4</mn> | |
| <mi>a</mi> | |
| <mi>c</mi> | |
| </mrow> | |
| </msqrt> | |
| </mrow> | |
| <mrow> | |
| <mn>2</mn> | |
| <mi>a</mi> | |
| </mrow> | |
| </mfrac> | |
| </math> | |
| <p>Where a, b, and c are coefficients of the quadratic equation.</p> | |
| <h2>Example Problem</h2> | |
| <p>Solve: xΒ² - 5x + 6 = 0</p> | |
| <p>Here, a = 1, b = -5, c = 6</p> | |
| <p>Using the formula: x = (5 Β± β(25-24))/2 = (5 Β± 1)/2</p> | |
| <p>Therefore, x = 3 or x = 2</p> | |
| </article> | |
| <footer> | |
| <nav>Related articles...</nav> | |
| </footer> | |
| </body> | |
| </html>""" | |
| # Custom CSS for better aesthetics | |
| custom_css = """ | |
| @import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500&family=Space+Grotesk:wght@400;500;600;700&display=swap'); | |
| .gradio-container { | |
| font-family: 'Space Grotesk', sans-serif !important; | |
| background: linear-gradient(135deg, #0f0f23 0%, #1a1a3e 50%, #0f0f23 100%) !important; | |
| min-height: 100vh; | |
| } | |
| .main-title { | |
| font-family: 'Space Grotesk', sans-serif !important; | |
| font-weight: 700 !important; | |
| font-size: 2.5rem !important; | |
| background: linear-gradient(90deg, #00d4ff, #7c3aed, #f472b6) !important; | |
| -webkit-background-clip: text !important; | |
| -webkit-text-fill-color: transparent !important; | |
| background-clip: text !important; | |
| text-align: center !important; | |
| margin-bottom: 0.5rem !important; | |
| } | |
| .subtitle { | |
| text-align: center !important; | |
| color: #94a3b8 !important; | |
| font-size: 1.1rem !important; | |
| margin-bottom: 2rem !important; | |
| } | |
| .gr-box { | |
| border-radius: 12px !important; | |
| border: 1px solid rgba(124, 58, 237, 0.3) !important; | |
| background: rgba(15, 15, 35, 0.8) !important; | |
| backdrop-filter: blur(10px) !important; | |
| } | |
| .gr-input, .gr-textarea { | |
| font-family: 'JetBrains Mono', monospace !important; | |
| background: rgba(30, 30, 60, 0.6) !important; | |
| border: 1px solid rgba(124, 58, 237, 0.4) !important; | |
| border-radius: 8px !important; | |
| color: #e2e8f0 !important; | |
| } | |
| .gr-button-primary { | |
| background: linear-gradient(135deg, #7c3aed 0%, #00d4ff 100%) !important; | |
| border: none !important; | |
| font-weight: 600 !important; | |
| font-size: 1rem !important; | |
| padding: 12px 32px !important; | |
| border-radius: 8px !important; | |
| transition: all 0.3s ease !important; | |
| text-transform: uppercase !important; | |
| letter-spacing: 1px !important; | |
| } | |
| .gr-button-primary:hover { | |
| transform: translateY(-2px) !important; | |
| box-shadow: 0 8px 25px rgba(124, 58, 237, 0.4) !important; | |
| } | |
| .gr-button-secondary { | |
| background: transparent !important; | |
| border: 2px solid rgba(124, 58, 237, 0.5) !important; | |
| color: #a78bfa !important; | |
| font-weight: 500 !important; | |
| border-radius: 8px !important; | |
| } | |
| .section-header { | |
| color: #00d4ff !important; | |
| font-weight: 600 !important; | |
| font-size: 1.2rem !important; | |
| margin-bottom: 1rem !important; | |
| padding-bottom: 0.5rem !important; | |
| border-bottom: 2px solid rgba(0, 212, 255, 0.3) !important; | |
| } | |
| .output-box { | |
| background: rgba(20, 20, 45, 0.9) !important; | |
| border: 1px solid rgba(0, 212, 255, 0.3) !important; | |
| border-radius: 12px !important; | |
| padding: 1rem !important; | |
| } | |
| .gr-markdown { | |
| color: #e2e8f0 !important; | |
| } | |
| .gr-markdown code { | |
| background: rgba(124, 58, 237, 0.2) !important; | |
| padding: 2px 6px !important; | |
| border-radius: 4px !important; | |
| font-family: 'JetBrains Mono', monospace !important; | |
| } | |
| footer { | |
| display: none !important; | |
| } | |
| .gr-accordion { | |
| border: 1px solid rgba(124, 58, 237, 0.3) !important; | |
| border-radius: 8px !important; | |
| background: rgba(20, 20, 45, 0.6) !important; | |
| } | |
| .gr-check-radio { | |
| accent-color: #7c3aed !important; | |
| } | |
| label { | |
| color: #cbd5e1 !important; | |
| } | |
| /* Fixed height textbox with scrollbar */ | |
| .gr-textbox textarea { | |
| overflow-y: auto !important; | |
| resize: vertical !important; | |
| } | |
| /* Output textbox styling - match markdown box height */ | |
| .output-textbox textarea { | |
| min-height: 560px !important; | |
| max-height: 580px !important; | |
| overflow-y: auto !important; | |
| } | |
| /* Markdown box styling */ | |
| .markdown-box { | |
| background: rgba(255, 255, 255, 0.95) !important; | |
| border: 1px solid rgba(124, 58, 237, 0.3) !important; | |
| border-radius: 8px !important; | |
| padding: 16px !important; | |
| min-height: 580px !important; | |
| max-height: 580px !important; | |
| overflow-y: auto !important; | |
| color: #1a1a2e !important; | |
| } | |
| .markdown-box * { | |
| color: #1a1a2e !important; | |
| } | |
| .markdown-box code { | |
| background: rgba(124, 58, 237, 0.1) !important; | |
| padding: 2px 6px !important; | |
| border-radius: 4px !important; | |
| } | |
| .markdown-box pre { | |
| background: #f4f4f8 !important; | |
| padding: 12px !important; | |
| border-radius: 6px !important; | |
| overflow-x: auto !important; | |
| } | |
| /* Custom scrollbar styling */ | |
| .gr-textbox textarea::-webkit-scrollbar { | |
| width: 8px; | |
| } | |
| .gr-textbox textarea::-webkit-scrollbar-track { | |
| background: rgba(30, 30, 60, 0.4); | |
| border-radius: 4px; | |
| } | |
| .gr-textbox textarea::-webkit-scrollbar-thumb { | |
| background: rgba(124, 58, 237, 0.6); | |
| border-radius: 4px; | |
| } | |
| .gr-textbox textarea::-webkit-scrollbar-thumb:hover { | |
| background: rgba(124, 58, 237, 0.8); | |
| } | |
| """ | |
| # Build Gradio interface | |
| with gr.Blocks(title="UltraData Math Parser") as demo: | |
| gr.HTML(f'<style>{custom_css}</style>') | |
| gr.HTML('<h1 class="main-title">π UltraData Math Parser</h1>') | |
| gr.HTML('<p class="subtitle">Unified HTML Parser for Mathematical Content Extraction</p>') | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.HTML('<div class="section-header">π₯ Input</div>') | |
| with gr.Tabs(): | |
| with gr.TabItem("π URL"): | |
| url_input = gr.Textbox( | |
| label="URL", | |
| placeholder="Enter URL to fetch (e.g., https://example.com/math-article)", | |
| lines=3, | |
| max_lines=5, | |
| value="https://math.stackexchange.com/questions/5120625/ode-problem-of-yt-sqrtyt-with-the-inital-value-y0-1-t-geq-0", | |
| ) | |
| fetch_btn = gr.Button("π₯ Fetch & Parse", variant="primary", size="lg") | |
| with gr.TabItem("π HTML"): | |
| pass # HTML input will be below, shared between tabs | |
| html_input = gr.Textbox( | |
| label="HTML Content", | |
| placeholder="Paste your HTML content here or fetch from URL above...", | |
| lines=10, | |
| max_lines=20, | |
| value=EXAMPLE_HTML, | |
| ) | |
| base_url_input = gr.Textbox( | |
| label="Base URL (Auto-filled from URL fetch)", | |
| placeholder="https://example.com/page", | |
| lines=1, | |
| ) | |
| with gr.Accordion("βοΈ Advanced Options", open=False): | |
| html_type = gr.Radio( | |
| choices=["unified", "article", "forum"], | |
| value="unified", | |
| label="Parser Type", | |
| info="Select the parsing strategy", | |
| ) | |
| process_math = gr.Checkbox( | |
| label="Process Math Expressions", | |
| value=True, | |
| info="Convert MathML and LaTeX to unified format", | |
| ) | |
| include_tables = gr.Checkbox( | |
| label="Include Tables", | |
| value=True, | |
| info="Preserve table elements in output", | |
| ) | |
| enable_forum = gr.Checkbox( | |
| label="Enable Forum Assembly", | |
| value=True, | |
| info="Assemble forum posts and comments", | |
| ) | |
| with gr.Row(): | |
| parse_btn = gr.Button("π Parse HTML", variant="primary", size="lg") | |
| clear_btn = gr.Button("ποΈ Clear", variant="secondary", size="lg") | |
| with gr.Column(scale=1): | |
| gr.HTML('<div class="section-header">π€ Output</div>') | |
| title_output = gr.Textbox( | |
| label="Extracted Title", | |
| lines=1, | |
| interactive=False, | |
| ) | |
| with gr.Tabs(): | |
| with gr.TabItem("β¨ Markdown"): | |
| markdown_output = gr.Markdown( | |
| label="Markdown Preview", | |
| elem_classes=["markdown-box"], | |
| latex_delimiters=[ | |
| {"left": "$$", "right": "$$", "display": True}, | |
| {"left": "$", "right": "$", "display": False}, | |
| {"left": "\\[", "right": "\\]", "display": True}, | |
| {"left": "\\(", "right": "\\)", "display": False}, | |
| ], | |
| ) | |
| with gr.TabItem("π Plain Text"): | |
| text_output = gr.Textbox( | |
| label="Plain Text (w3m rendered)", | |
| lines=25, | |
| max_lines=30, | |
| interactive=False, | |
| autoscroll=False, | |
| elem_classes=["output-textbox"], | |
| ) | |
| with gr.TabItem("π Raw HTML"): | |
| html_output = gr.Textbox( | |
| label="Extracted HTML", | |
| lines=25, | |
| max_lines=30, | |
| interactive=False, | |
| autoscroll=False, | |
| elem_classes=["output-textbox"], | |
| ) | |
| # Event handlers | |
| fetch_btn.click( | |
| fn=fetch_and_parse, | |
| inputs=[url_input, process_math, include_tables, enable_forum, html_type], | |
| outputs=[html_input, base_url_input, title_output, html_output, text_output, markdown_output], | |
| ) | |
| parse_btn.click( | |
| fn=process_input, | |
| inputs=[html_input, base_url_input, process_math, include_tables, enable_forum, html_type], | |
| outputs=[title_output, html_output, text_output, markdown_output], | |
| ) | |
| def clear_all(): | |
| return "", "", "", "", "", "", "" | |
| clear_btn.click( | |
| fn=clear_all, | |
| outputs=[url_input, html_input, base_url_input, title_output, html_output, text_output, markdown_output], | |
| ) | |
| # Footer info | |
| gr.HTML(""" | |
| <div style="text-align: center; margin-top: 2rem; padding: 1rem; color: #64748b; font-size: 0.9rem;"> | |
| <p>π¬ <strong>UltraData Math Parser</strong> - Part of the UltraData-Math Project</p> | |
| <p>Specialized in extracting mathematical content from web pages with MathML, LaTeX, and formula support.</p> | |
| </div> | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch(ssr_mode=False) | |