ZhouChuYue commited on
Commit
06dc834
ยท
1 Parent(s): a579dd2
Files changed (2) hide show
  1. app.py +30 -8
  2. packages.txt +1 -0
app.py CHANGED
@@ -5,7 +5,7 @@ A unified HTML parser optimized for extracting mathematical content.
5
  """
6
 
7
  import gradio as gr
8
- from ultradata_math_parser.parsers.unified_parser import UnifiedParser
9
 
10
 
11
  def parse_html(
@@ -14,9 +14,10 @@ def parse_html(
14
  process_math: bool = True,
15
  include_tables: bool = True,
16
  enable_forum_assembly: bool = True,
 
17
  ) -> dict:
18
  """
19
- Parse HTML content using UnifiedParser.
20
 
21
  Args:
22
  html_content: Raw HTML string to parse
@@ -24,6 +25,7 @@ def parse_html(
24
  process_math: Whether to process and convert math expressions
25
  include_tables: Whether to preserve table elements
26
  enable_forum_assembly: Whether to enable forum post assembly
 
27
 
28
  Returns:
29
  Dictionary containing parsed results
@@ -32,6 +34,7 @@ def parse_html(
32
  return {
33
  "title": "",
34
  "html": "",
 
35
  "text_length": 0,
36
  "xp_num": "",
37
  "fallback_strategy": "",
@@ -39,7 +42,7 @@ def parse_html(
39
  "error": "Please provide HTML content to parse.",
40
  }
41
 
42
- parser = UnifiedParser()
43
 
44
  try:
45
  result = parser.extract(
@@ -48,11 +51,13 @@ def parse_html(
48
  process_math=process_math,
49
  include_tables=include_tables,
50
  enable_forum_assembly=enable_forum_assembly,
 
51
  )
52
 
53
  return {
54
  "title": result.get("title", ""),
55
  "html": result.get("html", ""),
 
56
  "text_length": result.get("text_length", 0),
57
  "xp_num": result.get("xp_num", ""),
58
  "fallback_strategy": result.get("fallback_strategy", ""),
@@ -63,6 +68,7 @@ def parse_html(
63
  return {
64
  "title": "",
65
  "html": "",
 
66
  "text_length": 0,
67
  "xp_num": "",
68
  "fallback_strategy": "",
@@ -79,6 +85,7 @@ def format_output(result: dict) -> tuple:
79
  "",
80
  "",
81
  "",
 
82
  )
83
 
84
  # Build metadata string
@@ -94,11 +101,12 @@ def format_output(result: dict) -> tuple:
94
  metadata,
95
  result.get("title", ""),
96
  result.get("html", ""),
 
97
  result.get("html", ""), # For HTML preview
98
  )
99
 
100
 
101
- def process_input(html_content, base_url, process_math, include_tables, enable_forum):
102
  """Main processing function for Gradio interface."""
103
  result = parse_html(
104
  html_content=html_content,
@@ -106,6 +114,7 @@ def process_input(html_content, base_url, process_math, include_tables, enable_f
106
  process_math=process_math,
107
  include_tables=include_tables,
108
  enable_forum_assembly=enable_forum,
 
109
  )
110
  return format_output(result)
111
 
@@ -298,6 +307,12 @@ with gr.Blocks(css=custom_css, title="UltraData Math Parser") as demo:
298
  )
299
 
300
  with gr.Accordion("โš™๏ธ Advanced Options", open=False):
 
 
 
 
 
 
301
  process_math = gr.Checkbox(
302
  label="Process Math Expressions",
303
  value=True,
@@ -340,6 +355,13 @@ with gr.Blocks(css=custom_css, title="UltraData Math Parser") as demo:
340
  max_lines=20,
341
  interactive=False,
342
  )
 
 
 
 
 
 
 
343
  with gr.TabItem("๐Ÿ‘๏ธ Preview"):
344
  preview_output = gr.HTML(
345
  label="HTML Preview",
@@ -348,16 +370,16 @@ with gr.Blocks(css=custom_css, title="UltraData Math Parser") as demo:
348
  # Event handlers
349
  parse_btn.click(
350
  fn=process_input,
351
- inputs=[html_input, base_url_input, process_math, include_tables, enable_forum],
352
- outputs=[metadata_output, title_output, html_output, preview_output],
353
  )
354
 
355
  def clear_all():
356
- return "", "", "", "", "", ""
357
 
358
  clear_btn.click(
359
  fn=clear_all,
360
- outputs=[html_input, base_url_input, metadata_output, title_output, html_output, preview_output],
361
  )
362
 
363
  # Footer info
 
5
  """
6
 
7
  import gradio as gr
8
+ from ultradata_math_parser import GeneralParser
9
 
10
 
11
  def parse_html(
 
14
  process_math: bool = True,
15
  include_tables: bool = True,
16
  enable_forum_assembly: bool = True,
17
+ html_type: str = "unified",
18
  ) -> dict:
19
  """
20
+ Parse HTML content using GeneralParser.
21
 
22
  Args:
23
  html_content: Raw HTML string to parse
 
25
  process_math: Whether to process and convert math expressions
26
  include_tables: Whether to preserve table elements
27
  enable_forum_assembly: Whether to enable forum post assembly
28
+ html_type: Parser type (unified/article/forum)
29
 
30
  Returns:
31
  Dictionary containing parsed results
 
34
  return {
35
  "title": "",
36
  "html": "",
37
+ "text": "",
38
  "text_length": 0,
39
  "xp_num": "",
40
  "fallback_strategy": "",
 
42
  "error": "Please provide HTML content to parse.",
43
  }
44
 
45
+ parser = GeneralParser()
46
 
47
  try:
48
  result = parser.extract(
 
51
  process_math=process_math,
52
  include_tables=include_tables,
53
  enable_forum_assembly=enable_forum_assembly,
54
+ html_type=html_type,
55
  )
56
 
57
  return {
58
  "title": result.get("title", ""),
59
  "html": result.get("html", ""),
60
+ "text": result.get("text", ""),
61
  "text_length": result.get("text_length", 0),
62
  "xp_num": result.get("xp_num", ""),
63
  "fallback_strategy": result.get("fallback_strategy", ""),
 
68
  return {
69
  "title": "",
70
  "html": "",
71
+ "text": "",
72
  "text_length": 0,
73
  "xp_num": "",
74
  "fallback_strategy": "",
 
85
  "",
86
  "",
87
  "",
88
+ "",
89
  )
90
 
91
  # Build metadata string
 
101
  metadata,
102
  result.get("title", ""),
103
  result.get("html", ""),
104
+ result.get("text", ""),
105
  result.get("html", ""), # For HTML preview
106
  )
107
 
108
 
109
+ def process_input(html_content, base_url, process_math, include_tables, enable_forum, html_type):
110
  """Main processing function for Gradio interface."""
111
  result = parse_html(
112
  html_content=html_content,
 
114
  process_math=process_math,
115
  include_tables=include_tables,
116
  enable_forum_assembly=enable_forum,
117
+ html_type=html_type,
118
  )
119
  return format_output(result)
120
 
 
307
  )
308
 
309
  with gr.Accordion("โš™๏ธ Advanced Options", open=False):
310
+ html_type = gr.Radio(
311
+ choices=["unified", "article", "forum"],
312
+ value="unified",
313
+ label="Parser Type",
314
+ info="Select the parsing strategy",
315
+ )
316
  process_math = gr.Checkbox(
317
  label="Process Math Expressions",
318
  value=True,
 
355
  max_lines=20,
356
  interactive=False,
357
  )
358
+ with gr.TabItem("๐Ÿ“„ Plain Text"):
359
+ text_output = gr.Textbox(
360
+ label="Plain Text (w3m rendered)",
361
+ lines=12,
362
+ max_lines=20,
363
+ interactive=False,
364
+ )
365
  with gr.TabItem("๐Ÿ‘๏ธ Preview"):
366
  preview_output = gr.HTML(
367
  label="HTML Preview",
 
370
  # Event handlers
371
  parse_btn.click(
372
  fn=process_input,
373
+ inputs=[html_input, base_url_input, process_math, include_tables, enable_forum, html_type],
374
+ outputs=[metadata_output, title_output, html_output, text_output, preview_output],
375
  )
376
 
377
  def clear_all():
378
+ return "", "", "", "", "", "", ""
379
 
380
  clear_btn.click(
381
  fn=clear_all,
382
+ outputs=[html_input, base_url_input, metadata_output, title_output, html_output, text_output, preview_output],
383
  )
384
 
385
  # Footer info
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ w3m