Spaces:
Running
Running
| from app.pipeline.text_format import format_ocr_to_html, ocr_has_math | |
| def test_empty_returns_empty_markup(): | |
| assert str(format_ocr_to_html("")) == "" | |
| assert str(format_ocr_to_html(None)) == "" | |
| def test_single_paragraph(): | |
| out = str(format_ocr_to_html("Hello world")) | |
| assert out == "<p>Hello world</p>" | |
| def test_multi_line_paragraph_uses_br(): | |
| out = str(format_ocr_to_html("Line one\nLine two")) | |
| assert out == "<p>Line one<br>Line two</p>" | |
| def test_blank_line_separates_paragraphs(): | |
| out = str(format_ocr_to_html("First\n\nSecond")) | |
| assert out == "<p>First</p><p>Second</p>" | |
| def test_html_special_chars_are_escaped(): | |
| out = str(format_ocr_to_html("a < b & c > d")) | |
| assert "<" in out and "&" in out and ">" in out | |
| def test_bullet_list_unordered(): | |
| text = "• First point\n• Second point\n• Third" | |
| out = str(format_ocr_to_html(text)) | |
| assert out.startswith("<ul>") | |
| assert out.endswith("</ul>") | |
| assert out.count("<li>") == 3 | |
| def test_dash_bullets_recognized(): | |
| text = "- alpha\n- beta" | |
| out = str(format_ocr_to_html(text)) | |
| assert out == "<ul><li>alpha</li><li>beta</li></ul>" | |
| def test_numbered_list_ordered(): | |
| text = "1. step one\n2. step two\n3. step three" | |
| out = str(format_ocr_to_html(text)) | |
| assert out.startswith("<ol>") | |
| assert out.count("<li>") == 3 | |
| def test_continuation_line_joins_previous_item(): | |
| text = "• first item\n with continuation\n• second" | |
| out = str(format_ocr_to_html(text)) | |
| assert "first item with continuation" in out | |
| assert out.count("<li>") == 2 | |
| def test_equation_line_wrapped_in_mathjax(): | |
| out = str(format_ocr_to_html("E = mc²")) | |
| assert "\\(" in out and "\\)" in out | |
| assert "^2" in out | |
| assert 'class="ocr-equation"' in out | |
| def test_equation_normalizes_greek_and_operators(): | |
| out = str(format_ocr_to_html("α × β = γ")) | |
| assert r"\alpha" in out | |
| assert r"\times" in out | |
| assert r"\beta" in out | |
| assert r"\gamma" in out | |
| def test_long_prose_with_equals_is_not_treated_as_equation(): | |
| text = ( | |
| "If we set the value of x equal to zero, then the function evaluates " | |
| "to a constant of one as expected." | |
| ) | |
| out = str(format_ocr_to_html(text)) | |
| assert "\\(" not in out | |
| assert out.startswith("<p>") | |
| def test_mixed_blocks(): | |
| text = "Slide title\n\n• point one\n• point two\n\nx = 5" | |
| out = str(format_ocr_to_html(text)) | |
| assert "<p>Slide title</p>" in out | |
| assert "<ul><li>point one</li><li>point two</li></ul>" in out | |
| assert "\\(x = 5\\)" in out | |
| def test_ocr_has_math_detects_equation(): | |
| assert ocr_has_math("Title\n\nE = mc²") is True | |
| assert ocr_has_math("∑ x_i") is True | |
| def test_ocr_has_math_false_on_prose(): | |
| assert ocr_has_math("Just a regular sentence about photosynthesis.") is False | |
| assert ocr_has_math("") is False | |
| assert ocr_has_math(None) is False | |