from app.pipeline.text_format import format_ocr_to_html, ocr_has_math def test_empty_returns_empty_markup(): assert str(format_ocr_to_html("")) == "" assert str(format_ocr_to_html(None)) == "" def test_single_paragraph(): out = str(format_ocr_to_html("Hello world")) assert out == "

Hello world

" def test_multi_line_paragraph_uses_br(): out = str(format_ocr_to_html("Line one\nLine two")) assert out == "

Line one
Line two

" def test_blank_line_separates_paragraphs(): out = str(format_ocr_to_html("First\n\nSecond")) assert out == "

First

Second

" def test_html_special_chars_are_escaped(): out = str(format_ocr_to_html("a < b & c > d")) assert "<" in out and "&" in out and ">" in out def test_bullet_list_unordered(): text = "• First point\n• Second point\n• Third" out = str(format_ocr_to_html(text)) assert out.startswith("") assert out.count("
  • ") == 3 def test_dash_bullets_recognized(): text = "- alpha\n- beta" out = str(format_ocr_to_html(text)) assert out == "" def test_numbered_list_ordered(): text = "1. step one\n2. step two\n3. step three" out = str(format_ocr_to_html(text)) assert out.startswith("
      ") assert out.count("
    1. ") == 3 def test_continuation_line_joins_previous_item(): text = "• first item\n with continuation\n• second" out = str(format_ocr_to_html(text)) assert "first item with continuation" in out assert out.count("
    2. ") == 2 def test_equation_line_wrapped_in_mathjax(): out = str(format_ocr_to_html("E = mc²")) assert "\\(" in out and "\\)" in out assert "^2" in out assert 'class="ocr-equation"' in out def test_equation_normalizes_greek_and_operators(): out = str(format_ocr_to_html("α × β = γ")) assert r"\alpha" in out assert r"\times" in out assert r"\beta" in out assert r"\gamma" in out def test_long_prose_with_equals_is_not_treated_as_equation(): text = ( "If we set the value of x equal to zero, then the function evaluates " "to a constant of one as expected." ) out = str(format_ocr_to_html(text)) assert "\\(" not in out assert out.startswith("

      ") def test_mixed_blocks(): text = "Slide title\n\n• point one\n• point two\n\nx = 5" out = str(format_ocr_to_html(text)) assert "

      Slide title

      " in out assert "
      • point one
      • point two
      " in out assert "\\(x = 5\\)" in out def test_ocr_has_math_detects_equation(): assert ocr_has_math("Title\n\nE = mc²") is True assert ocr_has_math("∑ x_i") is True def test_ocr_has_math_false_on_prose(): assert ocr_has_math("Just a regular sentence about photosynthesis.") is False assert ocr_has_math("") is False assert ocr_has_math(None) is False