from app.pipeline.text_format import format_ocr_to_html, ocr_has_math
def test_empty_returns_empty_markup():
assert str(format_ocr_to_html("")) == ""
assert str(format_ocr_to_html(None)) == ""
def test_single_paragraph():
out = str(format_ocr_to_html("Hello world"))
assert out == "
Hello world
"
def test_multi_line_paragraph_uses_br():
out = str(format_ocr_to_html("Line one\nLine two"))
assert out == "Line one
Line two
"
def test_blank_line_separates_paragraphs():
out = str(format_ocr_to_html("First\n\nSecond"))
assert out == "First
Second
"
def test_html_special_chars_are_escaped():
out = str(format_ocr_to_html("a < b & c > d"))
assert "<" in out and "&" in out and ">" in out
def test_bullet_list_unordered():
text = "• First point\n• Second point\n• Third"
out = str(format_ocr_to_html(text))
assert out.startswith("")
assert out.count("") == 3
def test_dash_bullets_recognized():
text = "- alpha\n- beta"
out = str(format_ocr_to_html(text))
assert out == ""
def test_numbered_list_ordered():
text = "1. step one\n2. step two\n3. step three"
out = str(format_ocr_to_html(text))
assert out.startswith("")
assert out.count("- ") == 3
def test_continuation_line_joins_previous_item():
text = "• first item\n with continuation\n• second"
out = str(format_ocr_to_html(text))
assert "first item with continuation" in out
assert out.count("
- ") == 2
def test_equation_line_wrapped_in_mathjax():
out = str(format_ocr_to_html("E = mc²"))
assert "\\(" in out and "\\)" in out
assert "^2" in out
assert 'class="ocr-equation"' in out
def test_equation_normalizes_greek_and_operators():
out = str(format_ocr_to_html("α × β = γ"))
assert r"\alpha" in out
assert r"\times" in out
assert r"\beta" in out
assert r"\gamma" in out
def test_long_prose_with_equals_is_not_treated_as_equation():
text = (
"If we set the value of x equal to zero, then the function evaluates "
"to a constant of one as expected."
)
out = str(format_ocr_to_html(text))
assert "\\(" not in out
assert out.startswith("
")
def test_mixed_blocks():
text = "Slide title\n\n• point one\n• point two\n\nx = 5"
out = str(format_ocr_to_html(text))
assert "
Slide title
" in out
assert "" in out
assert "\\(x = 5\\)" in out
def test_ocr_has_math_detects_equation():
assert ocr_has_math("Title\n\nE = mc²") is True
assert ocr_has_math("∑ x_i") is True
def test_ocr_has_math_false_on_prose():
assert ocr_has_math("Just a regular sentence about photosynthesis.") is False
assert ocr_has_math("") is False
assert ocr_has_math(None) is False