File size: 5,110 Bytes
5d9b495
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""
Tests pour response_parser.py β€” extraction JSON, correction VLM, parsing tolΓ©rant.
"""
import pytest

from app.services.ai.response_parser import (
    ParseError,
    _extract_json_object,
    _fix_common_json_issues,
    _try_parse_json,
    parse_ai_response,
)


# ── _extract_json_object ─────────────────────────────────────────────────────

class TestExtractJsonObject:
    def test_simple_object(self):
        assert _extract_json_object('{"a": 1}') == '{"a": 1}'

    def test_text_before_json(self):
        result = _extract_json_object('Here is the JSON: {"a": 1}')
        assert result == '{"a": 1}'

    def test_text_after_json(self):
        result = _extract_json_object('{"a": 1} and more text')
        assert result == '{"a": 1}'

    def test_nested_braces(self):
        result = _extract_json_object('{"a": {"b": {"c": 1}}}')
        assert result == '{"a": {"b": {"c": 1}}}'

    def test_braces_inside_strings(self):
        result = _extract_json_object('{"text": "value with { and } inside"}')
        assert result == '{"text": "value with { and } inside"}'

    def test_escaped_quotes(self):
        result = _extract_json_object('{"text": "he said \\"hello\\""}')
        assert result == '{"text": "he said \\"hello\\""}'

    def test_no_json(self):
        result = _extract_json_object("no json here")
        assert result == "no json here"

    def test_unclosed_json(self):
        result = _extract_json_object('some text {"a": 1')
        assert result.startswith('{"a": 1')


# ── _fix_common_json_issues ──────────────────────────────────────────────────

class TestFixCommonJsonIssues:
    def test_trailing_comma_before_brace(self):
        assert _fix_common_json_issues('{"a": 1,}') == '{"a": 1}'

    def test_trailing_comma_before_bracket(self):
        assert _fix_common_json_issues('[1, 2,]') == '[1, 2]'

    def test_trailing_comma_with_whitespace(self):
        assert _fix_common_json_issues('{"a": 1 , }') == '{"a": 1 }'

    def test_no_issues(self):
        text = '{"a": 1, "b": 2}'
        assert _fix_common_json_issues(text) == text


# ── _try_parse_json ──────────────────────────────────────────────────────────

class TestTryParseJson:
    def test_valid_json(self):
        assert _try_parse_json('{"a": 1}') == {"a": 1}

    def test_json_with_trailing_comma(self):
        result = _try_parse_json('{"a": 1,}')
        assert result == {"a": 1}

    def test_invalid_json(self):
        assert _try_parse_json("not json at all") is None


# ── parse_ai_response ────────────────────────────────────────────────────────

class TestParseAiResponse:
    def test_clean_json(self):
        raw = '{"layout": {"regions": [{"id": "r1", "type": "text_block", "bbox": [10, 20, 100, 200], "confidence": 0.9}]}, "ocr": {"diplomatic_text": "hello", "confidence": 0.8}}'
        layout, ocr = parse_ai_response(raw)
        assert len(layout["regions"]) == 1
        assert layout["regions"][0]["id"] == "r1"
        assert ocr.diplomatic_text == "hello"

    def test_markdown_fenced_json(self):
        raw = '```json\n{"layout": {"regions": []}, "ocr": {"diplomatic_text": "test"}}\n```'
        layout, ocr = parse_ai_response(raw)
        assert layout["regions"] == []
        assert ocr.diplomatic_text == "test"

    def test_text_around_json(self):
        raw = 'Here is my analysis:\n{"layout": {"regions": []}, "ocr": {"diplomatic_text": "ok"}}\nHope this helps!'
        layout, ocr = parse_ai_response(raw)
        assert ocr.diplomatic_text == "ok"

    def test_invalid_region_skipped(self):
        raw = '{"layout": {"regions": [{"id": "r1", "type": "text_block", "bbox": [-1, 0, 100, 200], "confidence": 0.5}, {"id": "r2", "type": "miniature", "bbox": [10, 20, 100, 200], "confidence": 0.8}]}}'
        layout, ocr = parse_ai_response(raw)
        assert len(layout["regions"]) == 1
        assert layout["regions"][0]["id"] == "r2"

    def test_missing_ocr_returns_default(self):
        raw = '{"layout": {"regions": []}}'
        layout, ocr = parse_ai_response(raw)
        assert ocr.diplomatic_text == ""
        assert ocr.confidence == 0.0

    def test_not_json_raises_parse_error(self):
        with pytest.raises(ParseError):
            parse_ai_response("This is not JSON at all, no braces anywhere")

    def test_json_array_raises_parse_error(self):
        with pytest.raises(ParseError):
            parse_ai_response("[1, 2, 3]")

    def test_trailing_comma_tolerance(self):
        raw = '{"layout": {"regions": [],}, "ocr": {"diplomatic_text": "tolerant",}}'
        layout, ocr = parse_ai_response(raw)
        assert ocr.diplomatic_text == "tolerant"