File size: 8,187 Bytes
a9630ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9674032
cfb4439
 
 
 
 
f5d7f0e
a9630ec
 
 
 
 
 
f5d7f0e
a9630ec
 
f5d7f0e
 
 
a9630ec
9674032
f5d7f0e
 
9674032
 
cfb4439
 
 
 
 
 
 
 
 
 
f5d7f0e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9674032
 
 
 
 
 
 
 
 
 
 
f5d7f0e
9674032
 
 
f5d7f0e
a9630ec
f5d7f0e
a9630ec
f5d7f0e
 
 
 
 
 
a9630ec
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
"""
CorrectionPatch — Immutable correction with dual coordinate spaces.
PatchSet — Deterministic container with greedy overlap resolution.

TERMINOLOGY:
  ORIGINAL_TEXT = user's raw input (immutable)
  CURRENT_TEXT  = pipeline's working copy (mutated by each stage)

COORDINATE OWNERSHIP:
  start_original / end_original → PatchSet overlap resolution + API response
  start_current / end_current   → StageLocker + pipeline internals
"""
import uuid
import logging
from dataclasses import dataclass, field

logger = logging.getLogger(__name__)

PRIORITY = {'autocomplete': 0, 'spelling': 1, 'punctuation': 2, 'grammar': 3}


@dataclass
class CorrectionPatch:
    """
    Immutable correction suggestion with dual coordinate spaces.

    ORIGINAL coords (start_original, end_original):
      → Used by PatchSet.resolve_overlaps() for conflict resolution
      → Exported to frontend via to_dict() as 'start'/'end'
      → NEVER used for StageLocker or pipeline mutation

    CURRENT coords (start_current, end_current):
      → Used by StageLocker.lock() / is_locked()
      → Pipeline-internal range checking
      → NEVER sent to frontend
    """
    stage: str
    start_original: int
    end_original: int
    start_current: int
    end_current: int
    original: str
    replacement: str
    priority: int
    confidence: float = 1.0
    locked: bool = True
    alternatives: list = field(default_factory=list)
    id: str = field(default_factory=lambda: str(uuid.uuid4()))

    def to_dict(self) -> dict:
        """
        Serialize for API response.
        Exports ORIGINAL_TEXT coordinates ONLY as 'start' and 'end'.
        CURRENT_TEXT coordinates are pipeline-internal and never exposed.
        """
        return {
            'id': self.id,
            'start': self.start_original,
            'end': self.end_original,
            'original': self.original,
            'correction': self.replacement,
            'type': self.stage,
            'priority': self.priority,
            'confidence': self.confidence,
            'locked': self.locked,
            'alternatives': self.alternatives,
        }


class PatchSet:
    """
    Deterministic overlap resolution using greedy first-fit strategy.

    Resolution order: priority DESC → confidence DESC → start ASC → id ASC
    The id tiebreaker guarantees identical ordering for identical inputs.

    Strategy: Greedy — first non-overlapping patch wins its range.
    One range = one owner. No stacking.
    This is deterministic and sufficient for ≤3 pipeline stages.

    # FUTURE: If pipeline grows beyond 5 stages or requires minimal-loss
    # coverage optimization, consider weighted interval scheduling:
    #   - Model as weighted job scheduling problem
    #   - Use dynamic programming on sorted intervals
    #   - Maximize sum(priority * confidence) of selected non-overlapping patches
    # Not needed now — greedy is correct for the current architecture.
    """

    def __init__(self):
        self.patches: list = []

    def add(self, patch: CorrectionPatch):
        self.patches.append(patch)

    def resolve_overlaps(self) -> list:
        """
        Single owner per range. Deterministic resolution.
        Uses ORIGINAL coordinates for overlap detection.

        Phase 14: Relaxed overlap rules:
        1. Patches with < 50% overlap of the smaller patch coexist freely
        2. Spelling + Punctuation patches from different stages always coexist
           (they're compatible: one fixes the word, the other adds punct)
        3. Same-stage overlaps are always resolved (higher confidence wins)
        4. FIX-36: Grammar + Punctuation — merge trailing punct into grammar
        """
        sorted_patches = sorted(
            self.patches,
            key=lambda p: (-p.priority, -p.confidence, p.start_original, p.id)
        )

        claimed_ranges = []  # list of (start, end, stage, patch_index)
        resolved = []

        # FIX-36: Punctuation chars that can be merged into grammar corrections
        _PUNCT_CHARS = set('.,،؛;:!؟?')

        for patch in sorted_patches:
            has_substantial_overlap = False
            overlapping_resolved_idx = None
            for ci, (cs, ce, claimed_stage, res_idx) in enumerate(claimed_ranges):
                # Check if there's any overlap at all
                if patch.start_original < ce and patch.end_original > cs:
                    # ── Phase 14: Cross-stage compatibility ──
                    # Spelling + Punctuation are COMPATIBLE stages:
                    # spelling fixes the word, punctuation adds marks.
                    # They should never conflict.
                    _compatible_pair = {
                        frozenset({'spelling', 'punctuation'}),
                    }
                    if frozenset({patch.stage, claimed_stage}) in _compatible_pair:
                        continue  # Compatible stages — allow coexistence

                    # ── FIX-36: Grammar + Punctuation merge ──
                    # When punctuation adds a trailing character to a grammar
                    # correction at the same span, merge instead of dropping.
                    if (patch.stage == 'punctuation' and claimed_stage == 'grammar'
                            and patch.start_original == cs and patch.end_original == ce):
                        # Check if punctuation correction = grammar correction + punct char
                        grammar_patch = resolved[res_idx]
                        punc_correction = patch.replacement
                        gram_correction = grammar_patch.replacement
                        if (len(punc_correction) == len(gram_correction) + 1
                                and punc_correction.startswith(gram_correction)
                                and punc_correction[-1] in _PUNCT_CHARS):
                            # Merge: append the trailing punct to grammar correction
                            grammar_patch.replacement = punc_correction
                            logger.info(
                                f"[OVERLAP] Merged punctuation into grammar "
                                f"[{cs}:{ce}]: '{grammar_patch.original}' → "
                                f"'{grammar_patch.replacement}'"
                            )
                            has_substantial_overlap = True  # Don't add separately
                            break

                    # Calculate overlap amount
                    overlap_start = max(patch.start_original, cs)
                    overlap_end = min(patch.end_original, ce)
                    overlap_width = overlap_end - overlap_start
                    # Compare to the smaller patch's width
                    patch_width = max(1, patch.end_original - patch.start_original)
                    claimed_width = max(1, ce - cs)
                    smaller_width = min(patch_width, claimed_width)
                    overlap_ratio = overlap_width / smaller_width
                    if overlap_ratio > 0.5:
                        has_substantial_overlap = True
                        overlapping_resolved_idx = res_idx
                        break

            if not has_substantial_overlap:
                res_idx = len(resolved)
                resolved.append(patch)
                claimed_ranges.append((patch.start_original, patch.end_original, patch.stage, res_idx))
            else:
                # Only log "Dropped" if we didn't merge
                if overlapping_resolved_idx is not None or patch.stage != 'punctuation':
                    logger.info(
                        f"[OVERLAP] Dropped {patch.stage} [{patch.start_original}:{patch.end_original}] "
                        f"'{patch.original}' — conflicts with higher-priority span"
                    )

        dropped = len(self.patches) - len(resolved)
        if dropped > 0:
            logger.info(f"[OVERLAP] Resolved {dropped} overlapping suggestions")

        return resolved

    def to_list(self) -> list:
        """Serialize resolved patches for API response."""
        return [p.to_dict() for p in self.resolve_overlaps()]