youssefreda9 commited on
Commit
6f1ed4e
·
1 Parent(s): 32cefd4

Phase 11: Hierarchical StageLocker — grammar overrides spelling locks

Browse files

- Refactored StageLocker with STAGE_PRIORITY hierarchy:
protection(99) > grammar(3) > spelling(2) > punctuation(1)
- Added is_locked_for() and is_locked_by_for() hierarchy-aware methods
- Grammar stage now uses is_locked_for('grammar') to override spelling locks
- Punctuation stage uses is_locked_by_for('punctuation') to stay blocked
- Deduplicated pipeline_collision.json (PC021-PC050 were identical copies)
- Expanded to 50 unique collision test cases across 7 categories
- Fixed test_collisions.py hardcoded path, runs all 50 samples
- Added run_collision_benchmark.py with structured reporting
- Registered collision as 8th dataset in benchmark_runner.py
- All 13 StageLocker hierarchy unit tests passed

src/app.py CHANGED
@@ -1843,10 +1843,11 @@ def analyze_text():
1843
  logger.info(f'[FILTER-TEL] {_tel_json.dumps({"event":"grammar_diff","original":orig_text[:80],"correction":corr_text[:80],"start":d["start"],"end":d["end"]})}')
1844
  _tel_events.append({"event":"grammar_diff","original":orig_text[:80],"correction":corr_text[:80],"start":d["start"],"end":d["end"]})
1845
  # StageLocker: skip diffs that overlap with locked ranges
1846
- if ctx.stage_locker.is_locked(d['start'], d['end']):
 
1847
  logger.info(
1848
  f"[LOCK] Grammar blocked on [{d['start']}:{d['end']}] "
1849
- f"'{d.get('original','')}' — locked by previous stage"
1850
  )
1851
  logger.info(f'[FILTER-TEL] {_tel_json.dumps({"event":"filter_reject","filter":"StageLocker","original":orig_text[:80],"correction":corr_text[:80]})}')
1852
  _tel_events.append({"event":"filter_reject","filter":"StageLocker","original":orig_text[:80],"correction":corr_text[:80]})
@@ -2166,7 +2167,8 @@ def analyze_text():
2166
  for d in diffs:
2167
  # StageLocker: skip diffs that overlap with locked ranges
2168
  # BUT allow pure punctuation insertions near locked words
2169
- lock_info = ctx.stage_locker.is_locked_by(d['start'], d['end'])
 
2170
  if lock_info:
2171
  import re as _re
2172
  orig_alpha = _re.sub(r'[^\u0600-\u06FFa-zA-Z]', '', d.get('original', ''))
 
1843
  logger.info(f'[FILTER-TEL] {_tel_json.dumps({"event":"grammar_diff","original":orig_text[:80],"correction":corr_text[:80],"start":d["start"],"end":d["end"]})}')
1844
  _tel_events.append({"event":"grammar_diff","original":orig_text[:80],"correction":corr_text[:80],"start":d["start"],"end":d["end"]})
1845
  # StageLocker: skip diffs that overlap with locked ranges
1846
+ # Phase 11: Hierarchy-aware — grammar (3) overrides spelling (2)
1847
+ if ctx.stage_locker.is_locked_for(d['start'], d['end'], 'grammar'):
1848
  logger.info(
1849
  f"[LOCK] Grammar blocked on [{d['start']}:{d['end']}] "
1850
+ f"'{d.get('original','')}' — locked by equal/higher priority stage"
1851
  )
1852
  logger.info(f'[FILTER-TEL] {_tel_json.dumps({"event":"filter_reject","filter":"StageLocker","original":orig_text[:80],"correction":corr_text[:80]})}')
1853
  _tel_events.append({"event":"filter_reject","filter":"StageLocker","original":orig_text[:80],"correction":corr_text[:80]})
 
2167
  for d in diffs:
2168
  # StageLocker: skip diffs that overlap with locked ranges
2169
  # BUT allow pure punctuation insertions near locked words
2170
+ # Phase 11: Hierarchy-aware — punctuation (1) blocked by spelling (2) and grammar (3)
2171
+ lock_info = ctx.stage_locker.is_locked_by_for(d['start'], d['end'], 'punctuation')
2172
  if lock_info:
2173
  import re as _re
2174
  orig_alpha = _re.sub(r'[^\u0600-\u06FFa-zA-Z]', '', d.get('original', ''))
src/nlp/stage_locker.py CHANGED
@@ -10,8 +10,15 @@ STRICT RULES:
10
 
11
  TERMINOLOGY:
12
  lock(): registers a range in CURRENT_TEXT as owned
13
- is_locked(): checks if a range in CURRENT_TEXT overlaps any owned range
 
14
  update_via_mapper(): shifts all spans forward when CURRENT_TEXT mutates
 
 
 
 
 
 
15
  """
16
  import logging
17
 
@@ -20,6 +27,22 @@ logger = logging.getLogger(__name__)
20
  # Set to True for structured debug logging across all pipeline components
21
  PIPELINE_DEBUG = False
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  class StageLocker:
25
  """Protects corrected ranges in CURRENT_TEXT from being overwritten by later stages."""
@@ -34,7 +57,11 @@ class StageLocker:
34
  logger.debug(f"[StageLocker] LOCK [{start}:{end}] owner={owner}")
35
 
36
  def is_locked(self, start: int, end: int) -> bool:
37
- """Check if [start, end) in CURRENT_TEXT overlaps any locked range."""
 
 
 
 
38
  for ls, le, _ in self.locked_spans:
39
  if start < le and end > ls:
40
  if PIPELINE_DEBUG:
@@ -42,13 +69,65 @@ class StageLocker:
42
  return True
43
  return False
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  def is_locked_by(self, start: int, end: int):
46
- """Return (locked_start, locked_end, owner) if locked, else None."""
 
 
 
47
  for ls, le, owner in self.locked_spans:
48
  if start < le and end > ls:
49
  return (ls, le, owner)
50
  return None
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  def unlock(self, start: int, end: int) -> None:
53
  """FIX-18: Remove lock for a specific range (used when punctuation cap removes patches)."""
54
  self.locked_spans = [
 
10
 
11
  TERMINOLOGY:
12
  lock(): registers a range in CURRENT_TEXT as owned
13
+ is_locked(): checks if a range in CURRENT_TEXT overlaps any owned range (ABSOLUTE)
14
+ is_locked_for(): checks if a range is locked FOR A SPECIFIC STAGE (HIERARCHICAL)
15
  update_via_mapper(): shifts all spans forward when CURRENT_TEXT mutates
16
+
17
+ HIERARCHY (Phase 11):
18
+ protection (99) ─── Absolute, overrides everything
19
+ grammar (3) ─── May override spelling
20
+ spelling (2) ─── Blocks punctuation, blocked by grammar
21
+ punctuation(1) ─── Blocked by spelling and grammar
22
  """
23
  import logging
24
 
 
27
  # Set to True for structured debug logging across all pipeline components
28
  PIPELINE_DEBUG = False
29
 
30
+ # ═══════════════════════════════════════════════════════════════
31
+ # Phase 11: Hierarchical Priority Map
32
+ # ═══════════════════════════════════════════════════════════════
33
+ # A requesting stage is BLOCKED only by locks from stages with
34
+ # EQUAL or HIGHER priority. Lower-priority locks are overridden.
35
+ #
36
+ # Example: Grammar (3) requesting on a Spelling (2) lock → ALLOWED
37
+ # Example: Punctuation (1) requesting on a Spelling (2) lock → BLOCKED
38
+ # Example: Anything requesting on a Protection (99) lock → BLOCKED
39
+ STAGE_PRIORITY = {
40
+ 'punctuation': 1,
41
+ 'spelling': 2,
42
+ 'grammar': 3,
43
+ 'protection': 99,
44
+ }
45
+
46
 
47
  class StageLocker:
48
  """Protects corrected ranges in CURRENT_TEXT from being overwritten by later stages."""
 
57
  logger.debug(f"[StageLocker] LOCK [{start}:{end}] owner={owner}")
58
 
59
  def is_locked(self, start: int, end: int) -> bool:
60
+ """Check if [start, end) in CURRENT_TEXT overlaps any locked range.
61
+
62
+ ABSOLUTE check — ignores hierarchy. Any lock blocks.
63
+ Kept for backward compatibility and protection-level checks.
64
+ """
65
  for ls, le, _ in self.locked_spans:
66
  if start < le and end > ls:
67
  if PIPELINE_DEBUG:
 
69
  return True
70
  return False
71
 
72
+ def is_locked_for(self, start: int, end: int, requesting_stage: str) -> bool:
73
+ """Hierarchy-aware lock check.
74
+
75
+ Returns True (BLOCKED) only if an overlapping lock has EQUAL or
76
+ HIGHER priority than the requesting stage.
77
+
78
+ Returns False (ALLOWED) if the requester outranks all overlapping locks.
79
+
80
+ Phase 11 examples:
81
+ is_locked_for(0, 5, 'grammar') on spelling lock → False (grammar > spelling)
82
+ is_locked_for(0, 5, 'punctuation') on spelling lock → True (spelling > punctuation)
83
+ is_locked_for(0, 5, 'grammar') on protection lock → True (protection > grammar)
84
+ """
85
+ req_priority = STAGE_PRIORITY.get(requesting_stage, 0)
86
+ for ls, le, owner in self.locked_spans:
87
+ if start < le and end > ls:
88
+ owner_priority = STAGE_PRIORITY.get(owner, 0)
89
+ if owner_priority >= req_priority:
90
+ if PIPELINE_DEBUG:
91
+ logger.debug(
92
+ f"[StageLocker] HIERARCHY BLOCKED [{start}:{end}] "
93
+ f"requester={requesting_stage}({req_priority}) "
94
+ f"owner={owner}({owner_priority})"
95
+ )
96
+ return True # Blocked: owner is same or higher priority
97
+ else:
98
+ if PIPELINE_DEBUG:
99
+ logger.debug(
100
+ f"[StageLocker] HIERARCHY OVERRIDE [{start}:{end}] "
101
+ f"requester={requesting_stage}({req_priority}) "
102
+ f"overrides owner={owner}({owner_priority})"
103
+ )
104
+ return False # Not blocked: requester outranks all overlapping locks
105
+
106
  def is_locked_by(self, start: int, end: int):
107
+ """Return (locked_start, locked_end, owner) if locked, else None.
108
+
109
+ ABSOLUTE check — ignores hierarchy.
110
+ """
111
  for ls, le, owner in self.locked_spans:
112
  if start < le and end > ls:
113
  return (ls, le, owner)
114
  return None
115
 
116
+ def is_locked_by_for(self, start: int, end: int, requesting_stage: str):
117
+ """Hierarchy-aware lock info check.
118
+
119
+ Returns (locked_start, locked_end, owner) if the range is blocked
120
+ by a lock with EQUAL or HIGHER priority than the requesting stage.
121
+ Returns None if the requester outranks all overlapping locks.
122
+ """
123
+ req_priority = STAGE_PRIORITY.get(requesting_stage, 0)
124
+ for ls, le, owner in self.locked_spans:
125
+ if start < le and end > ls:
126
+ owner_priority = STAGE_PRIORITY.get(owner, 0)
127
+ if owner_priority >= req_priority:
128
+ return (ls, le, owner)
129
+ return None
130
+
131
  def unlock(self, start: int, end: int) -> None:
132
  """FIX-18: Remove lock for a specific range (used when punctuation cap removes patches)."""
133
  self.locked_spans = [
tests/phase10/benchmark_runner.py CHANGED
@@ -539,6 +539,83 @@ def run_hallucination_benchmark(api: API, samples: list) -> List[BenchResult]:
539
  results.append(r)
540
  return results
541
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
542
  # ═══════════════════════════════════════════════════════════════
543
  # Metrics
544
  # ═══════════════════════════════════════════════════════════════
@@ -618,6 +695,7 @@ def main():
618
  "religious": (GOLD_DIR/"religious.json", run_religious_benchmark),
619
  "structured": (GOLD_DIR/"structured_content.json", run_structured_benchmark),
620
  "hallucination":(GOLD_DIR/"hallucination.json", run_hallucination_benchmark),
 
621
  }
622
 
623
  for name, (path, runner) in DATASETS.items():
 
539
  results.append(r)
540
  return results
541
 
542
+ def run_collision_benchmark(api: API, samples: list) -> List[BenchResult]:
543
+ """Phase 11: Pipeline collision benchmark (spelling↔grammar↔punctuation interactions)."""
544
+ results = []
545
+ for i, s in enumerate(samples):
546
+ print(f" [{i+1}/{len(samples)}] {s['id']} ({s.get('category','')})... ", end="", flush=True)
547
+ r = BenchResult(
548
+ s['id'], 'collision', s.get('category', ''), s['input'],
549
+ expected=s.get('expected', ''), severity=s.get('severity', '')
550
+ )
551
+
552
+ resp = api.analyze(s['input'])
553
+ r.pipeline_ms = resp.get('_ms', 0)
554
+ r.pipeline_timing = resp.get('timing_ms', {})
555
+
556
+ if 'error' in resp:
557
+ r.pipeline_verdict = "ERROR"
558
+ r.pipeline_detail = resp.get('error', '')
559
+ print(f"💥 ERROR")
560
+ results.append(r)
561
+ continue
562
+
563
+ r.pipeline_output = resp.get('corrected', '')
564
+ r.pipeline_suggestions = resp.get('suggestions', [])
565
+
566
+ # Normalize for comparison (strip diacritics + collapse whitespace)
567
+ norm_output = re.sub(r'\s+', ' ', _strip_diacritics(r.pipeline_output)).strip()
568
+ norm_expected = re.sub(r'\s+', ' ', _strip_diacritics(s.get('expected', ''))).strip()
569
+
570
+ if norm_output == norm_expected:
571
+ r.pipeline_verdict = "TP"
572
+ r.pipeline_detail = "All corrections applied correctly"
573
+ else:
574
+ r.pipeline_verdict = "FN"
575
+ category = s.get('category', '')
576
+ stages = [sg.get('type', '') for sg in r.pipeline_suggestions]
577
+
578
+ # Root cause classification
579
+ if category == 'spelling_blocks_grammar':
580
+ if 'spelling' in stages and 'grammar' not in stages:
581
+ r.root_cause_component = "PIPELINE"
582
+ r.root_cause_stage = "integration"
583
+ r.root_cause_detail = "Spelling lock blocked grammar (StageLocker)"
584
+ else:
585
+ r.root_cause_component = "MODEL"
586
+ r.root_cause_stage = "grammar"
587
+ r.root_cause_detail = "Grammar model missed correction"
588
+ elif category in ('grammar_drops_spelling', 'spelling_grammar_overlap'):
589
+ r.root_cause_component = "PIPELINE"
590
+ r.root_cause_stage = "integration"
591
+ r.root_cause_detail = f"{category}: stage interaction failure"
592
+ elif category == 'multi_stage_collision':
593
+ r.root_cause_component = "PIPELINE" if 'grammar' in stages else "MODEL"
594
+ r.root_cause_stage = "integration" if 'grammar' in stages else "grammar"
595
+ r.root_cause_detail = "Multi-stage collision failure"
596
+ elif category == 'three_stage_collision':
597
+ r.root_cause_component = "PIPELINE"
598
+ r.root_cause_stage = "integration"
599
+ r.root_cause_detail = "Three-stage collision failure"
600
+ elif category == 'adjacent_corrections':
601
+ r.root_cause_component = "PIPELINE"
602
+ r.root_cause_stage = "integration"
603
+ r.root_cause_detail = "Adjacent corrections interfered"
604
+ else:
605
+ r.root_cause_component = "UNKNOWN"
606
+ r.root_cause_stage = "unknown"
607
+ r.root_cause_detail = f"Unclassified: {category}"
608
+
609
+ exp_words = set(norm_expected.split())
610
+ act_words = set(norm_output.split())
611
+ missing = exp_words - act_words
612
+ r.pipeline_detail = f"Missing: {list(missing)[:5]}" if missing else "Output mismatch"
613
+
614
+ icon = {"TP":"✅","TN":"✅","FP":"❌","FN":"⚠️","ERROR":"💥"}.get(r.pipeline_verdict,"?")
615
+ print(f"{icon} {r.pipeline_verdict} ({r.pipeline_ms}ms)")
616
+ results.append(r)
617
+ return results
618
+
619
  # ═══════════════════════════════════════════════════════════════
620
  # Metrics
621
  # ═══════════════════════════════════════════════════════════════
 
695
  "religious": (GOLD_DIR/"religious.json", run_religious_benchmark),
696
  "structured": (GOLD_DIR/"structured_content.json", run_structured_benchmark),
697
  "hallucination":(GOLD_DIR/"hallucination.json", run_hallucination_benchmark),
698
+ "collision": (GOLD_DIR/"pipeline_collision.json", run_collision_benchmark),
699
  }
700
 
701
  for name, (path, runner) in DATASETS.items():
tests/phase10/gold_datasets/pipeline_collision.json CHANGED
@@ -141,212 +141,212 @@
141
  },
142
  {
143
  "id": "PC021",
144
- "category": "multi_stage_collision",
145
- "input": "السياره سريع والرجال يعمل في المصنع",
146
- "expected": "السيارة سريعة والرجال يعملون في المصنع",
147
  "severity": "critical"
148
  },
149
  {
150
  "id": "PC022",
151
- "category": "multi_stage_collision",
152
- "input": "السياره سريع والرجال يعمل في المصنع",
153
- "expected": "السيارة سريعة والرجال يعملون في المصنع",
154
  "severity": "critical"
155
  },
156
  {
157
  "id": "PC023",
158
- "category": "multi_stage_collision",
159
- "input": "السياره سريع والرجال يعمل في المصنع",
160
- "expected": "السيارة سريعة والرجال يعملون في المصنع",
161
  "severity": "critical"
162
  },
163
  {
164
  "id": "PC024",
165
- "category": "multi_stage_collision",
166
- "input": "السياره سريع والرجال يعمل في المصنع",
167
- "expected": "السيارة سريعة والرجال يعملون في المصنع",
168
  "severity": "critical"
169
  },
170
  {
171
  "id": "PC025",
172
- "category": "multi_stage_collision",
173
- "input": "السياره سريع والرجال يعمل في المصنع",
174
- "expected": "السيارة سريعة والرجال يعملون في المصنع",
175
  "severity": "critical"
176
  },
177
  {
178
  "id": "PC026",
179
- "category": "multi_stage_collision",
180
- "input": "السياره سريع والرجال يعمل في المصنع",
181
- "expected": "السيارة سريعة والرجال يعملون في المصنع",
182
- "severity": "critical"
183
  },
184
  {
185
  "id": "PC027",
186
- "category": "multi_stage_collision",
187
- "input": "السياره سريع والرجال يعمل في المصنع",
188
- "expected": "السيارة سريعة والرجال يعملون في المصنع",
189
- "severity": "critical"
190
  },
191
  {
192
  "id": "PC028",
193
- "category": "multi_stage_collision",
194
- "input": "السياره سريع والرجال يعمل في المصنع",
195
- "expected": "السيارة سريعة والرجال يعملون في المصنع",
196
- "severity": "critical"
197
  },
198
  {
199
  "id": "PC029",
200
- "category": "multi_stage_collision",
201
- "input": "السياره سريع والرجال يعمل في المصنع",
202
- "expected": "السيارة سريعة والرجال يعملون في المصنع",
203
- "severity": "critical"
204
  },
205
  {
206
  "id": "PC030",
207
- "category": "multi_stage_collision",
208
- "input": "السياره سريع والرجال يعمل في المصنع",
209
- "expected": "السيارة سريعة والرجال يعملون في المصنع",
210
- "severity": "critical"
211
  },
212
  {
213
  "id": "PC031",
214
- "category": "multi_stage_collision",
215
  "input": "السياره سريع والرجال يعمل في المصنع",
216
  "expected": "السيارة سريعة والرجال يعملون في المصنع",
217
  "severity": "critical"
218
  },
219
  {
220
  "id": "PC032",
221
- "category": "multi_stage_collision",
222
- "input": "السياره سريع والرجال يعمل في المصنع",
223
- "expected": "السيا��ة سريعة والرجال يعملون في المصنع",
224
  "severity": "critical"
225
  },
226
  {
227
  "id": "PC033",
228
- "category": "multi_stage_collision",
229
- "input": "السياره سريع والرجال يعمل في المصنع",
230
- "expected": "السيارة سريعة والرجال يعملون في المصنع",
231
  "severity": "critical"
232
  },
233
  {
234
  "id": "PC034",
235
- "category": "multi_stage_collision",
236
- "input": "السياره سريع والرجال يعمل في المصنع",
237
- "expected": "السيارة سريعة والرجال يعملون في المصنع",
238
  "severity": "critical"
239
  },
240
  {
241
  "id": "PC035",
242
- "category": "multi_stage_collision",
243
- "input": "السياره سريع والرجال يعمل في المصنع",
244
- "expected": "السيارة سريعة والرجال يعملون في المصنع",
245
  "severity": "critical"
246
  },
247
  {
248
  "id": "PC036",
249
- "category": "multi_stage_collision",
250
- "input": "السياره سريع والرجال يعمل في المصنع",
251
- "expected": "السيارة سريعة والرجال يعملون في المصنع",
252
  "severity": "critical"
253
  },
254
  {
255
  "id": "PC037",
256
- "category": "multi_stage_collision",
257
- "input": "السياره سريع والرجال يعمل في المصنع",
258
- "expected": "السيارة سريعة والرجال يعملون في المصنع",
259
- "severity": "critical"
260
  },
261
  {
262
  "id": "PC038",
263
- "category": "multi_stage_collision",
264
- "input": "السياره سريع والرجال يعمل في المصنع",
265
- "expected": "السيارة سريعة والرجال يعملون في المصنع",
266
- "severity": "critical"
267
  },
268
  {
269
  "id": "PC039",
270
- "category": "multi_stage_collision",
271
- "input": "السياره سريع والرجال يعمل في المصنع",
272
- "expected": "السيارة سريعة والرجال يعملون في المصنع",
273
- "severity": "critical"
274
  },
275
  {
276
  "id": "PC040",
277
- "category": "multi_stage_collision",
278
- "input": "السياره سريع والرجال يعمل في المصنع",
279
- "expected": "السيارة سريعة والرجال يعملون في المصنع",
280
- "severity": "critical"
281
  },
282
  {
283
  "id": "PC041",
284
  "category": "multi_stage_collision",
285
- "input": "السياره سريع والرجال يعمل في المصنع",
286
- "expected": "السيارة سريعة والرجال يعملون في المصنع",
287
  "severity": "critical"
288
  },
289
  {
290
  "id": "PC042",
291
  "category": "multi_stage_collision",
292
- "input": "السياره سريع والرجال يعمل في المصنع",
293
- "expected": "السيارة سريعة والرجال يعملون في المصنع",
294
  "severity": "critical"
295
  },
296
  {
297
  "id": "PC043",
298
  "category": "multi_stage_collision",
299
- "input": "السياره سريع والرجال يعمل في المصنع",
300
- "expected": "السيارة سريعة والرجال يعملون في المصنع",
301
  "severity": "critical"
302
  },
303
  {
304
  "id": "PC044",
305
- "category": "multi_stage_collision",
306
- "input": "السياره سريع والرجال يعمل في المصنع",
307
- "expected": "السيارة سريعة والرجال يعملون في المصنع",
308
  "severity": "critical"
309
  },
310
  {
311
  "id": "PC045",
312
- "category": "multi_stage_collision",
313
- "input": "السياره سريع والرجال يعمل في المصنع",
314
- "expected": "السيارة سريعة والرجال يعملون في المصنع",
315
  "severity": "critical"
316
  },
317
  {
318
  "id": "PC046",
319
- "category": "multi_stage_collision",
320
- "input": "السياره سريع والرجال يعمل في المصنع",
321
- "expected": "السيارة سريعة والرجال يعملون في المصنع",
322
  "severity": "critical"
323
  },
324
  {
325
  "id": "PC047",
326
- "category": "multi_stage_collision",
327
- "input": "السياره سريع والرجال يعمل في المصنع",
328
- "expected": "السيارة سريعة والرجال يعملون في المصنع",
329
  "severity": "critical"
330
  },
331
  {
332
  "id": "PC048",
333
- "category": "multi_stage_collision",
334
- "input": "السياره سريع والرجال يعمل في المصنع",
335
- "expected": "السيارة سريعة والرجال يعملون في المصنع",
336
  "severity": "critical"
337
  },
338
  {
339
  "id": "PC049",
340
- "category": "multi_stage_collision",
341
- "input": "السياره سريع والرجال يعمل في المصنع",
342
- "expected": "السيارة سريعة والرجال يعملون في المصنع",
343
  "severity": "critical"
344
  },
345
  {
346
  "id": "PC050",
347
  "category": "multi_stage_collision",
348
- "input": "السياره سريع والرجال يعمل في المصنع",
349
- "expected": "السيارة سريعة والرجال يعملون في المصنع",
350
  "severity": "critical"
351
  }
352
  ]
 
141
  },
142
  {
143
  "id": "PC021",
144
+ "category": "spelling_blocks_grammar",
145
+ "input": "الجامعه قريب من البيت",
146
+ "expected": "الجامعة قريبة من البيت",
147
  "severity": "critical"
148
  },
149
  {
150
  "id": "PC022",
151
+ "category": "spelling_blocks_grammar",
152
+ "input": "الطائره سريع جدا",
153
+ "expected": "الطائرة سريعة جداً",
154
  "severity": "critical"
155
  },
156
  {
157
  "id": "PC023",
158
+ "category": "spelling_blocks_grammar",
159
+ "input": "القصه طويل ومملل",
160
+ "expected": "القصة طويلة ومملة",
161
  "severity": "critical"
162
  },
163
  {
164
  "id": "PC024",
165
+ "category": "spelling_blocks_grammar",
166
+ "input": "الحديقه واسع وجميل",
167
+ "expected": "الحديقة واسعة وجميلة",
168
  "severity": "critical"
169
  },
170
  {
171
  "id": "PC025",
172
+ "category": "spelling_blocks_grammar",
173
+ "input": "المكتبه كبير ومنظم",
174
+ "expected": "المكتبة كبيرة ومنظمة",
175
  "severity": "critical"
176
  },
177
  {
178
  "id": "PC026",
179
+ "category": "punctuation_near_spelling",
180
+ "input": "ذهبت الي المدرسه وقابلت صديقتي",
181
+ "expected": "ذهبت إلى المدرسة وقابلت صديقتي",
182
+ "severity": "major"
183
  },
184
  {
185
  "id": "PC027",
186
+ "category": "punctuation_near_spelling",
187
+ "input": "الكتاب مفيد جدا وانا احبه",
188
+ "expected": "الكتاب مفيد جداً وأنا أحبه",
189
+ "severity": "major"
190
  },
191
  {
192
  "id": "PC028",
193
+ "category": "punctuation_near_spelling",
194
+ "input": "المعلمه شرحت الدرس واعطت واجب",
195
+ "expected": "المعلمة شرحت الدرس وأعطت واجب",
196
+ "severity": "major"
197
  },
198
  {
199
  "id": "PC029",
200
+ "category": "punctuation_near_spelling",
201
+ "input": "قرأت الكتاب كله ولكن لم افهمه",
202
+ "expected": "قرأت الكتاب كله ولكن لم أفهمه",
203
+ "severity": "major"
204
  },
205
  {
206
  "id": "PC030",
207
+ "category": "punctuation_near_spelling",
208
+ "input": "الطقس بارد جدا واحتاج معطف",
209
+ "expected": "الطقس بارد جداً وأحتاج معطف",
210
+ "severity": "major"
211
  },
212
  {
213
  "id": "PC031",
214
+ "category": "three_stage_collision",
215
  "input": "السياره سريع والرجال يعمل في المصنع",
216
  "expected": "السيارة سريعة والرجال يعملون في المصنع",
217
  "severity": "critical"
218
  },
219
  {
220
  "id": "PC032",
221
+ "category": "three_stage_collision",
222
+ "input": "الطالبه ذكي وذهبت الي المدرسه",
223
+ "expected": "الطالبة ذكية وذهبت إلى المدرسة",
224
  "severity": "critical"
225
  },
226
  {
227
  "id": "PC033",
228
+ "category": "three_stage_collision",
229
+ "input": "المدرسه كبير والطلاب يدرس بجد",
230
+ "expected": "المدرسة كبيرة والطلاب يدرسون بجد",
231
  "severity": "critical"
232
  },
233
  {
234
  "id": "PC034",
235
+ "category": "three_stage_collision",
236
+ "input": "الحكومه اعلن قرار جديد والمواطنون يتابع",
237
+ "expected": "الحكومة أعلنت قراراً جديداً والمواطنون يتابعون",
238
  "severity": "critical"
239
  },
240
  {
241
  "id": "PC035",
242
+ "category": "three_stage_collision",
243
+ "input": "الشركه نجح في المشرووع والموظفون يحتفل",
244
+ "expected": "الشركة نجحت في المشروع والموظفون يحتفلون",
245
  "severity": "critical"
246
  },
247
  {
248
  "id": "PC036",
249
+ "category": "adjacent_corrections",
250
+ "input": "الولد الصغيره ذهب الي المدرسه",
251
+ "expected": "الولد الصغير ذهب إلى المدرسة",
252
  "severity": "critical"
253
  },
254
  {
255
  "id": "PC037",
256
+ "category": "adjacent_corrections",
257
+ "input": "قال المعلمون للطالبه ادرسي بجد",
258
+ "expected": "قال المعلمون للطالبة ادرسي بجد",
259
+ "severity": "major"
260
  },
261
  {
262
  "id": "PC038",
263
+ "category": "adjacent_corrections",
264
+ "input": "الكتب القديمه في المكتبه الكبيره",
265
+ "expected": "الكتب القديمة في المكتبة الكبيرة",
266
+ "severity": "major"
267
  },
268
  {
269
  "id": "PC039",
270
+ "category": "adjacent_corrections",
271
+ "input": "رأيت البنت الجميله في الحديقه",
272
+ "expected": "رأيت البنت الجميلة في الحديقة",
273
+ "severity": "major"
274
  },
275
  {
276
  "id": "PC040",
277
+ "category": "adjacent_corrections",
278
+ "input": "المعلمه الجديده شرحت الدرس",
279
+ "expected": "المعلمة الجديدة شرحت الدرس",
280
+ "severity": "major"
281
  },
282
  {
283
  "id": "PC041",
284
  "category": "multi_stage_collision",
285
+ "input": "الدكتور كتب التقريير والممرضات ساعد المرضى",
286
+ "expected": "الدكتور كتب التقرير والممرضات ساعدن المرضى",
287
  "severity": "critical"
288
  },
289
  {
290
  "id": "PC042",
291
  "category": "multi_stage_collision",
292
+ "input": "الطلاب حضر المحاضره والاستاذ شرح الدرس",
293
+ "expected": "الطلاب حضروا المحاضرة والأستاذ شرح الدرس",
294
  "severity": "critical"
295
  },
296
  {
297
  "id": "PC043",
298
  "category": "multi_stage_collision",
299
+ "input": "الامهات طبخ الطعام والاطفال لعب في الحديقه",
300
+ "expected": "الأمهات طبخن الطعام والأطفال لعبوا في الحديقة",
301
  "severity": "critical"
302
  },
303
  {
304
  "id": "PC044",
305
+ "category": "spelling_grammar_overlap",
306
+ "input": "المهندسات صممو المشرووع الكبير",
307
+ "expected": "المهندسات صممن المشروع الكبير",
308
  "severity": "critical"
309
  },
310
  {
311
  "id": "PC045",
312
+ "category": "spelling_grammar_overlap",
313
+ "input": "الطبيبات عالج المرضي في المستشفي",
314
+ "expected": "الطبيبات عالجن المرضى في المستشفى",
315
  "severity": "critical"
316
  },
317
  {
318
  "id": "PC046",
319
+ "category": "three_stage_collision",
320
+ "input": "المديره وافق علي المشرووع والموظفين يعمل بجد",
321
+ "expected": "المديرة وافقت على المشروع والموظفون يعملون بجد",
322
  "severity": "critical"
323
  },
324
  {
325
  "id": "PC047",
326
+ "category": "three_stage_collision",
327
+ "input": "الوزاره اصدر قرار والمعلمين ينفذ التعليمات",
328
+ "expected": "الوزارة أصدرت قراراً والمعلمون ينفذون التعليمات",
329
  "severity": "critical"
330
  },
331
  {
332
  "id": "PC048",
333
+ "category": "adjacent_corrections",
334
+ "input": "المنزل القديمه في الشارع الضيقه",
335
+ "expected": "المنزل القديم في الشارع الضيق",
336
  "severity": "critical"
337
  },
338
  {
339
  "id": "PC049",
340
+ "category": "adjacent_corrections",
341
+ "input": "الرجل الطويله وقف بجانب البنايه",
342
+ "expected": "الرجل الطويل وقف بجانب البناية",
343
  "severity": "critical"
344
  },
345
  {
346
  "id": "PC050",
347
  "category": "multi_stage_collision",
348
+ "input": "اللاعبون تدرب في الملعب والمدربه شجعتهم",
349
+ "expected": "اللاعبون تدربوا في الملعب والمدربة شجعتهم",
350
  "severity": "critical"
351
  }
352
  ]
tests/phase10/run_collision_benchmark.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Phase 11 — Collision Benchmark Runner
3
+ ======================================
4
+ Integrates with the benchmark_runner.py API client pattern.
5
+ Produces a Markdown/CLI table report with pass/fail rates and root causes.
6
+
7
+ Usage:
8
+ python tests/phase10/run_collision_benchmark.py [--url URL]
9
+ """
10
+ import argparse
11
+ import json
12
+ import re
13
+ import sys
14
+ import time
15
+ from pathlib import Path
16
+ from dataclasses import dataclass, field, asdict
17
+ from typing import List
18
+
19
+ # Reuse API client from benchmark_runner
20
+ sys.path.insert(0, str(Path(__file__).parent))
21
+ from benchmark_runner import API, BenchResult, calc_metrics, strip_punct_only
22
+
23
+ GOLD_DIR = Path(__file__).parent / "gold_datasets"
24
+ REPORT_DIR = Path(__file__).parent / "reports"
25
+ DEFAULT_URL = "https://bayan10-bayan-api.hf.space"
26
+
27
+
28
+ def _strip_diacritics(text):
29
+ return re.sub(r'[\u064B-\u065F\u0670]', '', text)
30
+
31
+
32
+ def _normalize(text):
33
+ """Normalize for comparison: strip diacritics + collapse whitespace."""
34
+ return re.sub(r'\s+', ' ', _strip_diacritics(text)).strip()
35
+
36
+
37
+ def run_collision_benchmark(api: API, samples: list) -> List[BenchResult]:
38
+ results = []
39
+ for i, s in enumerate(samples):
40
+ print(f" [{i+1}/{len(samples)}] {s['id']} ({s.get('category','')})... ", end="", flush=True)
41
+ r = BenchResult(
42
+ s['id'], 'collision', s.get('category', ''), s['input'],
43
+ expected=s.get('expected', ''), severity=s.get('severity', '')
44
+ )
45
+
46
+ resp = api.analyze(s['input'])
47
+ r.pipeline_ms = resp.get('_ms', 0)
48
+ r.pipeline_timing = resp.get('timing_ms', {})
49
+
50
+ if 'error' in resp:
51
+ r.pipeline_verdict = "ERROR"
52
+ r.pipeline_detail = resp.get('error', '')
53
+ print(f"💥 ERROR")
54
+ results.append(r)
55
+ continue
56
+
57
+ r.pipeline_output = resp.get('corrected', '')
58
+ r.pipeline_suggestions = resp.get('suggestions', [])
59
+ original = resp.get('original', s['input'])
60
+
61
+ # Normalize for comparison
62
+ norm_output = _normalize(r.pipeline_output)
63
+ norm_expected = _normalize(s.get('expected', ''))
64
+
65
+ if norm_output == norm_expected:
66
+ r.pipeline_verdict = "TP"
67
+ r.pipeline_detail = "All corrections applied correctly"
68
+ else:
69
+ # Classify the failure
70
+ category = s.get('category', '')
71
+ stages = [sg.get('type', '') for sg in r.pipeline_suggestions]
72
+
73
+ if category == 'spelling_blocks_grammar':
74
+ if 'spelling' in stages and 'grammar' not in stages:
75
+ r.root_cause_component = "PIPELINE"
76
+ r.root_cause_stage = "integration"
77
+ r.root_cause_detail = "Spelling lock blocked grammar correction (StageLocker)"
78
+ else:
79
+ r.root_cause_component = "MODEL"
80
+ r.root_cause_stage = "grammar"
81
+ r.root_cause_detail = "Grammar model missed gender agreement correction"
82
+ elif category == 'grammar_drops_spelling':
83
+ r.root_cause_component = "PIPELINE"
84
+ r.root_cause_stage = "integration"
85
+ r.root_cause_detail = "Grammar stage dropped spelling fix"
86
+ elif category == 'spelling_grammar_overlap':
87
+ r.root_cause_component = "PIPELINE"
88
+ r.root_cause_stage = "integration"
89
+ r.root_cause_detail = "Spelling and grammar corrections overlapped"
90
+ elif category == 'multi_stage_collision':
91
+ if not any(t == 'grammar' for t in stages):
92
+ r.root_cause_component = "MODEL"
93
+ r.root_cause_stage = "grammar"
94
+ r.root_cause_detail = "Grammar model missed SV/gender agreement"
95
+ else:
96
+ r.root_cause_component = "PIPELINE"
97
+ r.root_cause_stage = "integration"
98
+ r.root_cause_detail = "Multi-stage interaction failure"
99
+ elif category == 'three_stage_collision':
100
+ r.root_cause_component = "PIPELINE"
101
+ r.root_cause_stage = "integration"
102
+ r.root_cause_detail = "Three-stage collision: spelling+grammar+punctuation"
103
+ elif category == 'punctuation_near_spelling':
104
+ r.root_cause_component = "MODEL"
105
+ r.root_cause_stage = "spelling"
106
+ r.root_cause_detail = "Spelling correction near punctuation boundary"
107
+ elif category == 'adjacent_corrections':
108
+ r.root_cause_component = "PIPELINE"
109
+ r.root_cause_stage = "integration"
110
+ r.root_cause_detail = "Adjacent word corrections interfered"
111
+ else:
112
+ r.root_cause_component = "UNKNOWN"
113
+ r.root_cause_stage = "unknown"
114
+ r.root_cause_detail = f"Unclassified failure in {category}"
115
+
116
+ # Check what's wrong specifically
117
+ exp_words = set(norm_expected.split())
118
+ act_words = set(norm_output.split())
119
+ missing = exp_words - act_words
120
+ extra = act_words - exp_words
121
+
122
+ r.pipeline_verdict = "FN"
123
+ r.pipeline_detail = (
124
+ f"Missing: {list(missing)[:5]}, Extra: {list(extra)[:5]}"
125
+ if missing or extra
126
+ else f"Output mismatch: '{r.pipeline_output[:60]}' vs '{s['expected'][:60]}'"
127
+ )
128
+
129
+ # Span check
130
+ for sg in r.pipeline_suggestions:
131
+ actual_slice = original[sg['start']:sg['end']]
132
+ if actual_slice != sg.get('original', ''):
133
+ r.span_valid = False
134
+ r.span_detail = f"SPAN[{sg['start']}:{sg['end']}] exp='{sg.get('original','')}' got='{actual_slice}'"
135
+ break
136
+
137
+ icon = {"TP": "✅", "TN": "✅", "FP": "❌", "FN": "⚠️", "ERROR": "💥"}.get(r.pipeline_verdict, "?")
138
+ print(f"{icon} {r.pipeline_verdict} ({r.pipeline_ms}ms)")
139
+ results.append(r)
140
+
141
+ return results
142
+
143
+
144
+ def main():
145
+ parser = argparse.ArgumentParser(description="Phase 11 Collision Benchmark")
146
+ parser.add_argument("--url", default=DEFAULT_URL)
147
+ args = parser.parse_args()
148
+
149
+ api = API(args.url)
150
+ dataset_path = GOLD_DIR / "pipeline_collision.json"
151
+
152
+ if not dataset_path.exists():
153
+ print(f"❌ Dataset not found: {dataset_path}")
154
+ sys.exit(1)
155
+
156
+ with open(dataset_path, 'r', encoding='utf-8') as f:
157
+ samples = json.load(f)
158
+
159
+ print(f"\n{'='*60}")
160
+ print(f"COLLISION BENCHMARK ({len(samples)} samples)")
161
+ print(f"Target: {args.url}")
162
+ print(f"{'='*60}")
163
+
164
+ results = run_collision_benchmark(api, samples)
165
+ m = calc_metrics(results)
166
+
167
+ # ── Per-category breakdown ──
168
+ categories = {}
169
+ for r in results:
170
+ cat = r.category
171
+ if cat not in categories:
172
+ categories[cat] = {"total": 0, "pass": 0, "fail": 0}
173
+ categories[cat]["total"] += 1
174
+ if r.pipeline_verdict in ("TP", "TN"):
175
+ categories[cat]["pass"] += 1
176
+ else:
177
+ categories[cat]["fail"] += 1
178
+
179
+ # ── Print report ──
180
+ print(f"\n{'='*60}")
181
+ print("COLLISION BENCHMARK RESULTS")
182
+ print(f"{'='*60}")
183
+ print(f"\n## Summary")
184
+ print(f"| Metric | Value |")
185
+ print(f"|---------------|-------|")
186
+ print(f"| Total | {m['total']} |")
187
+ print(f"| Passed (TP) | {m['TP']} |")
188
+ print(f"| Failed (FN) | {m['FN']} |")
189
+ print(f"| FP | {m['FP']} |")
190
+ print(f"| Errors | {m['ERROR']} |")
191
+ print(f"| Pass Rate | {m['pass_rate']:.1%} |")
192
+
193
+ print(f"\n## By Category")
194
+ print(f"| Category | Total | Pass | Fail | Rate |")
195
+ print(f"|----------|-------|------|------|------|")
196
+ for cat, data in sorted(categories.items()):
197
+ rate = data['pass'] / data['total'] * 100 if data['total'] > 0 else 0
198
+ print(f"| {cat} | {data['total']} | {data['pass']} | {data['fail']} | {rate:.0f}% |")
199
+
200
+ # ── Root cause for failures ──
201
+ failures = [r for r in results if r.pipeline_verdict in ("FN", "FP")]
202
+ if failures:
203
+ print(f"\n## Failure Details")
204
+ print(f"| ID | Category | Input | Expected | Actual | Root Cause |")
205
+ print(f"|----|----------|-------|----------|--------|------------|")
206
+ for r in failures:
207
+ print(
208
+ f"| {r.id} | {r.category} | "
209
+ f"{r.input[:30]}... | {r.expected[:30]}... | "
210
+ f"{r.pipeline_output[:30]}... | {r.root_cause_detail[:40]} |"
211
+ )
212
+
213
+ # ── Save JSON report ──
214
+ REPORT_DIR.mkdir(parents=True, exist_ok=True)
215
+ report = {
216
+ "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
217
+ "target": args.url,
218
+ "metrics": m,
219
+ "by_category": categories,
220
+ "results": [asdict(r) for r in results],
221
+ }
222
+ out_path = REPORT_DIR / "collision_benchmark_results.json"
223
+ with open(out_path, 'w', encoding='utf-8') as f:
224
+ json.dump(report, f, ensure_ascii=False, indent=2)
225
+ print(f"\n[P11] Report → {out_path}")
226
+
227
+
228
+ if __name__ == "__main__":
229
+ main()
tests/phase10/test_collisions.py CHANGED
@@ -1,27 +1,195 @@
 
 
 
 
 
 
1
  import json
 
 
 
2
  import requests
 
3
 
4
- url = "https://bayan10-bayan-api.hf.space/api/analyze"
5
- with open("d:/BAYAN2/tests/phase10/gold_datasets/pipeline_collision.json", "r", encoding="utf-8") as f:
6
- samples = json.load(f)
7
-
8
- failures = []
9
- passed = 0
10
-
11
- for i, s in enumerate(samples[:10]): # Test first 10 for analysis
12
- try:
13
- r = requests.post(url, json={"text": s["input"]}).json()
14
- out = r.get("corrected", "")
15
- if out == s["expected"]:
16
- passed += 1
17
- print(f"[{s['id']}] PASS")
18
- else:
19
- failures.append((s, out))
20
- print(f"[{s['id']}] FAIL")
21
- print(f" Input: {s['input']}")
22
- print(f" Expected: {s['expected']}")
23
- print(f" Actual: {out}")
24
- except Exception as e:
25
- print(f"[{s['id']}] ERROR: {e}")
26
-
27
- print(f"\nResults: {passed} PASS, {len(failures)} FAIL")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Phase 11 — Pipeline Collision Test Runner
3
+ ==========================================
4
+ Runs ALL collision test cases against the live API and produces
5
+ a structured JSON report with per-failure classification.
6
+ """
7
  import json
8
+ import sys
9
+ import time
10
+ import re
11
  import requests
12
+ from pathlib import Path
13
 
14
+ # ── Configuration ──
15
+ API_URL = "https://bayan10-bayan-api.hf.space/api/analyze"
16
+ DATASET_PATH = Path(__file__).parent / "gold_datasets" / "pipeline_collision.json"
17
+ REPORT_PATH = Path(__file__).parent / "reports" / "collision_report.json"
18
+
19
+ def strip_diacritics(text):
20
+ """Remove Arabic diacritics for comparison."""
21
+ return re.sub(r'[\u064B-\u065F\u0670]', '', text)
22
+
23
+ def normalize_for_compare(text):
24
+ """Normalize text for fuzzy comparison (strip diacritics + collapse spaces)."""
25
+ t = strip_diacritics(text)
26
+ t = re.sub(r'\s+', ' ', t).strip()
27
+ return t
28
+
29
+ def classify_failure(sample, actual, suggestions):
30
+ """Classify root cause for a failure."""
31
+ expected = sample["expected"]
32
+ inp = sample["input"]
33
+ category = sample.get("category", "")
34
+
35
+ # Check which words are wrong
36
+ exp_words = set(expected.split())
37
+ act_words = set(actual.split())
38
+ inp_words = set(inp.split())
39
+ missing_fixes = exp_words - act_words # Expected words not in actual
40
+ unwanted = act_words - exp_words # Actual words not in expected
41
+
42
+ # Check suggestion stages
43
+ stages = [s.get('type', '') for s in suggestions]
44
+ has_spelling = 'spelling' in stages
45
+ has_grammar = 'grammar' in stages
46
+ has_punctuation = 'punctuation' in stages
47
+
48
+ # Determine root cause
49
+ if category == "spelling_blocks_grammar":
50
+ # Spelling fixed ه→ة but locked the range, grammar couldn't fix gender
51
+ if any(s.get('type') == 'spelling' for s in suggestions):
52
+ grammar_words_missed = [w for w in missing_fixes if w not in inp_words]
53
+ if grammar_words_missed:
54
+ return "STAGELOCKER", "spelling→grammar lock collision", grammar_words_missed
55
+ return "MODEL", "Grammar model missed correction", list(missing_fixes)
56
+
57
+ elif category == "grammar_drops_spelling":
58
+ return "PIPELINE", "Grammar stage dropped spelling fix", list(missing_fixes)
59
+
60
+ elif category == "spelling_grammar_overlap":
61
+ return "PIPELINE", "Spelling and grammar overlap conflict", list(missing_fixes)
62
+
63
+ elif category == "multi_stage_collision":
64
+ if not has_grammar and missing_fixes:
65
+ return "MODEL", "Grammar model missed correction", list(missing_fixes)
66
+ elif has_spelling and not has_grammar:
67
+ return "STAGELOCKER", "Spelling lock blocked grammar", list(missing_fixes)
68
+ return "PIPELINE", "Multi-stage interaction failure", list(missing_fixes)
69
+
70
+ else:
71
+ return "UNKNOWN", f"Unclassified failure in category '{category}'", list(missing_fixes)
72
+
73
+
74
+ def main():
75
+ # Load dataset
76
+ if not DATASET_PATH.exists():
77
+ print(f"❌ Dataset not found: {DATASET_PATH}")
78
+ sys.exit(1)
79
+
80
+ with open(DATASET_PATH, 'r', encoding='utf-8') as f:
81
+ samples = json.load(f)
82
+
83
+ print(f"[COLLISION] Running {len(samples)} test cases against {API_URL}")
84
+ print(f"{'='*70}")
85
+
86
+ results = []
87
+ passed = 0
88
+ failed = 0
89
+ errors = 0
90
+
91
+ for i, s in enumerate(samples):
92
+ sid = s["id"]
93
+ print(f" [{i+1}/{len(samples)}] {sid} ({s.get('category','')})... ", end="", flush=True)
94
+
95
+ try:
96
+ t0 = time.time()
97
+ r = requests.post(API_URL, json={"text": s["input"]}, timeout=120)
98
+ elapsed_ms = int((time.time() - t0) * 1000)
99
+ resp = r.json()
100
+ actual = resp.get("corrected", "")
101
+ suggestions = resp.get("suggestions", [])
102
+
103
+ # Normalize for comparison (strip diacritics, collapse spaces)
104
+ norm_actual = normalize_for_compare(actual)
105
+ norm_expected = normalize_for_compare(s["expected"])
106
+
107
+ result = {
108
+ "id": sid,
109
+ "category": s.get("category", ""),
110
+ "input": s["input"],
111
+ "expected": s["expected"],
112
+ "actual": actual,
113
+ "suggestions": suggestions,
114
+ "elapsed_ms": elapsed_ms,
115
+ }
116
+
117
+ if norm_actual == norm_expected:
118
+ result["verdict"] = "PASS"
119
+ passed += 1
120
+ print(f"✅ PASS ({elapsed_ms}ms)")
121
+ else:
122
+ result["verdict"] = "FAIL"
123
+ component, detail, missing = classify_failure(s, actual, suggestions)
124
+ result["root_cause_component"] = component
125
+ result["root_cause_detail"] = detail
126
+ result["missing_words"] = missing
127
+ failed += 1
128
+ print(f"❌ FAIL ({elapsed_ms}ms)")
129
+ print(f" Input: {s['input']}")
130
+ print(f" Expected: {s['expected']}")
131
+ print(f" Actual: {actual}")
132
+ print(f" Cause: [{component}] {detail}")
133
+
134
+ results.append(result)
135
+
136
+ except Exception as e:
137
+ errors += 1
138
+ results.append({
139
+ "id": sid, "category": s.get("category", ""),
140
+ "verdict": "ERROR", "error": str(e),
141
+ })
142
+ print(f"💥 ERROR: {e}")
143
+
144
+ # ── Summary ──
145
+ total = len(samples)
146
+ pass_rate = (passed / total * 100) if total > 0 else 0
147
+
148
+ print(f"\n{'='*70}")
149
+ print(f"COLLISION BENCHMARK RESULTS")
150
+ print(f"{'='*70}")
151
+ print(f" Total: {total}")
152
+ print(f" Passed: {passed}")
153
+ print(f" Failed: {failed}")
154
+ print(f" Errors: {errors}")
155
+ print(f" Pass Rate: {pass_rate:.1f}%")
156
+
157
+ # ── Root cause breakdown ──
158
+ failures = [r for r in results if r.get("verdict") == "FAIL"]
159
+ by_component = {}
160
+ by_category = {}
161
+ for r in failures:
162
+ comp = r.get("root_cause_component", "UNKNOWN")
163
+ cat = r.get("category", "unknown")
164
+ by_component[comp] = by_component.get(comp, 0) + 1
165
+ by_category[cat] = by_category.get(cat, 0) + 1
166
+
167
+ if failures:
168
+ print(f"\n Root Cause by Component:")
169
+ for comp, count in sorted(by_component.items(), key=lambda x: -x[1]):
170
+ print(f" {comp}: {count}")
171
+ print(f"\n Failures by Category:")
172
+ for cat, count in sorted(by_category.items(), key=lambda x: -x[1]):
173
+ print(f" {cat}: {count}")
174
+
175
+ # ── Save report ──
176
+ REPORT_PATH.parent.mkdir(parents=True, exist_ok=True)
177
+ report = {
178
+ "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
179
+ "target": API_URL,
180
+ "total": total,
181
+ "passed": passed,
182
+ "failed": failed,
183
+ "errors": errors,
184
+ "pass_rate": round(pass_rate, 1),
185
+ "root_cause_by_component": by_component,
186
+ "failures_by_category": by_category,
187
+ "results": results,
188
+ }
189
+ with open(REPORT_PATH, 'w', encoding='utf-8') as f:
190
+ json.dump(report, f, ensure_ascii=False, indent=2)
191
+ print(f"\n[COLLISION] Report saved → {REPORT_PATH}")
192
+
193
+
194
+ if __name__ == "__main__":
195
+ main()