Spaces:

bayan10
/

bayan-api

Running

youssefreda9 commited on about 23 hours ago

Commit

6f1ed4e

1 Parent(s): 32cefd4

Phase 11: Hierarchical StageLocker — grammar overrides spelling locks

- Refactored StageLocker with STAGE_PRIORITY hierarchy:
protection(99) > grammar(3) > spelling(2) > punctuation(1)
- Added is_locked_for() and is_locked_by_for() hierarchy-aware methods
- Grammar stage now uses is_locked_for('grammar') to override spelling locks
- Punctuation stage uses is_locked_by_for('punctuation') to stay blocked
- Deduplicated pipeline_collision.json (PC021-PC050 were identical copies)
- Expanded to 50 unique collision test cases across 7 categories
- Fixed test_collisions.py hardcoded path, runs all 50 samples
- Added run_collision_benchmark.py with structured reporting
- Registered collision as 8th dataset in benchmark_runner.py
- All 13 StageLocker hierarchy unit tests passed

Files changed (6) hide show

src/app.py +5 -3
src/nlp/stage_locker.py +82 -3
tests/phase10/benchmark_runner.py +78 -0
tests/phase10/gold_datasets/pipeline_collision.json +93 -93
tests/phase10/run_collision_benchmark.py +229 -0
tests/phase10/test_collisions.py +192 -24

src/app.py CHANGED Viewed

@@ -1843,10 +1843,11 @@ def analyze_text():
                     logger.info(f'[FILTER-TEL] {_tel_json.dumps({"event":"grammar_diff","original":orig_text[:80],"correction":corr_text[:80],"start":d["start"],"end":d["end"]})}')
                     _tel_events.append({"event":"grammar_diff","original":orig_text[:80],"correction":corr_text[:80],"start":d["start"],"end":d["end"]})
                     # StageLocker: skip diffs that overlap with locked ranges
-                    if ctx.stage_locker.is_locked(d['start'], d['end']):
                         logger.info(
                             f"[LOCK] Grammar blocked on [{d['start']}:{d['end']}] "
-                            f"'{d.get('original','')}' — locked by previous stage"
                         )
                         logger.info(f'[FILTER-TEL] {_tel_json.dumps({"event":"filter_reject","filter":"StageLocker","original":orig_text[:80],"correction":corr_text[:80]})}')
                         _tel_events.append({"event":"filter_reject","filter":"StageLocker","original":orig_text[:80],"correction":corr_text[:80]})
@@ -2166,7 +2167,8 @@ def analyze_text():
                 for d in diffs:
                     # StageLocker: skip diffs that overlap with locked ranges
                     # BUT allow pure punctuation insertions near locked words
-                    lock_info = ctx.stage_locker.is_locked_by(d['start'], d['end'])
                     if lock_info:
                         import re as _re
                         orig_alpha = _re.sub(r'[^\u0600-\u06FFa-zA-Z]', '', d.get('original', ''))

                     logger.info(f'[FILTER-TEL] {_tel_json.dumps({"event":"grammar_diff","original":orig_text[:80],"correction":corr_text[:80],"start":d["start"],"end":d["end"]})}')
                     _tel_events.append({"event":"grammar_diff","original":orig_text[:80],"correction":corr_text[:80],"start":d["start"],"end":d["end"]})
                     # StageLocker: skip diffs that overlap with locked ranges
+                    # Phase 11: Hierarchy-aware — grammar (3) overrides spelling (2)
+                    if ctx.stage_locker.is_locked_for(d['start'], d['end'], 'grammar'):
                         logger.info(
                             f"[LOCK] Grammar blocked on [{d['start']}:{d['end']}] "
+                            f"'{d.get('original','')}' — locked by equal/higher priority stage"
                         )
                         logger.info(f'[FILTER-TEL] {_tel_json.dumps({"event":"filter_reject","filter":"StageLocker","original":orig_text[:80],"correction":corr_text[:80]})}')
                         _tel_events.append({"event":"filter_reject","filter":"StageLocker","original":orig_text[:80],"correction":corr_text[:80]})
                 for d in diffs:
                     # StageLocker: skip diffs that overlap with locked ranges
                     # BUT allow pure punctuation insertions near locked words
+                    # Phase 11: Hierarchy-aware — punctuation (1) blocked by spelling (2) and grammar (3)
+                    lock_info = ctx.stage_locker.is_locked_by_for(d['start'], d['end'], 'punctuation')
                     if lock_info:
                         import re as _re
                         orig_alpha = _re.sub(r'[^\u0600-\u06FFa-zA-Z]', '', d.get('original', ''))

src/nlp/stage_locker.py CHANGED Viewed

@@ -10,8 +10,15 @@ STRICT RULES:
 TERMINOLOGY:
   lock():               registers a range in CURRENT_TEXT as owned
-  is_locked():          checks if a range in CURRENT_TEXT overlaps any owned range
   update_via_mapper():  shifts all spans forward when CURRENT_TEXT mutates
 """
 import logging
@@ -20,6 +27,22 @@ logger = logging.getLogger(__name__)
 # Set to True for structured debug logging across all pipeline components
 PIPELINE_DEBUG = False
 class StageLocker:
     """Protects corrected ranges in CURRENT_TEXT from being overwritten by later stages."""
@@ -34,7 +57,11 @@ class StageLocker:
             logger.debug(f"[StageLocker] LOCK [{start}:{end}] owner={owner}")
     def is_locked(self, start: int, end: int) -> bool:
-        """Check if [start, end) in CURRENT_TEXT overlaps any locked range."""
         for ls, le, _ in self.locked_spans:
             if start < le and end > ls:
                 if PIPELINE_DEBUG:
@@ -42,13 +69,65 @@ class StageLocker:
                 return True
         return False
     def is_locked_by(self, start: int, end: int):
-        """Return (locked_start, locked_end, owner) if locked, else None."""
         for ls, le, owner in self.locked_spans:
             if start < le and end > ls:
                 return (ls, le, owner)
         return None
     def unlock(self, start: int, end: int) -> None:
         """FIX-18: Remove lock for a specific range (used when punctuation cap removes patches)."""
         self.locked_spans = [

 TERMINOLOGY:
   lock():               registers a range in CURRENT_TEXT as owned
+  is_locked():          checks if a range in CURRENT_TEXT overlaps any owned range (ABSOLUTE)
+  is_locked_for():      checks if a range is locked FOR A SPECIFIC STAGE (HIERARCHICAL)
   update_via_mapper():  shifts all spans forward when CURRENT_TEXT mutates
+HIERARCHY (Phase 11):
+  protection (99) ─── Absolute, overrides everything
+  grammar    (3)  ─── May override spelling
+  spelling   (2)  ─── Blocks punctuation, blocked by grammar
+  punctuation(1)  ─── Blocked by spelling and grammar
 """
 import logging
 # Set to True for structured debug logging across all pipeline components
 PIPELINE_DEBUG = False
+# ═══════════════════════════════════════════════════════════════
+# Phase 11: Hierarchical Priority Map
+# ═══════════════════════════════════════════════════════════════
+# A requesting stage is BLOCKED only by locks from stages with
+# EQUAL or HIGHER priority. Lower-priority locks are overridden.
+#
+# Example: Grammar (3) requesting on a Spelling (2) lock → ALLOWED
+# Example: Punctuation (1) requesting on a Spelling (2) lock → BLOCKED
+# Example: Anything requesting on a Protection (99) lock → BLOCKED
+STAGE_PRIORITY = {
+    'punctuation': 1,
+    'spelling': 2,
+    'grammar': 3,
+    'protection': 99,
+}
 class StageLocker:
     """Protects corrected ranges in CURRENT_TEXT from being overwritten by later stages."""
             logger.debug(f"[StageLocker] LOCK [{start}:{end}] owner={owner}")
     def is_locked(self, start: int, end: int) -> bool:
+        """Check if [start, end) in CURRENT_TEXT overlaps any locked range.
+        ABSOLUTE check — ignores hierarchy. Any lock blocks.
+        Kept for backward compatibility and protection-level checks.
+        """
         for ls, le, _ in self.locked_spans:
             if start < le and end > ls:
                 if PIPELINE_DEBUG:
                 return True
         return False
+    def is_locked_for(self, start: int, end: int, requesting_stage: str) -> bool:
+        """Hierarchy-aware lock check.
+        Returns True (BLOCKED) only if an overlapping lock has EQUAL or
+        HIGHER priority than the requesting stage.
+        Returns False (ALLOWED) if the requester outranks all overlapping locks.
+        Phase 11 examples:
+          is_locked_for(0, 5, 'grammar')     on spelling lock → False (grammar > spelling)
+          is_locked_for(0, 5, 'punctuation') on spelling lock → True  (spelling > punctuation)
+          is_locked_for(0, 5, 'grammar')     on protection lock → True (protection > grammar)
+        """
+        req_priority = STAGE_PRIORITY.get(requesting_stage, 0)
+        for ls, le, owner in self.locked_spans:
+            if start < le and end > ls:
+                owner_priority = STAGE_PRIORITY.get(owner, 0)
+                if owner_priority >= req_priority:
+                    if PIPELINE_DEBUG:
+                        logger.debug(
+                            f"[StageLocker] HIERARCHY BLOCKED [{start}:{end}] "
+                            f"requester={requesting_stage}({req_priority}) "
+                            f"owner={owner}({owner_priority})"
+                        )
+                    return True  # Blocked: owner is same or higher priority
+                else:
+                    if PIPELINE_DEBUG:
+                        logger.debug(
+                            f"[StageLocker] HIERARCHY OVERRIDE [{start}:{end}] "
+                            f"requester={requesting_stage}({req_priority}) "
+                            f"overrides owner={owner}({owner_priority})"
+                        )
+        return False  # Not blocked: requester outranks all overlapping locks
     def is_locked_by(self, start: int, end: int):
+        """Return (locked_start, locked_end, owner) if locked, else None.
+        ABSOLUTE check — ignores hierarchy.
+        """
         for ls, le, owner in self.locked_spans:
             if start < le and end > ls:
                 return (ls, le, owner)
         return None
+    def is_locked_by_for(self, start: int, end: int, requesting_stage: str):
+        """Hierarchy-aware lock info check.
+        Returns (locked_start, locked_end, owner) if the range is blocked
+        by a lock with EQUAL or HIGHER priority than the requesting stage.
+        Returns None if the requester outranks all overlapping locks.
+        """
+        req_priority = STAGE_PRIORITY.get(requesting_stage, 0)
+        for ls, le, owner in self.locked_spans:
+            if start < le and end > ls:
+                owner_priority = STAGE_PRIORITY.get(owner, 0)
+                if owner_priority >= req_priority:
+                    return (ls, le, owner)
+        return None
     def unlock(self, start: int, end: int) -> None:
         """FIX-18: Remove lock for a specific range (used when punctuation cap removes patches)."""
         self.locked_spans = [

tests/phase10/benchmark_runner.py CHANGED Viewed

@@ -539,6 +539,83 @@ def run_hallucination_benchmark(api: API, samples: list) -> List[BenchResult]:
         results.append(r)
     return results
 # ═══════════════════════════════════════════════════════════════
 # Metrics
 # ═══════════════════════════════════════════════════════════════
@@ -618,6 +695,7 @@ def main():
         "religious":    (GOLD_DIR/"religious.json",     run_religious_benchmark),
         "structured":   (GOLD_DIR/"structured_content.json", run_structured_benchmark),
         "hallucination":(GOLD_DIR/"hallucination.json", run_hallucination_benchmark),
     }
     for name, (path, runner) in DATASETS.items():

         results.append(r)
     return results
+def run_collision_benchmark(api: API, samples: list) -> List[BenchResult]:
+    """Phase 11: Pipeline collision benchmark (spelling↔grammar↔punctuation interactions)."""
+    results = []
+    for i, s in enumerate(samples):
+        print(f"  [{i+1}/{len(samples)}] {s['id']} ({s.get('category','')})... ", end="", flush=True)
+        r = BenchResult(
+            s['id'], 'collision', s.get('category', ''), s['input'],
+            expected=s.get('expected', ''), severity=s.get('severity', '')
+        )
+        resp = api.analyze(s['input'])
+        r.pipeline_ms = resp.get('_ms', 0)
+        r.pipeline_timing = resp.get('timing_ms', {})
+        if 'error' in resp:
+            r.pipeline_verdict = "ERROR"
+            r.pipeline_detail = resp.get('error', '')
+            print(f"💥 ERROR")
+            results.append(r)
+            continue
+        r.pipeline_output = resp.get('corrected', '')
+        r.pipeline_suggestions = resp.get('suggestions', [])
+        # Normalize for comparison (strip diacritics + collapse whitespace)
+        norm_output = re.sub(r'\s+', ' ', _strip_diacritics(r.pipeline_output)).strip()
+        norm_expected = re.sub(r'\s+', ' ', _strip_diacritics(s.get('expected', ''))).strip()
+        if norm_output == norm_expected:
+            r.pipeline_verdict = "TP"
+            r.pipeline_detail = "All corrections applied correctly"
+        else:
+            r.pipeline_verdict = "FN"
+            category = s.get('category', '')
+            stages = [sg.get('type', '') for sg in r.pipeline_suggestions]
+            # Root cause classification
+            if category == 'spelling_blocks_grammar':
+                if 'spelling' in stages and 'grammar' not in stages:
+                    r.root_cause_component = "PIPELINE"
+                    r.root_cause_stage = "integration"
+                    r.root_cause_detail = "Spelling lock blocked grammar (StageLocker)"
+                else:
+                    r.root_cause_component = "MODEL"
+                    r.root_cause_stage = "grammar"
+                    r.root_cause_detail = "Grammar model missed correction"
+            elif category in ('grammar_drops_spelling', 'spelling_grammar_overlap'):
+                r.root_cause_component = "PIPELINE"
+                r.root_cause_stage = "integration"
+                r.root_cause_detail = f"{category}: stage interaction failure"
+            elif category == 'multi_stage_collision':
+                r.root_cause_component = "PIPELINE" if 'grammar' in stages else "MODEL"
+                r.root_cause_stage = "integration" if 'grammar' in stages else "grammar"
+                r.root_cause_detail = "Multi-stage collision failure"
+            elif category == 'three_stage_collision':
+                r.root_cause_component = "PIPELINE"
+                r.root_cause_stage = "integration"
+                r.root_cause_detail = "Three-stage collision failure"
+            elif category == 'adjacent_corrections':
+                r.root_cause_component = "PIPELINE"
+                r.root_cause_stage = "integration"
+                r.root_cause_detail = "Adjacent corrections interfered"
+            else:
+                r.root_cause_component = "UNKNOWN"
+                r.root_cause_stage = "unknown"
+                r.root_cause_detail = f"Unclassified: {category}"
+            exp_words = set(norm_expected.split())
+            act_words = set(norm_output.split())
+            missing = exp_words - act_words
+            r.pipeline_detail = f"Missing: {list(missing)[:5]}" if missing else "Output mismatch"
+        icon = {"TP":"✅","TN":"✅","FP":"❌","FN":"⚠️","ERROR":"💥"}.get(r.pipeline_verdict,"?")
+        print(f"{icon} {r.pipeline_verdict} ({r.pipeline_ms}ms)")
+        results.append(r)
+    return results
 # ═══════════════════════════════════════════════════════════════
 # Metrics
 # ═══════════════════════════════════════════════════════════════
         "religious":    (GOLD_DIR/"religious.json",     run_religious_benchmark),
         "structured":   (GOLD_DIR/"structured_content.json", run_structured_benchmark),
         "hallucination":(GOLD_DIR/"hallucination.json", run_hallucination_benchmark),
+        "collision":    (GOLD_DIR/"pipeline_collision.json", run_collision_benchmark),
     }
     for name, (path, runner) in DATASETS.items():

tests/phase10/gold_datasets/pipeline_collision.json CHANGED Viewed

@@ -141,212 +141,212 @@
   },
   {
     "id": "PC021",
-    "category": "multi_stage_collision",
-    "input": "السياره سريع والرجال يعمل في المصنع",
-    "expected": "السيارة سريعة والرجال يعملون في المصنع",
     "severity": "critical"
   },
   {
     "id": "PC022",
-    "category": "multi_stage_collision",
-    "input": "السياره سريع والرجال يعمل في المصنع",
-    "expected": "السيارة سريعة والرجال يعملون في المصنع",
     "severity": "critical"
   },
   {
     "id": "PC023",
-    "category": "multi_stage_collision",
-    "input": "السياره سريع والرجال يعمل في المصنع",
-    "expected": "السيارة سريعة والرجال يعملون في المصنع",
     "severity": "critical"
   },
   {
     "id": "PC024",
-    "category": "multi_stage_collision",
-    "input": "السياره سريع والرجال يعمل في المصنع",
-    "expected": "السيارة سريعة والرجال يعملون في المصنع",
     "severity": "critical"
   },
   {
     "id": "PC025",
-    "category": "multi_stage_collision",
-    "input": "السياره سريع والرجال يعمل في المصنع",
-    "expected": "السيارة سريعة والرجال يعملون في المصنع",
     "severity": "critical"
   },
   {
     "id": "PC026",
-    "category": "multi_stage_collision",
-    "input": "السياره سريع والرجال يعمل في المصنع",
-    "expected": "السيارة سريعة والرجال يعملون في المصنع",
-    "severity": "critical"
   },
   {
     "id": "PC027",
-    "category": "multi_stage_collision",
-    "input": "السياره سريع والرجال يعمل في المصنع",
-    "expected": "السيارة سريعة والرجال يعملون في المصنع",
-    "severity": "critical"
   },
   {
     "id": "PC028",
-    "category": "multi_stage_collision",
-    "input": "السياره سريع والرجال يعمل في المصنع",
-    "expected": "السيارة سريعة والرجال يعملون في المصنع",
-    "severity": "critical"
   },
   {
     "id": "PC029",
-    "category": "multi_stage_collision",
-    "input": "السياره سريع والرجال يعمل في المصنع",
-    "expected": "السيارة سريعة والرجال يعملون في المصنع",
-    "severity": "critical"
   },
   {
     "id": "PC030",
-    "category": "multi_stage_collision",
-    "input": "السياره سريع والرجال يعمل في المصنع",
-    "expected": "السيارة سريعة والرجال يعملون في المصنع",
-    "severity": "critical"
   },
   {
     "id": "PC031",
-    "category": "multi_stage_collision",
     "input": "السياره سريع والرجال يعمل في المصنع",
     "expected": "السيارة سريعة والرجال يعملون في المصنع",
     "severity": "critical"
   },
   {
     "id": "PC032",
-    "category": "multi_stage_collision",
-    "input": "السياره سريع والرجال يعمل في المصنع",
-    "expected": "السيا��ة سريعة والرجال يعملون في المصنع",
     "severity": "critical"
   },
   {
     "id": "PC033",
-    "category": "multi_stage_collision",
-    "input": "السياره سريع والرجال يعمل في المصنع",
-    "expected": "السيارة سريعة والرجال يعملون في المصنع",
     "severity": "critical"
   },
   {
     "id": "PC034",
-    "category": "multi_stage_collision",
-    "input": "السياره سريع والرجال يعمل في المصنع",
-    "expected": "السيارة سريعة والرجال يعملون في المصنع",
     "severity": "critical"
   },
   {
     "id": "PC035",
-    "category": "multi_stage_collision",
-    "input": "السياره سريع والرجال يعمل في المصنع",
-    "expected": "السيارة سريعة والرجال يعملون في المصنع",
     "severity": "critical"
   },
   {
     "id": "PC036",
-    "category": "multi_stage_collision",
-    "input": "السياره سريع والرجال يعمل في المصنع",
-    "expected": "السيارة سريعة والرجال يعملون في المصنع",
     "severity": "critical"
   },
   {
     "id": "PC037",
-    "category": "multi_stage_collision",
-    "input": "السياره سريع والرجال يعمل في المصنع",
-    "expected": "السيارة سريعة والرجال يعملون في المصنع",
-    "severity": "critical"
   },
   {
     "id": "PC038",
-    "category": "multi_stage_collision",
-    "input": "السياره سريع والرجال يعمل في المصنع",
-    "expected": "السيارة سريعة والرجال يعملون في المصنع",
-    "severity": "critical"
   },
   {
     "id": "PC039",
-    "category": "multi_stage_collision",
-    "input": "السياره سريع والرجال يعمل في المصنع",
-    "expected": "السيارة سريعة والرجال يعملون في المصنع",
-    "severity": "critical"
   },
   {
     "id": "PC040",
-    "category": "multi_stage_collision",
-    "input": "السياره سريع والرجال يعمل في المصنع",
-    "expected": "السيارة سريعة والرجال يعملون في المصنع",
-    "severity": "critical"
   },
   {
     "id": "PC041",
     "category": "multi_stage_collision",
-    "input": "السياره سريع والرجال يعمل في المصنع",
-    "expected": "السيارة سريعة والرجال يعملون في المصنع",
     "severity": "critical"
   },
   {
     "id": "PC042",
     "category": "multi_stage_collision",
-    "input": "السياره سريع والرجال يعمل في المصنع",
-    "expected": "السيارة سريعة والرجال يعملون في المصنع",
     "severity": "critical"
   },
   {
     "id": "PC043",
     "category": "multi_stage_collision",
-    "input": "السياره سريع والرجال يعمل في المصنع",
-    "expected": "السيارة سريعة والرجال يعملون في المصنع",
     "severity": "critical"
   },
   {
     "id": "PC044",
-    "category": "multi_stage_collision",
-    "input": "السياره سريع والرجال يعمل في المصنع",
-    "expected": "السيارة سريعة والرجال يعملون في المصنع",
     "severity": "critical"
   },
   {
     "id": "PC045",
-    "category": "multi_stage_collision",
-    "input": "السياره سريع والرجال يعمل في المصنع",
-    "expected": "السيارة سريعة والرجال يعملون في المصنع",
     "severity": "critical"
   },
   {
     "id": "PC046",
-    "category": "multi_stage_collision",
-    "input": "السياره سريع والرجال يعمل في المصنع",
-    "expected": "السيارة سريعة والرجال يعملون في المصنع",
     "severity": "critical"
   },
   {
     "id": "PC047",
-    "category": "multi_stage_collision",
-    "input": "السياره سريع والرجال يعمل في المصنع",
-    "expected": "السيارة سريعة والرجال يعملون في المصنع",
     "severity": "critical"
   },
   {
     "id": "PC048",
-    "category": "multi_stage_collision",
-    "input": "السياره سريع والرجال يعمل في المصنع",
-    "expected": "السيارة سريعة والرجال يعملون في المصنع",
     "severity": "critical"
   },
   {
     "id": "PC049",
-    "category": "multi_stage_collision",
-    "input": "السياره سريع والرجال يعمل في المصنع",
-    "expected": "السيارة سريعة والرجال يعملون في المصنع",
     "severity": "critical"
   },
   {
     "id": "PC050",
     "category": "multi_stage_collision",
-    "input": "السياره سريع والرجال يعمل في المصنع",
-    "expected": "السيارة سريعة والرجال يعملون في المصنع",
     "severity": "critical"
   }
 ]

   },
   {
     "id": "PC021",
+    "category": "spelling_blocks_grammar",
+    "input": "الجامعه قريب من البيت",
+    "expected": "الجامعة قريبة من البيت",
     "severity": "critical"
   },
   {
     "id": "PC022",
+    "category": "spelling_blocks_grammar",
+    "input": "الطائره سريع جدا",
+    "expected": "الطائرة سريعة جداً",
     "severity": "critical"
   },
   {
     "id": "PC023",
+    "category": "spelling_blocks_grammar",
+    "input": "القصه طويل ومملل",
+    "expected": "القصة طويلة ومملة",
     "severity": "critical"
   },
   {
     "id": "PC024",
+    "category": "spelling_blocks_grammar",
+    "input": "الحديقه واسع وجميل",
+    "expected": "الحديقة واسعة وجميلة",
     "severity": "critical"
   },
   {
     "id": "PC025",
+    "category": "spelling_blocks_grammar",
+    "input": "المكتبه كبير ومنظم",
+    "expected": "المكتبة كبيرة ومنظمة",
     "severity": "critical"
   },
   {
     "id": "PC026",
+    "category": "punctuation_near_spelling",
+    "input": "ذهبت الي المدرسه وقابلت صديقتي",
+    "expected": "ذهبت إلى المدرسة وقابلت صديقتي",
+    "severity": "major"
   },
   {
     "id": "PC027",
+    "category": "punctuation_near_spelling",
+    "input": "الكتاب مفيد جدا وانا احبه",
+    "expected": "الكتاب مفيد جداً وأنا أحبه",
+    "severity": "major"
   },
   {
     "id": "PC028",
+    "category": "punctuation_near_spelling",
+    "input": "المعلمه شرحت الدرس واعطت واجب",
+    "expected": "المعلمة شرحت الدرس وأعطت واجب",
+    "severity": "major"
   },
   {
     "id": "PC029",
+    "category": "punctuation_near_spelling",
+    "input": "قرأت الكتاب كله ولكن لم افهمه",
+    "expected": "قرأت الكتاب كله ولكن لم أفهمه",
+    "severity": "major"
   },
   {
     "id": "PC030",
+    "category": "punctuation_near_spelling",
+    "input": "الطقس بارد جدا واحتاج معطف",
+    "expected": "الطقس بارد جداً وأحتاج معطف",
+    "severity": "major"
   },
   {
     "id": "PC031",
+    "category": "three_stage_collision",
     "input": "السياره سريع والرجال يعمل في المصنع",
     "expected": "السيارة سريعة والرجال يعملون في المصنع",
     "severity": "critical"
   },
   {
     "id": "PC032",
+    "category": "three_stage_collision",
+    "input": "الطالبه ذكي وذهبت الي المدرسه",
+    "expected": "الطالبة ذكية وذهبت إلى المدرسة",
     "severity": "critical"
   },
   {
     "id": "PC033",
+    "category": "three_stage_collision",
+    "input": "المدرسه كبير والطلاب يدرس بجد",
+    "expected": "المدرسة كبيرة والطلاب يدرسون بجد",
     "severity": "critical"
   },
   {
     "id": "PC034",
+    "category": "three_stage_collision",
+    "input": "الحكومه اعلن قرار جديد والمواطنون يتابع",
+    "expected": "الحكومة أعلنت قراراً جديداً والمواطنون يتابعون",
     "severity": "critical"
   },
   {
     "id": "PC035",
+    "category": "three_stage_collision",
+    "input": "الشركه نجح في المشرووع والموظفون يحتفل",
+    "expected": "الشركة نجحت في المشروع والموظفون يحتفلون",
     "severity": "critical"
   },
   {
     "id": "PC036",
+    "category": "adjacent_corrections",
+    "input": "الولد الصغيره ذهب الي المدرسه",
+    "expected": "الولد الصغير ذهب إلى المدرسة",
     "severity": "critical"
   },
   {
     "id": "PC037",
+    "category": "adjacent_corrections",
+    "input": "قال المعلمون للطالبه ادرسي بجد",
+    "expected": "قال المعلمون للطالبة ادرسي بجد",
+    "severity": "major"
   },
   {
     "id": "PC038",
+    "category": "adjacent_corrections",
+    "input": "الكتب القديمه في المكتبه الكبيره",
+    "expected": "الكتب القديمة في المكتبة الكبيرة",
+    "severity": "major"
   },
   {
     "id": "PC039",
+    "category": "adjacent_corrections",
+    "input": "رأيت البنت الجميله في الحديقه",
+    "expected": "رأيت البنت الجميلة في الحديقة",
+    "severity": "major"
   },
   {
     "id": "PC040",
+    "category": "adjacent_corrections",
+    "input": "المعلمه الجديده شرحت الدرس",
+    "expected": "المعلمة الجديدة شرحت الدرس",
+    "severity": "major"
   },
   {
     "id": "PC041",
     "category": "multi_stage_collision",
+    "input": "الدكتور كتب التقريير والممرضات ساعد المرضى",
+    "expected": "الدكتور كتب التقرير والممرضات ساعدن المرضى",
     "severity": "critical"
   },
   {
     "id": "PC042",
     "category": "multi_stage_collision",
+    "input": "الطلاب حضر المحاضره والاستاذ شرح الدرس",
+    "expected": "الطلاب حضروا المحاضرة والأستاذ شرح الدرس",
     "severity": "critical"
   },
   {
     "id": "PC043",
     "category": "multi_stage_collision",
+    "input": "الامهات طبخ الطعام والاطفال لعب في الحديقه",
+    "expected": "الأمهات طبخن الطعام والأطفال لعبوا في الحديقة",
     "severity": "critical"
   },
   {
     "id": "PC044",
+    "category": "spelling_grammar_overlap",
+    "input": "المهندسات صممو المشرووع الكبير",
+    "expected": "المهندسات صممن المشروع الكبير",
     "severity": "critical"
   },
   {
     "id": "PC045",
+    "category": "spelling_grammar_overlap",
+    "input": "الطبيبات عالج المرضي في المستشفي",
+    "expected": "الطبيبات عالجن المرضى في المستشفى",
     "severity": "critical"
   },
   {
     "id": "PC046",
+    "category": "three_stage_collision",
+    "input": "المديره وافق علي المشرووع والموظفين يعمل بجد",
+    "expected": "المديرة وافقت على المشروع والموظفون يعملون بجد",
     "severity": "critical"
   },
   {
     "id": "PC047",
+    "category": "three_stage_collision",
+    "input": "الوزاره اصدر قرار والمعلمين ينفذ التعليمات",
+    "expected": "الوزارة أصدرت قراراً والمعلمون ينفذون التعليمات",
     "severity": "critical"
   },
   {
     "id": "PC048",
+    "category": "adjacent_corrections",
+    "input": "المنزل القديمه في الشارع الضيقه",
+    "expected": "المنزل القديم في الشارع الضيق",
     "severity": "critical"
   },
   {
     "id": "PC049",
+    "category": "adjacent_corrections",
+    "input": "الرجل الطويله وقف بجانب البنايه",
+    "expected": "الرجل الطويل وقف بجانب البناية",
     "severity": "critical"
   },
   {
     "id": "PC050",
     "category": "multi_stage_collision",
+    "input": "اللاعبون تدرب في الملعب والمدربه شجعتهم",
+    "expected": "اللاعبون تدربوا في الملعب والمدربة شجعتهم",
     "severity": "critical"
   }
 ]

tests/phase10/run_collision_benchmark.py ADDED Viewed

	@@ -0,0 +1,229 @@

+"""
+Phase 11 — Collision Benchmark Runner
+======================================
+Integrates with the benchmark_runner.py API client pattern.
+Produces a Markdown/CLI table report with pass/fail rates and root causes.
+Usage:
+    python tests/phase10/run_collision_benchmark.py [--url URL]
+"""
+import argparse
+import json
+import re
+import sys
+import time
+from pathlib import Path
+from dataclasses import dataclass, field, asdict
+from typing import List
+# Reuse API client from benchmark_runner
+sys.path.insert(0, str(Path(__file__).parent))
+from benchmark_runner import API, BenchResult, calc_metrics, strip_punct_only
+GOLD_DIR = Path(__file__).parent / "gold_datasets"
+REPORT_DIR = Path(__file__).parent / "reports"
+DEFAULT_URL = "https://bayan10-bayan-api.hf.space"
+def _strip_diacritics(text):
+    return re.sub(r'[\u064B-\u065F\u0670]', '', text)
+def _normalize(text):
+    """Normalize for comparison: strip diacritics + collapse whitespace."""
+    return re.sub(r'\s+', ' ', _strip_diacritics(text)).strip()
+def run_collision_benchmark(api: API, samples: list) -> List[BenchResult]:
+    results = []
+    for i, s in enumerate(samples):
+        print(f"  [{i+1}/{len(samples)}] {s['id']} ({s.get('category','')})... ", end="", flush=True)
+        r = BenchResult(
+            s['id'], 'collision', s.get('category', ''), s['input'],
+            expected=s.get('expected', ''), severity=s.get('severity', '')
+        )
+        resp = api.analyze(s['input'])
+        r.pipeline_ms = resp.get('_ms', 0)
+        r.pipeline_timing = resp.get('timing_ms', {})
+        if 'error' in resp:
+            r.pipeline_verdict = "ERROR"
+            r.pipeline_detail = resp.get('error', '')
+            print(f"💥 ERROR")
+            results.append(r)
+            continue
+        r.pipeline_output = resp.get('corrected', '')
+        r.pipeline_suggestions = resp.get('suggestions', [])
+        original = resp.get('original', s['input'])
+        # Normalize for comparison
+        norm_output = _normalize(r.pipeline_output)
+        norm_expected = _normalize(s.get('expected', ''))
+        if norm_output == norm_expected:
+            r.pipeline_verdict = "TP"
+            r.pipeline_detail = "All corrections applied correctly"
+        else:
+            # Classify the failure
+            category = s.get('category', '')
+            stages = [sg.get('type', '') for sg in r.pipeline_suggestions]
+            if category == 'spelling_blocks_grammar':
+                if 'spelling' in stages and 'grammar' not in stages:
+                    r.root_cause_component = "PIPELINE"
+                    r.root_cause_stage = "integration"
+                    r.root_cause_detail = "Spelling lock blocked grammar correction (StageLocker)"
+                else:
+                    r.root_cause_component = "MODEL"
+                    r.root_cause_stage = "grammar"
+                    r.root_cause_detail = "Grammar model missed gender agreement correction"
+            elif category == 'grammar_drops_spelling':
+                r.root_cause_component = "PIPELINE"
+                r.root_cause_stage = "integration"
+                r.root_cause_detail = "Grammar stage dropped spelling fix"
+            elif category == 'spelling_grammar_overlap':
+                r.root_cause_component = "PIPELINE"
+                r.root_cause_stage = "integration"
+                r.root_cause_detail = "Spelling and grammar corrections overlapped"
+            elif category == 'multi_stage_collision':
+                if not any(t == 'grammar' for t in stages):
+                    r.root_cause_component = "MODEL"
+                    r.root_cause_stage = "grammar"
+                    r.root_cause_detail = "Grammar model missed SV/gender agreement"
+                else:
+                    r.root_cause_component = "PIPELINE"
+                    r.root_cause_stage = "integration"
+                    r.root_cause_detail = "Multi-stage interaction failure"
+            elif category == 'three_stage_collision':
+                r.root_cause_component = "PIPELINE"
+                r.root_cause_stage = "integration"
+                r.root_cause_detail = "Three-stage collision: spelling+grammar+punctuation"
+            elif category == 'punctuation_near_spelling':
+                r.root_cause_component = "MODEL"
+                r.root_cause_stage = "spelling"
+                r.root_cause_detail = "Spelling correction near punctuation boundary"
+            elif category == 'adjacent_corrections':
+                r.root_cause_component = "PIPELINE"
+                r.root_cause_stage = "integration"
+                r.root_cause_detail = "Adjacent word corrections interfered"
+            else:
+                r.root_cause_component = "UNKNOWN"
+                r.root_cause_stage = "unknown"
+                r.root_cause_detail = f"Unclassified failure in {category}"
+            # Check what's wrong specifically
+            exp_words = set(norm_expected.split())
+            act_words = set(norm_output.split())
+            missing = exp_words - act_words
+            extra = act_words - exp_words
+            r.pipeline_verdict = "FN"
+            r.pipeline_detail = (
+                f"Missing: {list(missing)[:5]}, Extra: {list(extra)[:5]}"
+                if missing or extra
+                else f"Output mismatch: '{r.pipeline_output[:60]}' vs '{s['expected'][:60]}'"
+            )
+        # Span check
+        for sg in r.pipeline_suggestions:
+            actual_slice = original[sg['start']:sg['end']]
+            if actual_slice != sg.get('original', ''):
+                r.span_valid = False
+                r.span_detail = f"SPAN[{sg['start']}:{sg['end']}] exp='{sg.get('original','')}' got='{actual_slice}'"
+                break
+        icon = {"TP": "✅", "TN": "✅", "FP": "❌", "FN": "⚠️", "ERROR": "💥"}.get(r.pipeline_verdict, "?")
+        print(f"{icon} {r.pipeline_verdict} ({r.pipeline_ms}ms)")
+        results.append(r)
+    return results
+def main():
+    parser = argparse.ArgumentParser(description="Phase 11 Collision Benchmark")
+    parser.add_argument("--url", default=DEFAULT_URL)
+    args = parser.parse_args()
+    api = API(args.url)
+    dataset_path = GOLD_DIR / "pipeline_collision.json"
+    if not dataset_path.exists():
+        print(f"❌ Dataset not found: {dataset_path}")
+        sys.exit(1)
+    with open(dataset_path, 'r', encoding='utf-8') as f:
+        samples = json.load(f)
+    print(f"\n{'='*60}")
+    print(f"COLLISION BENCHMARK ({len(samples)} samples)")
+    print(f"Target: {args.url}")
+    print(f"{'='*60}")
+    results = run_collision_benchmark(api, samples)
+    m = calc_metrics(results)
+    # ── Per-category breakdown ──
+    categories = {}
+    for r in results:
+        cat = r.category
+        if cat not in categories:
+            categories[cat] = {"total": 0, "pass": 0, "fail": 0}
+        categories[cat]["total"] += 1
+        if r.pipeline_verdict in ("TP", "TN"):
+            categories[cat]["pass"] += 1
+        else:
+            categories[cat]["fail"] += 1
+    # ── Print report ──
+    print(f"\n{'='*60}")
+    print("COLLISION BENCHMARK RESULTS")
+    print(f"{'='*60}")
+    print(f"\n## Summary")
+    print(f"| Metric        | Value |")
+    print(f"|---------------|-------|")
+    print(f"| Total         | {m['total']} |")
+    print(f"| Passed (TP)   | {m['TP']} |")
+    print(f"| Failed (FN)   | {m['FN']} |")
+    print(f"| FP            | {m['FP']} |")
+    print(f"| Errors        | {m['ERROR']} |")
+    print(f"| Pass Rate     | {m['pass_rate']:.1%} |")
+    print(f"\n## By Category")
+    print(f"| Category | Total | Pass | Fail | Rate |")
+    print(f"|----------|-------|------|------|------|")
+    for cat, data in sorted(categories.items()):
+        rate = data['pass'] / data['total'] * 100 if data['total'] > 0 else 0
+        print(f"| {cat} | {data['total']} | {data['pass']} | {data['fail']} | {rate:.0f}% |")
+    # ── Root cause for failures ──
+    failures = [r for r in results if r.pipeline_verdict in ("FN", "FP")]
+    if failures:
+        print(f"\n## Failure Details")
+        print(f"| ID | Category | Input | Expected | Actual | Root Cause |")
+        print(f"|----|----------|-------|----------|--------|------------|")
+        for r in failures:
+            print(
+                f"| {r.id} | {r.category} | "
+                f"{r.input[:30]}... | {r.expected[:30]}... | "
+                f"{r.pipeline_output[:30]}... | {r.root_cause_detail[:40]} |"
+            )
+    # ── Save JSON report ──
+    REPORT_DIR.mkdir(parents=True, exist_ok=True)
+    report = {
+        "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
+        "target": args.url,
+        "metrics": m,
+        "by_category": categories,
+        "results": [asdict(r) for r in results],
+    }
+    out_path = REPORT_DIR / "collision_benchmark_results.json"
+    with open(out_path, 'w', encoding='utf-8') as f:
+        json.dump(report, f, ensure_ascii=False, indent=2)
+    print(f"\n[P11] Report → {out_path}")
+if __name__ == "__main__":
+    main()

tests/phase10/test_collisions.py CHANGED Viewed

@@ -1,27 +1,195 @@
 import json
 import requests
-url = "https://bayan10-bayan-api.hf.space/api/analyze"
-with open("d:/BAYAN2/tests/phase10/gold_datasets/pipeline_collision.json", "r", encoding="utf-8") as f:
-    samples = json.load(f)
-failures = []
-passed = 0
-for i, s in enumerate(samples[:10]):  # Test first 10 for analysis
-    try:
-        r = requests.post(url, json={"text": s["input"]}).json()
-        out = r.get("corrected", "")
-        if out == s["expected"]:
-            passed += 1
-            print(f"[{s['id']}] PASS")
-        else:
-            failures.append((s, out))
-            print(f"[{s['id']}] FAIL")
-            print(f"  Input: {s['input']}")
-            print(f"  Expected: {s['expected']}")
-            print(f"  Actual:   {out}")
-    except Exception as e:
-        print(f"[{s['id']}] ERROR: {e}")
-print(f"\nResults: {passed} PASS, {len(failures)} FAIL")

+"""
+Phase 11 — Pipeline Collision Test Runner
+==========================================
+Runs ALL collision test cases against the live API and produces
+a structured JSON report with per-failure classification.
+"""
 import json
+import sys
+import time
+import re
 import requests
+from pathlib import Path
+# ── Configuration ──
+API_URL = "https://bayan10-bayan-api.hf.space/api/analyze"
+DATASET_PATH = Path(__file__).parent / "gold_datasets" / "pipeline_collision.json"
+REPORT_PATH = Path(__file__).parent / "reports" / "collision_report.json"
+def strip_diacritics(text):
+    """Remove Arabic diacritics for comparison."""
+    return re.sub(r'[\u064B-\u065F\u0670]', '', text)
+def normalize_for_compare(text):
+    """Normalize text for fuzzy comparison (strip diacritics + collapse spaces)."""
+    t = strip_diacritics(text)
+    t = re.sub(r'\s+', ' ', t).strip()
+    return t
+def classify_failure(sample, actual, suggestions):
+    """Classify root cause for a failure."""
+    expected = sample["expected"]
+    inp = sample["input"]
+    category = sample.get("category", "")
+    # Check which words are wrong
+    exp_words = set(expected.split())
+    act_words = set(actual.split())
+    inp_words = set(inp.split())
+    missing_fixes = exp_words - act_words  # Expected words not in actual
+    unwanted = act_words - exp_words       # Actual words not in expected
+    # Check suggestion stages
+    stages = [s.get('type', '') for s in suggestions]
+    has_spelling = 'spelling' in stages
+    has_grammar = 'grammar' in stages
+    has_punctuation = 'punctuation' in stages
+    # Determine root cause
+    if category == "spelling_blocks_grammar":
+        # Spelling fixed ه→ة but locked the range, grammar couldn't fix gender
+        if any(s.get('type') == 'spelling' for s in suggestions):
+            grammar_words_missed = [w for w in missing_fixes if w not in inp_words]
+            if grammar_words_missed:
+                return "STAGELOCKER", "spelling→grammar lock collision", grammar_words_missed
+        return "MODEL", "Grammar model missed correction", list(missing_fixes)
+    elif category == "grammar_drops_spelling":
+        return "PIPELINE", "Grammar stage dropped spelling fix", list(missing_fixes)
+    elif category == "spelling_grammar_overlap":
+        return "PIPELINE", "Spelling and grammar overlap conflict", list(missing_fixes)
+    elif category == "multi_stage_collision":
+        if not has_grammar and missing_fixes:
+            return "MODEL", "Grammar model missed correction", list(missing_fixes)
+        elif has_spelling and not has_grammar:
+            return "STAGELOCKER", "Spelling lock blocked grammar", list(missing_fixes)
+        return "PIPELINE", "Multi-stage interaction failure", list(missing_fixes)
+    else:
+        return "UNKNOWN", f"Unclassified failure in category '{category}'", list(missing_fixes)
+def main():
+    # Load dataset
+    if not DATASET_PATH.exists():
+        print(f"❌ Dataset not found: {DATASET_PATH}")
+        sys.exit(1)
+    with open(DATASET_PATH, 'r', encoding='utf-8') as f:
+        samples = json.load(f)
+    print(f"[COLLISION] Running {len(samples)} test cases against {API_URL}")
+    print(f"{'='*70}")
+    results = []
+    passed = 0
+    failed = 0
+    errors = 0
+    for i, s in enumerate(samples):
+        sid = s["id"]
+        print(f"  [{i+1}/{len(samples)}] {sid} ({s.get('category','')})... ", end="", flush=True)
+        try:
+            t0 = time.time()
+            r = requests.post(API_URL, json={"text": s["input"]}, timeout=120)
+            elapsed_ms = int((time.time() - t0) * 1000)
+            resp = r.json()
+            actual = resp.get("corrected", "")
+            suggestions = resp.get("suggestions", [])
+            # Normalize for comparison (strip diacritics, collapse spaces)
+            norm_actual = normalize_for_compare(actual)
+            norm_expected = normalize_for_compare(s["expected"])
+            result = {
+                "id": sid,
+                "category": s.get("category", ""),
+                "input": s["input"],
+                "expected": s["expected"],
+                "actual": actual,
+                "suggestions": suggestions,
+                "elapsed_ms": elapsed_ms,
+            }
+            if norm_actual == norm_expected:
+                result["verdict"] = "PASS"
+                passed += 1
+                print(f"✅ PASS ({elapsed_ms}ms)")
+            else:
+                result["verdict"] = "FAIL"
+                component, detail, missing = classify_failure(s, actual, suggestions)
+                result["root_cause_component"] = component
+                result["root_cause_detail"] = detail
+                result["missing_words"] = missing
+                failed += 1
+                print(f"❌ FAIL ({elapsed_ms}ms)")
+                print(f"       Input:    {s['input']}")
+                print(f"       Expected: {s['expected']}")
+                print(f"       Actual:   {actual}")
+                print(f"       Cause:    [{component}] {detail}")
+            results.append(result)
+        except Exception as e:
+            errors += 1
+            results.append({
+                "id": sid, "category": s.get("category", ""),
+                "verdict": "ERROR", "error": str(e),
+            })
+            print(f"💥 ERROR: {e}")
+    # ── Summary ──
+    total = len(samples)
+    pass_rate = (passed / total * 100) if total > 0 else 0
+    print(f"\n{'='*70}")
+    print(f"COLLISION BENCHMARK RESULTS")
+    print(f"{'='*70}")
+    print(f"  Total:     {total}")
+    print(f"  Passed:    {passed}")
+    print(f"  Failed:    {failed}")
+    print(f"  Errors:    {errors}")
+    print(f"  Pass Rate: {pass_rate:.1f}%")
+    # ── Root cause breakdown ──
+    failures = [r for r in results if r.get("verdict") == "FAIL"]
+    by_component = {}
+    by_category = {}
+    for r in failures:
+        comp = r.get("root_cause_component", "UNKNOWN")
+        cat = r.get("category", "unknown")
+        by_component[comp] = by_component.get(comp, 0) + 1
+        by_category[cat] = by_category.get(cat, 0) + 1
+    if failures:
+        print(f"\n  Root Cause by Component:")
+        for comp, count in sorted(by_component.items(), key=lambda x: -x[1]):
+            print(f"    {comp}: {count}")
+        print(f"\n  Failures by Category:")
+        for cat, count in sorted(by_category.items(), key=lambda x: -x[1]):
+            print(f"    {cat}: {count}")
+    # ── Save report ──
+    REPORT_PATH.parent.mkdir(parents=True, exist_ok=True)
+    report = {
+        "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ"),
+        "target": API_URL,
+        "total": total,
+        "passed": passed,
+        "failed": failed,
+        "errors": errors,
+        "pass_rate": round(pass_rate, 1),
+        "root_cause_by_component": by_component,
+        "failures_by_category": by_category,
+        "results": results,
+    }
+    with open(REPORT_PATH, 'w', encoding='utf-8') as f:
+        json.dump(report, f, ensure_ascii=False, indent=2)
+    print(f"\n[COLLISION] Report saved → {REPORT_PATH}")
+if __name__ == "__main__":
+    main()