Spaces:

bayan10
/

bayan-api

Running

App Files Files Community

Bayan AI commited on about 18 hours ago

Commit

54052f7

1 Parent(s): 0087f00

Fix: Preserve input punctuation during grammar correction

Browse files

Files changed (2) hide show

src/nlp/grammar/grammar_service.py +57 -0
tests/phase10/gold_datasets/phase_b_punctuation_bug.json +34 -0

src/nlp/grammar/grammar_service.py CHANGED Viewed

@@ -31,6 +31,59 @@ class GrammarChecker:
         self.client = client
         self.rules = rules
     def correct(self, text: str) -> str:
         """
         Run grammar correction on text.
@@ -53,6 +106,10 @@ class GrammarChecker:
             # 2. Rule-based post-processing
             corrected = self.rules.process(text, model_output)
             logger.info(f"Grammar rules output: '{corrected[:80]}...'")
             return corrected

         self.client = client
         self.rules = rules
+    @staticmethod
+    def _preserve_punctuation(original: str, corrected: str) -> str:
+        """
+        Preserve punctuation from the original text if the grammar model removed it.
+        """
+        PUNCT_CHARS = set('.,;:!?،؛؟!.:«»"\'()-–—…')
+        orig_words = original.split()
+        corr_words = corrected.split()
+        if not orig_words or not corr_words:
+            return corrected
+        # If word count matches exactly, we can restore punctuation word-by-word
+        if len(orig_words) == len(corr_words):
+            result = []
+            for o_w, c_w in zip(orig_words, corr_words):
+                prefix = ""
+                for ch in o_w:
+                    if ch in PUNCT_CHARS: prefix += ch
+                    else: break
+                suffix = ""
+                for ch in reversed(o_w):
+                    if ch in PUNCT_CHARS: suffix = ch + suffix
+                    else: break
+                c_base = c_w.strip('.,;:!?،؛؟!.:«»"\'()-–—…')
+                if not c_base:
+                    c_base = c_w
+                result.append(prefix + c_base + suffix)
+            return " ".join(result)
+        # Global prefix/suffix if lengths differ
+        prefix = ""
+        for ch in original:
+            if ch in PUNCT_CHARS: prefix += ch
+            elif not ch.isspace(): break
+        suffix = ""
+        for ch in reversed(original):
+            if ch in PUNCT_CHARS: suffix = ch + suffix
+            elif not ch.isspace(): break
+        c_stripped = corrected.strip('.,;:!?،؛؟!.:«»"\'()-–—… \t\n')
+        # Only add prefix/suffix if the corrected text doesn't already have them
+        if prefix and c_stripped.startswith(prefix):
+            prefix = ""
+        if suffix and c_stripped.endswith(suffix):
+            suffix = ""
+        return prefix + c_stripped + suffix
     def correct(self, text: str) -> str:
         """
         Run grammar correction on text.
             # 2. Rule-based post-processing
             corrected = self.rules.process(text, model_output)
+            # 3. Preserve original punctuation if the model stripped it
+            corrected = self._preserve_punctuation(text, corrected)
             logger.info(f"Grammar rules output: '{corrected[:80]}...'")
             return corrected

tests/phase10/gold_datasets/phase_b_punctuation_bug.json ADDED Viewed

	@@ -0,0 +1,34 @@

+[
+  {
+    "id": "punct_bug_1",
+    "category": "entities",
+    "input": "شركة أبل",
+    "expected": "شركة أبل",
+    "severity": "low",
+    "description": "Ensure punctuation is not added to single entity names."
+  },
+  {
+    "id": "punct_bug_2",
+    "category": "short_phrase",
+    "input": "مرحبا بكم في لوحة التحكم",
+    "expected": "مرحبا بكم في لوحة التحكم",
+    "severity": "medium",
+    "description": "Ensure punctuation is not added to short standalone phrases."
+  },
+  {
+    "id": "punct_bug_3",
+    "category": "structured",
+    "input": "الاسم: أحمد، العمر: 30",
+    "expected": "الاسم: أحمد، العمر: 30",
+    "severity": "medium",
+    "description": "Ensure punctuation is not added to structured text fragments."
+  },
+  {
+    "id": "punct_bug_4",
+    "category": "grammar_preservation",
+    "input": "يذهبون المهندسون الى العمل",
+    "expected": "يذهب المهندسون إلى العمل",
+    "severity": "high",
+    "description": "Grammar fixes should not be overwritten or corrupted by the punctuation model."
+  }
+]