Bayan AI commited on
Commit
54052f7
·
1 Parent(s): 0087f00

Fix: Preserve input punctuation during grammar correction

Browse files
src/nlp/grammar/grammar_service.py CHANGED
@@ -31,6 +31,59 @@ class GrammarChecker:
31
  self.client = client
32
  self.rules = rules
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  def correct(self, text: str) -> str:
35
  """
36
  Run grammar correction on text.
@@ -53,6 +106,10 @@ class GrammarChecker:
53
 
54
  # 2. Rule-based post-processing
55
  corrected = self.rules.process(text, model_output)
 
 
 
 
56
  logger.info(f"Grammar rules output: '{corrected[:80]}...'")
57
 
58
  return corrected
 
31
  self.client = client
32
  self.rules = rules
33
 
34
+ @staticmethod
35
+ def _preserve_punctuation(original: str, corrected: str) -> str:
36
+ """
37
+ Preserve punctuation from the original text if the grammar model removed it.
38
+ """
39
+ PUNCT_CHARS = set('.,;:!?،؛؟!.:«»"\'()-–—…')
40
+ orig_words = original.split()
41
+ corr_words = corrected.split()
42
+
43
+ if not orig_words or not corr_words:
44
+ return corrected
45
+
46
+ # If word count matches exactly, we can restore punctuation word-by-word
47
+ if len(orig_words) == len(corr_words):
48
+ result = []
49
+ for o_w, c_w in zip(orig_words, corr_words):
50
+ prefix = ""
51
+ for ch in o_w:
52
+ if ch in PUNCT_CHARS: prefix += ch
53
+ else: break
54
+ suffix = ""
55
+ for ch in reversed(o_w):
56
+ if ch in PUNCT_CHARS: suffix = ch + suffix
57
+ else: break
58
+
59
+ c_base = c_w.strip('.,;:!?،؛؟!.:«»"\'()-–—…')
60
+ if not c_base:
61
+ c_base = c_w
62
+ result.append(prefix + c_base + suffix)
63
+ return " ".join(result)
64
+
65
+ # Global prefix/suffix if lengths differ
66
+ prefix = ""
67
+ for ch in original:
68
+ if ch in PUNCT_CHARS: prefix += ch
69
+ elif not ch.isspace(): break
70
+
71
+ suffix = ""
72
+ for ch in reversed(original):
73
+ if ch in PUNCT_CHARS: suffix = ch + suffix
74
+ elif not ch.isspace(): break
75
+
76
+ c_stripped = corrected.strip('.,;:!?،؛؟!.:«»"\'()-–—… \t\n')
77
+
78
+ # Only add prefix/suffix if the corrected text doesn't already have them
79
+ if prefix and c_stripped.startswith(prefix):
80
+ prefix = ""
81
+ if suffix and c_stripped.endswith(suffix):
82
+ suffix = ""
83
+
84
+ return prefix + c_stripped + suffix
85
+
86
+
87
  def correct(self, text: str) -> str:
88
  """
89
  Run grammar correction on text.
 
106
 
107
  # 2. Rule-based post-processing
108
  corrected = self.rules.process(text, model_output)
109
+
110
+ # 3. Preserve original punctuation if the model stripped it
111
+ corrected = self._preserve_punctuation(text, corrected)
112
+
113
  logger.info(f"Grammar rules output: '{corrected[:80]}...'")
114
 
115
  return corrected
tests/phase10/gold_datasets/phase_b_punctuation_bug.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "punct_bug_1",
4
+ "category": "entities",
5
+ "input": "شركة أبل",
6
+ "expected": "شركة أبل",
7
+ "severity": "low",
8
+ "description": "Ensure punctuation is not added to single entity names."
9
+ },
10
+ {
11
+ "id": "punct_bug_2",
12
+ "category": "short_phrase",
13
+ "input": "مرحبا بكم في لوحة التحكم",
14
+ "expected": "مرحبا بكم في لوحة التحكم",
15
+ "severity": "medium",
16
+ "description": "Ensure punctuation is not added to short standalone phrases."
17
+ },
18
+ {
19
+ "id": "punct_bug_3",
20
+ "category": "structured",
21
+ "input": "الاسم: أحمد، العمر: 30",
22
+ "expected": "الاسم: أحمد، العمر: 30",
23
+ "severity": "medium",
24
+ "description": "Ensure punctuation is not added to structured text fragments."
25
+ },
26
+ {
27
+ "id": "punct_bug_4",
28
+ "category": "grammar_preservation",
29
+ "input": "يذهبون المهندسون الى العمل",
30
+ "expected": "يذهب المهندسون إلى العمل",
31
+ "severity": "high",
32
+ "description": "Grammar fixes should not be overwritten or corrupted by the punctuation model."
33
+ }
34
+ ]