Commit ·
b62e8ec
1
Parent(s): e909fa9
Hotfix: Revert pronoun-h guard to ته-only (was breaking ta_marbuta), add directional blocks for عمله/لسانه/بيته/كتابه
Browse files- src/app.py +14 -13
src/app.py
CHANGED
|
@@ -829,6 +829,11 @@ def _is_small_spelling_change(orig_word, corr_word, vocab_manager=None):
|
|
| 829 |
'لكن': {'لاكن'}, # correct → misspelling = ALWAYS wrong
|
| 830 |
# Demonstrative: ذلك (correct) ↔ ذالك (common misspelling)
|
| 831 |
'ذلك': {'ذالك'}, # correct → misspelling = ALWAYS wrong
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 832 |
}
|
| 833 |
if corr_word in _DIRECTIONAL_BLOCKS.get(orig_word, set()):
|
| 834 |
return 0.0
|
|
@@ -872,19 +877,15 @@ def _is_small_spelling_change(orig_word, corr_word, vocab_manager=None):
|
|
| 872 |
# E.g., فتأملته (fataamaltahu) → فتأملتة is WRONG.
|
| 873 |
if (orig_word.endswith('ه') and corr_word.endswith('ة')
|
| 874 |
and orig_word[:-1] == corr_word[:-1]):
|
| 875 |
-
# Guard: if ه is a pronoun suffix
|
| 876 |
-
#
|
| 877 |
-
#
|
| 878 |
-
|
| 879 |
-
|
| 880 |
-
|
| 881 |
-
|
| 882 |
-
|
| 883 |
-
|
| 884 |
-
f"'{orig_word}'→'{corr_word}' "
|
| 885 |
-
f"('{stem_without_h}' is valid IV → ه = 'his/it')"
|
| 886 |
-
)
|
| 887 |
-
return 0.0
|
| 888 |
return 0.9
|
| 889 |
# 2. ة→ه at word end (less common but valid)
|
| 890 |
if (orig_word.endswith('ة') and corr_word.endswith('ه')
|
|
|
|
| 829 |
'لكن': {'لاكن'}, # correct → misspelling = ALWAYS wrong
|
| 830 |
# Demonstrative: ذلك (correct) ↔ ذالك (common misspelling)
|
| 831 |
'ذلك': {'ذالك'}, # correct → misspelling = ALWAYS wrong
|
| 832 |
+
# Pronoun suffix: ه→ة corruption (G037: عمله→عملة)
|
| 833 |
+
'عمله': {'عملة'}, # عمله (his work) → عملة (currency) = WRONG
|
| 834 |
+
'لسانه': {'لسانة'}, # his tongue
|
| 835 |
+
'بيته': {'بيتة'}, # his house
|
| 836 |
+
'كتابه': {'كتابة'}, # his book → writing
|
| 837 |
}
|
| 838 |
if corr_word in _DIRECTIONAL_BLOCKS.get(orig_word, set()):
|
| 839 |
return 0.0
|
|
|
|
| 877 |
# E.g., فتأملته (fataamaltahu) → فتأملتة is WRONG.
|
| 878 |
if (orig_word.endswith('ه') and corr_word.endswith('ة')
|
| 879 |
and orig_word[:-1] == corr_word[:-1]):
|
| 880 |
+
# Guard: if word ends in ته, the ه is likely a pronoun suffix
|
| 881 |
+
# Pattern: verb+ته = "verb + him/it", NOT ta marbuta.
|
| 882 |
+
# E.g., فتأملته → فتأملتة is WRONG.
|
| 883 |
+
if len(orig_word) >= 3 and orig_word[-2] == 'ت':
|
| 884 |
+
logger.info(
|
| 885 |
+
f"[SPELLING] Blocked ه→ة at pronoun suffix: "
|
| 886 |
+
f"'{orig_word}'→'{corr_word}' (ته pattern = pronoun 'him/it')"
|
| 887 |
+
)
|
| 888 |
+
return 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
| 889 |
return 0.9
|
| 890 |
# 2. ة→ه at word end (less common but valid)
|
| 891 |
if (orig_word.endswith('ة') and corr_word.endswith('ه')
|