youssefreda9 commited on
Commit
b62e8ec
·
1 Parent(s): e909fa9

Hotfix: Revert pronoun-h guard to ته-only (was breaking ta_marbuta), add directional blocks for عمله/لسانه/بيته/كتابه

Browse files
Files changed (1) hide show
  1. src/app.py +14 -13
src/app.py CHANGED
@@ -829,6 +829,11 @@ def _is_small_spelling_change(orig_word, corr_word, vocab_manager=None):
829
  'لكن': {'لاكن'}, # correct → misspelling = ALWAYS wrong
830
  # Demonstrative: ذلك (correct) ↔ ذالك (common misspelling)
831
  'ذلك': {'ذالك'}, # correct → misspelling = ALWAYS wrong
 
 
 
 
 
832
  }
833
  if corr_word in _DIRECTIONAL_BLOCKS.get(orig_word, set()):
834
  return 0.0
@@ -872,19 +877,15 @@ def _is_small_spelling_change(orig_word, corr_word, vocab_manager=None):
872
  # E.g., فتأملته (fataamaltahu) → فتأملتة is WRONG.
873
  if (orig_word.endswith('ه') and corr_word.endswith('ة')
874
  and orig_word[:-1] == corr_word[:-1]):
875
- # Guard: if ه is a pronoun suffix, block conversion to ة.
876
- # Heuristic: if removing ه gives a valid standalone word,
877
- # the ه is likely pronoun 'his/it' (عمله = عمل+ه).
878
- # If removing ه gives an invalid word, it's ta-marbuta (مدرسه→مدرسة).
879
- if len(orig_word) >= 3 and vocab_manager:
880
- stem_without_h = orig_word[:-1]
881
- if vocab_manager.is_iv(stem_without_h):
882
- logger.info(
883
- f"[SPELLING] Blocked ه→ة (pronoun suffix): "
884
- f"'{orig_word}'→'{corr_word}' "
885
- f"('{stem_without_h}' is valid IV → ه = 'his/it')"
886
- )
887
- return 0.0
888
  return 0.9
889
  # 2. ة→ه at word end (less common but valid)
890
  if (orig_word.endswith('ة') and corr_word.endswith('ه')
 
829
  'لكن': {'لاكن'}, # correct → misspelling = ALWAYS wrong
830
  # Demonstrative: ذلك (correct) ↔ ذالك (common misspelling)
831
  'ذلك': {'ذالك'}, # correct → misspelling = ALWAYS wrong
832
+ # Pronoun suffix: ه→ة corruption (G037: عمله→عملة)
833
+ 'عمله': {'عملة'}, # عمله (his work) → عملة (currency) = WRONG
834
+ 'لسانه': {'لسانة'}, # his tongue
835
+ 'بيته': {'بيتة'}, # his house
836
+ 'كتابه': {'كتابة'}, # his book → writing
837
  }
838
  if corr_word in _DIRECTIONAL_BLOCKS.get(orig_word, set()):
839
  return 0.0
 
877
  # E.g., فتأملته (fataamaltahu) → فتأملتة is WRONG.
878
  if (orig_word.endswith('ه') and corr_word.endswith('ة')
879
  and orig_word[:-1] == corr_word[:-1]):
880
+ # Guard: if word ends in ته, the ه is likely a pronoun suffix
881
+ # Pattern: verb+ته = "verb + him/it", NOT ta marbuta.
882
+ # E.g., فتأملته فتأملتة is WRONG.
883
+ if len(orig_word) >= 3 and orig_word[-2] == 'ت':
884
+ logger.info(
885
+ f"[SPELLING] Blocked ه→ة at pronoun suffix: "
886
+ f"'{orig_word}'→'{corr_word}' (ته pattern = pronoun 'him/it')"
887
+ )
888
+ return 0.0
 
 
 
 
889
  return 0.9
890
  # 2. ة→ه at word end (less common but valid)
891
  if (orig_word.endswith('ة') and corr_word.endswith('ه')