Ethosoft commited on
Commit ·
edec8b7
1
Parent(s): ec3d319
Refactor to standalone v2.0: zero dependencies, internal engine, removed zemberek/hf wrapper
Browse files- .gitattributes +5 -37
- .gitignore +3 -0
- README.md +117 -240
- hf_benchmark.py +0 -327
- id_to_token_64k.json +0 -0
- nedo_turkish_tokenizer/__init__.py +11 -12
- nedo_turkish_tokenizer/{_acronym_dict.py → _acronym_table.py} +13 -35
- nedo_turkish_tokenizer/_allomorph.py +0 -46
- nedo_turkish_tokenizer/_compound.py +0 -76
- nedo_turkish_tokenizer/_context_aware.py +0 -61
- nedo_turkish_tokenizer/{_medical_vocab.py → _domain_vocab.py} +17 -2
- nedo_turkish_tokenizer/_preprocessor.py +0 -246
- nedo_turkish_tokenizer/_root_validator.py +0 -205
- nedo_turkish_tokenizer/_suffix_expander.py +0 -212
- nedo_turkish_tokenizer/_suffix_table.py +197 -0
- nedo_turkish_tokenizer/_tdk_vocab.py +0 -148
- nedo_turkish_tokenizer/apostrophe.py +138 -0
- nedo_turkish_tokenizer/engine.py +157 -0
- nedo_turkish_tokenizer/morphology.py +161 -0
- nedo_turkish_tokenizer/normalization.py +63 -0
- nedo_turkish_tokenizer/resources.py +107 -0
- nedo_turkish_tokenizer/segmentation.py +475 -0
- nedo_turkish_tokenizer/{_normalizer.py → special_spans.py} +142 -134
- nedo_turkish_tokenizer/tokenizer.py +48 -296
- nedo_turkish_tokenizer/types.py +109 -0
- paper_baseline_check.py +0 -106
- pyproject.toml +6 -10
- special_tokens_map.json +0 -9
- test_lattice.py +0 -72
- tests/test_tdk_vocab.py +0 -31
- tests/test_tokenizer.py +457 -0
- tests/test_zemberek_integration.py +0 -58
- tokenization_nedo_turkish.py +0 -172
- tokenizer_config.json +0 -12
- vocab_64k.json +0 -0
.gitattributes
CHANGED
|
@@ -1,37 +1,5 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
-
nedo_turkish_tokenizer/data/zemberek-full.jar filter=lfs diff=lfs merge=lfs -text
|
| 37 |
-
*.jar filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
# Git LFS tracking rules.
|
| 2 |
+
# Only data files that are genuinely large are tracked.
|
| 3 |
+
# The HuggingFace boilerplate entries for model weights have been
|
| 4 |
+
# removed because this is a standalone tokenizer, not a model repo.
|
| 5 |
+
nedo_turkish_tokenizer/data/tdk_words.txt filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
CHANGED
|
@@ -8,3 +8,6 @@ build/
|
|
| 8 |
*.egg
|
| 9 |
.env
|
| 10 |
.venv/
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
*.egg
|
| 9 |
.env
|
| 10 |
.venv/
|
| 11 |
+
.pytest_cache/
|
| 12 |
+
*.whl
|
| 13 |
+
*.tar.gz
|
README.md
CHANGED
|
@@ -1,291 +1,168 @@
|
|
| 1 |
-
---
|
| 2 |
-
language:
|
| 3 |
-
- tr
|
| 4 |
-
- en
|
| 5 |
-
tags:
|
| 6 |
-
- tokenizer
|
| 7 |
-
- morphology
|
| 8 |
-
- turkish
|
| 9 |
-
- nlp
|
| 10 |
-
- transformers
|
| 11 |
-
license: mit
|
| 12 |
-
library_name: nedo-turkish-tokenizer
|
| 13 |
-
pipeline_tag: token-classification
|
| 14 |
-
---
|
| 15 |
-
|
| 16 |
# NedoTurkishTokenizer
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
<p align="center">
|
| 22 |
-
Morphology-aware Turkish tokenization with roots, suffixes, canonical morphemes, compounds, acronyms, foreign-word handling, and context-sensitive analysis.
|
| 23 |
-
</p>
|
| 24 |
-
|
| 25 |
-
<p align="center">
|
| 26 |
-
<strong>TR-MMLU 92.64%</strong> · <strong>Turkish-first</strong> · <strong>Zemberek-powered</strong> · <strong>Transformers-compatible</strong>
|
| 27 |
-
</p>
|
| 28 |
-
|
| 29 |
-
---
|
| 30 |
-
<p align="center">
|
| 31 |
-
<img src="https://cdn-uploads.huggingface.co/production/uploads/684ffdf517ebbc34153de81b/fbFKRHdqH7x1Iz20QlXx7.png" alt="Resim" style="width: 100%; max-width: 1000px;" />
|
| 32 |
-
</p>
|
| 33 |
-
|
| 34 |
-
## Overview
|
| 35 |
-
|
| 36 |
-
**NedoTurkishTokenizer** is a Turkish morphological tokenizer built for people who want more than generic subword splitting.
|
| 37 |
-
|
| 38 |
-
Instead of chopping Turkish into arbitrary BPE fragments, it tokenizes text in a way that reflects the real structure of the language: **roots, suffixes, morphological positions, canonical morphemes, compounds, acronym expansions, foreign roots, and contextual disambiguation**.
|
| 39 |
-
|
| 40 |
-
For a language as morphologically rich as Turkish, that difference is massive.
|
| 41 |
-
|
| 42 |
-
This repository is built to make Turkish tokenization feel **intelligent, interpretable, and linguistically grounded**.
|
| 43 |
-
|
| 44 |
-
---
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
## Why it stands out
|
| 48 |
-
|
| 49 |
-
Most tokenizers are optimized for compression.
|
| 50 |
-
|
| 51 |
-
This one is optimized for **understanding Turkish properly**.
|
| 52 |
-
|
| 53 |
-
NedoTurkishTokenizer is designed to capture the structure that actually matters in Turkish NLP:
|
| 54 |
-
|
| 55 |
-
- **Root + suffix aware tokenization**
|
| 56 |
-
- **Morphological positions inside words**
|
| 57 |
-
- **Canonical suffix normalization**
|
| 58 |
-
- **Foreign word + Turkish suffix detection**
|
| 59 |
-
- **Compound decomposition**
|
| 60 |
-
- **Acronym expansion**
|
| 61 |
-
- **Sentence-level disambiguation**
|
| 62 |
-
- **Domain-aware vocabulary support**
|
| 63 |
-
- **Clean integration with Transformers**
|
| 64 |
-
- **Standalone Python usage for custom NLP pipelines**
|
| 65 |
|
| 66 |
-
This is
|
| 67 |
-
It is a much more linguistically faithful way to represent Turkish text.
|
| 68 |
-
|
| 69 |
-
---
|
| 70 |
-
|
| 71 |
-
## Benchmark
|
| 72 |
-
|
| 73 |
-
| Metric | Score |
|
| 74 |
-
|---|---:|
|
| 75 |
-
| **TR-MMLU** | **92.64%** |
|
| 76 |
-
|
| 77 |
-
**Current repo claim:** world record.
|
| 78 |
-
|
| 79 |
-
---
|
| 80 |
-
|
| 81 |
-
## Model Details
|
| 82 |
-
|
| 83 |
-
| Field | Value |
|
| 84 |
-
|---|---|
|
| 85 |
-
| **Developer** | [Ethosoft](https://huggingface.co/Ethosoft) |
|
| 86 |
-
| **Model** | `Ethosoft/NedoTurkishTokenizer` |
|
| 87 |
-
| **Language** | Turkish (`tr`) |
|
| 88 |
-
| **License** | MIT |
|
| 89 |
-
| **Morphological engine** | `zemberek-python` |
|
| 90 |
-
|
| 91 |
-
---
|
| 92 |
|
| 93 |
## Installation
|
| 94 |
|
| 95 |
```bash
|
| 96 |
-
pip install
|
| 97 |
```
|
| 98 |
|
| 99 |
-
|
| 100 |
|
| 101 |
## Quick Start
|
| 102 |
|
| 103 |
-
### Transformers Usage
|
| 104 |
-
|
| 105 |
```python
|
| 106 |
-
from
|
| 107 |
-
|
| 108 |
-
tok = AutoTokenizer.from_pretrained(
|
| 109 |
-
"Ethosoft/NedoTurkishTokenizer",
|
| 110 |
-
trust_remote_code=True
|
| 111 |
-
)
|
| 112 |
|
| 113 |
-
|
| 114 |
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
print(
|
|
|
|
| 118 |
|
| 119 |
-
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
```
|
| 122 |
|
| 123 |
-
##
|
|
|
|
|
|
|
| 124 |
|
| 125 |
```python
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
```
|
| 131 |
|
| 132 |
-
#
|
|
|
|
| 133 |
|
| 134 |
-
|
| 135 |
-
|
| 136 |
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
if t.get("_canonical"):
|
| 140 |
-
print(f" [{t['_canonical']}]", end="")
|
| 141 |
-
if t.get("_compound"):
|
| 142 |
-
print(f" compound={t['_parts']}", end="")
|
| 143 |
-
if t.get("_expansion"):
|
| 144 |
-
print(f" -> {t['_expansion']}", end="")
|
| 145 |
-
print()
|
| 146 |
```
|
| 147 |
|
| 148 |
-
|
| 149 |
|
| 150 |
-
|
| 151 |
|
| 152 |
-
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
-
|
| 156 |
|
| 157 |
-
#
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
-
#
|
| 163 |
-
results = tok.batch_tokenize(["metin 1", "metin 2", "metin 3"], workers=4)
|
| 164 |
|
| 165 |
-
|
| 166 |
-
s = tok.stats(tokens)
|
| 167 |
-
print(f"TR%: {s['tr_pct']} Pure%: {s['pure_pct']}")
|
| 168 |
-
```
|
| 169 |
|
| 170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
-
##
|
| 173 |
|
| 174 |
-
**
|
| 175 |
|
| 176 |
-
```
|
| 177 |
-
|
|
|
|
|
|
|
|
|
|
| 178 |
```
|
| 179 |
|
| 180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
|
| 182 |
-
|
| 183 |
-
|---|---|---:|---|
|
| 184 |
-
| `<uppercase_word>` | ROOT | 0 | ALL CAPS marker |
|
| 185 |
-
| ` istanbul` | ROOT | 0 | lowercased normalization |
|
| 186 |
-
| `'` | PUNCT | 0 | fixed boundary |
|
| 187 |
-
| `da` | SUFFIX | 1 | `-LOC` |
|
| 188 |
-
| ` meeting` | FOREIGN | 0 | foreign root |
|
| 189 |
-
| `e` | SUFFIX | 1 | `-DAT` |
|
| 190 |
-
| ` katılmak` | ROOT | 0 | corrected root |
|
| 191 |
-
| `lama` | SUFFIX | 1 | `-VN+NEG` |
|
| 192 |
-
| `d` | SUFFIX | 2 | `-PAST` |
|
| 193 |
-
| `ım` | SUFFIX | 3 | `-1SG` |
|
| 194 |
|
| 195 |
-
|
| 196 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
|
| 198 |
-
-
|
| 199 |
|
| 200 |
-
|
| 201 |
|
| 202 |
-
|
| 203 |
|
| 204 |
-
|
|
| 205 |
|---|---|---|
|
| 206 |
-
| `
|
| 207 |
-
| `
|
| 208 |
-
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
-
|
| 211 |
|
| 212 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
|
| 214 |
-
|
| 215 |
-
|---|---|---|
|
| 216 |
-
| `ROOT` | Turkish root word | `kitap`, `gel` |
|
| 217 |
-
| `SUFFIX` | Turkish morphological suffix | `lar`, `da`, `dı` |
|
| 218 |
-
| `FOREIGN` | Foreign or loanword root | `meeting`, `zoom`, `tweet` |
|
| 219 |
-
| `BPE` | Unknown subword fallback | rare / OOV fragments |
|
| 220 |
-
| `PUNCT` | Punctuation | `.`, `,`, `?` |
|
| 221 |
-
| `NUM` | Number | `3.5`, `%85` |
|
| 222 |
-
| `DATE` | Date | `14.03.2026` |
|
| 223 |
-
| `UNIT` | Measurement unit | `km`, `mg`, `TL` |
|
| 224 |
-
| `URL` | Web address | `https://...` |
|
| 225 |
-
| `MENTION` | Username mention | `@ethosoft` |
|
| 226 |
-
| `HASHTAG` | Hashtag | `#NLP` |
|
| 227 |
-
| `EMOJI` | Emoji | `🙂` |
|
| 228 |
-
|
| 229 |
-
---
|
| 230 |
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
| `_canonical` | Canonical morpheme mapping such as `"lar"/"ler" -> "PL"` |
|
| 236 |
-
| `_suffix_label` | Detailed morphological label such as `-PL+ACC`, `-P3+LOC` |
|
| 237 |
-
| `_foreign` | Foreign root detected |
|
| 238 |
-
| `_caps` | Word was originally ALL CAPS |
|
| 239 |
-
| `_domain` | Domain-specific term detected |
|
| 240 |
-
| `_compound` | Compound word detected |
|
| 241 |
-
| `_parts` | Compound parts |
|
| 242 |
-
| `_expansion` | Acronym expansion |
|
| 243 |
-
| `_pos` | POS tag from Zemberek |
|
| 244 |
-
| `_lemma` | Lemma |
|
| 245 |
-
| `_disambiguated` | Context-based disambiguation applied |
|
| 246 |
-
| `_root_corrected` | Root corrected using phonetic and morphological validation |
|
| 247 |
-
|
| 248 |
-
---
|
| 249 |
-
|
| 250 |
-
## How It Works
|
| 251 |
-
|
| 252 |
-
NedoTurkishTokenizer wraps the base `turkish-tokenizer` BPE model and applies **12 sequential morphological fixes** to make tokenization dramatically more faithful to Turkish.
|
| 253 |
-
|
| 254 |
-
| Fix | Problem | Solution |
|
| 255 |
-
|---:|---|---|
|
| 256 |
-
| 1 | `İSTANBUL` becomes many BPE fragments | Lowercase before tokenization, restore uppercase marker |
|
| 257 |
-
| 2 | `meeting'e` breaks badly | Detect foreign base + Turkish suffix and split correctly |
|
| 258 |
-
| 3 | Turkish suffixes appear as generic BPE | Reclassify 260+ suffix patterns as `SUFFIX` |
|
| 259 |
-
| 4 | Wrong roots can appear | Validate and correct roots with Zemberek |
|
| 260 |
-
| 5 | Punctuation gets counted as BPE | Classify punctuation explicitly |
|
| 261 |
-
| 6 | Domain terms fragment unnecessarily | Add domain-aware vocabulary |
|
| 262 |
-
| 7 | Foreign roots are mislabeled | TDK-backed lookup for foreign words |
|
| 263 |
-
| 8 | Numbers, URLs, mentions fragment | Normalize special spans before tokenization |
|
| 264 |
-
| 9 | Allomorphs get separate IDs | Canonicalize morphemes such as `PL`, `ACC`, `DAT` |
|
| 265 |
-
| 10 | Compounds remain opaque | Decompose compound words |
|
| 266 |
-
| 11 | Acronyms lose meaning | Expand known acronyms |
|
| 267 |
-
| 12 | Ambiguous forms stay unresolved | Use sentence-level context disambiguation |
|
| 268 |
-
|
| 269 |
-
---
|
| 270 |
-
|
| 271 |
-
## Why This Matters
|
| 272 |
-
|
| 273 |
-
Turkish is an agglutinative language.
|
| 274 |
-
A tokenizer that ignores morphology ignores a huge part of what makes Turkish meaningful.
|
| 275 |
-
|
| 276 |
-
NedoTurkishTokenizer is built to preserve that structure so Turkish text becomes:
|
| 277 |
-
|
| 278 |
-
- more interpretable
|
| 279 |
-
- more linguistically faithful
|
| 280 |
-
- more useful for analysis
|
| 281 |
-
- more powerful for Turkish NLP pipelines
|
| 282 |
-
|
| 283 |
-
This is why the project stands out.
|
| 284 |
-
It does not simply tokenize Turkish.
|
| 285 |
-
It **represents Turkish in a way that respects the language**.
|
| 286 |
-
|
| 287 |
-
---
|
| 288 |
|
| 289 |
## License
|
| 290 |
|
| 291 |
-
MIT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# NedoTurkishTokenizer
|
| 2 |
|
| 3 |
+
Self-contained Turkish morphological tokenizer.
|
| 4 |
+
**Zero external dependencies** — tokenizes Turkish text into morphologically meaningful units using a candidate-based segmentation engine, a bundled TDK dictionary, and 260+ suffix patterns.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
+
> This is a standalone tokenizer. It does not wrap `turkish-tokenizer`, `zemberek-python`, `requests`, or `transformers`. There are no hidden fallbacks or optional dependency paths. Install and use immediately.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
## Installation
|
| 9 |
|
| 10 |
```bash
|
| 11 |
+
pip install .
|
| 12 |
```
|
| 13 |
|
| 14 |
+
No additional packages required. Everything is bundled.
|
| 15 |
|
| 16 |
## Quick Start
|
| 17 |
|
|
|
|
|
|
|
| 18 |
```python
|
| 19 |
+
from nedo_turkish_tokenizer import NedoTurkishTokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
+
tok = NedoTurkishTokenizer()
|
| 22 |
|
| 23 |
+
tokens = tok.tokenize("İstanbul'da toplantıya katılamadım")
|
| 24 |
+
for t in tokens:
|
| 25 |
+
print(f"{t['token']:15s} {t['token_type']:10s} pos={t['morph_pos']}")
|
| 26 |
+
```
|
| 27 |
|
| 28 |
+
Output:
|
| 29 |
+
```
|
| 30 |
+
İstanbul ROOT pos=0
|
| 31 |
+
' PUNCT pos=0
|
| 32 |
+
da SUFFIX pos=1
|
| 33 |
+
toplantı ROOT pos=0
|
| 34 |
+
ya SUFFIX pos=1
|
| 35 |
+
katıl ROOT pos=0
|
| 36 |
+
a SUFFIX pos=1
|
| 37 |
+
ma SUFFIX pos=2
|
| 38 |
+
dım SUFFIX pos=3
|
| 39 |
```
|
| 40 |
|
| 41 |
+
## API
|
| 42 |
+
|
| 43 |
+
### `NedoTurkishTokenizer()`
|
| 44 |
|
| 45 |
```python
|
| 46 |
+
tok = NedoTurkishTokenizer()
|
| 47 |
+
|
| 48 |
+
# Single text
|
| 49 |
+
tokens = tok.tokenize("Merhaba dünya")
|
|
|
|
| 50 |
|
| 51 |
+
# Callable shorthand
|
| 52 |
+
tokens = tok("Merhaba dünya")
|
| 53 |
|
| 54 |
+
# Batch (parallel, uses multiprocessing)
|
| 55 |
+
results = tok.batch_tokenize(["text1", "text2", "text3"])
|
| 56 |
|
| 57 |
+
# Statistics
|
| 58 |
+
stats = tok.stats(tokens)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
```
|
| 60 |
|
| 61 |
+
### Token Output Format
|
| 62 |
|
| 63 |
+
Each token is a `dict` with these guaranteed fields:
|
| 64 |
|
| 65 |
+
| Field | Type | Description |
|
| 66 |
+
|---|---|---|
|
| 67 |
+
| `token` | `str` | **Clean token text** — no leading/trailing whitespace. |
|
| 68 |
+
| `token_type` | `str` | One of the types below. |
|
| 69 |
+
| `morph_pos` | `int` | 0 = root/word-initial, 1+ = suffix position. |
|
| 70 |
|
| 71 |
+
**Token text does not encode spacing.** The `token` field contains only the clean surface form. Whether a token starts a new word is indicated by `morph_pos == 0`, not by whitespace in the string.
|
| 72 |
|
| 73 |
+
### Token Types
|
| 74 |
+
|
| 75 |
+
| Type | Description | Example |
|
| 76 |
+
|---|---|---|
|
| 77 |
+
| `ROOT` | Turkish word root | `ev`, `gel`, `kitap` |
|
| 78 |
+
| `SUFFIX` | Morphological suffix | `de`, `ler`, `yor` |
|
| 79 |
+
| `FOREIGN` | Non-Turkish word | `meeting`, `cloud` |
|
| 80 |
+
| `PUNCT` | Punctuation | `.`, `,`, `'` |
|
| 81 |
+
| `NUM` | Number | `42`, `%85`, `3.14` |
|
| 82 |
+
| `DATE` | Date | `14.03.2026` |
|
| 83 |
+
| `UNIT` | Unit | `kg`, `km`, `TL` |
|
| 84 |
+
| `URL` | URL | `https://...` |
|
| 85 |
+
| `MENTION` | Social mention | `@user` |
|
| 86 |
+
| `HASHTAG` | Hashtag | `#topic` |
|
| 87 |
+
| `EMOJI` | Emoji | 😀, :) |
|
| 88 |
+
| `ACRONYM` | Acronym | `NATO`, `TBMM` |
|
| 89 |
|
| 90 |
+
### Optional Metadata Fields
|
|
|
|
| 91 |
|
| 92 |
+
Tokens may include `_`-prefixed metadata fields:
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
+
| Field | Type | Description |
|
| 95 |
+
|---|---|---|
|
| 96 |
+
| `_suffix_label` | `str` | Morphological label (e.g. `-LOC`, `-PL`, `-PST`) |
|
| 97 |
+
| `_canonical` | `str` | Canonical morpheme (e.g. `LOC`, `PL`, `PAST`) |
|
| 98 |
+
| `_caps` | `bool` | Word was originally ALL CAPS |
|
| 99 |
+
| `_foreign` | `bool` | Word detected as foreign |
|
| 100 |
+
| `_acronym` | `bool` | Token is an acronym |
|
| 101 |
+
| `_expansion` | `str` | Acronym expansion (e.g. `NATO` → `Kuzey Atlantik...`) |
|
| 102 |
+
| `_compound` | `bool` | Root is a compound word |
|
| 103 |
+
| `_parts` | `list[str]` | Compound decomposition |
|
| 104 |
+
| `_apo_suffix` | `bool` | Suffix follows an apostrophe |
|
| 105 |
+
| `_domain` | `bool` | Root from domain vocabulary |
|
| 106 |
|
| 107 |
+
## Architecture
|
| 108 |
|
| 109 |
+
The tokenizer uses a **candidate-based segmentation** pipeline:
|
| 110 |
|
| 111 |
+
```
|
| 112 |
+
Text → Normalize → Special Spans → Word Split → Per-Word Segmentation → Annotate → Strip
|
| 113 |
+
│
|
| 114 |
+
Generate Candidates
|
| 115 |
+
Score & Select Best
|
| 116 |
```
|
| 117 |
|
| 118 |
+
For each word, the engine:
|
| 119 |
+
1. Generates 2–5 segmentation candidates (whole ROOT, suffix chains, foreign)
|
| 120 |
+
2. Scores each candidate deterministically (TDK validation, root length, suffix recognition)
|
| 121 |
+
3. Selects the highest-scoring segmentation
|
| 122 |
+
4. Strips internal whitespace markers from the output
|
| 123 |
|
| 124 |
+
### Scoring Rules
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
+
| Factor | Score |
|
| 127 |
+
|---|---|
|
| 128 |
+
| Root in TDK dictionary | +10 |
|
| 129 |
+
| Whole word in TDK (unsplit) | +5 bonus |
|
| 130 |
+
| Root in domain vocabulary | +8 |
|
| 131 |
+
| Root length | +2 per character |
|
| 132 |
+
| Each recognised suffix | +2 |
|
| 133 |
+
| Short root penalty (≤2 chars) | −4 |
|
| 134 |
+
| Foreign root (fallback) | +3 base |
|
| 135 |
+
| Unknown root | +1 base |
|
| 136 |
|
| 137 |
+
### Known-Intact Words
|
| 138 |
|
| 139 |
+
A curated set of common Turkish words (inflected forms of `demek`, `yemek`, and discourse particles) bypass candidate generation entirely and are always kept whole. This prevents false splits like `dedi` → `de` + `di` where the root `de` is a valid TDK conjunction.
|
| 140 |
|
| 141 |
+
### Bundled Resources
|
| 142 |
|
| 143 |
+
| Resource | Size | Purpose |
|
| 144 |
|---|---|---|
|
| 145 |
+
| `tdk_words.txt` | ~746 KB | TDK dictionary (64K+ lemmas + derived verb stems) |
|
| 146 |
+
| `turkish_proper_nouns.txt` | ~1 KB | Proper nouns (cities, regions, names) |
|
| 147 |
+
| Suffix table | 260+ entries | Turkish suffix patterns with morphological labels |
|
| 148 |
+
| Acronym table | 80+ entries | Acronym → Turkish expansion mappings |
|
| 149 |
+
| Domain vocabulary | 200+ entries | Medical, sports, tourism terms |
|
| 150 |
|
| 151 |
+
## Known Limitations
|
| 152 |
|
| 153 |
+
- **Not a full morphological analyzer.** This is a heuristic segmenter, not a Zemberek/TRMorph replacement. Morphological labels may be incorrect for ambiguous suffixes (e.g. `-ACC` vs `-GEN` for "in").
|
| 154 |
+
- **No disambiguation.** The tokenizer does not use sentence-level context to resolve ambiguous words (e.g. "gelir" = income vs. aorist).
|
| 155 |
+
- **Verb stem derivation is simple.** Only `-mak`/`-mek` infinitive stripping (for stems ≥3 characters) is used; vowel harmony alternations in stems are not modelled.
|
| 156 |
+
- **TDK dictionary coverage.** The bundled TDK list has ~64K entries. Words absent from TDK default to ROOT (unknown) or FOREIGN.
|
| 157 |
+
- **Not backward-compatible with v1.x.** The old wrapper relied on `turkish-tokenizer` BPE and `zemberek-python`. Output format and token boundaries will differ.
|
| 158 |
|
| 159 |
+
## Running Tests
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
+
```bash
|
| 162 |
+
pip install -e ".[dev]"
|
| 163 |
+
pytest tests/ -v
|
| 164 |
+
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
|
| 166 |
## License
|
| 167 |
|
| 168 |
+
MIT
|
hf_benchmark.py
DELETED
|
@@ -1,327 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
hf_benchmark.py
|
| 3 |
-
---------------
|
| 4 |
-
NedoTurkishTokenizer'ı TR-MMLU üzerinde benchmark eder.
|
| 5 |
-
HuggingFace'den hem tokenizer hem dataset'i çeker.
|
| 6 |
-
|
| 7 |
-
Kullanım:
|
| 8 |
-
cd NedoTurkishTokenizer/
|
| 9 |
-
pip install huggingface_hub datasets
|
| 10 |
-
python hf_benchmark.py
|
| 11 |
-
|
| 12 |
-
# Sadece 2000 örnek (hızlı test):
|
| 13 |
-
python hf_benchmark.py --samples 2000
|
| 14 |
-
|
| 15 |
-
# Belirli kategori:
|
| 16 |
-
python hf_benchmark.py --category TUS
|
| 17 |
-
"""
|
| 18 |
-
|
| 19 |
-
import argparse
|
| 20 |
-
import json
|
| 21 |
-
import os
|
| 22 |
-
import sys
|
| 23 |
-
import csv
|
| 24 |
-
from pathlib import Path
|
| 25 |
-
|
| 26 |
-
# ── HuggingFace token ─────────────────────────────────────────────────────────
|
| 27 |
-
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 28 |
-
|
| 29 |
-
# ── Argümanlar ────────────────────────────────────────────────────────────────
|
| 30 |
-
parser = argparse.ArgumentParser()
|
| 31 |
-
parser.add_argument("--samples", default="all", help="Kaç örnek (all veya sayı)")
|
| 32 |
-
parser.add_argument("--category", default=None, help="Kategori filtresi (ör. TUS)")
|
| 33 |
-
parser.add_argument("--out", default=".", help="Çıktı klasörü")
|
| 34 |
-
args = parser.parse_args()
|
| 35 |
-
|
| 36 |
-
OUT = Path(args.out)
|
| 37 |
-
OUT.mkdir(parents=True, exist_ok=True)
|
| 38 |
-
|
| 39 |
-
# ── HuggingFace login ─────────────────────────────────────────────────────────
|
| 40 |
-
from huggingface_hub import login
|
| 41 |
-
if HF_TOKEN:
|
| 42 |
-
login(token=HF_TOKEN, add_to_git_credential=False)
|
| 43 |
-
else:
|
| 44 |
-
print("HF_TOKEN not set; using existing Hugging Face login state if available.")
|
| 45 |
-
print("HuggingFace login ✓")
|
| 46 |
-
|
| 47 |
-
# ── Tokenizer yükle ───────────────────────────────────────────────────────────
|
| 48 |
-
print("NedoTurkishTokenizer yükleniyor...")
|
| 49 |
-
sys.path.insert(0, str(Path(__file__).parent))
|
| 50 |
-
from nedo_turkish_tokenizer import NedoTurkishTokenizer
|
| 51 |
-
tok = NedoTurkishTokenizer()
|
| 52 |
-
print(f" Zemberek: {'✓' if tok.zemberek_available else '✗ (devre dışı)'}")
|
| 53 |
-
|
| 54 |
-
# ── Dataset yükle ─────────────────────────────────────────────────────────────
|
| 55 |
-
print("TR-MMLU dataset yükleniyor...")
|
| 56 |
-
from datasets import load_dataset
|
| 57 |
-
ds = load_dataset(
|
| 58 |
-
"alibayram/turkish_mmlu",
|
| 59 |
-
split="test",
|
| 60 |
-
token=HF_TOKEN,
|
| 61 |
-
)
|
| 62 |
-
print(f" {len(ds)} örnek")
|
| 63 |
-
|
| 64 |
-
# ── Kategori filtresi ─────────────────────────────────────────────────────────
|
| 65 |
-
samples = list(ds)
|
| 66 |
-
if args.category:
|
| 67 |
-
samples = [r for r in samples
|
| 68 |
-
if str(r.get("bolum","")).strip() == args.category]
|
| 69 |
-
print(f" Kategori '{args.category}': {len(samples)} örnek")
|
| 70 |
-
|
| 71 |
-
if args.samples != "all":
|
| 72 |
-
n = int(args.samples)
|
| 73 |
-
samples = samples[:n]
|
| 74 |
-
print(f" Kısıtlı: {len(samples)} örnek")
|
| 75 |
-
|
| 76 |
-
# ── Metin alanlarını birleştir ─────────────────────────────────────────────────
|
| 77 |
-
def get_text(row: dict) -> str:
|
| 78 |
-
parts = []
|
| 79 |
-
for field in ["soru", "question"]:
|
| 80 |
-
if row.get(field):
|
| 81 |
-
parts.append(str(row[field]))
|
| 82 |
-
break
|
| 83 |
-
choices = row.get("secenekler") or []
|
| 84 |
-
if isinstance(choices, list):
|
| 85 |
-
parts.extend(str(c) for c in choices)
|
| 86 |
-
for lbl in ["A","B","C","D"]:
|
| 87 |
-
if row.get(lbl):
|
| 88 |
-
parts.append(str(row[lbl]))
|
| 89 |
-
if row.get("aciklama"):
|
| 90 |
-
parts.append(str(row["aciklama"]))
|
| 91 |
-
return " ".join(parts)
|
| 92 |
-
|
| 93 |
-
# ── Token istatistikleri ──────────────────────────────────────────────────────
|
| 94 |
-
def token_stats(tokens: list[dict]) -> dict:
|
| 95 |
-
"""NedoTurkishTokenizer'ın token_type alanını kullan."""
|
| 96 |
-
total = len(tokens)
|
| 97 |
-
if total == 0:
|
| 98 |
-
return {"total":0,"bpe":0,"tr_pct":0.0,"pure_pct":0.0,"orig_tr_pct":0.0}
|
| 99 |
-
|
| 100 |
-
# Enhanced stats
|
| 101 |
-
roots = sum(1 for t in tokens if t["token_type"] == "ROOT")
|
| 102 |
-
suffixes = sum(1 for t in tokens if t["token_type"] == "SUFFIX")
|
| 103 |
-
foreign = sum(1 for t in tokens if t["token_type"] == "FOREIGN")
|
| 104 |
-
punct = sum(1 for t in tokens if t["token_type"] == "PUNCT")
|
| 105 |
-
bpe = sum(1 for t in tokens if t["token_type"] == "BPE")
|
| 106 |
-
special = sum(1 for t in tokens
|
| 107 |
-
if t["token_type"] in ("NUM","DATE","UNIT","URL",
|
| 108 |
-
"MENTION","HASHTAG","EMOJI","ACRONYM"))
|
| 109 |
-
tr = roots + suffixes + foreign + punct + special
|
| 110 |
-
pure = sum(1 for t in tokens
|
| 111 |
-
if t["token_type"] in ("ROOT","SUFFIX","FOREIGN")
|
| 112 |
-
and not t["token"].strip().startswith("<"))
|
| 113 |
-
return {
|
| 114 |
-
"total": total,
|
| 115 |
-
"roots": roots,
|
| 116 |
-
"suffixes": suffixes,
|
| 117 |
-
"foreign": foreign,
|
| 118 |
-
"bpe": bpe,
|
| 119 |
-
"punct": punct,
|
| 120 |
-
"special": special,
|
| 121 |
-
"tr_pct": round(tr / total * 100, 4),
|
| 122 |
-
"pure_pct": round(pure / total * 100, 4),
|
| 123 |
-
}
|
| 124 |
-
|
| 125 |
-
def orig_stats(tokens: list[dict]) -> dict:
|
| 126 |
-
"""Orijinal tokenizer istatistikleri (karşılaştırma için)."""
|
| 127 |
-
total = len(tokens)
|
| 128 |
-
if total == 0:
|
| 129 |
-
return {"total":0,"bpe":0,"tr_pct":0.0,"pure_pct":0.0}
|
| 130 |
-
roots = sum(1 for t in tokens if t.get("type") == "ROOT")
|
| 131 |
-
suffixes = sum(1 for t in tokens if t.get("type") == "SUFFIX")
|
| 132 |
-
bpe = sum(1 for t in tokens if t.get("type") == "BPE")
|
| 133 |
-
tr = roots + suffixes
|
| 134 |
-
pure = sum(1 for t in tokens
|
| 135 |
-
if t.get("type") in ("ROOT","SUFFIX")
|
| 136 |
-
and not t.get("token","").strip().startswith("<"))
|
| 137 |
-
return {
|
| 138 |
-
"total": total,
|
| 139 |
-
"bpe": bpe,
|
| 140 |
-
"tr_pct": round(tr / total * 100, 4),
|
| 141 |
-
"pure_pct": round(pure / total * 100, 4),
|
| 142 |
-
}
|
| 143 |
-
|
| 144 |
-
# ── Ana benchmark döngüsü ─────────────────────────────────────────────────────
|
| 145 |
-
print(f"\nBenchmark başlıyor: {len(samples)} örnek...")
|
| 146 |
-
|
| 147 |
-
per_sample = []
|
| 148 |
-
orig_tr_sum = 0.0
|
| 149 |
-
enh_tr_sum = 0.0
|
| 150 |
-
orig_pur_sum = 0.0
|
| 151 |
-
enh_pur_sum = 0.0
|
| 152 |
-
orig_tok_sum = 0
|
| 153 |
-
enh_tok_sum = 0
|
| 154 |
-
orig_bpe_sum = 0
|
| 155 |
-
enh_bpe_sum = 0
|
| 156 |
-
improved = 0
|
| 157 |
-
regressed = 0
|
| 158 |
-
unchanged = 0
|
| 159 |
-
|
| 160 |
-
REPORT_EVERY = 500
|
| 161 |
-
|
| 162 |
-
for idx, row in enumerate(samples):
|
| 163 |
-
text = get_text(row)
|
| 164 |
-
if not text.strip():
|
| 165 |
-
continue
|
| 166 |
-
|
| 167 |
-
# Orijinal tokenizer
|
| 168 |
-
orig_toks = tok._base.tokenize_text(text)
|
| 169 |
-
os_ = orig_stats(orig_toks)
|
| 170 |
-
|
| 171 |
-
# NedoTurkishTokenizer
|
| 172 |
-
enh_toks = tok.tokenize(text)
|
| 173 |
-
es_ = token_stats(enh_toks)
|
| 174 |
-
|
| 175 |
-
d_tr = round(es_["tr_pct"] - os_["tr_pct"], 4)
|
| 176 |
-
|
| 177 |
-
per_sample.append({
|
| 178 |
-
"idx": idx,
|
| 179 |
-
"bolum": str(row.get("bolum","")),
|
| 180 |
-
"orig_tr": os_["tr_pct"],
|
| 181 |
-
"enh_tr": es_["tr_pct"],
|
| 182 |
-
"d_tr": d_tr,
|
| 183 |
-
"orig_pure": os_["pure_pct"],
|
| 184 |
-
"enh_pure": es_["pure_pct"],
|
| 185 |
-
"orig_tok": os_["total"],
|
| 186 |
-
"enh_tok": es_["total"],
|
| 187 |
-
"orig_bpe": os_["bpe"],
|
| 188 |
-
"enh_bpe": es_["bpe"],
|
| 189 |
-
})
|
| 190 |
-
|
| 191 |
-
orig_tr_sum += os_["tr_pct"]
|
| 192 |
-
enh_tr_sum += es_["tr_pct"]
|
| 193 |
-
orig_pur_sum += os_["pure_pct"]
|
| 194 |
-
enh_pur_sum += es_["pure_pct"]
|
| 195 |
-
orig_tok_sum += os_["total"]
|
| 196 |
-
enh_tok_sum += es_["total"]
|
| 197 |
-
orig_bpe_sum += os_["bpe"]
|
| 198 |
-
enh_bpe_sum += es_["bpe"]
|
| 199 |
-
|
| 200 |
-
if d_tr > 0: improved += 1
|
| 201 |
-
elif d_tr < 0: regressed += 1
|
| 202 |
-
else: unchanged += 1
|
| 203 |
-
|
| 204 |
-
if (idx + 1) % REPORT_EVERY == 0:
|
| 205 |
-
n = idx + 1
|
| 206 |
-
print(f" [{n:>6}/{len(samples)}] "
|
| 207 |
-
f"TR%: {enh_tr_sum/n:.2f}% "
|
| 208 |
-
f"BPE/örnek: {enh_bpe_sum/n:.2f} "
|
| 209 |
-
f"Regressed: {regressed}")
|
| 210 |
-
|
| 211 |
-
n = len(per_sample)
|
| 212 |
-
if n == 0:
|
| 213 |
-
print("Hiç örnek işlenmedi!")
|
| 214 |
-
sys.exit(1)
|
| 215 |
-
|
| 216 |
-
# ── Özet ─────────────────────────────────────────────────────────────────────
|
| 217 |
-
summary = {
|
| 218 |
-
"n_samples": n,
|
| 219 |
-
"orig_tr_pct": round(orig_tr_sum / n, 4),
|
| 220 |
-
"orig_pure_pct": round(orig_pur_sum / n, 4),
|
| 221 |
-
"enh_tr_pct": round(enh_tr_sum / n, 4),
|
| 222 |
-
"enh_pure_pct": round(enh_pur_sum / n, 4),
|
| 223 |
-
"delta_tr_pct": round((enh_tr_sum - orig_tr_sum) / n, 4),
|
| 224 |
-
"delta_pure_pct": round((enh_pur_sum - orig_pur_sum) / n, 4),
|
| 225 |
-
"orig_avg_tokens": round(orig_tok_sum / n, 2),
|
| 226 |
-
"enh_avg_tokens": round(enh_tok_sum / n, 2),
|
| 227 |
-
"orig_avg_bpe": round(orig_bpe_sum / n, 2),
|
| 228 |
-
"enh_avg_bpe": round(enh_bpe_sum / n, 2),
|
| 229 |
-
"pct_improved": round(improved / n * 100, 2),
|
| 230 |
-
"pct_regressed": round(regressed / n * 100, 2),
|
| 231 |
-
"pct_unchanged": round(unchanged / n * 100, 2),
|
| 232 |
-
}
|
| 233 |
-
|
| 234 |
-
# ── Kategori bazında ──────────────────────────────────────────────────────────
|
| 235 |
-
from collections import defaultdict
|
| 236 |
-
cat_scores = defaultdict(list)
|
| 237 |
-
for row in per_sample:
|
| 238 |
-
cat_scores[row["bolum"]].append(row["enh_tr"])
|
| 239 |
-
|
| 240 |
-
cat_summary = {
|
| 241 |
-
cat: round(sum(v)/len(v), 2)
|
| 242 |
-
for cat, v in cat_scores.items()
|
| 243 |
-
if len(v) >= 3
|
| 244 |
-
}
|
| 245 |
-
cat_sorted = sorted(cat_summary.items(), key=lambda x: x[1])
|
| 246 |
-
|
| 247 |
-
# ── Rapor yazdır ─────────────────────────────────────────────────────────────
|
| 248 |
-
SEP = "═" * 65
|
| 249 |
-
print(f"\n{SEP}")
|
| 250 |
-
print(" NedoTurkishTokenizer — TR-MMLU Benchmark Sonuçları")
|
| 251 |
-
print(SEP)
|
| 252 |
-
print(f" N = {n:,} örnek\n")
|
| 253 |
-
print(f" {'Metrik':30s} {'Orijinal':>10} {'Enhanced':>10} {'Δ':>8}")
|
| 254 |
-
print(" " + "─"*55)
|
| 255 |
-
for label, orig, enh, delta in [
|
| 256 |
-
("TR%", summary["orig_tr_pct"], summary["enh_tr_pct"], summary["delta_tr_pct"]),
|
| 257 |
-
("Pure%", summary["orig_pure_pct"], summary["enh_pure_pct"], summary["delta_pure_pct"]),
|
| 258 |
-
("Avg token/örn",summary["orig_avg_tokens"],summary["enh_avg_tokens"],
|
| 259 |
-
round(summary["enh_avg_tokens"]-summary["orig_avg_tokens"],2)),
|
| 260 |
-
("Avg BPE/örn", summary["orig_avg_bpe"], summary["enh_avg_bpe"],
|
| 261 |
-
round(summary["enh_avg_bpe"]-summary["orig_avg_bpe"],2)),
|
| 262 |
-
]:
|
| 263 |
-
print(f" {label:30s} {orig:>10.2f} {enh:>10.2f} {delta:>+8.2f}")
|
| 264 |
-
|
| 265 |
-
print(f"\n İyileşen : {improved:>6,} (%{summary['pct_improved']:.2f})")
|
| 266 |
-
print(f" Gerileyen: {regressed:>6,} (%{summary['pct_regressed']:.2f})")
|
| 267 |
-
print(f" Değişmeyen:{unchanged:>6,} (%{summary['pct_unchanged']:.2f})")
|
| 268 |
-
|
| 269 |
-
print(f"\n En düşük TR% kategoriler:")
|
| 270 |
-
for cat, avg in cat_sorted[:10]:
|
| 271 |
-
n_cat = len(cat_scores[cat])
|
| 272 |
-
print(f" {cat:<35} {avg:>6.2f}% (n={n_cat})")
|
| 273 |
-
|
| 274 |
-
print(f"\n En yüksek TR% kategoriler:")
|
| 275 |
-
for cat, avg in cat_sorted[-8:]:
|
| 276 |
-
n_cat = len(cat_scores[cat])
|
| 277 |
-
print(f" {cat:<35} {avg:>6.2f}% (n={n_cat})")
|
| 278 |
-
|
| 279 |
-
print(SEP)
|
| 280 |
-
|
| 281 |
-
# ── Dosyalara yaz ─────────────────────────────────────────────────────────────
|
| 282 |
-
# Summary JSON
|
| 283 |
-
summary_path = OUT / "tr_mmlu_summary.json"
|
| 284 |
-
with open(summary_path, "w", encoding="utf-8") as f:
|
| 285 |
-
json.dump(summary, f, ensure_ascii=False, indent=2)
|
| 286 |
-
print(f"\n ✓ {summary_path}")
|
| 287 |
-
|
| 288 |
-
# Report Markdown
|
| 289 |
-
paper_tr = 90.29
|
| 290 |
-
paper_pur = 85.80
|
| 291 |
-
report_path = OUT / "tr_mmlu_report.md"
|
| 292 |
-
with open(report_path, "w", encoding="utf-8") as f:
|
| 293 |
-
f.write("# TR-MMLU Benchmark — NedoTurkishTokenizer\n\n")
|
| 294 |
-
f.write(f"**N = {n:,} örnek**\n\n")
|
| 295 |
-
f.write("## Ana Metrikler\n\n")
|
| 296 |
-
f.write("| Metrik | Orijinal | Enhanced | Δ |\n")
|
| 297 |
-
f.write("|--------|----------|----------|---|\n")
|
| 298 |
-
f.write(f"| TR% | {summary['orig_tr_pct']:.2f}% | {summary['enh_tr_pct']:.2f}% | {summary['delta_tr_pct']:+.2f}% |\n")
|
| 299 |
-
f.write(f"| Pure% | {summary['orig_pure_pct']:.2f}% | {summary['enh_pure_pct']:.2f}% | {summary['delta_pure_pct']:+.2f}% |\n")
|
| 300 |
-
f.write(f"| Avg token/örnek | {summary['orig_avg_tokens']:.2f} | {summary['enh_avg_tokens']:.2f} | {summary['enh_avg_tokens']-summary['orig_avg_tokens']:+.2f} |\n")
|
| 301 |
-
f.write(f"| Avg BPE/örnek | {summary['orig_avg_bpe']:.2f} | {summary['enh_avg_bpe']:.2f} | {summary['enh_avg_bpe']-summary['orig_avg_bpe']:+.2f} |\n")
|
| 302 |
-
f.write("\n## Paper ile Karşılaştırma\n\n")
|
| 303 |
-
f.write("| Metrik | Paper (orijinal) | Bizim (orijinal) | NedoTurkishTokenizer |\n")
|
| 304 |
-
f.write("|--------|-----------------|-----------------|---------------------|\n")
|
| 305 |
-
f.write(f"| TR% | {paper_tr}% | {summary['orig_tr_pct']:.2f}% | **{summary['enh_tr_pct']:.2f}%** |\n")
|
| 306 |
-
f.write(f"| Pure% | {paper_pur}% | {summary['orig_pure_pct']:.2f}% | **{summary['enh_pure_pct']:.2f}%** |\n")
|
| 307 |
-
f.write("\n## Örnek Dağılımı\n\n")
|
| 308 |
-
f.write(f"- İyileşen: {improved:,} (%{summary['pct_improved']:.2f})\n")
|
| 309 |
-
f.write(f"- Gerileyen: {regressed:,} (%{summary['pct_regressed']:.2f})\n")
|
| 310 |
-
f.write(f"- Değişmeyen: {unchanged:,} (%{summary['pct_unchanged']:.2f})\n")
|
| 311 |
-
f.write("\n## Kategori Bazında TR%\n\n")
|
| 312 |
-
f.write("| Kategori | TR% | N |\n")
|
| 313 |
-
f.write("|----------|-----|---|\n")
|
| 314 |
-
for cat, avg in cat_sorted:
|
| 315 |
-
n_cat = len(cat_scores[cat])
|
| 316 |
-
f.write(f"| {cat} | {avg:.2f}% | {n_cat} |\n")
|
| 317 |
-
print(f" ✓ {report_path}")
|
| 318 |
-
|
| 319 |
-
# Per-sample CSV
|
| 320 |
-
csv_path = OUT / "tr_mmlu_per_sample.csv"
|
| 321 |
-
with open(csv_path, "w", newline="", encoding="utf-8") as f:
|
| 322 |
-
writer = csv.DictWriter(f, fieldnames=per_sample[0].keys())
|
| 323 |
-
writer.writeheader()
|
| 324 |
-
writer.writerows(per_sample)
|
| 325 |
-
print(f" ✓ {csv_path}")
|
| 326 |
-
|
| 327 |
-
print("\nTamamlandı.\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
id_to_token_64k.json
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nedo_turkish_tokenizer/__init__.py
CHANGED
|
@@ -1,21 +1,20 @@
|
|
| 1 |
-
"""
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
-
Usage:
|
| 6 |
from nedo_turkish_tokenizer import NedoTurkishTokenizer
|
| 7 |
|
| 8 |
tok = NedoTurkishTokenizer()
|
| 9 |
-
tokens = tok("İstanbul'da meeting'e katılamadım")
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
# token : str — token string (with leading space if word-initial)
|
| 13 |
-
# token_type : str — ROOT | SUFFIX | FOREIGN | BPE | PUNCT |
|
| 14 |
-
# NUM | DATE | UNIT | URL | MENTION | HASHTAG | EMOJI
|
| 15 |
-
# morph_pos : int — 0=root/word-initial, 1=first suffix, 2=second...
|
| 16 |
"""
|
| 17 |
|
| 18 |
from .tokenizer import NedoTurkishTokenizer
|
| 19 |
|
| 20 |
__all__ = ["NedoTurkishTokenizer"]
|
| 21 |
-
__version__ = "
|
|
|
|
| 1 |
+
"""NedoTurkishTokenizer — self-contained Turkish morphological tokenizer.
|
| 2 |
+
|
| 3 |
+
Zero external dependencies. Segments Turkish text into morphologically
|
| 4 |
+
meaningful tokens using a candidate-based segmentation engine with a
|
| 5 |
+
bundled TDK dictionary, suffix heuristics, and domain-aware vocabulary.
|
| 6 |
+
|
| 7 |
+
Usage::
|
| 8 |
|
|
|
|
| 9 |
from nedo_turkish_tokenizer import NedoTurkishTokenizer
|
| 10 |
|
| 11 |
tok = NedoTurkishTokenizer()
|
| 12 |
+
tokens = tok.tokenize("İstanbul'da meeting'e katılamadım")
|
| 13 |
+
for t in tokens:
|
| 14 |
+
print(t["token"], t["token_type"], t["morph_pos"])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
"""
|
| 16 |
|
| 17 |
from .tokenizer import NedoTurkishTokenizer
|
| 18 |
|
| 19 |
__all__ = ["NedoTurkishTokenizer"]
|
| 20 |
+
__version__ = "2.0.0"
|
nedo_turkish_tokenizer/{_acronym_dict.py → _acronym_table.py}
RENAMED
|
@@ -1,9 +1,14 @@
|
|
| 1 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
ACRONYM_EXPANSIONS: dict[str, str] = {
|
| 6 |
-
# International organizations
|
| 7 |
"NATO": "Kuzey Atlantik Antlaşması Örgütü",
|
| 8 |
"UN": "Birleşmiş Milletler",
|
| 9 |
"UNESCO": "BM Eğitim, Bilim ve Kültür Örgütü",
|
|
@@ -16,7 +21,7 @@ ACRONYM_EXPANSIONS: dict[str, str] = {
|
|
| 16 |
"FIFA": "Uluslararası Futbol Federasyonları Birliği",
|
| 17 |
"IOC": "Uluslararası Olimpiyat Komitesi",
|
| 18 |
"UEFA": "Avrupa Futbol Birliği",
|
| 19 |
-
# Turkish institutions
|
| 20 |
"TBMM": "Türkiye Büyük Millet Meclisi",
|
| 21 |
"MEB": "Milli Eğitim Bakanlığı",
|
| 22 |
"TDK": "Türk Dil Kurumu",
|
|
@@ -32,12 +37,12 @@ ACRONYM_EXPANSIONS: dict[str, str] = {
|
|
| 32 |
"TÜİK": "Türkiye İstatistik Kurumu",
|
| 33 |
"TÜBİTAK": "Türkiye Bilimsel ve Teknolojik Araştırma Kurumu",
|
| 34 |
"ASELSAN": "Askeri Elektronik Sanayii",
|
| 35 |
-
# Turkish exams
|
| 36 |
"TUS": "Tıpta Uzmanlık Sınavı",
|
| 37 |
"DUS": "Diş Hekimliğinde Uzmanlık Sınavı",
|
| 38 |
"YDUS": "Yabancı Dil Uzmanlık Sınavı",
|
| 39 |
"KPSS": "Kamu Personeli Seçme Sınavı",
|
| 40 |
-
# Medical
|
| 41 |
"CMV": "Sitomegalovirüs", "EBV": "Epstein-Barr Virüsü",
|
| 42 |
"VZV": "Varisella-Zoster Virüsü", "HHV": "İnsan Herpes Virüsü",
|
| 43 |
"HSV": "Herpes Simplex Virüsü", "HIV": "İnsan İmmün Yetmezlik Virüsü",
|
|
@@ -61,7 +66,7 @@ ACRONYM_EXPANSIONS: dict[str, str] = {
|
|
| 61 |
"SMMM": "Serbest Muhasebeci Mali Müşavir",
|
| 62 |
"YMM": "Yeminli Mali Müşavir",
|
| 63 |
"SM": "Serbest Muhasebeci",
|
| 64 |
-
# Technology
|
| 65 |
"AI": "Yapay Zeka", "ML": "Makine Öğrenmesi",
|
| 66 |
"LLM": "Büyük Dil Modeli", "NLP": "Doğal Dil İşleme",
|
| 67 |
"API": "Uygulama Programlama Arayüzü",
|
|
@@ -73,37 +78,10 @@ ACRONYM_EXPANSIONS: dict[str, str] = {
|
|
| 73 |
"OS": "İşletim Sistemi",
|
| 74 |
"BERT": "Çift Yönlü Kodlayıcı Temsiller",
|
| 75 |
"GPT": "Üretici Önceden Eğitilmiş Dönüştürücü",
|
| 76 |
-
# Economics
|
| 77 |
"OPEC": "Petrol İhraç Eden Ülkeler Örgütü",
|
| 78 |
"NAFTA": "Kuzey Amerika Serbest Ticaret Anlaşması",
|
| 79 |
-
# Sports
|
| 80 |
"NBA": "Ulusal Basketbol Birliği",
|
| 81 |
"NFL": "Ulusal Futbol Ligi",
|
| 82 |
}
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
def reclassify_acronyms(tokens: list[dict]) -> list[dict]:
|
| 86 |
-
"""Add ``_expansion`` to known acronyms; promote CAPS ROOTs to ACRONYM."""
|
| 87 |
-
result: list[dict] = []
|
| 88 |
-
for tok in tokens:
|
| 89 |
-
token_upper = tok["token"].strip().upper()
|
| 90 |
-
expansion = ACRONYM_EXPANSIONS.get(token_upper)
|
| 91 |
-
|
| 92 |
-
if tok["type"] == "ACRONYM":
|
| 93 |
-
# Already typed as ACRONYM by span detection — add expansion
|
| 94 |
-
if expansion:
|
| 95 |
-
result.append({**tok, "_expansion": expansion, "_known_acronym": True})
|
| 96 |
-
else:
|
| 97 |
-
result.append(tok)
|
| 98 |
-
elif tok["type"] == "ROOT" and (tok.get("_acronym") or tok.get("_caps")):
|
| 99 |
-
# ALL CAPS ROOT that's in the acronym dict → promote to ACRONYM
|
| 100 |
-
if expansion:
|
| 101 |
-
result.append({
|
| 102 |
-
**tok, "type": "ACRONYM",
|
| 103 |
-
"_expansion": expansion, "_known_acronym": True,
|
| 104 |
-
})
|
| 105 |
-
else:
|
| 106 |
-
result.append(tok)
|
| 107 |
-
else:
|
| 108 |
-
result.append(tok)
|
| 109 |
-
return result
|
|
|
|
| 1 |
+
"""Acronym / abbreviation expansion dictionary.
|
| 2 |
+
|
| 3 |
+
Maps uppercase acronyms to their Turkish expansions. Used for:
|
| 4 |
+
- Acronym detection (is an ALL CAPS word a known acronym?)
|
| 5 |
+
- Expansion metadata (``_expansion`` field on ACRONYM tokens)
|
| 6 |
+
"""
|
| 7 |
|
| 8 |
from __future__ import annotations
|
| 9 |
|
| 10 |
ACRONYM_EXPANSIONS: dict[str, str] = {
|
| 11 |
+
# ── International organizations ──────────────────────────────────────
|
| 12 |
"NATO": "Kuzey Atlantik Antlaşması Örgütü",
|
| 13 |
"UN": "Birleşmiş Milletler",
|
| 14 |
"UNESCO": "BM Eğitim, Bilim ve Kültür Örgütü",
|
|
|
|
| 21 |
"FIFA": "Uluslararası Futbol Federasyonları Birliği",
|
| 22 |
"IOC": "Uluslararası Olimpiyat Komitesi",
|
| 23 |
"UEFA": "Avrupa Futbol Birliği",
|
| 24 |
+
# ── Turkish institutions ─────────────────────────────────────────────
|
| 25 |
"TBMM": "Türkiye Büyük Millet Meclisi",
|
| 26 |
"MEB": "Milli Eğitim Bakanlığı",
|
| 27 |
"TDK": "Türk Dil Kurumu",
|
|
|
|
| 37 |
"TÜİK": "Türkiye İstatistik Kurumu",
|
| 38 |
"TÜBİTAK": "Türkiye Bilimsel ve Teknolojik Araştırma Kurumu",
|
| 39 |
"ASELSAN": "Askeri Elektronik Sanayii",
|
| 40 |
+
# ── Turkish exams ────────────────────────────────────────────────────
|
| 41 |
"TUS": "Tıpta Uzmanlık Sınavı",
|
| 42 |
"DUS": "Diş Hekimliğinde Uzmanlık Sınavı",
|
| 43 |
"YDUS": "Yabancı Dil Uzmanlık Sınavı",
|
| 44 |
"KPSS": "Kamu Personeli Seçme Sınavı",
|
| 45 |
+
# ── Medical ──────────────────────────────────────────────────────────
|
| 46 |
"CMV": "Sitomegalovirüs", "EBV": "Epstein-Barr Virüsü",
|
| 47 |
"VZV": "Varisella-Zoster Virüsü", "HHV": "İnsan Herpes Virüsü",
|
| 48 |
"HSV": "Herpes Simplex Virüsü", "HIV": "İnsan İmmün Yetmezlik Virüsü",
|
|
|
|
| 66 |
"SMMM": "Serbest Muhasebeci Mali Müşavir",
|
| 67 |
"YMM": "Yeminli Mali Müşavir",
|
| 68 |
"SM": "Serbest Muhasebeci",
|
| 69 |
+
# ── Technology ───────────────────────────────────────────────────────
|
| 70 |
"AI": "Yapay Zeka", "ML": "Makine Öğrenmesi",
|
| 71 |
"LLM": "Büyük Dil Modeli", "NLP": "Doğal Dil İşleme",
|
| 72 |
"API": "Uygulama Programlama Arayüzü",
|
|
|
|
| 78 |
"OS": "İşletim Sistemi",
|
| 79 |
"BERT": "Çift Yönlü Kodlayıcı Temsiller",
|
| 80 |
"GPT": "Üretici Önceden Eğitilmiş Dönüştürücü",
|
| 81 |
+
# ── Economics ────────────────────────────────────────────────────────
|
| 82 |
"OPEC": "Petrol İhraç Eden Ülkeler Örgütü",
|
| 83 |
"NAFTA": "Kuzey Amerika Serbest Ticaret Anlaşması",
|
| 84 |
+
# ── Sports ───────────────────────────────────────────────────────────
|
| 85 |
"NBA": "Ulusal Basketbol Birliği",
|
| 86 |
"NFL": "Ulusal Futbol Ligi",
|
| 87 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
nedo_turkish_tokenizer/_allomorph.py
DELETED
|
@@ -1,46 +0,0 @@
|
|
| 1 |
-
"""Fix 9: Allomorph canonicalization — map surface forms to morpheme IDs."""
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
ALLOMORPH_MAP: dict[str, str] = {
|
| 6 |
-
"lar": "PL", "ler": "PL",
|
| 7 |
-
"ı": "ACC", "i": "ACC", "u": "ACC", "ü": "ACC",
|
| 8 |
-
"yı": "ACC", "yi": "ACC", "yu": "ACC", "yü": "ACC",
|
| 9 |
-
"a": "DAT", "e": "DAT", "ya": "DAT", "ye": "DAT",
|
| 10 |
-
"da": "LOC", "de": "LOC", "ta": "LOC", "te": "LOC",
|
| 11 |
-
"dan": "ABL", "den": "ABL", "tan": "ABL", "ten": "ABL",
|
| 12 |
-
"ın": "GEN", "in": "GEN", "un": "GEN", "ün": "GEN",
|
| 13 |
-
"nın": "GEN", "nin": "GEN", "nun": "GEN", "nün": "GEN",
|
| 14 |
-
"la": "INS", "le": "INS", "yla": "INS", "yle": "INS",
|
| 15 |
-
"dı": "PAST", "di": "PAST", "du": "PAST", "dü": "PAST",
|
| 16 |
-
"tı": "PAST", "ti": "PAST", "tu": "PAST", "tü": "PAST",
|
| 17 |
-
"yor": "PROG",
|
| 18 |
-
"ar": "AOR", "er": "AOR",
|
| 19 |
-
"ır": "AOR", "ir": "AOR", "ur": "AOR", "ür": "AOR",
|
| 20 |
-
"mış": "EVID", "miş": "EVID", "muş": "EVID", "müş": "EVID",
|
| 21 |
-
"ma": "NEG", "me": "NEG",
|
| 22 |
-
"mak": "INF", "mek": "INF",
|
| 23 |
-
"ım": "1SG", "im": "1SG", "um": "1SG", "üm": "1SG",
|
| 24 |
-
"ın": "2SG", "in": "2SG", "un": "2SG", "ün": "2SG",
|
| 25 |
-
"iz": "1PL", "ız": "1PL", "uz": "1PL", "üz": "1PL",
|
| 26 |
-
"mı": "Q", "mi": "Q", "mu": "Q", "mü": "Q",
|
| 27 |
-
"lı": "WITH", "li": "WITH", "lu": "WITH", "lü": "WITH",
|
| 28 |
-
"sız": "WITHOUT","siz": "WITHOUT","suz": "WITHOUT","süz": "WITHOUT",
|
| 29 |
-
"cı": "AGT", "ci": "AGT", "cu": "AGT", "cü": "AGT",
|
| 30 |
-
"çı": "AGT", "çi": "AGT", "çu": "AGT", "çü": "AGT",
|
| 31 |
-
"lık": "ABSTR","lik": "ABSTR","luk": "ABSTR","lük": "ABSTR",
|
| 32 |
-
"sa": "COND", "se": "COND",
|
| 33 |
-
"ıl": "PASS", "il": "PASS", "ul": "PASS", "ül": "PASS",
|
| 34 |
-
}
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
def add_canonical_labels(tokens: list[dict]) -> list[dict]:
|
| 38 |
-
"""Add ``_canonical`` field to SUFFIX tokens (e.g. 'lar'/'ler' → 'PL')."""
|
| 39 |
-
result: list[dict] = []
|
| 40 |
-
for tok in tokens:
|
| 41 |
-
if tok["type"] != "SUFFIX":
|
| 42 |
-
result.append(tok)
|
| 43 |
-
continue
|
| 44 |
-
canonical = ALLOMORPH_MAP.get(tok["token"].strip().lower())
|
| 45 |
-
result.append({**tok, "_canonical": canonical} if canonical else tok)
|
| 46 |
-
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
nedo_turkish_tokenizer/_compound.py
DELETED
|
@@ -1,76 +0,0 @@
|
|
| 1 |
-
"""Fix 10: Turkish compound word annotation."""
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
KNOWN_COMPOUNDS: dict[str, list[str]] = {
|
| 6 |
-
"başbakan": ["baş", "bakan"],
|
| 7 |
-
"cumhurbaşkanı": ["cumhur", "başkan"],
|
| 8 |
-
"dışişleri": ["dış", "iş"],
|
| 9 |
-
"içişleri": ["iç", "iş"],
|
| 10 |
-
"maliye": ["mal", "iye"],
|
| 11 |
-
"belediye": ["beled", "iye"],
|
| 12 |
-
"ayakkabı": ["ayak", "kap"],
|
| 13 |
-
"yelkovan": ["yel", "kovan"],
|
| 14 |
-
"saatlik": ["saat", "lik"],
|
| 15 |
-
"günlük": ["gün", "lük"],
|
| 16 |
-
"yıllık": ["yıl", "lık"],
|
| 17 |
-
"aylık": ["ay", "lık"],
|
| 18 |
-
"haftalık": ["hafta", "lık"],
|
| 19 |
-
"gastrointestinal": ["gastro", "intestinal"],
|
| 20 |
-
"kardiyovasküler": ["kardio", "vasküler"],
|
| 21 |
-
"nöropsikiyatri": ["nöro", "psikiyatri"],
|
| 22 |
-
"biyokimya": ["biyo", "kimya"],
|
| 23 |
-
"mikrobiyoloji": ["mikro", "biyoloji"],
|
| 24 |
-
"farmakoloji": ["farma", "koloji"],
|
| 25 |
-
"patoloji": ["pato", "loji"],
|
| 26 |
-
"hematoloji": ["hemato", "loji"],
|
| 27 |
-
"nefroloji": ["nefro", "loji"],
|
| 28 |
-
"kardiyoloji": ["kardio", "loji"],
|
| 29 |
-
"radyoloji": ["radyo", "loji"],
|
| 30 |
-
"onkoloji": ["onko", "loji"],
|
| 31 |
-
"elektromanyetik": ["elektro", "manyetik"],
|
| 32 |
-
"termodinamik": ["termo", "dinamik"],
|
| 33 |
-
"hidroelektrik": ["hidro", "elektrik"],
|
| 34 |
-
"biyoinformatik": ["biyo", "informatik"],
|
| 35 |
-
"nanoteknoloji": ["nano", "teknoloji"],
|
| 36 |
-
"futbolcu": ["futbol", "cu"],
|
| 37 |
-
"basketbolcu": ["basketbol", "cu"],
|
| 38 |
-
"voleybolcu": ["voleybol", "cu"],
|
| 39 |
-
}
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
def _decompose_zemberek(word: str, morphology) -> list[str] | None:
|
| 43 |
-
try:
|
| 44 |
-
wa = morphology.analyze(word)
|
| 45 |
-
for sa in wa:
|
| 46 |
-
morphemes = [str(m) for m in sa.get_morphemes()]
|
| 47 |
-
roots = [m for m in morphemes if "Noun" in m or "Verb" in m or "Adj" in m]
|
| 48 |
-
if len(roots) > 1:
|
| 49 |
-
return roots
|
| 50 |
-
except Exception: # noqa: BLE001
|
| 51 |
-
pass
|
| 52 |
-
return None
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
def add_compound_info(tokens: list[dict], morphology=None) -> list[dict]:
|
| 56 |
-
"""Annotate ROOT tokens that are compound words with ``_compound`` and ``_parts``."""
|
| 57 |
-
result: list[dict] = []
|
| 58 |
-
for tok in tokens:
|
| 59 |
-
if tok["type"] != "ROOT" or tok["token"].strip().startswith("<"):
|
| 60 |
-
result.append(tok)
|
| 61 |
-
continue
|
| 62 |
-
|
| 63 |
-
surface = tok["token"].strip().lower()
|
| 64 |
-
|
| 65 |
-
if morphology is not None:
|
| 66 |
-
parts = _decompose_zemberek(surface, morphology)
|
| 67 |
-
if parts and len(parts) > 1:
|
| 68 |
-
result.append({**tok, "_compound": True, "_parts": parts, "_source": "zemberek"})
|
| 69 |
-
continue
|
| 70 |
-
|
| 71 |
-
if surface in KNOWN_COMPOUNDS:
|
| 72 |
-
result.append({**tok, "_compound": True, "_parts": KNOWN_COMPOUNDS[surface], "_source": "manual"})
|
| 73 |
-
else:
|
| 74 |
-
result.append(tok)
|
| 75 |
-
|
| 76 |
-
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
nedo_turkish_tokenizer/_context_aware.py
DELETED
|
@@ -1,61 +0,0 @@
|
|
| 1 |
-
"""Fix 12: Context-aware Zemberek disambiguation.
|
| 2 |
-
|
| 3 |
-
Uses zemberek-python (pure Python) — no JVM required.
|
| 4 |
-
"""
|
| 5 |
-
|
| 6 |
-
from __future__ import annotations
|
| 7 |
-
|
| 8 |
-
from ._root_validator import ZEMBEREK_AVAILABLE, _morphology
|
| 9 |
-
|
| 10 |
-
AMBIGUOUS_WORDS = {
|
| 11 |
-
"yüz", "gelir", "yazar", "geçer", "çıkar", "gider",
|
| 12 |
-
"biter", "düşer", "tutar", "kalır", "gerekir", "uyar",
|
| 13 |
-
"uçar", "güzel", "büyük", "küçük", "yeni", "eski",
|
| 14 |
-
}
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
def annotate_with_context(tokens: list[dict], original_text: str) -> list[dict]:
|
| 18 |
-
"""Enrich ROOT tokens with POS and lemma using Zemberek sentence-level disambiguation."""
|
| 19 |
-
if not ZEMBEREK_AVAILABLE:
|
| 20 |
-
return tokens
|
| 21 |
-
|
| 22 |
-
try:
|
| 23 |
-
sa_result = _morphology.analyze_and_disambiguate(original_text.strip())
|
| 24 |
-
best_list = sa_result.best_analysis()
|
| 25 |
-
|
| 26 |
-
analyses: dict[str, dict] = {}
|
| 27 |
-
for sa in best_list:
|
| 28 |
-
try:
|
| 29 |
-
sf = (str(sa.get_stem()) + str(sa.get_ending())).lower().strip()
|
| 30 |
-
if sf not in analyses:
|
| 31 |
-
analyses[sf] = {
|
| 32 |
-
"lemma": str(sa.item.lemma),
|
| 33 |
-
"pos": str(sa.item.primary_pos.short_form),
|
| 34 |
-
"morphemes": [str(m) for m in sa.get_morphemes()],
|
| 35 |
-
}
|
| 36 |
-
except Exception: # noqa: BLE001
|
| 37 |
-
continue
|
| 38 |
-
|
| 39 |
-
result: list[dict] = []
|
| 40 |
-
for tok in tokens:
|
| 41 |
-
if tok["type"] != "ROOT" or tok["token"].strip().startswith("<"):
|
| 42 |
-
result.append(tok)
|
| 43 |
-
continue
|
| 44 |
-
|
| 45 |
-
surface = tok["token"].strip().lower()
|
| 46 |
-
z = analyses.get(surface)
|
| 47 |
-
if z:
|
| 48 |
-
result.append({
|
| 49 |
-
**tok,
|
| 50 |
-
"_pos": z["pos"],
|
| 51 |
-
"_lemma": z["lemma"],
|
| 52 |
-
"_morphemes": z["morphemes"],
|
| 53 |
-
"_disambiguated": surface in AMBIGUOUS_WORDS,
|
| 54 |
-
})
|
| 55 |
-
else:
|
| 56 |
-
result.append(tok)
|
| 57 |
-
|
| 58 |
-
return result
|
| 59 |
-
|
| 60 |
-
except Exception: # noqa: BLE001
|
| 61 |
-
return tokens
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
nedo_turkish_tokenizer/{_medical_vocab.py → _domain_vocab.py}
RENAMED
|
@@ -1,4 +1,14 @@
|
|
| 1 |
-
"""Domain vocabulary: medical, sports, tourism roots
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
|
@@ -136,4 +146,9 @@ TOURISM_ROOTS: dict[str, str] = {
|
|
| 136 |
"delüks": "delüks",
|
| 137 |
}
|
| 138 |
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Domain vocabulary: medical, sports, tourism roots.
|
| 2 |
+
|
| 3 |
+
These are domain-specific terms that a generic Turkish dictionary may not
|
| 4 |
+
contain but that should be recognised as valid ROOT tokens rather than
|
| 5 |
+
left as unknown fragments.
|
| 6 |
+
|
| 7 |
+
The domain vocabulary is an **optional support layer** — the core
|
| 8 |
+
segmentation engine works without it. It is consulted during candidate
|
| 9 |
+
scoring to boost the score of candidates whose root matches a known
|
| 10 |
+
domain term.
|
| 11 |
+
"""
|
| 12 |
|
| 13 |
from __future__ import annotations
|
| 14 |
|
|
|
|
| 146 |
"delüks": "delüks",
|
| 147 |
}
|
| 148 |
|
| 149 |
+
# Combined set of all domain roots (lowercase) for fast lookup
|
| 150 |
+
ALL_DOMAIN_ROOTS: frozenset[str] = frozenset(
|
| 151 |
+
k.lower()
|
| 152 |
+
for d in (MEDICAL_ROOTS, SPORTS_ROOTS, TOURISM_ROOTS)
|
| 153 |
+
for k in d
|
| 154 |
+
)
|
nedo_turkish_tokenizer/_preprocessor.py
DELETED
|
@@ -1,246 +0,0 @@
|
|
| 1 |
-
"""Fix 1: ALL CAPS inflation fix. Fix 2: Apostrophe / code-switching split."""
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
import re
|
| 6 |
-
from pathlib import Path
|
| 7 |
-
|
| 8 |
-
TR_CHARS = set("çğışöüÇĞİŞÖÜ")
|
| 9 |
-
|
| 10 |
-
_PROPER_NOUNS: set[str] | None = None
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
def _load_proper_nouns() -> set[str]:
|
| 14 |
-
global _PROPER_NOUNS
|
| 15 |
-
if _PROPER_NOUNS is not None:
|
| 16 |
-
return _PROPER_NOUNS
|
| 17 |
-
path = Path(__file__).parent / "data" / "turkish_proper_nouns.txt"
|
| 18 |
-
if path.exists():
|
| 19 |
-
_PROPER_NOUNS = {
|
| 20 |
-
line.strip().lower()
|
| 21 |
-
for line in path.read_text(encoding="utf-8").splitlines()
|
| 22 |
-
if line.strip() and not line.startswith("#")
|
| 23 |
-
}
|
| 24 |
-
else:
|
| 25 |
-
_PROPER_NOUNS = set()
|
| 26 |
-
return _PROPER_NOUNS
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
def _turkish_lower(s: str) -> str:
|
| 30 |
-
"""Turkish-aware lowercase: İ→i, I→ı (not i), then standard lower."""
|
| 31 |
-
return s.replace("İ", "i").replace("I", "ı").lower()
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
TURKISH_SUFFIXES_AFTER_APOSTROPHE = sorted(
|
| 35 |
-
[
|
| 36 |
-
"nın","nin","nun","nün","dan","den","tan","ten",
|
| 37 |
-
"da","de","ta","te","ya","ye","nda","nde",
|
| 38 |
-
"yı","yi","yu","yü","nı","ni","nu","nü",
|
| 39 |
-
"lar","ler","lara","lere","ları","leri",
|
| 40 |
-
"ım","im","um","üm","ın","in","un","ün",
|
| 41 |
-
"mız","miz","muz","müz","nız","niz","nuz","nüz",
|
| 42 |
-
"dır","dir","dur","dür","tır","tir","tur","tür",
|
| 43 |
-
"ki","li","lı","lu","lü","sız","siz","suz","süz",
|
| 44 |
-
"a","e","ı","i","u","ü",
|
| 45 |
-
],
|
| 46 |
-
key=len,
|
| 47 |
-
reverse=True,
|
| 48 |
-
)
|
| 49 |
-
|
| 50 |
-
_APO_RE = re.compile(
|
| 51 |
-
r"([A-Za-zÇçĞğİıÖöŞşÜü0-9]{2,})['\u2019]([A-Za-zÇçĞğİıÖöŞşÜü]{1,6})\b"
|
| 52 |
-
)
|
| 53 |
-
_CAPS_RE = re.compile(r'\b([A-ZÇĞİÖŞÜ]{2,})\b')
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
def _is_turkish_base(word: str) -> bool:
|
| 57 |
-
"""Return True if the word should be treated as Turkish (don't split apostrophe)."""
|
| 58 |
-
wl = _turkish_lower(word)
|
| 59 |
-
# Fast path: Turkish-specific characters → definitely Turkish
|
| 60 |
-
if any(c in TR_CHARS for c in wl):
|
| 61 |
-
return True
|
| 62 |
-
# Turkish proper nouns (cities, regions) — not in TDK common-word list
|
| 63 |
-
if wl in _load_proper_nouns():
|
| 64 |
-
return True
|
| 65 |
-
# TDK lookup: if it's in the dictionary it's Turkish (or an accepted loanword)
|
| 66 |
-
from ._tdk_vocab import load_tdk_words # noqa: PLC0415
|
| 67 |
-
tdk = load_tdk_words()
|
| 68 |
-
if tdk and wl in tdk:
|
| 69 |
-
return True
|
| 70 |
-
# Zemberek: proper nouns whose lemma contains Turkish chars (İstanbul, İzmir…)
|
| 71 |
-
try:
|
| 72 |
-
from ._root_validator import _morphology, ZEMBEREK_AVAILABLE # noqa: PLC0415
|
| 73 |
-
if ZEMBEREK_AVAILABLE and _morphology:
|
| 74 |
-
wa = _morphology.analyze(wl)
|
| 75 |
-
for sa in wa:
|
| 76 |
-
lemma = str(sa.item.lemma)
|
| 77 |
-
if any(c in TR_CHARS for c in lemma):
|
| 78 |
-
return True
|
| 79 |
-
except Exception: # noqa: BLE001
|
| 80 |
-
pass
|
| 81 |
-
# TDK unavailable + Zemberek unavailable: very short words are ambiguous
|
| 82 |
-
return len(wl) < 4
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
# ── Fix 1: ALL CAPS ───────────────────────────────────────────────────────────
|
| 86 |
-
|
| 87 |
-
def _fix_all_caps(text: str) -> tuple[str, set]:
|
| 88 |
-
caps: set[str] = set()
|
| 89 |
-
|
| 90 |
-
def _replace(m: re.Match) -> str:
|
| 91 |
-
w = m.group(1)
|
| 92 |
-
caps.add(_turkish_lower(w))
|
| 93 |
-
return _turkish_lower(w)
|
| 94 |
-
|
| 95 |
-
return _CAPS_RE.sub(_replace, text), caps
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
def _restore_caps_tokens(tokens: list[dict], caps: set) -> list[dict]:
|
| 99 |
-
result: list[dict] = []
|
| 100 |
-
i = 0
|
| 101 |
-
while i < len(tokens):
|
| 102 |
-
tok = tokens[i]
|
| 103 |
-
raw_low = _turkish_lower(tok["token"].strip())
|
| 104 |
-
|
| 105 |
-
if tok["type"] == "ROOT" and raw_low in caps:
|
| 106 |
-
result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
|
| 107 |
-
result.append(tok)
|
| 108 |
-
i += 1
|
| 109 |
-
continue
|
| 110 |
-
|
| 111 |
-
if tok["type"] == "BPE" and tok["token"].startswith(" "):
|
| 112 |
-
combined = raw_low
|
| 113 |
-
lookahead = [tok]
|
| 114 |
-
j = i + 1
|
| 115 |
-
while j < len(tokens):
|
| 116 |
-
nt = tokens[j]
|
| 117 |
-
if not nt["token"].startswith(" "):
|
| 118 |
-
combined += _turkish_lower(nt["token"].strip())
|
| 119 |
-
lookahead.append(nt)
|
| 120 |
-
j += 1
|
| 121 |
-
if combined in caps:
|
| 122 |
-
break
|
| 123 |
-
if len(combined) > 8:
|
| 124 |
-
break
|
| 125 |
-
else:
|
| 126 |
-
break
|
| 127 |
-
if combined in caps:
|
| 128 |
-
result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
|
| 129 |
-
result.append({"token": f" {combined}", "type": "ROOT",
|
| 130 |
-
"_acronym": True, "_caps": True})
|
| 131 |
-
i = j
|
| 132 |
-
continue
|
| 133 |
-
|
| 134 |
-
result.append(tok)
|
| 135 |
-
i += 1
|
| 136 |
-
|
| 137 |
-
return result
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
# ── Fix 2: Apostrophe split ───────────────────────────────────────────────────
|
| 141 |
-
#
|
| 142 |
-
# Strategy: record (foreign_base, suffix) pairs, replace apostrophe with space.
|
| 143 |
-
# After tokenization, _merge_apostrophe_tokens uses these pairs to find the
|
| 144 |
-
# BPE pieces that form the foreign word and merge them into one FOREIGN ROOT,
|
| 145 |
-
# then marks the following word-initial suffix token as SUFFIX.
|
| 146 |
-
#
|
| 147 |
-
# Old approach used a \ue001 separator — the base tokenizer converts that to
|
| 148 |
-
# '<unknown>' so the separator was never found. Simple-space + pair-list is
|
| 149 |
-
# robust regardless of how the tokenizer handles the input.
|
| 150 |
-
|
| 151 |
-
def _split_apostrophe(text: str) -> tuple[str, list[tuple[str, str]]]:
|
| 152 |
-
"""
|
| 153 |
-
Replace FOREIGN'SUFFIX with 'FOREIGN SUFFIX' (apostrophe → space).
|
| 154 |
-
Returns (modified_text, [(foreign_base_lower, suffix_lower), ...]).
|
| 155 |
-
Turkish proper names (İstanbul'da) are left unchanged.
|
| 156 |
-
"""
|
| 157 |
-
splits: list[tuple[str, str]] = []
|
| 158 |
-
|
| 159 |
-
def _repl(m: re.Match) -> str:
|
| 160 |
-
base, suffix = m.group(1), m.group(2)
|
| 161 |
-
if _is_turkish_base(base):
|
| 162 |
-
return m.group(0) # leave Turkish names alone
|
| 163 |
-
sl = suffix.lower()
|
| 164 |
-
if any(sl == s for s in TURKISH_SUFFIXES_AFTER_APOSTROPHE):
|
| 165 |
-
splits.append((_turkish_lower(base), sl))
|
| 166 |
-
return f"{base} {suffix}" # just drop the apostrophe
|
| 167 |
-
return m.group(0)
|
| 168 |
-
|
| 169 |
-
return _APO_RE.sub(_repl, text), splits
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
def _merge_apostrophe_tokens(
|
| 173 |
-
tokens: list[dict], apo_splits: list[tuple[str, str]]
|
| 174 |
-
) -> list[dict]:
|
| 175 |
-
"""
|
| 176 |
-
For each (foreign_base, suffix) pair recorded during _split_apostrophe,
|
| 177 |
-
find the consecutive BPE/ROOT pieces that together spell foreign_base,
|
| 178 |
-
merge them into one FOREIGN ROOT token, and mark the next word-initial
|
| 179 |
-
token whose stripped form == suffix as SUFFIX.
|
| 180 |
-
"""
|
| 181 |
-
if not apo_splits:
|
| 182 |
-
return tokens
|
| 183 |
-
|
| 184 |
-
result = list(tokens)
|
| 185 |
-
|
| 186 |
-
for foreign_base, suffix in apo_splits:
|
| 187 |
-
n = len(result)
|
| 188 |
-
for j in range(1, n):
|
| 189 |
-
tok_j = result[j]
|
| 190 |
-
# Candidate suffix token: word-initial, stripped == suffix
|
| 191 |
-
if not tok_j["token"].startswith(" "):
|
| 192 |
-
continue
|
| 193 |
-
if _turkish_lower(tok_j["token"].strip()) != suffix:
|
| 194 |
-
continue
|
| 195 |
-
|
| 196 |
-
# Walk back to find pieces of the word before j (no leading space)
|
| 197 |
-
word_start = j - 1
|
| 198 |
-
while word_start > 0 and not result[word_start]["token"].startswith(" "):
|
| 199 |
-
word_start -= 1
|
| 200 |
-
|
| 201 |
-
pieces = result[word_start:j]
|
| 202 |
-
if not pieces:
|
| 203 |
-
continue
|
| 204 |
-
|
| 205 |
-
combined = "".join(_turkish_lower(p["token"].strip()) for p in pieces)
|
| 206 |
-
if combined != foreign_base:
|
| 207 |
-
continue
|
| 208 |
-
|
| 209 |
-
# Merge pieces into one FOREIGN ROOT
|
| 210 |
-
merged = pieces[0]["token"] # keeps leading space
|
| 211 |
-
for p in pieces[1:]:
|
| 212 |
-
merged += p["token"].strip()
|
| 213 |
-
|
| 214 |
-
new_root = {"token": merged, "type": "ROOT", "_foreign": True}
|
| 215 |
-
new_suf = {**tok_j, "type": "SUFFIX", "_apo_suffix": True}
|
| 216 |
-
|
| 217 |
-
result = (
|
| 218 |
-
result[:word_start]
|
| 219 |
-
+ [new_root, new_suf]
|
| 220 |
-
+ result[j + 1:]
|
| 221 |
-
)
|
| 222 |
-
break # this pair is handled
|
| 223 |
-
|
| 224 |
-
return result
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
# ── Combined pre / post ───────────────────────────────────────────────────────
|
| 228 |
-
|
| 229 |
-
def preprocess(text: str) -> tuple[str, set, list]:
|
| 230 |
-
"""Prepare text before base tokenization.
|
| 231 |
-
|
| 232 |
-
Returns:
|
| 233 |
-
(modified_text, caps_set, apo_splits)
|
| 234 |
-
"""
|
| 235 |
-
text, caps = _fix_all_caps(text)
|
| 236 |
-
text, apo_splits = _split_apostrophe(text)
|
| 237 |
-
return text, caps, apo_splits
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
def postprocess(
|
| 241 |
-
tokens: list[dict], caps: set, apo_splits: list | None = None
|
| 242 |
-
) -> list[dict]:
|
| 243 |
-
"""Fix tokens after base tokenization."""
|
| 244 |
-
tokens = _restore_caps_tokens(tokens, caps)
|
| 245 |
-
tokens = _merge_apostrophe_tokens(tokens, apo_splits or [])
|
| 246 |
-
return tokens
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
nedo_turkish_tokenizer/_root_validator.py
DELETED
|
@@ -1,205 +0,0 @@
|
|
| 1 |
-
"""Zemberek-based root validation and correction (Fix 4).
|
| 2 |
-
|
| 3 |
-
Uses zemberek-python (pure Python) — no JVM or JPype required.
|
| 4 |
-
"""
|
| 5 |
-
|
| 6 |
-
from __future__ import annotations
|
| 7 |
-
|
| 8 |
-
ZEMBEREK_AVAILABLE = False
|
| 9 |
-
_morphology = None
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
def _apply_zemberek_patch() -> None:
|
| 13 |
-
"""Fix O(N^2) loading time bug in zemberek-python 0.2.3."""
|
| 14 |
-
import csv
|
| 15 |
-
import zemberek.morphology.lexicon.root_lexicon as rl
|
| 16 |
-
|
| 17 |
-
def fast_load_from_resources(resource_path: str):
|
| 18 |
-
items = list()
|
| 19 |
-
csv.field_size_limit(100000000)
|
| 20 |
-
with open(resource_path, 'r', encoding='utf-8') as f:
|
| 21 |
-
lex = list(csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE))
|
| 22 |
-
|
| 23 |
-
# O(1) dictionary lookup instead of O(N) iteration per reference
|
| 24 |
-
lex_dict = {line[0]: line for line in lex}
|
| 25 |
-
|
| 26 |
-
for i, line in enumerate(lex):
|
| 27 |
-
item = rl.DictionaryReader.make_dict_item_from_line(line)
|
| 28 |
-
if line[7] != 'null':
|
| 29 |
-
reference_item_line = lex_dict.get(line[7])
|
| 30 |
-
if reference_item_line is not None:
|
| 31 |
-
item.set_reference_item(rl.DictionaryReader.make_dict_item_from_line(reference_item_line))
|
| 32 |
-
items.append(item)
|
| 33 |
-
return rl.RootLexicon(items)
|
| 34 |
-
|
| 35 |
-
rl.DictionaryReader.load_from_resources = fast_load_from_resources
|
| 36 |
-
|
| 37 |
-
def _init_zemberek() -> None:
|
| 38 |
-
global ZEMBEREK_AVAILABLE, _morphology
|
| 39 |
-
|
| 40 |
-
try:
|
| 41 |
-
from zemberek import TurkishMorphology # noqa: PLC0415
|
| 42 |
-
|
| 43 |
-
_apply_zemberek_patch()
|
| 44 |
-
_morphology = TurkishMorphology.create_with_defaults()
|
| 45 |
-
ZEMBEREK_AVAILABLE = True
|
| 46 |
-
|
| 47 |
-
except ImportError:
|
| 48 |
-
print("[NedoTurkishTokenizer] zemberek-python not installed → pip install zemberek-python")
|
| 49 |
-
except Exception as exc: # noqa: BLE001
|
| 50 |
-
print(f"[NedoTurkishTokenizer] Zemberek init failed: {exc}")
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
_init_zemberek()
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
# ── Zemberek API helpers ──────────────────────────────────────────────────────
|
| 57 |
-
|
| 58 |
-
def analyze_word(word: str) -> list[dict]:
|
| 59 |
-
"""Return all Zemberek analyses for a single word."""
|
| 60 |
-
if not ZEMBEREK_AVAILABLE:
|
| 61 |
-
return []
|
| 62 |
-
try:
|
| 63 |
-
wa = _morphology.analyze(word)
|
| 64 |
-
return [
|
| 65 |
-
{
|
| 66 |
-
"lemma": str(sa.item.lemma),
|
| 67 |
-
"pos": str(sa.item.primary_pos.short_form),
|
| 68 |
-
"morphemes": [str(m) for m in sa.get_morphemes()],
|
| 69 |
-
"surface": str(sa.get_stem()) + str(sa.get_ending()),
|
| 70 |
-
}
|
| 71 |
-
for sa in wa
|
| 72 |
-
]
|
| 73 |
-
except Exception: # noqa: BLE001
|
| 74 |
-
return []
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
def get_root_and_suffixes(word: str) -> dict | None:
|
| 78 |
-
"""Return root + suffix list for a word, or None if unknown."""
|
| 79 |
-
analyses = analyze_word(word)
|
| 80 |
-
if not analyses:
|
| 81 |
-
return None
|
| 82 |
-
a = analyses[0]
|
| 83 |
-
return {"root": a["lemma"], "suffixes": a["morphemes"][1:], "pos": a["pos"]}
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
# ── Heuristic fallback (no Zemberek) ─────────────────────────────────────────
|
| 87 |
-
|
| 88 |
-
_SPURIOUS_SHORT_ROOTS = {"oğ", "gök", "zo", "me", "im", "pro", "go", "da", "al"}
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
def _is_spurious_root(root: str, next_tokens: list[dict]) -> bool:
|
| 92 |
-
if root.strip().lower() not in _SPURIOUS_SHORT_ROOTS:
|
| 93 |
-
return False
|
| 94 |
-
return sum(1 for t in next_tokens[:3] if t["type"] == "BPE") >= 2
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
# ── Main validation ───────────────────────────────────────────────────────────
|
| 98 |
-
|
| 99 |
-
def build_correction_map(
|
| 100 |
-
original_words: list[str], base_tokenizer
|
| 101 |
-
) -> dict[str, str]:
|
| 102 |
-
"""Build a {tokenizer_root → zemberek_root} correction map."""
|
| 103 |
-
correction_map: dict[str, str] = {}
|
| 104 |
-
|
| 105 |
-
for word in original_words:
|
| 106 |
-
w = word.lower().strip("'\".,!?;:()")
|
| 107 |
-
if not w or len(w) < 3:
|
| 108 |
-
continue
|
| 109 |
-
|
| 110 |
-
z = get_root_and_suffixes(w)
|
| 111 |
-
if not z or z["root"] == "UNK":
|
| 112 |
-
continue
|
| 113 |
-
z_root = z["root"].lower()
|
| 114 |
-
|
| 115 |
-
try:
|
| 116 |
-
toks = base_tokenizer.tokenize_text(w)
|
| 117 |
-
t_root = next(
|
| 118 |
-
(t["token"].strip().lower() for t in toks if t["type"] == "ROOT"),
|
| 119 |
-
None,
|
| 120 |
-
)
|
| 121 |
-
except Exception: # noqa: BLE001
|
| 122 |
-
continue
|
| 123 |
-
|
| 124 |
-
if not t_root or t_root == z_root:
|
| 125 |
-
continue
|
| 126 |
-
|
| 127 |
-
diff = len(z_root) - len(t_root)
|
| 128 |
-
if diff < 0 or diff > 4:
|
| 129 |
-
continue
|
| 130 |
-
if not z_root.startswith(t_root):
|
| 131 |
-
continue
|
| 132 |
-
|
| 133 |
-
correction_map[t_root] = z_root
|
| 134 |
-
|
| 135 |
-
return correction_map
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
def validate_roots(
|
| 139 |
-
tokens: list[dict],
|
| 140 |
-
original_words: list[str],
|
| 141 |
-
base_tokenizer=None,
|
| 142 |
-
) -> list[dict]:
|
| 143 |
-
"""Apply Zemberek root corrections to the token stream."""
|
| 144 |
-
if not ZEMBEREK_AVAILABLE:
|
| 145 |
-
result = []
|
| 146 |
-
for i, tok in enumerate(tokens):
|
| 147 |
-
if tok["type"] == "ROOT" and not tok["token"].strip().startswith("<"):
|
| 148 |
-
if _is_spurious_root(tok["token"], tokens[i + 1 : i + 5]):
|
| 149 |
-
tok = {**tok, "_suspicious": True}
|
| 150 |
-
result.append(tok)
|
| 151 |
-
return result
|
| 152 |
-
|
| 153 |
-
corr = (
|
| 154 |
-
build_correction_map(original_words, base_tokenizer)
|
| 155 |
-
if base_tokenizer is not None
|
| 156 |
-
else {}
|
| 157 |
-
)
|
| 158 |
-
|
| 159 |
-
result = []
|
| 160 |
-
for tok in tokens:
|
| 161 |
-
if tok["type"] != "ROOT" or tok["token"].strip().startswith("<"):
|
| 162 |
-
result.append(tok)
|
| 163 |
-
continue
|
| 164 |
-
|
| 165 |
-
surface = tok["token"].strip().lower()
|
| 166 |
-
correct = corr.get(surface)
|
| 167 |
-
|
| 168 |
-
if correct and correct != surface:
|
| 169 |
-
leading = " " if tok["token"].startswith(" ") else ""
|
| 170 |
-
tok = {
|
| 171 |
-
**tok,
|
| 172 |
-
"token": leading + correct,
|
| 173 |
-
"_original_token": tok["token"],
|
| 174 |
-
"_root_corrected": True,
|
| 175 |
-
"_note": f"root corrected: '{surface}' → '{correct}'",
|
| 176 |
-
}
|
| 177 |
-
|
| 178 |
-
result.append(tok)
|
| 179 |
-
|
| 180 |
-
return result
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
def disambiguate_sentence(words: list[str]) -> list[dict | None]:
|
| 184 |
-
"""Sentence-level Zemberek disambiguation."""
|
| 185 |
-
if not ZEMBEREK_AVAILABLE:
|
| 186 |
-
return [None] * len(words)
|
| 187 |
-
try:
|
| 188 |
-
sentence = " ".join(words)
|
| 189 |
-
sa_result = _morphology.analyze_and_disambiguate(sentence)
|
| 190 |
-
best = sa_result.best_analysis()
|
| 191 |
-
out = []
|
| 192 |
-
for sa in best:
|
| 193 |
-
try:
|
| 194 |
-
out.append({
|
| 195 |
-
"lemma": str(sa.item.lemma),
|
| 196 |
-
"pos": str(sa.item.primary_pos.short_form),
|
| 197 |
-
"morphemes": [str(m) for m in sa.get_morphemes()],
|
| 198 |
-
})
|
| 199 |
-
except Exception: # noqa: BLE001
|
| 200 |
-
out.append(None)
|
| 201 |
-
while len(out) < len(words):
|
| 202 |
-
out.append(None)
|
| 203 |
-
return out[: len(words)]
|
| 204 |
-
except Exception: # noqa: BLE001
|
| 205 |
-
return [analyze_word(w)[0] if analyze_word(w) else None for w in words]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
nedo_turkish_tokenizer/_suffix_expander.py
DELETED
|
@@ -1,212 +0,0 @@
|
|
| 1 |
-
"""Fix 3: BPE → SUFFIX reclassification. Fix 5: Punctuation → PUNCT."""
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
PUNCT_CHARS = set(
|
| 6 |
-
"'?.,;:!-\u2013\u2014()[]{}\"`/\\|@#$%^&*+=<>~"
|
| 7 |
-
"\u2019\u2018\u201c\u201d\u2032\u00ab\u00bb\u2039\u203a"
|
| 8 |
-
"\u2022\u2026\u00b7\u00b0\u00b1\u00d7\u00f7"
|
| 9 |
-
)
|
| 10 |
-
_PUNCT_DIGITS = set("0123456789")
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
def _is_punct(token: str) -> bool:
|
| 14 |
-
s = token.strip()
|
| 15 |
-
if not s:
|
| 16 |
-
return False
|
| 17 |
-
return all(
|
| 18 |
-
c in PUNCT_CHARS or c in _PUNCT_DIGITS or (ord(c) > 0x02FF and not c.isalpha())
|
| 19 |
-
for c in s
|
| 20 |
-
)
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
# ── Suffix dictionary (260+ entries) ─────────────────────────────────────────
|
| 24 |
-
|
| 25 |
-
EXTENDED_SUFFIX_MAP: dict[str, str] = {
|
| 26 |
-
# Plural + case
|
| 27 |
-
"leri": "-PL+ACC", "ları": "-PL+ACC",
|
| 28 |
-
"lere": "-PL+DAT", "lara": "-PL+DAT",
|
| 29 |
-
"lerin": "-PL+GEN", "ların": "-PL+GEN",
|
| 30 |
-
"lerde": "-PL+LOC", "larda": "-PL+LOC",
|
| 31 |
-
"lerden": "-PL+ABL","lardan": "-PL+ABL",
|
| 32 |
-
"lerle": "-PL+INS", "larla": "-PL+INS",
|
| 33 |
-
"lerce": "-PL+EQU", "larca": "-PL+EQU",
|
| 34 |
-
# -yon / loanword suffixes
|
| 35 |
-
"yon": "-YON", "iyon": "-YON", "asyon": "-YON", "izasyon": "-YON",
|
| 36 |
-
# Adjective derivation
|
| 37 |
-
"al": "-ADJ", "el": "-ADJ", "ik": "-ADJ",
|
| 38 |
-
"sal": "-ADJ.TR", "sel": "-ADJ.TR",
|
| 39 |
-
# 1st/2nd plural possessive
|
| 40 |
-
"imiz": "-P1PL", "ımız": "-P1PL", "umuz": "-P1PL", "ümüz": "-P1PL",
|
| 41 |
-
"iniz": "-P2PL", "ınız": "-P2PL", "unuz": "-P2PL", "ünüz": "-P2PL",
|
| 42 |
-
# Arabic long vowels
|
| 43 |
-
"\u00e2": "-LONG_A", "\u00ee": "-LONG_I", "\u00fb": "-LONG_U",
|
| 44 |
-
# Roman numerals
|
| 45 |
-
"ii": "-ROM", "iii": "-ROM", "iv": "-ROM", "vi": "-ROM",
|
| 46 |
-
"vii": "-ROM", "viii": "-ROM", "ix": "-ROM", "xi": "-ROM",
|
| 47 |
-
"xii": "-ROM", "xiii": "-ROM", "xiv": "-ROM", "xv": "-ROM",
|
| 48 |
-
# Frequent BPE pieces
|
| 49 |
-
"eri": "-PL.SFX", "una": "-P3+DAT", "iril": "-PASS.SFX",
|
| 50 |
-
"yan": "-PART.ACT","ren": "-PART.ACT", "ıda": "-LOC.SFX",
|
| 51 |
-
"maya": "-NEG.INF", "üler": "-PL.SFX", "ıler": "-PL.SFX",
|
| 52 |
-
"ni": "-ACC.SFX", "ri": "-PL.SFX", "lan": "-PASS+NZ",
|
| 53 |
-
"on": "-YON.SFX",
|
| 54 |
-
# Possessive + case compounds
|
| 55 |
-
"ımı": "-P1+ACC", "imi": "-P1+ACC", "umu": "-P1+ACC", "ümü": "-P1+ACC",
|
| 56 |
-
"ıyla": "-INS.COMP","iyle": "-INS.COMP","uyla": "-INS.COMP","üyle": "-INS.COMP",
|
| 57 |
-
"kten": "-ABL.COMP","ğından": "-ABL.COMP","ğinden": "-ABL.COMP",
|
| 58 |
-
"yla": "-COM", "yle": "-COM",
|
| 59 |
-
# Abstract noun + possessive
|
| 60 |
-
"liği": "-ABSTR+P3", "lığı": "-ABSTR+P3",
|
| 61 |
-
"luğu": "-ABSTR+P3", "lüğü": "-ABSTR+P3",
|
| 62 |
-
"liğini": "-ABSTR+P3+ACC", "lığını": "-ABSTR+P3+ACC",
|
| 63 |
-
# -izm (ideology)
|
| 64 |
-
"izm": "-ISM", "izmi": "-ISM+P3", "izmde": "-ISM+LOC",
|
| 65 |
-
"izmden": "-ISM+ABL", "izmin": "-ISM+GEN",
|
| 66 |
-
# Aorist
|
| 67 |
-
"lir": "-AOR3SG", "lır": "-AOR3SG", "lur": "-AOR3SG", "lür": "-AOR3SG",
|
| 68 |
-
# 3sg possessive + case
|
| 69 |
-
"ine": "-P3+DAT", "ına": "-P3+DAT", "une": "-P3+DAT", "üne": "-P3+DAT",
|
| 70 |
-
"inde": "-P3+LOC", "ında": "-P3+LOC", "unda": "-P3+LOC", "ünde": "-P3+LOC",
|
| 71 |
-
"ini": "-P3+ACC", "ını": "-P3+ACC", "unu": "-P3+ACC", "ünü": "-P3+ACC",
|
| 72 |
-
"inden": "-P3+ABL","ından": "-P3+ABL","undan": "-P3+ABL","ünden": "-P3+ABL",
|
| 73 |
-
# -daki
|
| 74 |
-
"daki": "-LOC+REL","deki": "-LOC+REL","taki": "-LOC+REL","teki": "-LOC+REL",
|
| 75 |
-
# Passive + nominalization
|
| 76 |
-
"lan": "-PASS+NZ", "len": "-PASS+NZ",
|
| 77 |
-
# Verbal noun
|
| 78 |
-
"mesi": "-VN3", "ması": "-VN3",
|
| 79 |
-
"mesini": "-VN3+ACC", "masını": "-VN3+ACC",
|
| 80 |
-
"mesine": "-VN3+DAT", "masına": "-VN3+DAT",
|
| 81 |
-
"mesinde": "-VN3+LOC", "masında": "-VN3+LOC",
|
| 82 |
-
# Genitive + possessive
|
| 83 |
-
"ının": "-GEN+P", "inin": "-GEN+P", "unun": "-GEN+P", "ünün": "-GEN+P",
|
| 84 |
-
# Participle
|
| 85 |
-
"diği": "-PART", "dığı": "-PART", "tiği": "-PART", "tığı": "-PART",
|
| 86 |
-
"duğu": "-PART", "düğü": "-PART", "tuğu": "-PART", "tüğü": "-PART",
|
| 87 |
-
"ği": "-PART.SFX","ğı": "-PART.SFX","gu": "-PART.SFX","gü": "-PART.SFX",
|
| 88 |
-
# Negative verbal noun
|
| 89 |
-
"mas": "-NEG.VN", "mes": "-NEG.VN",
|
| 90 |
-
# 2sg imperative
|
| 91 |
-
"sin": "-IMP2", "sın": "-IMP2", "sun": "-IMP2", "sün": "-IMP2",
|
| 92 |
-
# Passive short
|
| 93 |
-
"ıl": "-PASS", "il": "-PASS", "ul": "-PASS", "ül": "-PASS",
|
| 94 |
-
# Causative + VN
|
| 95 |
-
"irme": "-CAUS+VN","ırma": "-CAUS+VN","urma": "-CAUS+VN",
|
| 96 |
-
"ürme": "-CAUS+VN","erme": "-CAUS+VN","arma": "-CAUS+VN",
|
| 97 |
-
# Accusative
|
| 98 |
-
"ı": "-ACC", "i": "-ACC", "u": "-ACC", "ü": "-ACC",
|
| 99 |
-
# Past tense
|
| 100 |
-
"dım": "-DI1SG","dim": "-DI1SG","dum": "-DI1SG","düm": "-DI1SG",
|
| 101 |
-
"tım": "-DI1SG","tim": "-DI1SG","tum": "-DI1SG","tüm": "-DI1SG",
|
| 102 |
-
"dık": "-DI1PL","dik": "-DI1PL","duk": "-DI1PL","dük": "-DI1PL",
|
| 103 |
-
"tık": "-DI1PL","tik": "-DI1PL","tuk": "-DI1PL","tük": "-DI1PL",
|
| 104 |
-
"dın": "-DI2SG","din": "-DI2SG","dun": "-DI2SG","dün": "-DI2SG",
|
| 105 |
-
"tın": "-DI2SG","tin": "-DI2SG","tun": "-DI2SG","tün": "-DI2SG",
|
| 106 |
-
"d": "-PAST", "t": "-PAST",
|
| 107 |
-
# Conditional
|
| 108 |
-
"sa": "-COND", "se": "-COND",
|
| 109 |
-
# Progressive
|
| 110 |
-
"yor": "-PROG",
|
| 111 |
-
# Simple past
|
| 112 |
-
"dı": "-PST", "di": "-PST", "du": "-PST", "dü": "-PST",
|
| 113 |
-
"tı": "-PST", "ti": "-PST", "tu": "-PST", "tü": "-PST",
|
| 114 |
-
# Aorist short
|
| 115 |
-
"ir": "-AOR", "ır": "-AOR", "ur": "-AOR", "ür": "-AOR",
|
| 116 |
-
"er": "-AOR", "ar": "-AOR",
|
| 117 |
-
# Evidential past
|
| 118 |
-
"mış": "-EVID","miş": "-EVID","muş": "-EVID","müş": "-EVID",
|
| 119 |
-
# Negation
|
| 120 |
-
"ma": "-NEG", "me": "-NEG",
|
| 121 |
-
"lama": "-VN+NEG","leme": "-VN+NEG",
|
| 122 |
-
# Abilitative
|
| 123 |
-
"bil": "-ABIL",
|
| 124 |
-
# Necessitative
|
| 125 |
-
"malı": "-NECES","meli": "-NECES",
|
| 126 |
-
# Infinitive
|
| 127 |
-
"mak": "-INF", "mek": "-INF",
|
| 128 |
-
# -ken (while/when)
|
| 129 |
-
"ken": "-WHEN",
|
| 130 |
-
# Converb
|
| 131 |
-
"arak": "-CONV","erek": "-CONV",
|
| 132 |
-
# With / without
|
| 133 |
-
"lı": "-WITH", "li": "-WITH", "lu": "-WITH", "lü": "-WITH",
|
| 134 |
-
# Agentive
|
| 135 |
-
"cı": "-AGT", "ci": "-AGT", "cu": "-AGT", "cü": "-AGT",
|
| 136 |
-
"çı": "-AGT", "çi": "-AGT", "çu": "-AGT", "çü": "-AGT",
|
| 137 |
-
# Abstract noun
|
| 138 |
-
"lık": "-ABSTR","lik": "-ABSTR","luk": "-ABSTR","lük": "-ABSTR",
|
| 139 |
-
"lığ": "-ABSTR","liğ": "-ABSTR",
|
| 140 |
-
# Optative 1pl
|
| 141 |
-
"elim": "-OPT1PL","alım": "-OPT1PL",
|
| 142 |
-
# Person suffixes
|
| 143 |
-
"ım": "-1SG", "im": "-1SG", "um": "-1SG", "üm": "-1SG",
|
| 144 |
-
"ın": "-2SG", "in": "-2SG", "un": "-2SG", "ün": "-2SG",
|
| 145 |
-
"iz": "-1PL", "ız": "-1PL", "uz": "-1PL", "üz": "-1PL",
|
| 146 |
-
"nız": "-2PL","niz": "-2PL","nuz": "-2PL","nüz": "-2PL",
|
| 147 |
-
# Question
|
| 148 |
-
"mı": "-Q", "mi": "-Q", "mu": "-Q", "mü": "-Q",
|
| 149 |
-
# Dative
|
| 150 |
-
"a": "-DAT", "e": "-DAT", "ya": "-DAT", "ye": "-DAT",
|
| 151 |
-
# Ablative
|
| 152 |
-
"dan": "-ABL","den": "-ABL","tan": "-ABL","ten": "-ABL",
|
| 153 |
-
# Locative
|
| 154 |
-
"da": "-LOC", "de": "-LOC", "ta": "-LOC", "te": "-LOC",
|
| 155 |
-
# Plural
|
| 156 |
-
"lar": "-PL", "ler": "-PL",
|
| 157 |
-
# 3sg possessive short
|
| 158 |
-
"sı": "-P3", "si": "-P3", "su": "-P3", "sü": "-P3",
|
| 159 |
-
# Genitive
|
| 160 |
-
"nin": "-GEN","nın": "-GEN","nun": "-GEN","nün": "-GEN",
|
| 161 |
-
# Instrumental
|
| 162 |
-
"le": "-INS", "la": "-INS",
|
| 163 |
-
# Equative
|
| 164 |
-
"ce": "-EQU","ca": "-EQU","çe": "-EQU","ça": "-EQU",
|
| 165 |
-
# Glide
|
| 166 |
-
"y": "-GLIDE",
|
| 167 |
-
}
|
| 168 |
-
|
| 169 |
-
_SUFFIX_MAP_SORTED = sorted(
|
| 170 |
-
EXTENDED_SUFFIX_MAP.items(), key=lambda x: len(x[0]), reverse=True
|
| 171 |
-
)
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
def reclassify_bpe_suffixes(tokens: list[dict]) -> list[dict]:
|
| 175 |
-
"""Reclassify BPE tokens: punctuation → PUNCT, word-internal suffixes → SUFFIX."""
|
| 176 |
-
result: list[dict] = []
|
| 177 |
-
for tok in tokens:
|
| 178 |
-
if tok["type"] != "BPE":
|
| 179 |
-
result.append(tok)
|
| 180 |
-
continue
|
| 181 |
-
|
| 182 |
-
raw = tok["token"]
|
| 183 |
-
stripped = raw.strip()
|
| 184 |
-
|
| 185 |
-
if _is_punct(raw):
|
| 186 |
-
result.append({**tok, "type": "PUNCT", "_punct": True})
|
| 187 |
-
continue
|
| 188 |
-
|
| 189 |
-
# Only reclassify tokens without a leading space (word-internal)
|
| 190 |
-
if raw != stripped:
|
| 191 |
-
result.append(tok)
|
| 192 |
-
continue
|
| 193 |
-
|
| 194 |
-
prev_ok = bool(result) and result[-1]["type"] in ("ROOT", "SUFFIX", "BPE")
|
| 195 |
-
if not prev_ok:
|
| 196 |
-
result.append(tok)
|
| 197 |
-
continue
|
| 198 |
-
|
| 199 |
-
sl = stripped.lower()
|
| 200 |
-
label = next((lbl for surf, lbl in _SUFFIX_MAP_SORTED if sl == surf), None)
|
| 201 |
-
if label:
|
| 202 |
-
result.append({
|
| 203 |
-
"token": raw,
|
| 204 |
-
"type": "SUFFIX",
|
| 205 |
-
"_reclassified": True,
|
| 206 |
-
"_suffix_label": label,
|
| 207 |
-
**{k: v for k, v in tok.items() if k not in ("token", "type")},
|
| 208 |
-
})
|
| 209 |
-
else:
|
| 210 |
-
result.append(tok)
|
| 211 |
-
|
| 212 |
-
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
nedo_turkish_tokenizer/_suffix_table.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Turkish suffix pattern table (260+ entries).
|
| 2 |
+
|
| 3 |
+
Maps surface-form suffixes to morphological labels. Used by the
|
| 4 |
+
segmentation engine for candidate generation (suffix stripping) and by
|
| 5 |
+
the post-annotation layer for ``_suffix_label`` metadata.
|
| 6 |
+
|
| 7 |
+
Suffixes are sorted longest-first at module load time so that the
|
| 8 |
+
candidate generator always tries the most specific match first.
|
| 9 |
+
|
| 10 |
+
Design note: some surface forms are ambiguous (e.g. "in" can be GEN or
|
| 11 |
+
2SG). This table assigns a single canonical label per surface form —
|
| 12 |
+
the most common interpretation in written Turkish. The candidate scoring
|
| 13 |
+
system resolves segmentation ambiguity via root validation, not via
|
| 14 |
+
suffix-label disambiguation.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from __future__ import annotations
|
| 18 |
+
|
| 19 |
+
# ── Raw suffix → label mapping ───────────────────────────────────────────────
|
| 20 |
+
# Organised by morphological category for readability.
|
| 21 |
+
|
| 22 |
+
SUFFIX_MAP: dict[str, str] = {
|
| 23 |
+
# ── Plural + case ────────────────────────────────────────────────────
|
| 24 |
+
"leri": "-PL+ACC", "ları": "-PL+ACC",
|
| 25 |
+
"lere": "-PL+DAT", "lara": "-PL+DAT",
|
| 26 |
+
"lerin": "-PL+GEN", "ların": "-PL+GEN",
|
| 27 |
+
"lerde": "-PL+LOC", "larda": "-PL+LOC",
|
| 28 |
+
"lerden": "-PL+ABL", "lardan": "-PL+ABL",
|
| 29 |
+
"lerle": "-PL+INS", "larla": "-PL+INS",
|
| 30 |
+
"lerce": "-PL+EQU", "larca": "-PL+EQU",
|
| 31 |
+
# ── Loanword / derivational ──────────────────────────────────────────
|
| 32 |
+
"yon": "-YON", "iyon": "-YON", "asyon": "-YON", "izasyon": "-YON",
|
| 33 |
+
# ── Adjective derivation ─────────────────────────────────────────────
|
| 34 |
+
"sal": "-ADJ.TR", "sel": "-ADJ.TR",
|
| 35 |
+
# ── 1st/2nd plural possessive ────────────────────────────────────────
|
| 36 |
+
"imiz": "-P1PL", "ımız": "-P1PL", "umuz": "-P1PL", "ümüz": "-P1PL",
|
| 37 |
+
"iniz": "-P2PL", "ınız": "-P2PL", "unuz": "-P2PL", "ünüz": "-P2PL",
|
| 38 |
+
# ── Possessive + case compounds ──────────────────────────────────────
|
| 39 |
+
"ımı": "-P1+ACC", "imi": "-P1+ACC", "umu": "-P1+ACC", "ümü": "-P1+ACC",
|
| 40 |
+
"ıyla": "-INS.COMP", "iyle": "-INS.COMP", "uyla": "-INS.COMP", "üyle": "-INS.COMP",
|
| 41 |
+
"kten": "-ABL.COMP", "ğından": "-ABL.COMP", "ğinden": "-ABL.COMP",
|
| 42 |
+
"yla": "-COM", "yle": "-COM",
|
| 43 |
+
# ── Abstract noun + possessive ───────────────────────────────────────
|
| 44 |
+
"liği": "-ABSTR+P3", "lığı": "-ABSTR+P3",
|
| 45 |
+
"luğu": "-ABSTR+P3", "lüğü": "-ABSTR+P3",
|
| 46 |
+
"liğini": "-ABSTR+P3+ACC", "lığını": "-ABSTR+P3+ACC",
|
| 47 |
+
# ── -izm (ideology) ─────────────────────────────────────────────────
|
| 48 |
+
"izm": "-ISM", "izmi": "-ISM+P3", "izmde": "-ISM+LOC",
|
| 49 |
+
"izmden": "-ISM+ABL", "izmin": "-ISM+GEN",
|
| 50 |
+
# ── Aorist ───────────────────────────────────────────────────────────
|
| 51 |
+
"lir": "-AOR3SG", "lır": "-AOR3SG", "lur": "-AOR3SG", "lür": "-AOR3SG",
|
| 52 |
+
# ── 3sg possessive + case ────────────────────────────────────────────
|
| 53 |
+
"ine": "-P3+DAT", "ına": "-P3+DAT", "une": "-P3+DAT", "üne": "-P3+DAT",
|
| 54 |
+
"inde": "-P3+LOC", "ında": "-P3+LOC", "unda": "-P3+LOC", "ünde": "-P3+LOC",
|
| 55 |
+
"ini": "-P3+ACC", "ını": "-P3+ACC", "unu": "-P3+ACC", "ünü": "-P3+ACC",
|
| 56 |
+
"inden": "-P3+ABL", "ından": "-P3+ABL", "undan": "-P3+ABL", "ünden": "-P3+ABL",
|
| 57 |
+
# ── Locative-relative ────────────────────────────────────────────────
|
| 58 |
+
"daki": "-LOC+REL", "deki": "-LOC+REL", "taki": "-LOC+REL", "teki": "-LOC+REL",
|
| 59 |
+
# ── Passive + nominalization ─────────────────────────────────────────
|
| 60 |
+
"lan": "-PASS+NZ", "len": "-PASS+NZ",
|
| 61 |
+
# ── Verbal noun ──────────────────────────────────────────────────────
|
| 62 |
+
"mesi": "-VN3", "ması": "-VN3",
|
| 63 |
+
"mesini": "-VN3+ACC", "masını": "-VN3+ACC",
|
| 64 |
+
"mesine": "-VN3+DAT", "masına": "-VN3+DAT",
|
| 65 |
+
"mesinde": "-VN3+LOC", "masında": "-VN3+LOC",
|
| 66 |
+
# ── Genitive + possessive ─────────────���──────────────────────────────
|
| 67 |
+
"ının": "-GEN+P", "inin": "-GEN+P", "unun": "-GEN+P", "ünün": "-GEN+P",
|
| 68 |
+
# ── Participle ───────────────────────────────────────────────────────
|
| 69 |
+
"diği": "-PART", "dığı": "-PART", "tiği": "-PART", "tığı": "-PART",
|
| 70 |
+
"duğu": "-PART", "düğü": "-PART", "tuğu": "-PART", "tüğü": "-PART",
|
| 71 |
+
"ği": "-PART.SFX", "ğı": "-PART.SFX", "gu": "-PART.SFX", "gü": "-PART.SFX",
|
| 72 |
+
# ── Negative verbal noun ─────────────────────────────────────────────
|
| 73 |
+
"mas": "-NEG.VN", "mes": "-NEG.VN",
|
| 74 |
+
# ── 2sg imperative ───────────────────────────────────────────────────
|
| 75 |
+
"sin": "-IMP2", "sın": "-IMP2", "sun": "-IMP2", "sün": "-IMP2",
|
| 76 |
+
# ── Passive short ────────────────────────────────────────────────────
|
| 77 |
+
"ıl": "-PASS", "il": "-PASS", "ul": "-PASS", "ül": "-PASS",
|
| 78 |
+
# ── Causative + VN ───────────────────────────────────────────────────
|
| 79 |
+
"irme": "-CAUS+VN", "ırma": "-CAUS+VN", "urma": "-CAUS+VN",
|
| 80 |
+
"ürme": "-CAUS+VN", "erme": "-CAUS+VN", "arma": "-CAUS+VN",
|
| 81 |
+
# ── Past tense ───────────────────────────────────────────────────────
|
| 82 |
+
"dım": "-DI1SG", "dim": "-DI1SG", "dum": "-DI1SG", "düm": "-DI1SG",
|
| 83 |
+
"tım": "-DI1SG", "tim": "-DI1SG", "tum": "-DI1SG", "tüm": "-DI1SG",
|
| 84 |
+
"dık": "-DI1PL", "dik": "-DI1PL", "duk": "-DI1PL", "dük": "-DI1PL",
|
| 85 |
+
"tık": "-DI1PL", "tik": "-DI1PL", "tuk": "-DI1PL", "tük": "-DI1PL",
|
| 86 |
+
"dın": "-DI2SG", "din": "-DI2SG", "dun": "-DI2SG", "dün": "-DI2SG",
|
| 87 |
+
"tın": "-DI2SG", "tin": "-DI2SG", "tun": "-DI2SG", "tün": "-DI2SG",
|
| 88 |
+
# ── Conditional ──────────────────────────────────────────────────────
|
| 89 |
+
"sa": "-COND", "se": "-COND",
|
| 90 |
+
# ── Progressive ──────────────────────────────────────────────────────
|
| 91 |
+
"iyor": "-PROG", "ıyor": "-PROG", "uyor": "-PROG", "üyor": "-PROG",
|
| 92 |
+
"yor": "-PROG",
|
| 93 |
+
# ── Simple past ──────────────────────────────────────────────────────
|
| 94 |
+
"dı": "-PST", "di": "-PST", "du": "-PST", "dü": "-PST",
|
| 95 |
+
"tı": "-PST", "ti": "-PST", "tu": "-PST", "tü": "-PST",
|
| 96 |
+
# ── Aorist short ─────────────────────────────────────────────────────
|
| 97 |
+
"ir": "-AOR", "ır": "-AOR", "ur": "-AOR", "ür": "-AOR",
|
| 98 |
+
"er": "-AOR", "ar": "-AOR",
|
| 99 |
+
# ── Evidential past ──────────────────────────────────────────────────
|
| 100 |
+
"mış": "-EVID", "miş": "-EVID", "muş": "-EVID", "müş": "-EVID",
|
| 101 |
+
# ── Negation ─────────────────────────────────────────────────────────
|
| 102 |
+
"ma": "-NEG", "me": "-NEG",
|
| 103 |
+
"lama": "-VN+NEG", "leme": "-VN+NEG",
|
| 104 |
+
"maya": "-NEG.INF",
|
| 105 |
+
# ── Abilitative ──────────────────────────────────────────────────────
|
| 106 |
+
"bil": "-ABIL",
|
| 107 |
+
# ── Necessitative ────────────────────────────────────────────────────
|
| 108 |
+
"malı": "-NECES", "meli": "-NECES",
|
| 109 |
+
# ── Infinitive ───────────────────────────────────────────────────────
|
| 110 |
+
"mak": "-INF", "mek": "-INF",
|
| 111 |
+
# ── -ken (while/when) ────────────────────────────────────────────────
|
| 112 |
+
"ken": "-WHEN",
|
| 113 |
+
# ── Converb ──────────────────────────────────────────────────────────
|
| 114 |
+
"arak": "-CONV", "erek": "-CONV",
|
| 115 |
+
# ── With / without ��──────────────────────────────────────────────────
|
| 116 |
+
"lı": "-WITH", "li": "-WITH", "lu": "-WITH", "lü": "-WITH",
|
| 117 |
+
"sız": "-WITHOUT", "siz": "-WITHOUT", "suz": "-WITHOUT", "süz": "-WITHOUT",
|
| 118 |
+
# ── Agentive ─────────────────────────────────────────────────────────
|
| 119 |
+
"cı": "-AGT", "ci": "-AGT", "cu": "-AGT", "cü": "-AGT",
|
| 120 |
+
"çı": "-AGT", "çi": "-AGT", "çu": "-AGT", "çü": "-AGT",
|
| 121 |
+
# ── Abstract noun ────────────────────────────────────────────────────
|
| 122 |
+
"lık": "-ABSTR", "lik": "-ABSTR", "luk": "-ABSTR", "lük": "-ABSTR",
|
| 123 |
+
"lığ": "-ABSTR", "liğ": "-ABSTR",
|
| 124 |
+
# ── Optative 1pl ─────────────────────────────────────────────────────
|
| 125 |
+
"elim": "-OPT1PL", "alım": "-OPT1PL",
|
| 126 |
+
# ── Person suffixes ──────────────────────────────────────────────────
|
| 127 |
+
"ım": "-1SG", "im": "-1SG", "um": "-1SG", "üm": "-1SG",
|
| 128 |
+
"sın": "-2SG", "sin": "-2SG", "sun": "-2SG", "sün": "-2SG",
|
| 129 |
+
"iz": "-1PL", "ız": "-1PL", "uz": "-1PL", "üz": "-1PL",
|
| 130 |
+
"nız": "-2PL", "niz": "-2PL", "nuz": "-2PL", "nüz": "-2PL",
|
| 131 |
+
# ── Question ─────────────────────────────────────────────────────────
|
| 132 |
+
"mı": "-Q", "mi": "-Q", "mu": "-Q", "mü": "-Q",
|
| 133 |
+
# ── Accusative ───────────────────────────────────────────────────────
|
| 134 |
+
"yı": "-ACC", "yi": "-ACC", "yu": "-ACC", "yü": "-ACC",
|
| 135 |
+
"nı": "-ACC", "ni": "-ACC", "nu": "-ACC", "nü": "-ACC",
|
| 136 |
+
# ── Dative ───────────────────────────────────────────────────────────
|
| 137 |
+
"ya": "-DAT", "ye": "-DAT",
|
| 138 |
+
"a": "-DAT", "e": "-DAT",
|
| 139 |
+
# ── Ablative ─────────────────────────────────────────────────────────
|
| 140 |
+
"dan": "-ABL", "den": "-ABL", "tan": "-ABL", "ten": "-ABL",
|
| 141 |
+
# ── Locative ─────────────────────────────────────────────────────────
|
| 142 |
+
"da": "-LOC", "de": "-LOC", "ta": "-LOC", "te": "-LOC",
|
| 143 |
+
# ── Plural ───────────────────────────────────────────────────────────
|
| 144 |
+
"lar": "-PL", "ler": "-PL",
|
| 145 |
+
# ── 3sg possessive ───────────────────────────────────────────────────
|
| 146 |
+
"sı": "-P3", "si": "-P3", "su": "-P3", "sü": "-P3",
|
| 147 |
+
# ── Genitive ─────────────────────────────────────────────────────────
|
| 148 |
+
"nin": "-GEN", "nın": "-GEN", "nun": "-GEN", "nün": "-GEN",
|
| 149 |
+
"ın": "-GEN", "in": "-GEN", "un": "-GEN", "ün": "-GEN",
|
| 150 |
+
# ── Instrumental ─────────────────────────────────────────────────────
|
| 151 |
+
"le": "-INS", "la": "-INS",
|
| 152 |
+
# ── Equative ─────────────────────────────────────────────────────────
|
| 153 |
+
"ce": "-EQU", "ca": "-EQU", "çe": "-EQU", "ça": "-EQU",
|
| 154 |
+
# ── Frequent BPE-origin suffixes ─────────────────────────────────────
|
| 155 |
+
"eri": "-PL.SFX", "una": "-P3+DAT", "iril": "-PASS.SFX",
|
| 156 |
+
"yan": "-PART.ACT", "ren": "-PART.ACT", "ıda": "-LOC.SFX",
|
| 157 |
+
"üler": "-PL.SFX", "ıler": "-PL.SFX",
|
| 158 |
+
"ri": "-PL.SFX",
|
| 159 |
+
# ── Single-vowel accusative (used cautiously by the scorer) ──────────
|
| 160 |
+
"ı": "-ACC", "i": "-ACC", "u": "-ACC", "ü": "-ACC",
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
# Suffixes that are too short / ambiguous for aggressive stripping.
|
| 164 |
+
# The segmentation engine applies extra constraints when matching these
|
| 165 |
+
# (e.g. minimum root length of 3, root must be in TDK).
|
| 166 |
+
SHORT_AMBIGUOUS_SUFFIXES: frozenset[str] = frozenset(
|
| 167 |
+
{"a", "e", "ı", "i", "u", "ü"}
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
# Pre-sorted list: (surface_form, label) ordered longest-first.
|
| 171 |
+
# Longest-first ordering ensures the most specific suffix wins when
|
| 172 |
+
# multiple suffixes could match at the same position.
|
| 173 |
+
SUFFIX_ENTRIES: list[tuple[str, str]] = sorted(
|
| 174 |
+
SUFFIX_MAP.items(), key=lambda x: len(x[0]), reverse=True
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
# ── Turkish suffixes that can follow an apostrophe ───────────────────────────
|
| 179 |
+
# Used for apostrophe-based segmentation (e.g. İstanbul'da, meeting'e).
|
| 180 |
+
|
| 181 |
+
APOSTROPHE_SUFFIXES: list[str] = sorted(
|
| 182 |
+
[
|
| 183 |
+
"nın", "nin", "nun", "nün", "dan", "den", "tan", "ten",
|
| 184 |
+
"da", "de", "ta", "te", "ya", "ye", "nda", "nde",
|
| 185 |
+
"yı", "yi", "yu", "yü", "nı", "ni", "nu", "nü",
|
| 186 |
+
"lar", "ler", "lara", "lere", "ları", "leri",
|
| 187 |
+
"ım", "im", "um", "üm", "ın", "in", "un", "ün",
|
| 188 |
+
"mız", "miz", "muz", "müz", "nız", "niz", "nuz", "nüz",
|
| 189 |
+
"dır", "dir", "dur", "dür", "tır", "tir", "tur", "tür",
|
| 190 |
+
"ki", "li", "lı", "lu", "lü", "sız", "siz", "suz", "süz",
|
| 191 |
+
"inci", "ıncı", "uncu", "üncü", "nci", "ncı",
|
| 192 |
+
"lık", "lik", "luk", "lük",
|
| 193 |
+
"a", "e", "ı", "i", "u", "ü",
|
| 194 |
+
],
|
| 195 |
+
key=len,
|
| 196 |
+
reverse=True,
|
| 197 |
+
)
|
nedo_turkish_tokenizer/_tdk_vocab.py
DELETED
|
@@ -1,148 +0,0 @@
|
|
| 1 |
-
"""Fix 7: TDK-based FOREIGN word detection."""
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
import json
|
| 6 |
-
import os
|
| 7 |
-
from pathlib import Path
|
| 8 |
-
|
| 9 |
-
_CACHE_DIR = Path.home() / ".cache" / "nedo_turkish_tokenizer"
|
| 10 |
-
_CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
| 11 |
-
TDK_CACHE_FILE = str(_CACHE_DIR / "tdk_words.txt")
|
| 12 |
-
_BUNDLED_TDK_FILE = Path(__file__).parent / "data" / "tdk_words.txt"
|
| 13 |
-
|
| 14 |
-
TR_CHARS = set("çğışöüÇĞİŞÖÜ")
|
| 15 |
-
|
| 16 |
-
_TDK_WORDS: set | None = None
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
_HF_TDK_URL = (
|
| 20 |
-
"https://huggingface.co/Ethosoft/NedoTurkishTokenizer/resolve/main"
|
| 21 |
-
"/nedo_turkish_tokenizer/data/tdk_words.txt"
|
| 22 |
-
)
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
def _read_word_file(path: Path) -> set[str]:
|
| 26 |
-
with path.open(encoding="utf-8") as f:
|
| 27 |
-
return {line.strip().lower() for line in f if line.strip()}
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
def _load_cached_or_bundled_words() -> tuple[set[str] | None, str | None]:
|
| 31 |
-
candidates = (
|
| 32 |
-
(Path(TDK_CACHE_FILE), "cache"),
|
| 33 |
-
(_BUNDLED_TDK_FILE, "package bundle"),
|
| 34 |
-
)
|
| 35 |
-
for path, source in candidates:
|
| 36 |
-
if path.exists():
|
| 37 |
-
return _read_word_file(path), source
|
| 38 |
-
return None, None
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
def load_tdk_words() -> set:
|
| 42 |
-
global _TDK_WORDS
|
| 43 |
-
if _TDK_WORDS is not None:
|
| 44 |
-
return _TDK_WORDS
|
| 45 |
-
|
| 46 |
-
words, source = _load_cached_or_bundled_words()
|
| 47 |
-
if words is not None:
|
| 48 |
-
_TDK_WORDS = words
|
| 49 |
-
print(f"[NedoTurkishTokenizer] TDK: {len(_TDK_WORDS):,} words loaded from {source} [ok]")
|
| 50 |
-
return _TDK_WORDS
|
| 51 |
-
|
| 52 |
-
if not os.path.exists(TDK_CACHE_FILE):
|
| 53 |
-
print("[NedoTurkishTokenizer] TDK word list not found - downloading...")
|
| 54 |
-
words = _download_from_hf() or _download_from_tdk()
|
| 55 |
-
if not words:
|
| 56 |
-
_TDK_WORDS = set()
|
| 57 |
-
return _TDK_WORDS
|
| 58 |
-
|
| 59 |
-
_TDK_WORDS, source = _load_cached_or_bundled_words()
|
| 60 |
-
if _TDK_WORDS is None:
|
| 61 |
-
_TDK_WORDS = set()
|
| 62 |
-
return _TDK_WORDS
|
| 63 |
-
|
| 64 |
-
print(f"[NedoTurkishTokenizer] TDK: {len(_TDK_WORDS):,} words loaded from {source} [ok]")
|
| 65 |
-
return _TDK_WORDS
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
def _download_from_hf() -> list[str]:
|
| 69 |
-
"""Download the bundled TDK word list from the HuggingFace repo."""
|
| 70 |
-
try:
|
| 71 |
-
import urllib.request # noqa: PLC0415
|
| 72 |
-
|
| 73 |
-
with urllib.request.urlopen(_HF_TDK_URL, timeout=30) as resp:
|
| 74 |
-
content = resp.read().decode("utf-8")
|
| 75 |
-
|
| 76 |
-
words = [w.strip() for w in content.splitlines() if w.strip()]
|
| 77 |
-
with open(TDK_CACHE_FILE, "w", encoding="utf-8") as f:
|
| 78 |
-
f.write("\n".join(words))
|
| 79 |
-
|
| 80 |
-
print(f"[NedoTurkishTokenizer] TDK: {len(words):,} words downloaded from HuggingFace [ok]")
|
| 81 |
-
return words
|
| 82 |
-
|
| 83 |
-
except Exception as exc: # noqa: BLE001
|
| 84 |
-
print(f"[NedoTurkishTokenizer] HuggingFace download failed: {exc} - trying TDK API...")
|
| 85 |
-
return []
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
def _download_from_tdk() -> list[str]:
|
| 89 |
-
"""Fallback: download from the official TDK autocomplete API."""
|
| 90 |
-
try:
|
| 91 |
-
import urllib.request # noqa: PLC0415
|
| 92 |
-
|
| 93 |
-
url = "https://sozluk.gov.tr/autocomplete.json"
|
| 94 |
-
with urllib.request.urlopen(url, timeout=30) as resp:
|
| 95 |
-
data = json.loads(resp.read().decode("utf-8"))
|
| 96 |
-
|
| 97 |
-
words = sorted({item.get("madde", "").strip().lower() for item in data if item.get("madde")})
|
| 98 |
-
with open(TDK_CACHE_FILE, "w", encoding="utf-8") as f:
|
| 99 |
-
f.write("\n".join(words))
|
| 100 |
-
|
| 101 |
-
print(f"[NedoTurkishTokenizer] TDK: {len(words):,} words downloaded from TDK API [ok]")
|
| 102 |
-
return words
|
| 103 |
-
|
| 104 |
-
except Exception as exc: # noqa: BLE001
|
| 105 |
-
print(f"[NedoTurkishTokenizer] TDK API also failed: {exc}")
|
| 106 |
-
print(" FOREIGN detection will be disabled for this session.")
|
| 107 |
-
return []
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
def download_tdk_words() -> list[str]:
|
| 111 |
-
"""Public helper: force re-download TDK word list."""
|
| 112 |
-
return _download_from_hf() or _download_from_tdk()
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
def is_foreign_word(word: str) -> bool:
|
| 116 |
-
w = word.strip().lower()
|
| 117 |
-
if not w or len(w) < 2:
|
| 118 |
-
return False
|
| 119 |
-
if any(c in TR_CHARS for c in w):
|
| 120 |
-
return False
|
| 121 |
-
return w not in load_tdk_words()
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
def reclassify_foreign_words(tokens: list[dict]) -> list[dict]:
|
| 125 |
-
"""Reclassify word-initial BPE tokens as ROOT if they are foreign words."""
|
| 126 |
-
tdk = load_tdk_words()
|
| 127 |
-
if not tdk:
|
| 128 |
-
return tokens
|
| 129 |
-
|
| 130 |
-
result: list[dict] = []
|
| 131 |
-
for tok in tokens:
|
| 132 |
-
if tok["type"] != "BPE":
|
| 133 |
-
result.append(tok)
|
| 134 |
-
continue
|
| 135 |
-
|
| 136 |
-
raw = tok["token"]
|
| 137 |
-
stripped = raw.lstrip()
|
| 138 |
-
|
| 139 |
-
if raw == stripped: # no leading space → not word-initial
|
| 140 |
-
result.append(tok)
|
| 141 |
-
continue
|
| 142 |
-
|
| 143 |
-
if is_foreign_word(stripped):
|
| 144 |
-
result.append({**tok, "type": "ROOT", "_foreign": True, "_tdk": False})
|
| 145 |
-
else:
|
| 146 |
-
result.append(tok)
|
| 147 |
-
|
| 148 |
-
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
nedo_turkish_tokenizer/apostrophe.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Apostrophe-aware segmentation for Turkish text.
|
| 2 |
+
|
| 3 |
+
Handles two distinct cases:
|
| 4 |
+
1. **Turkish proper names** — İstanbul'da, Ankara'ya
|
| 5 |
+
→ ROOT(İstanbul) + PUNCT(') + SUFFIX(da)
|
| 6 |
+
2. **Foreign stems with Turkish suffixes** — meeting'e, zoom'da
|
| 7 |
+
→ FOREIGN(meeting) + SUFFIX(e)
|
| 8 |
+
|
| 9 |
+
The decision between these two cases uses:
|
| 10 |
+
- Turkish character detection (ç,ğ,ı,ş,ö,ü → Turkish)
|
| 11 |
+
- TDK dictionary lookup
|
| 12 |
+
- Proper noun list
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
|
| 17 |
+
import re
|
| 18 |
+
|
| 19 |
+
from ._suffix_table import APOSTROPHE_SUFFIXES, SUFFIX_MAP
|
| 20 |
+
from .normalization import has_turkish_chars, turkish_lower
|
| 21 |
+
from .resources import load_proper_nouns, load_tdk_words
|
| 22 |
+
|
| 23 |
+
# Matches word'suffix patterns (both ASCII and Unicode apostrophes)
|
| 24 |
+
_APO_RE = re.compile(
|
| 25 |
+
r"([A-Za-zÇçĞğİıÖöŞşÜü0-9]{2,})['\u2019]([A-Za-zÇçĞğİıÖöŞşÜü]{1,6})\b"
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def is_turkish_base(word: str) -> bool:
|
| 30 |
+
"""Return True if *word* should be treated as a Turkish word.
|
| 31 |
+
|
| 32 |
+
Used to decide whether ``word'suffix`` is a Turkish proper name
|
| 33 |
+
(keep apostrophe as punctuation boundary) or a foreign word
|
| 34 |
+
(merge into FOREIGN root + SUFFIX).
|
| 35 |
+
|
| 36 |
+
Decision order:
|
| 37 |
+
1. Turkish-specific chars → definitely Turkish
|
| 38 |
+
2. Proper nouns list → Turkish
|
| 39 |
+
3. TDK dictionary → Turkish (or accepted loanword)
|
| 40 |
+
4. Very short words (< 4 chars) → assume Turkish (because short
|
| 41 |
+
words are ambiguous and Turkish short words are common)
|
| 42 |
+
"""
|
| 43 |
+
wl = turkish_lower(word)
|
| 44 |
+
|
| 45 |
+
# Turkish-specific characters are a strong signal
|
| 46 |
+
if has_turkish_chars(wl):
|
| 47 |
+
return True
|
| 48 |
+
|
| 49 |
+
# Known proper nouns
|
| 50 |
+
if wl in load_proper_nouns():
|
| 51 |
+
return True
|
| 52 |
+
|
| 53 |
+
# TDK dictionary
|
| 54 |
+
tdk = load_tdk_words()
|
| 55 |
+
if tdk and wl in tdk:
|
| 56 |
+
return True
|
| 57 |
+
|
| 58 |
+
# Very short words are ambiguous — default to Turkish
|
| 59 |
+
return len(wl) < 4
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def split_apostrophe_words(
|
| 63 |
+
text: str,
|
| 64 |
+
) -> tuple[str, list[tuple[str, str]]]:
|
| 65 |
+
"""Process apostrophe patterns in *text*.
|
| 66 |
+
|
| 67 |
+
For **foreign** stems followed by a Turkish suffix after apostrophe,
|
| 68 |
+
replaces the apostrophe with a space so the word can later be
|
| 69 |
+
segmented as FOREIGN ROOT + SUFFIX.
|
| 70 |
+
|
| 71 |
+
For **Turkish** proper names (İstanbul'da), leaves the text
|
| 72 |
+
unchanged — the apostrophe will be handled as punctuation by the
|
| 73 |
+
word splitter.
|
| 74 |
+
|
| 75 |
+
Returns:
|
| 76 |
+
``(modified_text, [(foreign_base_lower, suffix_lower), ...])``
|
| 77 |
+
"""
|
| 78 |
+
foreign_splits: list[tuple[str, str]] = []
|
| 79 |
+
|
| 80 |
+
def _repl(m: re.Match) -> str:
|
| 81 |
+
base, suffix = m.group(1), m.group(2)
|
| 82 |
+
|
| 83 |
+
if is_turkish_base(base):
|
| 84 |
+
return m.group(0) # Keep apostrophe for Turkish names
|
| 85 |
+
|
| 86 |
+
sl = suffix.lower()
|
| 87 |
+
if any(sl == s for s in APOSTROPHE_SUFFIXES):
|
| 88 |
+
foreign_splits.append((turkish_lower(base), sl))
|
| 89 |
+
return f"{base} {suffix}" # Drop apostrophe → space
|
| 90 |
+
|
| 91 |
+
return m.group(0)
|
| 92 |
+
|
| 93 |
+
modified = _APO_RE.sub(_repl, text)
|
| 94 |
+
return modified, foreign_splits
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def build_apostrophe_tokens(
|
| 98 |
+
word: str, suffix_str: str, *, is_foreign: bool
|
| 99 |
+
) -> list[dict[str, object]]:
|
| 100 |
+
"""Create token dicts for a word + apostrophe + suffix pattern.
|
| 101 |
+
|
| 102 |
+
Args:
|
| 103 |
+
word: The base word (before apostrophe).
|
| 104 |
+
suffix_str: The suffix string (after apostrophe).
|
| 105 |
+
is_foreign: Whether the base word is foreign.
|
| 106 |
+
|
| 107 |
+
Returns:
|
| 108 |
+
List of token dicts.
|
| 109 |
+
"""
|
| 110 |
+
label = SUFFIX_MAP.get(suffix_str.lower(), "-SFX")
|
| 111 |
+
|
| 112 |
+
if is_foreign:
|
| 113 |
+
# Foreign: FOREIGN(word) + SUFFIX(suffix)
|
| 114 |
+
return [
|
| 115 |
+
{
|
| 116 |
+
"token": f" {word}", "token_type": "FOREIGN", "morph_pos": 0,
|
| 117 |
+
"_foreign": True,
|
| 118 |
+
},
|
| 119 |
+
{
|
| 120 |
+
"token": suffix_str, "token_type": "SUFFIX", "morph_pos": 1,
|
| 121 |
+
"_apo_suffix": True, "_suffix_label": label,
|
| 122 |
+
},
|
| 123 |
+
]
|
| 124 |
+
else:
|
| 125 |
+
# Turkish: ROOT(word) + PUNCT(') + SUFFIX(suffix)
|
| 126 |
+
return [
|
| 127 |
+
{
|
| 128 |
+
"token": f" {word}", "token_type": "ROOT", "morph_pos": 0,
|
| 129 |
+
},
|
| 130 |
+
{
|
| 131 |
+
"token": "'", "token_type": "PUNCT", "morph_pos": 0,
|
| 132 |
+
"_punct": True,
|
| 133 |
+
},
|
| 134 |
+
{
|
| 135 |
+
"token": suffix_str, "token_type": "SUFFIX", "morph_pos": 1,
|
| 136 |
+
"_apo_suffix": True, "_suffix_label": label,
|
| 137 |
+
},
|
| 138 |
+
]
|
nedo_turkish_tokenizer/engine.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tokenization engine — orchestrates the full pipeline.
|
| 2 |
+
|
| 3 |
+
This is the central pipeline that ties together all modules:
|
| 4 |
+
1. Text normalization (Unicode, whitespace)
|
| 5 |
+
2. ALL CAPS detection and lowercasing
|
| 6 |
+
3. Special span extraction (URLs, numbers, dates, acronyms, emojis)
|
| 7 |
+
4. Word-level segmentation with candidate generation/selection
|
| 8 |
+
5. Post-annotation (allomorph labels, compound info, acronym expansion)
|
| 9 |
+
6. Number/unit reclassification safety net
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
from ._domain_vocab import ALL_DOMAIN_ROOTS
|
| 15 |
+
from .morphology import annotate_acronyms, annotate_canonical, annotate_compounds
|
| 16 |
+
from .normalization import detect_all_caps, normalize_text
|
| 17 |
+
from .resources import load_tdk_words
|
| 18 |
+
from .segmentation import segment_word, split_into_words
|
| 19 |
+
from .special_spans import find_special_spans, make_special_tokens, reclassify_numbers_in_tokens
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class TokenizationEngine:
|
| 23 |
+
"""Core tokenization engine.
|
| 24 |
+
|
| 25 |
+
Stateless after initialisation: loads TDK and domain vocabulary once,
|
| 26 |
+
then processes texts through a deterministic pipeline.
|
| 27 |
+
|
| 28 |
+
This class is NOT the public API. Use ``NedoTurkishTokenizer``
|
| 29 |
+
instead, which delegates to this engine.
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
def __init__(self) -> None:
|
| 33 |
+
self._tdk: set[str] = load_tdk_words()
|
| 34 |
+
self._domain_roots: frozenset[str] = ALL_DOMAIN_ROOTS
|
| 35 |
+
|
| 36 |
+
def tokenize(self, text: str) -> list[dict[str, object]]:
|
| 37 |
+
"""Run the full tokenization pipeline on *text*.
|
| 38 |
+
|
| 39 |
+
Returns a list of token dicts, each with at minimum:
|
| 40 |
+
``token``, ``token_type``, ``morph_pos``.
|
| 41 |
+
"""
|
| 42 |
+
if not text or not text.strip():
|
| 43 |
+
return []
|
| 44 |
+
|
| 45 |
+
# ── 1. Normalize ─────────────────────────────────────────────────
|
| 46 |
+
text = normalize_text(text)
|
| 47 |
+
|
| 48 |
+
# ── 2. ALL CAPS detection ────────────────────────────────────────
|
| 49 |
+
text, caps_set = detect_all_caps(text)
|
| 50 |
+
|
| 51 |
+
# ── 3. Special span extraction ───────────────────────────────────
|
| 52 |
+
spans = find_special_spans(text)
|
| 53 |
+
|
| 54 |
+
tokens: list[dict[str, object]] = []
|
| 55 |
+
pos = 0
|
| 56 |
+
|
| 57 |
+
for start, end, span_type, original in spans:
|
| 58 |
+
# Tokenize normal text before this special span
|
| 59 |
+
if pos < start:
|
| 60 |
+
segment = text[pos:start]
|
| 61 |
+
if segment.strip():
|
| 62 |
+
seg_tokens = self._tokenize_segment(segment, caps_set)
|
| 63 |
+
tokens.extend(seg_tokens)
|
| 64 |
+
|
| 65 |
+
# Insert special tokens directly
|
| 66 |
+
tokens.extend(make_special_tokens(span_type, original))
|
| 67 |
+
pos = end
|
| 68 |
+
|
| 69 |
+
# Tokenize remaining text after last special span
|
| 70 |
+
if pos < len(text):
|
| 71 |
+
segment = text[pos:]
|
| 72 |
+
if segment.strip():
|
| 73 |
+
seg_tokens = self._tokenize_segment(segment, caps_set)
|
| 74 |
+
tokens.extend(seg_tokens)
|
| 75 |
+
|
| 76 |
+
# ── 5. Post-annotation passes ────────────────────────────────────
|
| 77 |
+
tokens = reclassify_numbers_in_tokens(tokens)
|
| 78 |
+
tokens = annotate_canonical(tokens)
|
| 79 |
+
tokens = annotate_compounds(tokens)
|
| 80 |
+
tokens = annotate_acronyms(tokens)
|
| 81 |
+
|
| 82 |
+
# ── 6. Finalize morph_pos ────────────────────────────────────────
|
| 83 |
+
tokens = _compute_morph_pos(tokens)
|
| 84 |
+
|
| 85 |
+
# ── 7. Strip internal leading spaces from token text ─────────────
|
| 86 |
+
# Leading spaces are used internally to detect word boundaries
|
| 87 |
+
# during morph_pos computation but are NOT part of the public API.
|
| 88 |
+
tokens = _strip_token_text(tokens)
|
| 89 |
+
|
| 90 |
+
return tokens
|
| 91 |
+
|
| 92 |
+
def _tokenize_segment(
|
| 93 |
+
self, segment: str, caps_set: frozenset[str]
|
| 94 |
+
) -> list[dict[str, object]]:
|
| 95 |
+
"""Tokenize a plain-text segment (no special spans)."""
|
| 96 |
+
words = split_into_words(segment)
|
| 97 |
+
tokens: list[dict[str, object]] = []
|
| 98 |
+
|
| 99 |
+
for word in words:
|
| 100 |
+
word_tokens = segment_word(
|
| 101 |
+
word, self._tdk, self._domain_roots, caps_set
|
| 102 |
+
)
|
| 103 |
+
tokens.extend(word_tokens)
|
| 104 |
+
|
| 105 |
+
return tokens
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
# ── Helper: compute morph_pos across the full token stream ───────────────────
|
| 109 |
+
|
| 110 |
+
def _compute_morph_pos(tokens: list[dict[str, object]]) -> list[dict[str, object]]:
|
| 111 |
+
"""Recompute ``morph_pos`` consistently across the token stream.
|
| 112 |
+
|
| 113 |
+
Rules:
|
| 114 |
+
- Word-initial tokens (leading space, special types, PUNCT) → morph_pos = 0
|
| 115 |
+
- SUFFIX tokens increment the position counter
|
| 116 |
+
- Apostrophe suffixes continue from the previous word
|
| 117 |
+
"""
|
| 118 |
+
result: list[dict[str, object]] = []
|
| 119 |
+
word_pos = 0
|
| 120 |
+
|
| 121 |
+
for tok in tokens:
|
| 122 |
+
raw = str(tok["token"])
|
| 123 |
+
token_type = str(tok["token_type"])
|
| 124 |
+
|
| 125 |
+
is_word_start = raw.startswith(" ") or raw.strip().startswith("<")
|
| 126 |
+
|
| 127 |
+
# Apostrophe suffixes continue the previous word
|
| 128 |
+
if tok.get("_apo_suffix"):
|
| 129 |
+
is_word_start = False
|
| 130 |
+
|
| 131 |
+
if is_word_start or token_type in (
|
| 132 |
+
"NUM", "DATE", "UNIT", "URL", "MENTION", "HASHTAG", "EMOJI", "ACRONYM", "PUNCT"
|
| 133 |
+
):
|
| 134 |
+
word_pos = 0
|
| 135 |
+
morph_pos = 0
|
| 136 |
+
elif token_type == "SUFFIX":
|
| 137 |
+
word_pos += 1
|
| 138 |
+
morph_pos = word_pos
|
| 139 |
+
else:
|
| 140 |
+
# ROOT or FOREIGN within a word (shouldn't normally happen)
|
| 141 |
+
word_pos = 0
|
| 142 |
+
morph_pos = 0
|
| 143 |
+
|
| 144 |
+
result.append({**tok, "morph_pos": morph_pos})
|
| 145 |
+
|
| 146 |
+
return result
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
def _strip_token_text(tokens: list[dict[str, object]]) -> list[dict[str, object]]:
|
| 150 |
+
"""Remove internal leading whitespace from all token text strings.
|
| 151 |
+
|
| 152 |
+
During pipeline processing, a leading space in ``token`` signals
|
| 153 |
+
a word-initial token. Once ``morph_pos`` has been computed, this
|
| 154 |
+
space is no longer needed and must be stripped so the public API
|
| 155 |
+
returns clean text.
|
| 156 |
+
"""
|
| 157 |
+
return [{**tok, "token": str(tok["token"]).lstrip()} for tok in tokens]
|
nedo_turkish_tokenizer/morphology.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Morphology utilities: suffix analysis, allomorph canonicalization, compound annotation.
|
| 2 |
+
|
| 3 |
+
This module provides:
|
| 4 |
+
- Suffix stripping and matching against the suffix table
|
| 5 |
+
- Allomorph → canonical morpheme mapping (e.g. "lar"/"ler" → "PL")
|
| 6 |
+
- Compound word detection and decomposition
|
| 7 |
+
- Acronym expansion annotation
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
from ._acronym_table import ACRONYM_EXPANSIONS
|
| 13 |
+
from ._suffix_table import SUFFIX_ENTRIES, SUFFIX_MAP
|
| 14 |
+
|
| 15 |
+
# ── Allomorph → canonical morpheme mapping ───────────────────────────────────
|
| 16 |
+
# Maps surface-form suffixes to a language-neutral canonical label.
|
| 17 |
+
# Used for the ``_canonical`` metadata field on SUFFIX tokens.
|
| 18 |
+
|
| 19 |
+
ALLOMORPH_MAP: dict[str, str] = {
|
| 20 |
+
"lar": "PL", "ler": "PL",
|
| 21 |
+
"ı": "ACC", "i": "ACC", "u": "ACC", "ü": "ACC",
|
| 22 |
+
"yı": "ACC", "yi": "ACC", "yu": "ACC", "yü": "ACC",
|
| 23 |
+
"a": "DAT", "e": "DAT", "ya": "DAT", "ye": "DAT",
|
| 24 |
+
"da": "LOC", "de": "LOC", "ta": "LOC", "te": "LOC",
|
| 25 |
+
"dan": "ABL", "den": "ABL", "tan": "ABL", "ten": "ABL",
|
| 26 |
+
"ın": "GEN", "in": "GEN", "un": "GEN", "ün": "GEN",
|
| 27 |
+
"nın": "GEN", "nin": "GEN", "nun": "GEN", "nün": "GEN",
|
| 28 |
+
"la": "INS", "le": "INS", "yla": "INS", "yle": "INS",
|
| 29 |
+
"dı": "PAST", "di": "PAST", "du": "PAST", "dü": "PAST",
|
| 30 |
+
"tı": "PAST", "ti": "PAST", "tu": "PAST", "tü": "PAST",
|
| 31 |
+
"yor": "PROG", "iyor": "PROG", "ıyor": "PROG", "uyor": "PROG", "üyor": "PROG",
|
| 32 |
+
"ar": "AOR", "er": "AOR",
|
| 33 |
+
"ır": "AOR", "ir": "AOR", "ur": "AOR", "ür": "AOR",
|
| 34 |
+
"mış": "EVID", "miş": "EVID", "muş": "EVID", "müş": "EVID",
|
| 35 |
+
"ma": "NEG", "me": "NEG",
|
| 36 |
+
"mak": "INF", "mek": "INF",
|
| 37 |
+
"ım": "1SG", "im": "1SG", "um": "1SG", "üm": "1SG",
|
| 38 |
+
"iz": "1PL", "ız": "1PL", "uz": "1PL", "üz": "1PL",
|
| 39 |
+
"mı": "Q", "mi": "Q", "mu": "Q", "mü": "Q",
|
| 40 |
+
"lı": "WITH", "li": "WITH", "lu": "WITH", "lü": "WITH",
|
| 41 |
+
"sız": "WITHOUT", "siz": "WITHOUT", "suz": "WITHOUT", "süz": "WITHOUT",
|
| 42 |
+
"cı": "AGT", "ci": "AGT", "cu": "AGT", "cü": "AGT",
|
| 43 |
+
"çı": "AGT", "çi": "AGT", "çu": "AGT", "çü": "AGT",
|
| 44 |
+
"lık": "ABSTR", "lik": "ABSTR", "luk": "ABSTR", "lük": "ABSTR",
|
| 45 |
+
"sa": "COND", "se": "COND",
|
| 46 |
+
"ıl": "PASS", "il": "PASS", "ul": "PASS", "ül": "PASS",
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
# ── Compound word dictionary ────────────────────────────────────────────────
|
| 51 |
+
|
| 52 |
+
KNOWN_COMPOUNDS: dict[str, list[str]] = {
|
| 53 |
+
"başbakan": ["baş", "bakan"],
|
| 54 |
+
"cumhurbaşkanı": ["cumhur", "başkan"],
|
| 55 |
+
"dışişleri": ["dış", "iş"],
|
| 56 |
+
"içişleri": ["iç", "iş"],
|
| 57 |
+
"maliye": ["mal", "iye"],
|
| 58 |
+
"belediye": ["beled", "iye"],
|
| 59 |
+
"ayakkabı": ["ayak", "kap"],
|
| 60 |
+
"yelkovan": ["yel", "kovan"],
|
| 61 |
+
"saatlik": ["saat", "lik"],
|
| 62 |
+
"günlük": ["gün", "lük"],
|
| 63 |
+
"yıllık": ["yıl", "lık"],
|
| 64 |
+
"aylık": ["ay", "lık"],
|
| 65 |
+
"haftalık": ["hafta", "lık"],
|
| 66 |
+
"gastrointestinal": ["gastro", "intestinal"],
|
| 67 |
+
"kardiyovasküler": ["kardio", "vasküler"],
|
| 68 |
+
"nöropsikiyatri": ["nöro", "psikiyatri"],
|
| 69 |
+
"biyokimya": ["biyo", "kimya"],
|
| 70 |
+
"mikrobiyoloji": ["mikro", "biyoloji"],
|
| 71 |
+
"farmakoloji": ["farma", "koloji"],
|
| 72 |
+
"patoloji": ["pato", "loji"],
|
| 73 |
+
"hematoloji": ["hemato", "loji"],
|
| 74 |
+
"nefroloji": ["nefro", "loji"],
|
| 75 |
+
"kardiyoloji": ["kardio", "loji"],
|
| 76 |
+
"radyoloji": ["radyo", "loji"],
|
| 77 |
+
"onkoloji": ["onko", "loji"],
|
| 78 |
+
"elektromanyetik": ["elektro", "manyetik"],
|
| 79 |
+
"termodinamik": ["termo", "dinamik"],
|
| 80 |
+
"hidroelektrik": ["hidro", "elektrik"],
|
| 81 |
+
"biyoinformatik": ["biyo", "informatik"],
|
| 82 |
+
"nanoteknoloji": ["nano", "teknoloji"],
|
| 83 |
+
"futbolcu": ["futbol", "cu"],
|
| 84 |
+
"basketbolcu": ["basketbol", "cu"],
|
| 85 |
+
"voleybolcu": ["voleybol", "cu"],
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
# ── Suffix label lookup ─────────────────────────────────────────────────────
|
| 90 |
+
|
| 91 |
+
def get_suffix_label(surface: str) -> str | None:
|
| 92 |
+
"""Return the morphological label for a suffix surface form, or None."""
|
| 93 |
+
return SUFFIX_MAP.get(surface.lower())
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def get_canonical(surface: str) -> str | None:
|
| 97 |
+
"""Return the canonical morpheme label for a suffix, or None."""
|
| 98 |
+
return ALLOMORPH_MAP.get(surface.lower())
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
# ── Post-annotation passes ──────────────────────────────────────────────────
|
| 102 |
+
|
| 103 |
+
def annotate_canonical(tokens: list[dict[str, object]]) -> list[dict[str, object]]:
|
| 104 |
+
"""Add ``_canonical`` field to SUFFIX tokens (e.g. 'lar' → 'PL')."""
|
| 105 |
+
result: list[dict[str, object]] = []
|
| 106 |
+
for tok in tokens:
|
| 107 |
+
if tok["token_type"] != "SUFFIX":
|
| 108 |
+
result.append(tok)
|
| 109 |
+
continue
|
| 110 |
+
surface = str(tok["token"]).strip().lower()
|
| 111 |
+
canonical = ALLOMORPH_MAP.get(surface)
|
| 112 |
+
if canonical:
|
| 113 |
+
result.append({**tok, "_canonical": canonical})
|
| 114 |
+
else:
|
| 115 |
+
result.append(tok)
|
| 116 |
+
return result
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def annotate_compounds(tokens: list[dict[str, object]]) -> list[dict[str, object]]:
|
| 120 |
+
"""Annotate ROOT tokens that are compound words."""
|
| 121 |
+
result: list[dict[str, object]] = []
|
| 122 |
+
for tok in tokens:
|
| 123 |
+
if tok["token_type"] != "ROOT" or str(tok["token"]).strip().startswith("<"):
|
| 124 |
+
result.append(tok)
|
| 125 |
+
continue
|
| 126 |
+
|
| 127 |
+
surface = str(tok["token"]).strip().lower()
|
| 128 |
+
if surface in KNOWN_COMPOUNDS:
|
| 129 |
+
result.append({
|
| 130 |
+
**tok,
|
| 131 |
+
"_compound": True,
|
| 132 |
+
"_parts": KNOWN_COMPOUNDS[surface],
|
| 133 |
+
})
|
| 134 |
+
else:
|
| 135 |
+
result.append(tok)
|
| 136 |
+
return result
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def annotate_acronyms(tokens: list[dict[str, object]]) -> list[dict[str, object]]:
|
| 140 |
+
"""Add ``_expansion`` to known acronyms; promote CAPS ROOTs to ACRONYM."""
|
| 141 |
+
result: list[dict[str, object]] = []
|
| 142 |
+
for tok in tokens:
|
| 143 |
+
token_upper = str(tok["token"]).strip().upper()
|
| 144 |
+
expansion = ACRONYM_EXPANSIONS.get(token_upper)
|
| 145 |
+
|
| 146 |
+
if tok["token_type"] == "ACRONYM":
|
| 147 |
+
if expansion:
|
| 148 |
+
result.append({**tok, "_expansion": expansion, "_known_acronym": True})
|
| 149 |
+
else:
|
| 150 |
+
result.append(tok)
|
| 151 |
+
elif tok["token_type"] == "ROOT" and (tok.get("_acronym") or tok.get("_caps")):
|
| 152 |
+
if expansion:
|
| 153 |
+
result.append({
|
| 154 |
+
**tok, "token_type": "ACRONYM",
|
| 155 |
+
"_expansion": expansion, "_known_acronym": True,
|
| 156 |
+
})
|
| 157 |
+
else:
|
| 158 |
+
result.append(tok)
|
| 159 |
+
else:
|
| 160 |
+
result.append(tok)
|
| 161 |
+
return result
|
nedo_turkish_tokenizer/normalization.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Text normalization utilities for Turkish text.
|
| 2 |
+
|
| 3 |
+
Handles:
|
| 4 |
+
- Turkish-aware lowercasing (İ→i, I→ı)
|
| 5 |
+
- Unicode NFC normalization
|
| 6 |
+
- Whitespace cleanup
|
| 7 |
+
- ALL CAPS word detection and lowercasing
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
import re
|
| 13 |
+
import unicodedata
|
| 14 |
+
|
| 15 |
+
# Turkish-specific characters — presence indicates a Turkish word
|
| 16 |
+
TR_CHARS: frozenset[str] = frozenset("çğışöüÇĞİŞÖÜ")
|
| 17 |
+
|
| 18 |
+
# Pattern for detecting ALL CAPS words (≥2 uppercase letters)
|
| 19 |
+
_CAPS_RE = re.compile(r"\b([A-ZÇĞİÖŞÜ]{2,})\b")
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def turkish_lower(s: str) -> str:
|
| 23 |
+
"""Turkish-aware lowercase: İ→i, I→ı, then standard ``str.lower()``.
|
| 24 |
+
|
| 25 |
+
Standard Python ``str.lower()`` maps both I and İ to 'i', which is
|
| 26 |
+
wrong for Turkish where I→ı and İ→i.
|
| 27 |
+
"""
|
| 28 |
+
return s.replace("İ", "i").replace("I", "ı").lower()
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def normalize_text(text: str) -> str:
|
| 32 |
+
"""Apply Unicode NFC normalization and collapse whitespace."""
|
| 33 |
+
text = unicodedata.normalize("NFC", text)
|
| 34 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 35 |
+
return text
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def has_turkish_chars(word: str) -> bool:
|
| 39 |
+
"""Return True if *word* contains Turkish-specific characters (ç,ğ,ı,ş,ö,ü)."""
|
| 40 |
+
return any(c in TR_CHARS for c in word)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def detect_all_caps(text: str) -> tuple[str, frozenset[str]]:
|
| 44 |
+
"""Detect ALL CAPS words, lowercase them, and return the modified text.
|
| 45 |
+
|
| 46 |
+
ALL CAPS words like ``İSTANBUL`` cause problems for suffix-based
|
| 47 |
+
segmentation because the suffix table works on lowercase text. This
|
| 48 |
+
function lowercases them in-place and returns a set of the lowered
|
| 49 |
+
forms so the output tokens can be annotated with ``_caps=True``.
|
| 50 |
+
|
| 51 |
+
Returns:
|
| 52 |
+
``(modified_text, frozenset_of_lowered_caps_words)``
|
| 53 |
+
"""
|
| 54 |
+
caps_collector: set[str] = set()
|
| 55 |
+
|
| 56 |
+
def _replace(m: re.Match) -> str:
|
| 57 |
+
word = m.group(1)
|
| 58 |
+
lowered = turkish_lower(word)
|
| 59 |
+
caps_collector.add(lowered)
|
| 60 |
+
return lowered
|
| 61 |
+
|
| 62 |
+
modified = _CAPS_RE.sub(_replace, text)
|
| 63 |
+
return modified, frozenset(caps_collector)
|
nedo_turkish_tokenizer/resources.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Static resource loading for the tokenizer.
|
| 2 |
+
|
| 3 |
+
Loads bundled data files (TDK dictionary, proper nouns) from the package
|
| 4 |
+
``data/`` directory. All resources are loaded lazily on first access and
|
| 5 |
+
cached in module-level globals.
|
| 6 |
+
|
| 7 |
+
**No network access. No runtime downloads. Fully offline.**
|
| 8 |
+
|
| 9 |
+
The TDK dictionary contains infinitive verb forms (e.g. "gelmek") but the
|
| 10 |
+
tokenizer needs bare verb stems (e.g. "gel") for suffix stripping. This
|
| 11 |
+
module automatically derives verb stems from infinitives at load time.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from __future__ import annotations
|
| 15 |
+
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
|
| 18 |
+
_DATA_DIR = Path(__file__).parent / "data"
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# ── TDK Word List ────────────────────────────────────────────────────────────
|
| 22 |
+
|
| 23 |
+
_TDK_WORDS: set[str] | None = None
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _derive_verb_stems(raw_words: set[str]) -> set[str]:
|
| 27 |
+
"""Derive bare verb stems from TDK infinitive entries.
|
| 28 |
+
|
| 29 |
+
TDK lists verbs as infinitives ("gelmek", "bakmak"). The tokenizer
|
| 30 |
+
needs bare stems ("gel", "bak") for suffix stripping.
|
| 31 |
+
|
| 32 |
+
This function strips "-mak"/"-mek" from infinitives and adds the
|
| 33 |
+
resulting stems to the word set. Only stems of 2+ characters are
|
| 34 |
+
added to avoid spurious short matches.
|
| 35 |
+
"""
|
| 36 |
+
derived: set[str] = set()
|
| 37 |
+
for word in raw_words:
|
| 38 |
+
if word.endswith("mak") and len(word) > 4:
|
| 39 |
+
stem = word[:-3]
|
| 40 |
+
if len(stem) >= 2:
|
| 41 |
+
derived.add(stem)
|
| 42 |
+
elif word.endswith("mek") and len(word) > 4:
|
| 43 |
+
stem = word[:-3]
|
| 44 |
+
if len(stem) >= 2:
|
| 45 |
+
derived.add(stem)
|
| 46 |
+
return derived
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def load_tdk_words() -> set[str]:
|
| 50 |
+
"""Load the TDK (Türk Dil Kurumu) word list from the bundled data file.
|
| 51 |
+
|
| 52 |
+
Returns a set of lowercase Turkish words including:
|
| 53 |
+
- Original dictionary entries (nouns, adjectives, adverbs, infinitives)
|
| 54 |
+
- Derived verb stems (stripped -mak/-mek from infinitives)
|
| 55 |
+
|
| 56 |
+
Used for:
|
| 57 |
+
- Root validation during suffix stripping (is the remainder a real word?)
|
| 58 |
+
- Foreign word detection (word absent from TDK → likely foreign)
|
| 59 |
+
- Turkish-base detection for apostrophe handling
|
| 60 |
+
"""
|
| 61 |
+
global _TDK_WORDS
|
| 62 |
+
if _TDK_WORDS is not None:
|
| 63 |
+
return _TDK_WORDS
|
| 64 |
+
|
| 65 |
+
tdk_path = _DATA_DIR / "tdk_words.txt"
|
| 66 |
+
if tdk_path.exists():
|
| 67 |
+
raw_words = {
|
| 68 |
+
line.strip().lower()
|
| 69 |
+
for line in tdk_path.read_text(encoding="utf-8").splitlines()
|
| 70 |
+
if line.strip()
|
| 71 |
+
}
|
| 72 |
+
# Derive verb stems from infinitives (gelmek→gel, bakmak→bak)
|
| 73 |
+
stems = _derive_verb_stems(raw_words)
|
| 74 |
+
_TDK_WORDS = raw_words | stems
|
| 75 |
+
else:
|
| 76 |
+
_TDK_WORDS = set()
|
| 77 |
+
|
| 78 |
+
return _TDK_WORDS
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
# ── Proper Nouns ─────────────────────────────────────────────────────────────
|
| 82 |
+
|
| 83 |
+
_PROPER_NOUNS: set[str] | None = None
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def load_proper_nouns() -> set[str]:
|
| 87 |
+
"""Load Turkish proper nouns (cities, regions, names) from bundled data.
|
| 88 |
+
|
| 89 |
+
Used in apostrophe handling to distinguish Turkish proper names
|
| 90 |
+
(İstanbul'da → keep as Turkish ROOT) from foreign words
|
| 91 |
+
(meeting'e → mark as FOREIGN ROOT).
|
| 92 |
+
"""
|
| 93 |
+
global _PROPER_NOUNS
|
| 94 |
+
if _PROPER_NOUNS is not None:
|
| 95 |
+
return _PROPER_NOUNS
|
| 96 |
+
|
| 97 |
+
path = _DATA_DIR / "turkish_proper_nouns.txt"
|
| 98 |
+
if path.exists():
|
| 99 |
+
_PROPER_NOUNS = {
|
| 100 |
+
line.strip().lower()
|
| 101 |
+
for line in path.read_text(encoding="utf-8").splitlines()
|
| 102 |
+
if line.strip() and not line.startswith("#")
|
| 103 |
+
}
|
| 104 |
+
else:
|
| 105 |
+
_PROPER_NOUNS = set()
|
| 106 |
+
|
| 107 |
+
return _PROPER_NOUNS
|
nedo_turkish_tokenizer/segmentation.py
ADDED
|
@@ -0,0 +1,475 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Word-level segmentation with candidate generation and selection.
|
| 2 |
+
|
| 3 |
+
This is the core of the tokenizer. For each word it:
|
| 4 |
+
1. Generates multiple segmentation candidates (whole-word ROOT, suffix
|
| 5 |
+
chains, foreign root, etc.)
|
| 6 |
+
2. Scores each candidate deterministically
|
| 7 |
+
3. Selects the highest-scoring segmentation
|
| 8 |
+
|
| 9 |
+
The scoring rules are transparent and tunable:
|
| 10 |
+
- TDK root match gives a large bonus
|
| 11 |
+
- Domain vocabulary match gives a moderate bonus
|
| 12 |
+
- Longer roots are preferred over shorter ones
|
| 13 |
+
- Each recognised suffix adds a small bonus
|
| 14 |
+
- Unknown / unvalidated roots get a low base score
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from __future__ import annotations
|
| 18 |
+
|
| 19 |
+
import re
|
| 20 |
+
from typing import Any
|
| 21 |
+
|
| 22 |
+
from ._domain_vocab import ALL_DOMAIN_ROOTS
|
| 23 |
+
from ._suffix_table import (
|
| 24 |
+
SHORT_AMBIGUOUS_SUFFIXES,
|
| 25 |
+
SUFFIX_ENTRIES,
|
| 26 |
+
SUFFIX_MAP,
|
| 27 |
+
)
|
| 28 |
+
from .normalization import has_turkish_chars, turkish_lower
|
| 29 |
+
from .resources import load_proper_nouns, load_tdk_words
|
| 30 |
+
from .types import PUNCT_CHARS, SegmentationCandidate, Token, is_punct_token
|
| 31 |
+
|
| 32 |
+
# ── Scoring constants ────────────────────────────────────────────────────────
|
| 33 |
+
# Why these values: TDK_BONUS dominates so that a TDK-validated root almost
|
| 34 |
+
# always wins over an unvalidated one. SUFFIX_BONUS is small enough that
|
| 35 |
+
# over-segmentation (many tiny suffixes) doesn't beat a valid longer root.
|
| 36 |
+
|
| 37 |
+
_TDK_BONUS = 10 # Root found in TDK dictionary
|
| 38 |
+
_DOMAIN_BONUS = 8 # Root found in domain vocabulary
|
| 39 |
+
_SUFFIX_BONUS = 2 # Each recognised suffix
|
| 40 |
+
_ROOT_LEN_WEIGHT = 2 # Per-character bonus for root length (prefer longer roots)
|
| 41 |
+
_WHOLE_WORD_BONUS = 5 # Extra bonus when the *entire* unsplit word is in TDK
|
| 42 |
+
_FOREIGN_BASE = 3 # Base score for foreign root (intentionally low)
|
| 43 |
+
_UNKNOWN_BASE = 1 # Base score for unrecognised root
|
| 44 |
+
_SHORT_ROOT_PENALTY = 4 # Penalty when root is exactly _MIN_ROOT_LEN chars
|
| 45 |
+
_MIN_ROOT_LEN = 2 # Minimum root length for suffix stripping
|
| 46 |
+
_MAX_SUFFIX_DEPTH = 5 # Maximum number of suffixes to strip
|
| 47 |
+
|
| 48 |
+
# ── Known-intact words ───────────────────────────────────────────────────────
|
| 49 |
+
# Common Turkish words that *look* like root+suffix but must stay whole.
|
| 50 |
+
# Without this set, "dedi" would split into "de" (TDK conjunction) + "di"
|
| 51 |
+
# (past tense suffix) because both are individually valid.
|
| 52 |
+
#
|
| 53 |
+
# This set covers inflected forms of very short verb stems (de-, ye-) and
|
| 54 |
+
# common discourse particles that happen to end in suffix-like sequences.
|
| 55 |
+
|
| 56 |
+
KNOWN_INTACT: frozenset[str] = frozenset({
|
| 57 |
+
# Forms of "demek" (to say) — stem "de" is a TDK conjunction,
|
| 58 |
+
# causing false splits like de+di, de+miş, de+se, etc.
|
| 59 |
+
"dedi", "dedim", "dedin", "dedik", "dediniz", "dediler",
|
| 60 |
+
"demiş", "demişti", "demiştir",
|
| 61 |
+
"dese", "desem", "desen", "desek",
|
| 62 |
+
"der", "derim", "dersin", "deriz",
|
| 63 |
+
"denir", "dendi", "denmiş",
|
| 64 |
+
# Forms of "yemek" (to eat) — stem "ye" is in TDK
|
| 65 |
+
"yemiş", "yese", "yesem", "yesen",
|
| 66 |
+
"yer", "yerim", "yersin", "yeriz",
|
| 67 |
+
"yenir", "yendi", "yenmiş",
|
| 68 |
+
# Common particles / conjunctions that end in suffix-like sequences
|
| 69 |
+
# (most already protected by TDK WHOLE_WORD_BONUS, but double-guarding)
|
| 70 |
+
"diye", "niye", "nice",
|
| 71 |
+
})
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
# ── Punctuation splitting ────────────────────────────────────────────────────
|
| 75 |
+
|
| 76 |
+
# Regex to split a word at apostrophes (keeping the apostrophe)
|
| 77 |
+
_APOSTROPHE_RE = re.compile(r"(['\u2019])")
|
| 78 |
+
|
| 79 |
+
# Regex to split leading/trailing punctuation from a word
|
| 80 |
+
_LEADING_PUNCT_RE = re.compile(r"^([^\w]+)")
|
| 81 |
+
_TRAILING_PUNCT_RE = re.compile(r"([^\w]+)$")
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def _split_punctuation(word: str) -> list[tuple[str, str]]:
|
| 85 |
+
"""Split a raw word token into (text, type) pairs.
|
| 86 |
+
|
| 87 |
+
Separates leading and trailing punctuation from the core word.
|
| 88 |
+
For example: ``'"hello,'`` → ``[('"', 'PUNCT'), ('hello', 'WORD'), (',', 'PUNCT')]``
|
| 89 |
+
"""
|
| 90 |
+
if not word:
|
| 91 |
+
return []
|
| 92 |
+
|
| 93 |
+
parts: list[tuple[str, str]] = []
|
| 94 |
+
|
| 95 |
+
# Check if the entire token is punctuation
|
| 96 |
+
if is_punct_token(word):
|
| 97 |
+
return [(word, "PUNCT")]
|
| 98 |
+
|
| 99 |
+
# Strip leading punctuation
|
| 100 |
+
lead_m = _LEADING_PUNCT_RE.match(word)
|
| 101 |
+
if lead_m:
|
| 102 |
+
for ch in lead_m.group(1):
|
| 103 |
+
parts.append((ch, "PUNCT"))
|
| 104 |
+
word = word[lead_m.end():]
|
| 105 |
+
|
| 106 |
+
# Strip trailing punctuation
|
| 107 |
+
trail_m = _TRAILING_PUNCT_RE.search(word)
|
| 108 |
+
trailing: list[tuple[str, str]] = []
|
| 109 |
+
if trail_m:
|
| 110 |
+
for ch in trail_m.group(1):
|
| 111 |
+
trailing.append((ch, "PUNCT"))
|
| 112 |
+
word = word[:trail_m.start()]
|
| 113 |
+
|
| 114 |
+
if word:
|
| 115 |
+
parts.append((word, "WORD"))
|
| 116 |
+
|
| 117 |
+
parts.extend(trailing)
|
| 118 |
+
return parts
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
# ── Word splitting ───────────────────────────────────────────────────────────
|
| 122 |
+
|
| 123 |
+
def split_into_words(text: str) -> list[str]:
|
| 124 |
+
"""Split text into whitespace-delimited word tokens.
|
| 125 |
+
|
| 126 |
+
Preserves the original casing and punctuation within each token.
|
| 127 |
+
"""
|
| 128 |
+
return text.split()
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
# ── Candidate generation ────────────────────────────────────────────────────
|
| 132 |
+
|
| 133 |
+
def _generate_suffix_candidates(
|
| 134 |
+
word_lower: str,
|
| 135 |
+
tdk: set[str],
|
| 136 |
+
domain_roots: frozenset[str],
|
| 137 |
+
depth: int = 0,
|
| 138 |
+
) -> list[SegmentationCandidate]:
|
| 139 |
+
"""Recursively generate segmentation candidates by stripping suffixes.
|
| 140 |
+
|
| 141 |
+
Tries each suffix in the table (longest first). If the remainder
|
| 142 |
+
is a valid root, produces a candidate. If not, recurses to try
|
| 143 |
+
stripping additional suffixes from the remainder.
|
| 144 |
+
"""
|
| 145 |
+
if depth >= _MAX_SUFFIX_DEPTH or len(word_lower) < _MIN_ROOT_LEN:
|
| 146 |
+
return []
|
| 147 |
+
|
| 148 |
+
candidates: list[SegmentationCandidate] = []
|
| 149 |
+
|
| 150 |
+
for suffix_surface, suffix_label in SUFFIX_ENTRIES:
|
| 151 |
+
if not word_lower.endswith(suffix_surface):
|
| 152 |
+
continue
|
| 153 |
+
|
| 154 |
+
remainder = word_lower[: -len(suffix_surface)]
|
| 155 |
+
if len(remainder) < _MIN_ROOT_LEN:
|
| 156 |
+
continue
|
| 157 |
+
|
| 158 |
+
# Extra caution for very short / ambiguous suffixes
|
| 159 |
+
if suffix_surface in SHORT_AMBIGUOUS_SUFFIXES and len(remainder) < 3:
|
| 160 |
+
continue
|
| 161 |
+
|
| 162 |
+
suffix_token = Token(
|
| 163 |
+
text=suffix_surface,
|
| 164 |
+
token_type="SUFFIX",
|
| 165 |
+
metadata={"_suffix_label": suffix_label},
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
# Check if remainder is a valid root
|
| 169 |
+
root_in_tdk = remainder in tdk
|
| 170 |
+
root_in_domain = remainder in domain_roots
|
| 171 |
+
root_score = len(remainder) * _ROOT_LEN_WEIGHT
|
| 172 |
+
|
| 173 |
+
if root_in_tdk:
|
| 174 |
+
root_score += _TDK_BONUS
|
| 175 |
+
elif root_in_domain:
|
| 176 |
+
root_score += _DOMAIN_BONUS
|
| 177 |
+
else:
|
| 178 |
+
root_score += _UNKNOWN_BASE
|
| 179 |
+
|
| 180 |
+
# Penalise very short roots: 2-char roots like "de", "ye", "al"
|
| 181 |
+
# are valid TDK entries but produce many false splits on short
|
| 182 |
+
# words (e.g. "dedi" → de+di). The penalty makes it harder for
|
| 183 |
+
# a 2-char root to beat the whole-word hypothesis.
|
| 184 |
+
if len(remainder) <= _MIN_ROOT_LEN:
|
| 185 |
+
root_score -= _SHORT_ROOT_PENALTY
|
| 186 |
+
|
| 187 |
+
if root_in_tdk or root_in_domain:
|
| 188 |
+
# Valid root found → create single-level candidate
|
| 189 |
+
root_token = Token(
|
| 190 |
+
text=remainder,
|
| 191 |
+
token_type="ROOT",
|
| 192 |
+
metadata={"_tdk": root_in_tdk, "_domain": root_in_domain} if root_in_domain else {},
|
| 193 |
+
)
|
| 194 |
+
total_score = root_score + _SUFFIX_BONUS
|
| 195 |
+
candidates.append(SegmentationCandidate(
|
| 196 |
+
tokens=[root_token, suffix_token],
|
| 197 |
+
score=total_score,
|
| 198 |
+
source="suffix_chain",
|
| 199 |
+
))
|
| 200 |
+
|
| 201 |
+
# Recurse: try stripping more suffixes from the remainder
|
| 202 |
+
if depth < _MAX_SUFFIX_DEPTH - 1:
|
| 203 |
+
sub_candidates = _generate_suffix_candidates(
|
| 204 |
+
remainder, tdk, domain_roots, depth + 1
|
| 205 |
+
)
|
| 206 |
+
for sc in sub_candidates:
|
| 207 |
+
# Only accept recursive results that found a real root
|
| 208 |
+
if sc.score > len(remainder) + _UNKNOWN_BASE:
|
| 209 |
+
extended = SegmentationCandidate(
|
| 210 |
+
tokens=sc.tokens + [suffix_token],
|
| 211 |
+
score=sc.score + _SUFFIX_BONUS,
|
| 212 |
+
source="suffix_chain",
|
| 213 |
+
)
|
| 214 |
+
candidates.append(extended)
|
| 215 |
+
|
| 216 |
+
return candidates
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
def generate_candidates(
|
| 220 |
+
word: str,
|
| 221 |
+
tdk: set[str],
|
| 222 |
+
domain_roots: frozenset[str],
|
| 223 |
+
caps_set: frozenset[str],
|
| 224 |
+
) -> list[SegmentationCandidate]:
|
| 225 |
+
"""Generate all plausible segmentation candidates for a single word.
|
| 226 |
+
|
| 227 |
+
Returns a list of candidates sorted by score (highest first).
|
| 228 |
+
"""
|
| 229 |
+
wl = turkish_lower(word)
|
| 230 |
+
candidates: list[SegmentationCandidate] = []
|
| 231 |
+
|
| 232 |
+
is_caps = wl in caps_set
|
| 233 |
+
is_tr_chars = has_turkish_chars(wl)
|
| 234 |
+
|
| 235 |
+
# ── Fast path: known-intact words bypass candidate generation ────────
|
| 236 |
+
# These are common words that look splittable but must stay whole.
|
| 237 |
+
if wl in KNOWN_INTACT:
|
| 238 |
+
root_meta_intact: dict[str, Any] = {}
|
| 239 |
+
if is_caps:
|
| 240 |
+
root_meta_intact["_caps"] = True
|
| 241 |
+
return [SegmentationCandidate(
|
| 242 |
+
tokens=[Token(text=wl, token_type="ROOT", metadata=root_meta_intact)],
|
| 243 |
+
score=len(wl) * _ROOT_LEN_WEIGHT + _TDK_BONUS + _WHOLE_WORD_BONUS,
|
| 244 |
+
source="known_intact",
|
| 245 |
+
)]
|
| 246 |
+
|
| 247 |
+
# ── Candidate 1: whole word as ROOT ──────────────────────────────────
|
| 248 |
+
in_tdk = wl in tdk
|
| 249 |
+
in_proper = wl in load_proper_nouns()
|
| 250 |
+
in_domain = wl in domain_roots
|
| 251 |
+
whole_score = len(wl) * _ROOT_LEN_WEIGHT
|
| 252 |
+
if in_tdk or in_proper:
|
| 253 |
+
# Whole-word TDK/proper-noun match gets an extra bonus to prevent
|
| 254 |
+
# over-segmenting valid dictionary words like "dünya" into
|
| 255 |
+
# "dün" + "ya".
|
| 256 |
+
whole_score += _TDK_BONUS + _WHOLE_WORD_BONUS
|
| 257 |
+
elif in_domain:
|
| 258 |
+
whole_score += _DOMAIN_BONUS + _WHOLE_WORD_BONUS
|
| 259 |
+
else:
|
| 260 |
+
whole_score += _UNKNOWN_BASE
|
| 261 |
+
|
| 262 |
+
root_meta: dict[str, Any] = {}
|
| 263 |
+
if is_caps:
|
| 264 |
+
root_meta["_caps"] = True
|
| 265 |
+
if in_domain:
|
| 266 |
+
root_meta["_domain"] = True
|
| 267 |
+
|
| 268 |
+
whole_root = Token(text=wl, token_type="ROOT", metadata=root_meta)
|
| 269 |
+
candidates.append(SegmentationCandidate(
|
| 270 |
+
tokens=[whole_root],
|
| 271 |
+
score=whole_score,
|
| 272 |
+
source="whole_word",
|
| 273 |
+
))
|
| 274 |
+
|
| 275 |
+
# ── Candidate 2+: suffix stripping ───────────────────────────────────
|
| 276 |
+
suffix_cands = _generate_suffix_candidates(wl, tdk, domain_roots)
|
| 277 |
+
for sc in suffix_cands:
|
| 278 |
+
# Propagate caps flag to the root token
|
| 279 |
+
if is_caps and sc.tokens:
|
| 280 |
+
sc.tokens[0].metadata["_caps"] = True
|
| 281 |
+
candidates.append(sc)
|
| 282 |
+
|
| 283 |
+
# ── Candidate N: foreign root ────────────────────────────────────────
|
| 284 |
+
if not in_tdk and not in_proper and not is_tr_chars and len(wl) >= 2:
|
| 285 |
+
foreign_token = Token(
|
| 286 |
+
text=wl, token_type="FOREIGN",
|
| 287 |
+
metadata={"_foreign": True},
|
| 288 |
+
)
|
| 289 |
+
# Foreign score uses flat weight 1 (not ROOT_LEN_WEIGHT) so that
|
| 290 |
+
# valid suffix chains with a TDK root always beat FOREIGN.
|
| 291 |
+
foreign_score = _FOREIGN_BASE + len(wl)
|
| 292 |
+
candidates.append(SegmentationCandidate(
|
| 293 |
+
tokens=[foreign_token],
|
| 294 |
+
score=foreign_score,
|
| 295 |
+
source="foreign",
|
| 296 |
+
))
|
| 297 |
+
|
| 298 |
+
# Sort by score descending (highest first)
|
| 299 |
+
candidates.sort(key=lambda c: c.score, reverse=True)
|
| 300 |
+
return candidates
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
# ── Candidate selection ──────────────────────────────────────────────────────
|
| 304 |
+
|
| 305 |
+
def select_best_candidate(
|
| 306 |
+
candidates: list[SegmentationCandidate],
|
| 307 |
+
) -> SegmentationCandidate:
|
| 308 |
+
"""Select the best segmentation among candidates.
|
| 309 |
+
|
| 310 |
+
Picks the highest-scoring candidate. Ties are broken by:
|
| 311 |
+
1. Fewer tokens (less fragmentation)
|
| 312 |
+
2. Longer root token
|
| 313 |
+
"""
|
| 314 |
+
if not candidates:
|
| 315 |
+
# Fallback: should never happen, but safety net
|
| 316 |
+
return SegmentationCandidate(
|
| 317 |
+
tokens=[Token(text="", token_type="ROOT")],
|
| 318 |
+
score=0.0,
|
| 319 |
+
source="fallback",
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
+
if len(candidates) == 1:
|
| 323 |
+
return candidates[0]
|
| 324 |
+
|
| 325 |
+
best_score = candidates[0].score
|
| 326 |
+
tied = [c for c in candidates if c.score == best_score]
|
| 327 |
+
|
| 328 |
+
if len(tied) == 1:
|
| 329 |
+
return tied[0]
|
| 330 |
+
|
| 331 |
+
# Tie-breaking: fewer tokens first; then longer root
|
| 332 |
+
def _tie_key(c: SegmentationCandidate) -> tuple[int, int]:
|
| 333 |
+
root_len = max(
|
| 334 |
+
(len(t.text) for t in c.tokens if t.token_type == "ROOT"),
|
| 335 |
+
default=0,
|
| 336 |
+
)
|
| 337 |
+
return (len(c.tokens), -root_len)
|
| 338 |
+
|
| 339 |
+
tied.sort(key=_tie_key)
|
| 340 |
+
return tied[0]
|
| 341 |
+
|
| 342 |
+
|
| 343 |
+
# ── Full word segmentation ───────────────────────────────────────────────────
|
| 344 |
+
|
| 345 |
+
def segment_word(
|
| 346 |
+
word: str,
|
| 347 |
+
tdk: set[str],
|
| 348 |
+
domain_roots: frozenset[str],
|
| 349 |
+
caps_set: frozenset[str],
|
| 350 |
+
) -> list[dict[str, object]]:
|
| 351 |
+
"""Segment a single word into token dicts.
|
| 352 |
+
|
| 353 |
+
This is the main entry point for per-word segmentation. It handles
|
| 354 |
+
punctuation splitting, candidate generation, and selection.
|
| 355 |
+
|
| 356 |
+
Args:
|
| 357 |
+
word: Raw word string (may include surrounding punctuation).
|
| 358 |
+
tdk: TDK dictionary set.
|
| 359 |
+
domain_roots: Domain vocabulary set.
|
| 360 |
+
caps_set: Set of words that were originally ALL CAPS.
|
| 361 |
+
|
| 362 |
+
Returns:
|
| 363 |
+
List of token dicts ready for inclusion in the output.
|
| 364 |
+
"""
|
| 365 |
+
parts = _split_punctuation(word)
|
| 366 |
+
result: list[dict[str, object]] = []
|
| 367 |
+
is_first = True
|
| 368 |
+
|
| 369 |
+
for text, part_type in parts:
|
| 370 |
+
if part_type == "PUNCT":
|
| 371 |
+
prefix = " " if is_first else ""
|
| 372 |
+
result.append({
|
| 373 |
+
"token": f"{prefix}{text}",
|
| 374 |
+
"token_type": "PUNCT",
|
| 375 |
+
"morph_pos": 0,
|
| 376 |
+
"_punct": True,
|
| 377 |
+
})
|
| 378 |
+
is_first = False
|
| 379 |
+
continue
|
| 380 |
+
|
| 381 |
+
# part_type == "WORD"
|
| 382 |
+
# Check for apostrophe within the word
|
| 383 |
+
if "'" in text or "\u2019" in text:
|
| 384 |
+
apo_tokens = _segment_apostrophe_word(text, tdk, domain_roots, caps_set)
|
| 385 |
+
for i, t in enumerate(apo_tokens):
|
| 386 |
+
if i == 0 and is_first:
|
| 387 |
+
t["token"] = f" {t['token'].lstrip()}"
|
| 388 |
+
result.append(t)
|
| 389 |
+
is_first = False
|
| 390 |
+
continue
|
| 391 |
+
|
| 392 |
+
# Standard word segmentation via candidate generation
|
| 393 |
+
candidates = generate_candidates(text, tdk, domain_roots, caps_set)
|
| 394 |
+
best = select_best_candidate(candidates)
|
| 395 |
+
|
| 396 |
+
for i, token in enumerate(best.tokens):
|
| 397 |
+
tok_dict = token.to_dict()
|
| 398 |
+
# Add leading space to the first token of this word
|
| 399 |
+
if i == 0 and is_first:
|
| 400 |
+
tok_dict["token"] = f" {tok_dict['token'].lstrip()}"
|
| 401 |
+
# Compute morph_pos
|
| 402 |
+
if i == 0:
|
| 403 |
+
tok_dict["morph_pos"] = 0
|
| 404 |
+
else:
|
| 405 |
+
tok_dict["morph_pos"] = i
|
| 406 |
+
result.append(tok_dict)
|
| 407 |
+
|
| 408 |
+
is_first = False
|
| 409 |
+
|
| 410 |
+
return result
|
| 411 |
+
|
| 412 |
+
|
| 413 |
+
def _segment_apostrophe_word(
|
| 414 |
+
word: str,
|
| 415 |
+
tdk: set[str],
|
| 416 |
+
domain_roots: frozenset[str],
|
| 417 |
+
caps_set: frozenset[str],
|
| 418 |
+
) -> list[dict[str, object]]:
|
| 419 |
+
"""Segment a word containing an apostrophe.
|
| 420 |
+
|
| 421 |
+
Splits at the apostrophe and determines whether the base is Turkish
|
| 422 |
+
(proper name) or foreign.
|
| 423 |
+
"""
|
| 424 |
+
from .apostrophe import is_turkish_base # avoid circular at module level
|
| 425 |
+
|
| 426 |
+
# Find the apostrophe position
|
| 427 |
+
apo_pos = word.find("'")
|
| 428 |
+
if apo_pos == -1:
|
| 429 |
+
apo_pos = word.find("\u2019")
|
| 430 |
+
if apo_pos == -1:
|
| 431 |
+
# No apostrophe found (shouldn't happen) — treat as regular word
|
| 432 |
+
candidates = generate_candidates(word, tdk, domain_roots, caps_set)
|
| 433 |
+
best = select_best_candidate(candidates)
|
| 434 |
+
return [t.to_dict() for t in best.tokens]
|
| 435 |
+
|
| 436 |
+
base = word[:apo_pos]
|
| 437 |
+
suffix = word[apo_pos + 1:]
|
| 438 |
+
|
| 439 |
+
wl = turkish_lower(base)
|
| 440 |
+
is_caps = wl in caps_set
|
| 441 |
+
|
| 442 |
+
if is_turkish_base(base):
|
| 443 |
+
# Turkish proper name: ROOT + PUNCT(') + SUFFIX
|
| 444 |
+
suffix_label = SUFFIX_MAP.get(suffix.lower(), "-SFX")
|
| 445 |
+
tokens: list[dict[str, object]] = [
|
| 446 |
+
{
|
| 447 |
+
"token": base, "token_type": "ROOT", "morph_pos": 0,
|
| 448 |
+
**( {"_caps": True} if is_caps else {}),
|
| 449 |
+
},
|
| 450 |
+
{
|
| 451 |
+
"token": "'", "token_type": "PUNCT", "morph_pos": 0,
|
| 452 |
+
"_punct": True,
|
| 453 |
+
},
|
| 454 |
+
]
|
| 455 |
+
if suffix:
|
| 456 |
+
tokens.append({
|
| 457 |
+
"token": suffix, "token_type": "SUFFIX", "morph_pos": 1,
|
| 458 |
+
"_apo_suffix": True, "_suffix_label": suffix_label,
|
| 459 |
+
})
|
| 460 |
+
return tokens
|
| 461 |
+
else:
|
| 462 |
+
# Foreign word: FOREIGN + SUFFIX
|
| 463 |
+
suffix_label = SUFFIX_MAP.get(suffix.lower(), "-SFX")
|
| 464 |
+
tokens = [
|
| 465 |
+
{
|
| 466 |
+
"token": base, "token_type": "FOREIGN", "morph_pos": 0,
|
| 467 |
+
"_foreign": True,
|
| 468 |
+
},
|
| 469 |
+
]
|
| 470 |
+
if suffix:
|
| 471 |
+
tokens.append({
|
| 472 |
+
"token": suffix, "token_type": "SUFFIX", "morph_pos": 1,
|
| 473 |
+
"_apo_suffix": True, "_suffix_label": suffix_label,
|
| 474 |
+
})
|
| 475 |
+
return tokens
|
nedo_turkish_tokenizer/{_normalizer.py → special_spans.py}
RENAMED
|
@@ -1,100 +1,85 @@
|
|
| 1 |
-
"""
|
| 2 |
|
| 3 |
-
|
| 4 |
-
|
|
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
from __future__ import annotations
|
| 8 |
|
| 9 |
import re
|
| 10 |
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
"
|
| 20 |
-
"
|
| 21 |
-
"
|
| 22 |
-
"
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
"
|
| 30 |
-
"
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
"lar","ler","lara","lere","ları","leri",
|
| 46 |
-
"ım","im","um","üm","ın","in","un","ün",
|
| 47 |
-
"mız","miz","muz","müz","nız","niz","nuz","nüz",
|
| 48 |
-
"dır","dir","dur","dür","tır","tir","tur","tür",
|
| 49 |
-
"ki","li","lı","lu","lü","sız","siz","suz","süz",
|
| 50 |
-
"inci","ıncı","uncu","üncü","nci","ncı",
|
| 51 |
-
"lık","lik","luk","lük",
|
| 52 |
-
"a","e","ı","i","u","ü",
|
| 53 |
-
],
|
| 54 |
-
key=len,
|
| 55 |
-
reverse=True,
|
| 56 |
-
)
|
| 57 |
|
| 58 |
-
_SUFFIX_ALT =
|
| 59 |
|
| 60 |
-
# Number
|
| 61 |
NUM_APOSTROPHE_RE = re.compile(
|
| 62 |
r"\d+(?:[.:,]\d+)*['\u2019](?:" + _SUFFIX_ALT + r")+\b",
|
| 63 |
re.IGNORECASE,
|
| 64 |
)
|
| 65 |
|
| 66 |
-
DATE_RE
|
| 67 |
-
r
|
| 68 |
-
r
|
| 69 |
)
|
| 70 |
-
CURRENCY_RE
|
| 71 |
-
NUMBER_RE
|
| 72 |
-
r
|
| 73 |
-
r
|
| 74 |
-
r
|
| 75 |
-
r
|
| 76 |
-
r
|
| 77 |
)
|
| 78 |
-
TIME_RE
|
| 79 |
-
PLAIN_NUM_RE
|
| 80 |
-
|
| 81 |
-
#
|
| 82 |
-
# Matches standalone uppercase sequences (+ optional trailing digits).
|
| 83 |
-
# [A-Z]{2,}[0-9]* → HTML, GPT, CSS3, HTML5, MP3
|
| 84 |
-
# [A-Z][0-9]+ → F16, H264, A4
|
| 85 |
-
# Does NOT match mixed-case words (ChatGPT) because \b won't fire mid-word.
|
| 86 |
ACRONYM_RE = re.compile(
|
| 87 |
r"\b[A-ZÇĞİÖŞÜ]{2,}[0-9]*\b"
|
| 88 |
r"|\b[A-ZÇĞİÖŞÜ][0-9]+\b"
|
| 89 |
)
|
| 90 |
|
| 91 |
-
# Acronym
|
| 92 |
ACRONYM_APOSTROPHE_RE = re.compile(
|
| 93 |
r"\b(?:[A-ZÇĞİÖŞÜ]{2,}[0-9]*|[A-ZÇĞİÖŞÜ][0-9]+)['\u2019](?:"
|
| 94 |
+ _SUFFIX_ALT + r")+\b"
|
| 95 |
)
|
| 96 |
|
| 97 |
-
TEXT_EMOJI_RE
|
| 98 |
UNICODE_EMOJI_RE = re.compile(
|
| 99 |
"[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
|
| 100 |
"\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF"
|
|
@@ -103,20 +88,20 @@ UNICODE_EMOJI_RE = re.compile(
|
|
| 103 |
flags=re.UNICODE,
|
| 104 |
)
|
| 105 |
|
| 106 |
-
#
|
| 107 |
-
_SPAN_PATTERNS: list[tuple[re.Pattern, str]] = [
|
| 108 |
-
(URL_RE,
|
| 109 |
-
(MENTION_RE,
|
| 110 |
-
(HASHTAG_RE,
|
| 111 |
-
(DATE_RE,
|
| 112 |
-
(CURRENCY_RE,
|
| 113 |
-
(NUM_APOSTROPHE_RE,
|
| 114 |
-
(ACRONYM_APOSTROPHE_RE,
|
| 115 |
-
(ACRONYM_RE,
|
| 116 |
-
(NUMBER_RE,
|
| 117 |
-
(TIME_RE,
|
| 118 |
-
(PLAIN_NUM_RE,
|
| 119 |
-
(UNICODE_EMOJI_RE,
|
| 120 |
(TEXT_EMOJI_RE, "EMOJI"),
|
| 121 |
]
|
| 122 |
|
|
@@ -124,42 +109,34 @@ _SPAN_PATTERNS: list[tuple[re.Pattern, str]] = [
|
|
| 124 |
# ── Acronym vs Turkish word disambiguation ───────────────────────────────────
|
| 125 |
|
| 126 |
def _is_known_turkish_word(word_upper: str) -> bool:
|
| 127 |
-
"""Return True if *word_upper* (ALL CAPS) is a
|
| 128 |
|
| 129 |
-
Checks
|
| 130 |
1. ACRONYM_EXPANSIONS dict → always acronym (return False)
|
| 131 |
-
2.
|
| 132 |
-
3.
|
| 133 |
-
4.
|
| 134 |
-
5. Otherwise → treat as acronym (return False)
|
| 135 |
"""
|
| 136 |
-
from ._acronym_dict import ACRONYM_EXPANSIONS # noqa: PLC0415
|
| 137 |
-
from ._preprocessor import _turkish_lower, _load_proper_nouns # noqa: PLC0415
|
| 138 |
-
from ._tdk_vocab import load_tdk_words # noqa: PLC0415
|
| 139 |
-
|
| 140 |
# Known acronyms always win
|
| 141 |
if word_upper in ACRONYM_EXPANSIONS:
|
| 142 |
return False
|
| 143 |
-
# Also check without trailing digits (HTML5 → HTML)
|
| 144 |
base = word_upper.rstrip("0123456789")
|
| 145 |
if base and base != word_upper and base in ACRONYM_EXPANSIONS:
|
| 146 |
return False
|
| 147 |
|
| 148 |
-
wl =
|
| 149 |
|
| 150 |
-
# TDK dictionary: if the lowercase form is a real Turkish word → not acronym
|
| 151 |
tdk = load_tdk_words()
|
| 152 |
if tdk and wl in tdk:
|
| 153 |
return True
|
| 154 |
|
| 155 |
-
|
| 156 |
-
if wl in _load_proper_nouns():
|
| 157 |
return True
|
| 158 |
|
| 159 |
return False
|
| 160 |
|
| 161 |
|
| 162 |
-
# ──
|
| 163 |
|
| 164 |
def find_special_spans(text: str) -> list[tuple[int, int, str, str]]:
|
| 165 |
"""Find all special-token spans in *text*.
|
|
@@ -172,9 +149,8 @@ def find_special_spans(text: str) -> list[tuple[int, int, str, str]]:
|
|
| 172 |
for m in pattern.finditer(text):
|
| 173 |
original = m.group(0)
|
| 174 |
|
| 175 |
-
# Acronym filtering: skip if it's actually a Turkish word
|
| 176 |
if ttype in ("ACRONYM", "ACRONYM_APO"):
|
| 177 |
-
# Extract the uppercase base (before apostrophe for APO)
|
| 178 |
if ttype == "ACRONYM_APO":
|
| 179 |
apo = original.find("'")
|
| 180 |
if apo == -1:
|
|
@@ -200,28 +176,38 @@ def find_special_spans(text: str) -> list[tuple[int, int, str, str]]:
|
|
| 200 |
return result
|
| 201 |
|
| 202 |
|
| 203 |
-
def
|
| 204 |
-
"""Split a suffix string (after apostrophe) into individual
|
| 205 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
remaining = suffix_str.lower()
|
| 207 |
while remaining:
|
| 208 |
matched = False
|
| 209 |
-
for s in
|
| 210 |
if remaining.startswith(s):
|
| 211 |
-
|
|
|
|
| 212 |
remaining = remaining[len(s):]
|
| 213 |
matched = True
|
| 214 |
break
|
| 215 |
if not matched:
|
| 216 |
-
|
|
|
|
| 217 |
break
|
| 218 |
-
return
|
| 219 |
|
| 220 |
|
| 221 |
-
def make_special_tokens(
|
|
|
|
|
|
|
| 222 |
"""Create token dict(s) for a matched special span.
|
| 223 |
|
| 224 |
-
``NUM_APO`` and ``ACRONYM_APO`` spans are split into base + SUFFIX
|
|
|
|
| 225 |
"""
|
| 226 |
# ── Number + apostrophe + suffix (3'te, 1990'larda) ──────────────────
|
| 227 |
if span_type == "NUM_APO":
|
|
@@ -229,10 +215,16 @@ def make_special_tokens(span_type: str, original: str) -> list[dict]:
|
|
| 229 |
if apo_pos == -1:
|
| 230 |
apo_pos = original.find("\u2019")
|
| 231 |
num_part = original[:apo_pos]
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
|
| 237 |
# ── Acronym + apostrophe + suffix (NATO'nun, HTML5'ten) ──────────────
|
| 238 |
if span_type == "ACRONYM_APO":
|
|
@@ -240,43 +232,59 @@ def make_special_tokens(span_type: str, original: str) -> list[dict]:
|
|
| 240 |
if apo_pos == -1:
|
| 241 |
apo_pos = original.find("\u2019")
|
| 242 |
acr_part = original[:apo_pos]
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
|
| 248 |
# ── Plain acronym (HTML5, GPT) ──────────────────────────────────────
|
| 249 |
if span_type == "ACRONYM":
|
| 250 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
|
| 252 |
# ── Everything else (NUM, DATE, URL, MENTION, HASHTAG, EMOJI, UNIT) ──
|
| 253 |
return [{
|
| 254 |
"token": f" {original}",
|
| 255 |
-
"
|
|
|
|
| 256 |
f"_{span_type.lower()}": True,
|
| 257 |
}]
|
| 258 |
|
| 259 |
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
"""Catch remaining number/unit tokens missed by span detection."""
|
| 264 |
-
result: list[dict] = []
|
| 265 |
for tok in tokens:
|
| 266 |
-
|
|
|
|
| 267 |
result.append(tok)
|
| 268 |
continue
|
| 269 |
|
| 270 |
-
raw = tok["token"].strip()
|
| 271 |
|
| 272 |
if NUMBER_RE.fullmatch(raw):
|
| 273 |
-
result.append({**tok, "
|
| 274 |
-
elif raw.lower() in UNITS
|
| 275 |
-
result.append({**tok, "
|
| 276 |
-
elif raw.lower() in ROMAN_NUMERALS
|
| 277 |
-
result.append({**tok, "
|
| 278 |
-
elif raw.lower() in MONTH_NAMES
|
| 279 |
-
result.append({**tok, "
|
| 280 |
else:
|
| 281 |
result.append(tok)
|
| 282 |
|
|
|
|
| 1 |
+
"""Special span detection: URLs, numbers, dates, mentions, hashtags, emojis, acronyms.
|
| 2 |
|
| 3 |
+
Detects non-textual spans in the input text **before** the word-level
|
| 4 |
+
segmentation runs, so they are never mistakenly split by suffix
|
| 5 |
+
stripping. Returns a sorted, non-overlapping list of spans.
|
| 6 |
"""
|
| 7 |
|
| 8 |
from __future__ import annotations
|
| 9 |
|
| 10 |
import re
|
| 11 |
|
| 12 |
+
from ._acronym_table import ACRONYM_EXPANSIONS
|
| 13 |
+
from ._suffix_table import APOSTROPHE_SUFFIXES
|
| 14 |
+
from .normalization import turkish_lower
|
| 15 |
+
from .resources import load_proper_nouns, load_tdk_words
|
| 16 |
+
|
| 17 |
+
# ── Static vocabulary sets ───────────────────────────────────────────────────
|
| 18 |
+
|
| 19 |
+
MONTH_NAMES: frozenset[str] = frozenset({
|
| 20 |
+
"ocak", "şubat", "mart", "nisan", "mayıs", "haziran",
|
| 21 |
+
"temmuz", "ağustos", "eylül", "ekim", "kasım", "aralık",
|
| 22 |
+
"january", "february", "march", "april", "may", "june",
|
| 23 |
+
"july", "august", "september", "october", "november", "december",
|
| 24 |
+
})
|
| 25 |
+
|
| 26 |
+
UNITS: frozenset[str] = frozenset({
|
| 27 |
+
"km", "m", "cm", "mm", "nm",
|
| 28 |
+
"kg", "g", "mg", "ton",
|
| 29 |
+
"sn", "dk", "sa", "ms",
|
| 30 |
+
"tl", "usd", "eur", "gbp",
|
| 31 |
+
"kb", "mb", "gb", "tb", "pb",
|
| 32 |
+
"ml", "mcg", "meq", "iu", "mmhg", "mosm",
|
| 33 |
+
"hz", "mhz", "ghz", "watt", "kw", "mw", "kcal", "cal",
|
| 34 |
+
})
|
| 35 |
+
|
| 36 |
+
ROMAN_NUMERALS: frozenset[str] = frozenset({
|
| 37 |
+
"i", "ii", "iii", "iv", "vi", "vii", "viii", "ix",
|
| 38 |
+
"xi", "xii", "xiii", "xiv", "xv", "xvi", "xvii", "xviii", "xix", "xx",
|
| 39 |
+
})
|
| 40 |
+
|
| 41 |
+
# ── Regex patterns ───────────────────────────────────────────────────────────
|
| 42 |
+
|
| 43 |
+
URL_RE = re.compile(r"https?://\S+|www\.\S+", re.IGNORECASE)
|
| 44 |
+
MENTION_RE = re.compile(r"@[\w\u00C0-\u024F]+")
|
| 45 |
+
HASHTAG_RE = re.compile(r"#[\w\u00C0-\u024F]+")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
+
_SUFFIX_ALT = "|".join(re.escape(s) for s in APOSTROPHE_SUFFIXES)
|
| 48 |
|
| 49 |
+
# Number + apostrophe + Turkish suffix(es)
|
| 50 |
NUM_APOSTROPHE_RE = re.compile(
|
| 51 |
r"\d+(?:[.:,]\d+)*['\u2019](?:" + _SUFFIX_ALT + r")+\b",
|
| 52 |
re.IGNORECASE,
|
| 53 |
)
|
| 54 |
|
| 55 |
+
DATE_RE = re.compile(
|
| 56 |
+
r"\d{1,2}[./\-]\d{1,2}[./\-]\d{2,4}"
|
| 57 |
+
r"|\d{4}[./\-]\d{1,2}[./\-]\d{1,2}"
|
| 58 |
)
|
| 59 |
+
CURRENCY_RE = re.compile(r"[$€£¥₺₽]\d+[\.,]?\d*|\d+[\.,]?\d*[$€£¥₺₽]")
|
| 60 |
+
NUMBER_RE = re.compile(
|
| 61 |
+
r"%\d+[\.,]?\d*"
|
| 62 |
+
r"|\d{1,3}(?:\.\d{3})+" # thousands (1.000.000)
|
| 63 |
+
r"|\d+[\.,]\d+" # decimal
|
| 64 |
+
r"|\d+%"
|
| 65 |
+
r"|\d+/\d+"
|
| 66 |
)
|
| 67 |
+
TIME_RE = re.compile(r"\d{1,2}:\d{2}(?::\d{2})?")
|
| 68 |
+
PLAIN_NUM_RE = re.compile(r"\b\d+\b")
|
| 69 |
+
|
| 70 |
+
# Acronyms: standalone uppercase 2+ letters (optionally + digits)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
ACRONYM_RE = re.compile(
|
| 72 |
r"\b[A-ZÇĞİÖŞÜ]{2,}[0-9]*\b"
|
| 73 |
r"|\b[A-ZÇĞİÖŞÜ][0-9]+\b"
|
| 74 |
)
|
| 75 |
|
| 76 |
+
# Acronym + apostrophe + Turkish suffix(es)
|
| 77 |
ACRONYM_APOSTROPHE_RE = re.compile(
|
| 78 |
r"\b(?:[A-ZÇĞİÖŞÜ]{2,}[0-9]*|[A-ZÇĞİÖŞÜ][0-9]+)['\u2019](?:"
|
| 79 |
+ _SUFFIX_ALT + r")+\b"
|
| 80 |
)
|
| 81 |
|
| 82 |
+
TEXT_EMOJI_RE = re.compile(r"[:;=]-?[\)\(\]\[dDpPoO3]|<3")
|
| 83 |
UNICODE_EMOJI_RE = re.compile(
|
| 84 |
"[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
|
| 85 |
"\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF"
|
|
|
|
| 88 |
flags=re.UNICODE,
|
| 89 |
)
|
| 90 |
|
| 91 |
+
# Priority order: earlier entries win when spans overlap
|
| 92 |
+
_SPAN_PATTERNS: list[tuple[re.Pattern[str], str]] = [
|
| 93 |
+
(URL_RE, "URL"),
|
| 94 |
+
(MENTION_RE, "MENTION"),
|
| 95 |
+
(HASHTAG_RE, "HASHTAG"),
|
| 96 |
+
(DATE_RE, "DATE"),
|
| 97 |
+
(CURRENCY_RE, "UNIT"),
|
| 98 |
+
(NUM_APOSTROPHE_RE, "NUM_APO"),
|
| 99 |
+
(ACRONYM_APOSTROPHE_RE, "ACRONYM_APO"),
|
| 100 |
+
(ACRONYM_RE, "ACRONYM"),
|
| 101 |
+
(NUMBER_RE, "NUM"),
|
| 102 |
+
(TIME_RE, "NUM"),
|
| 103 |
+
(PLAIN_NUM_RE, "NUM"),
|
| 104 |
+
(UNICODE_EMOJI_RE, "EMOJI"),
|
| 105 |
(TEXT_EMOJI_RE, "EMOJI"),
|
| 106 |
]
|
| 107 |
|
|
|
|
| 109 |
# ── Acronym vs Turkish word disambiguation ───────────────────────────────────
|
| 110 |
|
| 111 |
def _is_known_turkish_word(word_upper: str) -> bool:
|
| 112 |
+
"""Return True if *word_upper* (ALL CAPS) is actually a Turkish word.
|
| 113 |
|
| 114 |
+
Checks:
|
| 115 |
1. ACRONYM_EXPANSIONS dict → always acronym (return False)
|
| 116 |
+
2. TDK dictionary → Turkish word (return True)
|
| 117 |
+
3. Proper nouns → Turkish word (return True)
|
| 118 |
+
4. Otherwise → treat as acronym (return False)
|
|
|
|
| 119 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
# Known acronyms always win
|
| 121 |
if word_upper in ACRONYM_EXPANSIONS:
|
| 122 |
return False
|
|
|
|
| 123 |
base = word_upper.rstrip("0123456789")
|
| 124 |
if base and base != word_upper and base in ACRONYM_EXPANSIONS:
|
| 125 |
return False
|
| 126 |
|
| 127 |
+
wl = turkish_lower(word_upper)
|
| 128 |
|
|
|
|
| 129 |
tdk = load_tdk_words()
|
| 130 |
if tdk and wl in tdk:
|
| 131 |
return True
|
| 132 |
|
| 133 |
+
if wl in load_proper_nouns():
|
|
|
|
| 134 |
return True
|
| 135 |
|
| 136 |
return False
|
| 137 |
|
| 138 |
|
| 139 |
+
# ── Public API ───────────────────────────────────────────────────────────────
|
| 140 |
|
| 141 |
def find_special_spans(text: str) -> list[tuple[int, int, str, str]]:
|
| 142 |
"""Find all special-token spans in *text*.
|
|
|
|
| 149 |
for m in pattern.finditer(text):
|
| 150 |
original = m.group(0)
|
| 151 |
|
| 152 |
+
# Acronym filtering: skip if it's actually a common Turkish word
|
| 153 |
if ttype in ("ACRONYM", "ACRONYM_APO"):
|
|
|
|
| 154 |
if ttype == "ACRONYM_APO":
|
| 155 |
apo = original.find("'")
|
| 156 |
if apo == -1:
|
|
|
|
| 176 |
return result
|
| 177 |
|
| 178 |
|
| 179 |
+
def split_apostrophe_suffixes(suffix_str: str) -> list[tuple[str, str]]:
|
| 180 |
+
"""Split a suffix string (after apostrophe) into individual suffix pieces.
|
| 181 |
+
|
| 182 |
+
Returns a list of ``(surface_form, label)`` tuples.
|
| 183 |
+
"""
|
| 184 |
+
from ._suffix_table import SUFFIX_MAP # avoid circular at module level
|
| 185 |
+
|
| 186 |
+
pieces: list[tuple[str, str]] = []
|
| 187 |
remaining = suffix_str.lower()
|
| 188 |
while remaining:
|
| 189 |
matched = False
|
| 190 |
+
for s in APOSTROPHE_SUFFIXES:
|
| 191 |
if remaining.startswith(s):
|
| 192 |
+
label = SUFFIX_MAP.get(s, "-SFX")
|
| 193 |
+
pieces.append((s, label))
|
| 194 |
remaining = remaining[len(s):]
|
| 195 |
matched = True
|
| 196 |
break
|
| 197 |
if not matched:
|
| 198 |
+
# Unrecognised remainder → emit as a single suffix chunk
|
| 199 |
+
pieces.append((remaining, "-SFX"))
|
| 200 |
break
|
| 201 |
+
return pieces
|
| 202 |
|
| 203 |
|
| 204 |
+
def make_special_tokens(
|
| 205 |
+
span_type: str, original: str
|
| 206 |
+
) -> list[dict[str, object]]:
|
| 207 |
"""Create token dict(s) for a matched special span.
|
| 208 |
|
| 209 |
+
``NUM_APO`` and ``ACRONYM_APO`` spans are split into base + SUFFIX
|
| 210 |
+
tokens.
|
| 211 |
"""
|
| 212 |
# ── Number + apostrophe + suffix (3'te, 1990'larda) ──────────────────
|
| 213 |
if span_type == "NUM_APO":
|
|
|
|
| 215 |
if apo_pos == -1:
|
| 216 |
apo_pos = original.find("\u2019")
|
| 217 |
num_part = original[:apo_pos]
|
| 218 |
+
suffix_pieces = split_apostrophe_suffixes(original[apo_pos + 1:])
|
| 219 |
+
result: list[dict[str, object]] = [
|
| 220 |
+
{"token": f" {num_part}", "token_type": "NUM", "morph_pos": 0, "_num": True},
|
| 221 |
]
|
| 222 |
+
for idx, (surf, label) in enumerate(suffix_pieces, start=1):
|
| 223 |
+
result.append({
|
| 224 |
+
"token": surf, "token_type": "SUFFIX", "morph_pos": idx,
|
| 225 |
+
"_apo_suffix": True, "_suffix_label": label,
|
| 226 |
+
})
|
| 227 |
+
return result
|
| 228 |
|
| 229 |
# ── Acronym + apostrophe + suffix (NATO'nun, HTML5'ten) ──────────────
|
| 230 |
if span_type == "ACRONYM_APO":
|
|
|
|
| 232 |
if apo_pos == -1:
|
| 233 |
apo_pos = original.find("\u2019")
|
| 234 |
acr_part = original[:apo_pos]
|
| 235 |
+
suffix_pieces = split_apostrophe_suffixes(original[apo_pos + 1:])
|
| 236 |
+
expansion = ACRONYM_EXPANSIONS.get(acr_part.upper())
|
| 237 |
+
meta: dict[str, object] = {"_acronym": True}
|
| 238 |
+
if expansion:
|
| 239 |
+
meta["_expansion"] = expansion
|
| 240 |
+
meta["_known_acronym"] = True
|
| 241 |
+
result = [
|
| 242 |
+
{"token": f" {acr_part}", "token_type": "ACRONYM", "morph_pos": 0, **meta},
|
| 243 |
]
|
| 244 |
+
for idx, (surf, label) in enumerate(suffix_pieces, start=1):
|
| 245 |
+
result.append({
|
| 246 |
+
"token": surf, "token_type": "SUFFIX", "morph_pos": idx,
|
| 247 |
+
"_apo_suffix": True, "_suffix_label": label,
|
| 248 |
+
})
|
| 249 |
+
return result
|
| 250 |
|
| 251 |
# ── Plain acronym (HTML5, GPT) ──────────────────────────────────────
|
| 252 |
if span_type == "ACRONYM":
|
| 253 |
+
expansion = ACRONYM_EXPANSIONS.get(original.upper())
|
| 254 |
+
meta = {"_acronym": True}
|
| 255 |
+
if expansion:
|
| 256 |
+
meta["_expansion"] = expansion
|
| 257 |
+
meta["_known_acronym"] = True
|
| 258 |
+
return [{"token": f" {original}", "token_type": "ACRONYM", "morph_pos": 0, **meta}]
|
| 259 |
|
| 260 |
# ── Everything else (NUM, DATE, URL, MENTION, HASHTAG, EMOJI, UNIT) ──
|
| 261 |
return [{
|
| 262 |
"token": f" {original}",
|
| 263 |
+
"token_type": span_type,
|
| 264 |
+
"morph_pos": 0,
|
| 265 |
f"_{span_type.lower()}": True,
|
| 266 |
}]
|
| 267 |
|
| 268 |
|
| 269 |
+
def reclassify_numbers_in_tokens(tokens: list[dict[str, object]]) -> list[dict[str, object]]:
|
| 270 |
+
"""Post-pass: catch remaining numbers / units missed by span detection."""
|
| 271 |
+
result: list[dict[str, object]] = []
|
|
|
|
|
|
|
| 272 |
for tok in tokens:
|
| 273 |
+
tt = tok["token_type"]
|
| 274 |
+
if tt not in ("ROOT", "FOREIGN"):
|
| 275 |
result.append(tok)
|
| 276 |
continue
|
| 277 |
|
| 278 |
+
raw = str(tok["token"]).strip()
|
| 279 |
|
| 280 |
if NUMBER_RE.fullmatch(raw):
|
| 281 |
+
result.append({**tok, "token_type": "NUM", "_num": True})
|
| 282 |
+
elif raw.lower() in UNITS:
|
| 283 |
+
result.append({**tok, "token_type": "UNIT", "_unit": True})
|
| 284 |
+
elif raw.lower() in ROMAN_NUMERALS:
|
| 285 |
+
result.append({**tok, "token_type": "NUM", "_roman": True})
|
| 286 |
+
elif raw.lower() in MONTH_NAMES:
|
| 287 |
+
result.append({**tok, "token_type": "ROOT", "_month": True})
|
| 288 |
else:
|
| 289 |
result.append(tok)
|
| 290 |
|
nedo_turkish_tokenizer/tokenizer.py
CHANGED
|
@@ -1,24 +1,22 @@
|
|
| 1 |
-
"""
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
11. Acronym expansion
|
| 16 |
-
12. Context-aware Zemberek disambiguation
|
| 17 |
|
| 18 |
Output fields per token:
|
| 19 |
token : str — token string (leading space = word-initial)
|
| 20 |
-
token_type : str — ROOT | SUFFIX | FOREIGN |
|
| 21 |
-
NUM | DATE | UNIT | URL | MENTION | HASHTAG | EMOJI
|
| 22 |
morph_pos : int — 0=root/word-initial, 1=first suffix, 2=second suffix…
|
| 23 |
(+ optional _* metadata fields)
|
| 24 |
"""
|
|
@@ -26,48 +24,14 @@ Output fields per token:
|
|
| 26 |
from __future__ import annotations
|
| 27 |
|
| 28 |
import os
|
| 29 |
-
import re
|
| 30 |
import multiprocessing
|
| 31 |
from concurrent.futures import ProcessPoolExecutor, as_completed
|
| 32 |
-
from pathlib import Path
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
from ._preprocessor import preprocess, postprocess
|
| 36 |
-
from ._suffix_expander import reclassify_bpe_suffixes
|
| 37 |
-
from ._root_validator import validate_roots, ZEMBEREK_AVAILABLE
|
| 38 |
-
from ._medical_vocab import ALL_DOMAIN_ROOTS
|
| 39 |
-
from ._tdk_vocab import reclassify_foreign_words
|
| 40 |
-
from ._normalizer import (
|
| 41 |
-
find_special_spans,
|
| 42 |
-
make_special_tokens,
|
| 43 |
-
reclassify_numbers_in_tokens,
|
| 44 |
-
)
|
| 45 |
-
from ._allomorph import add_canonical_labels
|
| 46 |
-
from ._compound import add_compound_info
|
| 47 |
-
from ._acronym_dict import reclassify_acronyms
|
| 48 |
-
from ._context_aware import annotate_with_context
|
| 49 |
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
except Exception:
|
| 53 |
-
_zemb_morphology = None
|
| 54 |
|
| 55 |
-
_DOMAIN_ROOTS_LOWER = {k.lower() for k in ALL_DOMAIN_ROOTS}
|
| 56 |
|
| 57 |
-
# ──
|
| 58 |
-
|
| 59 |
-
_SPECIAL_TYPES = frozenset(
|
| 60 |
-
("NUM", "DATE", "UNIT", "URL", "MENTION", "HASHTAG", "EMOJI", "ACRONYM")
|
| 61 |
-
)
|
| 62 |
-
|
| 63 |
-
_TYPE_SYM = {
|
| 64 |
-
"ROOT": "R", "SUFFIX": "S", "FOREIGN": "F", "BPE": "B", "PUNCT": "P",
|
| 65 |
-
"NUM": "N", "DATE": "D", "UNIT": "U",
|
| 66 |
-
"URL": "L", "MENTION": "@", "HASHTAG": "#", "EMOJI": "E", "ACRONYM": "A",
|
| 67 |
-
}
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
# ── Parallel worker helpers ───────────────────────────────────────────────────
|
| 71 |
|
| 72 |
_worker_tok: "NedoTurkishTokenizer | None" = None
|
| 73 |
|
|
@@ -84,9 +48,12 @@ def _tokenize_one(text: str) -> list[dict]:
|
|
| 84 |
|
| 85 |
# ══════════════════════════════════════════════════════════════════════════════
|
| 86 |
|
|
|
|
| 87 |
class NedoTurkishTokenizer:
|
| 88 |
-
"""
|
| 89 |
-
|
|
|
|
|
|
|
| 90 |
|
| 91 |
Example::
|
| 92 |
|
|
@@ -99,83 +66,22 @@ class NedoTurkishTokenizer:
|
|
| 99 |
"""
|
| 100 |
|
| 101 |
def __init__(self) -> None:
|
|
|
|
| 102 |
|
| 103 |
-
|
| 104 |
-
self._base = TurkishTokenizer()
|
| 105 |
-
self.zemberek_available = ZEMBEREK_AVAILABLE
|
| 106 |
-
|
| 107 |
-
# ── Public API ────────────────────────────────────────────────────────────
|
| 108 |
|
| 109 |
def __call__(self, text: str) -> list[dict]:
|
|
|
|
| 110 |
return self.tokenize(text)
|
| 111 |
|
| 112 |
def tokenize(self, text: str) -> list[dict]:
|
| 113 |
"""Tokenize a single text string.
|
| 114 |
|
| 115 |
-
Returns a list of token dicts, each
|
| 116 |
-
|
|
|
|
| 117 |
"""
|
| 118 |
-
|
| 119 |
-
# and split text into segments so they never enter the base tokenizer.
|
| 120 |
-
spans = find_special_spans(text)
|
| 121 |
-
|
| 122 |
-
tokens: list[dict] = []
|
| 123 |
-
pos = 0
|
| 124 |
-
|
| 125 |
-
for start, end, ttype, original in spans:
|
| 126 |
-
# Tokenize normal text before this special span
|
| 127 |
-
if pos < start:
|
| 128 |
-
segment = text[pos:start]
|
| 129 |
-
if segment.strip():
|
| 130 |
-
seg_proc, caps, apo = preprocess(segment)
|
| 131 |
-
seg_raw = self._base.tokenize_text(seg_proc)
|
| 132 |
-
seg_tokens = postprocess(seg_raw, caps, apo)
|
| 133 |
-
tokens.extend(seg_tokens)
|
| 134 |
-
|
| 135 |
-
# Insert the special token(s) directly
|
| 136 |
-
tokens.extend(make_special_tokens(ttype, original))
|
| 137 |
-
pos = end
|
| 138 |
-
|
| 139 |
-
# Tokenize remaining text after the last special span
|
| 140 |
-
if pos < len(text):
|
| 141 |
-
segment = text[pos:]
|
| 142 |
-
if segment.strip():
|
| 143 |
-
seg_proc, caps, apo = preprocess(segment)
|
| 144 |
-
seg_raw = self._base.tokenize_text(seg_proc)
|
| 145 |
-
seg_tokens = postprocess(seg_raw, caps, apo)
|
| 146 |
-
tokens.extend(seg_tokens)
|
| 147 |
-
|
| 148 |
-
# Fix 3 + 5: BPE→SUFFIX reclassification + PUNCT
|
| 149 |
-
tokens = reclassify_bpe_suffixes(tokens)
|
| 150 |
-
|
| 151 |
-
# Fix 8b: remaining numbers / units
|
| 152 |
-
tokens = reclassify_numbers_in_tokens(tokens)
|
| 153 |
-
|
| 154 |
-
# Fix 6: domain vocabulary (medical / sports / tourism)
|
| 155 |
-
tokens = _reclassify_domain_roots(tokens, _DOMAIN_ROOTS_LOWER)
|
| 156 |
-
|
| 157 |
-
# Fix 7: TDK FOREIGN detection
|
| 158 |
-
tokens = reclassify_foreign_words(tokens)
|
| 159 |
-
|
| 160 |
-
# Fix 11: acronym expansions
|
| 161 |
-
tokens = reclassify_acronyms(tokens)
|
| 162 |
-
|
| 163 |
-
# Fix 9: allomorph canonical labels
|
| 164 |
-
tokens = add_canonical_labels(tokens)
|
| 165 |
-
|
| 166 |
-
# Fix 10: compound word annotation
|
| 167 |
-
tokens = add_compound_info(tokens, morphology=_zemb_morphology)
|
| 168 |
-
|
| 169 |
-
# Fix 12: context-aware Zemberek disambiguation
|
| 170 |
-
tokens = annotate_with_context(tokens, text)
|
| 171 |
-
|
| 172 |
-
# Fix 4: Zemberek root validation & correction
|
| 173 |
-
tokens = validate_roots(tokens, text.split(), base_tokenizer=self._base)
|
| 174 |
-
|
| 175 |
-
# Add public output fields
|
| 176 |
-
tokens = _add_output_fields(tokens)
|
| 177 |
-
|
| 178 |
-
return tokens
|
| 179 |
|
| 180 |
def batch_tokenize(
|
| 181 |
self,
|
|
@@ -187,11 +93,11 @@ class NedoTurkishTokenizer:
|
|
| 187 |
|
| 188 |
Args:
|
| 189 |
texts: List of strings to tokenize.
|
| 190 |
-
workers: Number of worker processes (None = all CPUs).
|
| 191 |
-
chunk_size: Below this count, run sequentially
|
| 192 |
|
| 193 |
Returns:
|
| 194 |
-
List of token lists, in the same order as
|
| 195 |
"""
|
| 196 |
if not texts:
|
| 197 |
return []
|
|
@@ -209,126 +115,30 @@ class NedoTurkishTokenizer:
|
|
| 209 |
i = futs[fut]
|
| 210 |
try:
|
| 211 |
results[i] = fut.result()
|
| 212 |
-
except Exception as exc:
|
| 213 |
-
|
|
|
|
| 214 |
print(f"[NedoTurkishTokenizer] fallback at idx={i}: {exc}")
|
| 215 |
|
| 216 |
return results # type: ignore[return-value]
|
| 217 |
|
| 218 |
-
# ──
|
| 219 |
-
|
| 220 |
-
@classmethod
|
| 221 |
-
def from_pretrained(cls, _model_id: str = "Ethosoft/NedoTurkishTokenizer") -> "NedoTurkishTokenizer":
|
| 222 |
-
"""Load tokenizer (rules-based, no weights to download)."""
|
| 223 |
-
return cls()
|
| 224 |
-
|
| 225 |
-
def save_pretrained(self, save_directory: str) -> None:
|
| 226 |
-
"""Save tokenizer config to a directory (for HF Hub compatibility)."""
|
| 227 |
-
import json
|
| 228 |
-
path = Path(save_directory)
|
| 229 |
-
path.mkdir(parents=True, exist_ok=True)
|
| 230 |
-
config = {
|
| 231 |
-
"tokenizer_class": "NedoTurkishTokenizer",
|
| 232 |
-
"model_type": "nedo-turkish-tokenizer",
|
| 233 |
-
"version": "1.0.0",
|
| 234 |
-
"zemberek_available": self.zemberek_available,
|
| 235 |
-
}
|
| 236 |
-
(path / "tokenizer_config.json").write_text(
|
| 237 |
-
json.dumps(config, ensure_ascii=False, indent=2), encoding="utf-8"
|
| 238 |
-
)
|
| 239 |
-
|
| 240 |
-
# ── Morphological Lattice API ─────────────────────────────────────────────
|
| 241 |
-
|
| 242 |
-
def get_morphological_lattice(self, word: str) -> list[dict]:
|
| 243 |
-
"""Return all possible morphological analyses for *word* as a lattice.
|
| 244 |
-
|
| 245 |
-
Each entry in the returned list is a dict with:
|
| 246 |
-
``root`` – the lemma / root form
|
| 247 |
-
``suffixes`` – list of surface-form suffixes
|
| 248 |
-
``pos`` – abbreviated POS tag (Noun, Verb, Adj, …)
|
| 249 |
-
``lexical_form`` – full lexical representation from Zemberek
|
| 250 |
-
|
| 251 |
-
Returns an **empty list** when Zemberek cannot analyse the word
|
| 252 |
-
(unknown word) or when Zemberek is not available.
|
| 253 |
-
"""
|
| 254 |
-
if _zemb_morphology is None:
|
| 255 |
-
return []
|
| 256 |
-
|
| 257 |
-
try:
|
| 258 |
-
word_analysis = _zemb_morphology.analyze(word)
|
| 259 |
-
|
| 260 |
-
lattice: list[dict] = []
|
| 261 |
-
for sa in word_analysis:
|
| 262 |
-
try:
|
| 263 |
-
root = str(sa.item.lemma)
|
| 264 |
-
pos = str(sa.item.primary_pos.short_form)
|
| 265 |
-
lexical_form = str(sa.format_string())
|
| 266 |
-
|
| 267 |
-
# Build suffix list from morpheme chain (skip the root morpheme)
|
| 268 |
-
morphemes = list(sa.get_morphemes())
|
| 269 |
-
suffixes = [str(m) for m in morphemes[1:]] if len(morphemes) > 1 else []
|
| 270 |
-
|
| 271 |
-
lattice.append({
|
| 272 |
-
"root": root,
|
| 273 |
-
"suffixes": suffixes,
|
| 274 |
-
"pos": pos,
|
| 275 |
-
"lexical_form": lexical_form,
|
| 276 |
-
})
|
| 277 |
-
except Exception: # noqa: BLE001
|
| 278 |
-
continue
|
| 279 |
-
|
| 280 |
-
return lattice
|
| 281 |
-
|
| 282 |
-
except Exception: # noqa: BLE001
|
| 283 |
-
return []
|
| 284 |
-
|
| 285 |
-
def tokenize_lattice(self, text: str) -> dict:
|
| 286 |
-
"""Tokenize *text* and return a morphological lattice for every word.
|
| 287 |
-
|
| 288 |
-
Returns a dict with:
|
| 289 |
-
``input`` – the original text
|
| 290 |
-
``words`` – list of per-word dicts, each containing
|
| 291 |
-
``word`` (str) and ``lattice`` (list of analyses)
|
| 292 |
-
|
| 293 |
-
Example::
|
| 294 |
-
|
| 295 |
-
tok = NedoTurkishTokenizer()
|
| 296 |
-
data = tok.tokenize_lattice("Evin güzel gelir")
|
| 297 |
-
for w in data["words"]:
|
| 298 |
-
print(w["word"], "→", len(w["lattice"]), "analysis(es)")
|
| 299 |
-
"""
|
| 300 |
-
# Split text on whitespace, respecting punctuation
|
| 301 |
-
words = re.findall(r"\S+", text)
|
| 302 |
-
|
| 303 |
-
result_words: list[dict] = []
|
| 304 |
-
for w in words:
|
| 305 |
-
lattice = self.get_morphological_lattice(w)
|
| 306 |
-
result_words.append({
|
| 307 |
-
"word": w,
|
| 308 |
-
"lattice": lattice,
|
| 309 |
-
})
|
| 310 |
-
|
| 311 |
-
return {
|
| 312 |
-
"input": text,
|
| 313 |
-
"words": result_words,
|
| 314 |
-
}
|
| 315 |
-
|
| 316 |
-
# ── Utility ───────────────────────────────────────────────────────────────
|
| 317 |
|
| 318 |
def stats(self, tokens: list[dict]) -> dict:
|
| 319 |
"""Compute morphological coverage statistics for a token list."""
|
| 320 |
total = len(tokens)
|
| 321 |
if total == 0:
|
| 322 |
-
return {k: 0 for k in (
|
| 323 |
-
|
| 324 |
-
|
|
|
|
|
|
|
| 325 |
suffixes = sum(1 for t in tokens if t["token_type"] == "SUFFIX")
|
| 326 |
-
foreign
|
| 327 |
-
punct
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
pure = sum(
|
| 332 |
1 for t in tokens
|
| 333 |
if t["token_type"] in ("ROOT", "SUFFIX", "FOREIGN")
|
| 334 |
and not t["token"].strip().startswith("<")
|
|
@@ -338,66 +148,8 @@ class NedoTurkishTokenizer:
|
|
| 338 |
"roots": roots,
|
| 339 |
"suffixes": suffixes,
|
| 340 |
"foreign": foreign,
|
| 341 |
-
"bpe": bpe,
|
| 342 |
"punct": punct,
|
| 343 |
"special": special,
|
| 344 |
"tr_pct": round(tr / total * 100, 2),
|
| 345 |
"pure_pct": round(pure / total * 100, 2),
|
| 346 |
}
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
# ── Internal helpers ──────────────────────────────────────────────────────────
|
| 350 |
-
|
| 351 |
-
def _reclassify_domain_roots(tokens: list[dict], domain_lower: set) -> list[dict]:
|
| 352 |
-
result = []
|
| 353 |
-
for tok in tokens:
|
| 354 |
-
if tok["type"] != "BPE":
|
| 355 |
-
result.append(tok)
|
| 356 |
-
continue
|
| 357 |
-
raw = tok["token"]
|
| 358 |
-
if raw == raw.lstrip(): # no leading space → not word-initial
|
| 359 |
-
result.append(tok)
|
| 360 |
-
continue
|
| 361 |
-
if raw.lstrip().lower() in domain_lower:
|
| 362 |
-
result.append({**tok, "type": "ROOT", "_domain": True})
|
| 363 |
-
else:
|
| 364 |
-
result.append(tok)
|
| 365 |
-
return result
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
def _add_output_fields(tokens: list[dict]) -> list[dict]:
|
| 369 |
-
"""Compute token_type and morph_pos and add them to every token."""
|
| 370 |
-
result = []
|
| 371 |
-
word_pos = 0
|
| 372 |
-
|
| 373 |
-
for tok in tokens:
|
| 374 |
-
raw = tok["token"]
|
| 375 |
-
base_type = tok["type"]
|
| 376 |
-
stripped = raw.strip()
|
| 377 |
-
|
| 378 |
-
# ── token_type: FOREIGN for foreign ROOTs ─────────────────────────
|
| 379 |
-
if base_type == "ROOT" and tok.get("_foreign"):
|
| 380 |
-
token_type = "FOREIGN"
|
| 381 |
-
else:
|
| 382 |
-
token_type = base_type
|
| 383 |
-
|
| 384 |
-
# ── morph_pos ─────────────────────────────────────────────────────
|
| 385 |
-
is_word_start = raw.startswith(" ") or stripped.startswith("<")
|
| 386 |
-
# Apostrophe suffixes are word-initial in text but continue the word
|
| 387 |
-
if tok.get("_apo_suffix"):
|
| 388 |
-
is_word_start = False
|
| 389 |
-
|
| 390 |
-
if is_word_start or base_type in _SPECIAL_TYPES or base_type == "PUNCT":
|
| 391 |
-
word_pos = 0
|
| 392 |
-
morph_pos = 0
|
| 393 |
-
elif base_type == "SUFFIX":
|
| 394 |
-
word_pos += 1
|
| 395 |
-
morph_pos = word_pos
|
| 396 |
-
else:
|
| 397 |
-
# ROOT or BPE within a word (no leading space)
|
| 398 |
-
word_pos = 0
|
| 399 |
-
morph_pos = 0
|
| 400 |
-
|
| 401 |
-
result.append({**tok, "token_type": token_type, "morph_pos": morph_pos})
|
| 402 |
-
|
| 403 |
-
return result
|
|
|
|
| 1 |
+
"""NedoTurkishTokenizer — self-contained Turkish morphological tokenizer.
|
| 2 |
+
|
| 3 |
+
A zero-dependency Turkish tokenizer that segments text into
|
| 4 |
+
morphologically meaningful tokens using deterministic heuristics,
|
| 5 |
+
a bundled TDK dictionary, and a candidate-based segmentation engine.
|
| 6 |
+
|
| 7 |
+
Usage::
|
| 8 |
+
|
| 9 |
+
from nedo_turkish_tokenizer import NedoTurkishTokenizer
|
| 10 |
+
|
| 11 |
+
tok = NedoTurkishTokenizer()
|
| 12 |
+
tokens = tok.tokenize("İstanbul'da meeting'e katılamadım")
|
| 13 |
+
for t in tokens:
|
| 14 |
+
print(t["token"], t["token_type"], t["morph_pos"])
|
|
|
|
|
|
|
| 15 |
|
| 16 |
Output fields per token:
|
| 17 |
token : str — token string (leading space = word-initial)
|
| 18 |
+
token_type : str — ROOT | SUFFIX | FOREIGN | PUNCT |
|
| 19 |
+
NUM | DATE | UNIT | URL | MENTION | HASHTAG | EMOJI | ACRONYM
|
| 20 |
morph_pos : int — 0=root/word-initial, 1=first suffix, 2=second suffix…
|
| 21 |
(+ optional _* metadata fields)
|
| 22 |
"""
|
|
|
|
| 24 |
from __future__ import annotations
|
| 25 |
|
| 26 |
import os
|
|
|
|
| 27 |
import multiprocessing
|
| 28 |
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
+
from .engine import TokenizationEngine
|
| 31 |
+
from .types import SPECIAL_TYPES
|
|
|
|
|
|
|
| 32 |
|
|
|
|
| 33 |
|
| 34 |
+
# ── Parallel worker helpers ──────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
_worker_tok: "NedoTurkishTokenizer | None" = None
|
| 37 |
|
|
|
|
| 48 |
|
| 49 |
# ══════════════════════════════════════════════════════════════════════════════
|
| 50 |
|
| 51 |
+
|
| 52 |
class NedoTurkishTokenizer:
|
| 53 |
+
"""Self-contained Turkish morphological tokenizer.
|
| 54 |
+
|
| 55 |
+
Requires **no external dependencies** — all tokenization logic,
|
| 56 |
+
dictionaries, and heuristics are bundled within the package.
|
| 57 |
|
| 58 |
Example::
|
| 59 |
|
|
|
|
| 66 |
"""
|
| 67 |
|
| 68 |
def __init__(self) -> None:
|
| 69 |
+
self._engine = TokenizationEngine()
|
| 70 |
|
| 71 |
+
# ── Public API ─────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
def __call__(self, text: str) -> list[dict]:
|
| 74 |
+
"""Shorthand for ``tokenize(text)``."""
|
| 75 |
return self.tokenize(text)
|
| 76 |
|
| 77 |
def tokenize(self, text: str) -> list[dict]:
|
| 78 |
"""Tokenize a single text string.
|
| 79 |
|
| 80 |
+
Returns a list of token dicts, each containing at minimum:
|
| 81 |
+
``token``, ``token_type``, ``morph_pos``, plus optional
|
| 82 |
+
``_*`` metadata fields.
|
| 83 |
"""
|
| 84 |
+
return self._engine.tokenize(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
def batch_tokenize(
|
| 87 |
self,
|
|
|
|
| 93 |
|
| 94 |
Args:
|
| 95 |
texts: List of strings to tokenize.
|
| 96 |
+
workers: Number of worker processes (``None`` = all CPUs).
|
| 97 |
+
chunk_size: Below this count, run sequentially.
|
| 98 |
|
| 99 |
Returns:
|
| 100 |
+
List of token lists, in the same order as *texts*.
|
| 101 |
"""
|
| 102 |
if not texts:
|
| 103 |
return []
|
|
|
|
| 115 |
i = futs[fut]
|
| 116 |
try:
|
| 117 |
results[i] = fut.result()
|
| 118 |
+
except Exception as exc:
|
| 119 |
+
# Fallback: tokenize in the main process
|
| 120 |
+
results[i] = self.tokenize(texts[i])
|
| 121 |
print(f"[NedoTurkishTokenizer] fallback at idx={i}: {exc}")
|
| 122 |
|
| 123 |
return results # type: ignore[return-value]
|
| 124 |
|
| 125 |
+
# ── Statistics ─────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
def stats(self, tokens: list[dict]) -> dict:
|
| 128 |
"""Compute morphological coverage statistics for a token list."""
|
| 129 |
total = len(tokens)
|
| 130 |
if total == 0:
|
| 131 |
+
return {k: 0 for k in (
|
| 132 |
+
"total", "roots", "suffixes", "foreign",
|
| 133 |
+
"punct", "special", "tr_pct", "pure_pct",
|
| 134 |
+
)}
|
| 135 |
+
roots = sum(1 for t in tokens if t["token_type"] == "ROOT")
|
| 136 |
suffixes = sum(1 for t in tokens if t["token_type"] == "SUFFIX")
|
| 137 |
+
foreign = sum(1 for t in tokens if t["token_type"] == "FOREIGN")
|
| 138 |
+
punct = sum(1 for t in tokens if t["token_type"] == "PUNCT")
|
| 139 |
+
special = sum(1 for t in tokens if t["token_type"] in SPECIAL_TYPES)
|
| 140 |
+
tr = roots + suffixes + foreign + punct + special
|
| 141 |
+
pure = sum(
|
|
|
|
| 142 |
1 for t in tokens
|
| 143 |
if t["token_type"] in ("ROOT", "SUFFIX", "FOREIGN")
|
| 144 |
and not t["token"].strip().startswith("<")
|
|
|
|
| 148 |
"roots": roots,
|
| 149 |
"suffixes": suffixes,
|
| 150 |
"foreign": foreign,
|
|
|
|
| 151 |
"punct": punct,
|
| 152 |
"special": special,
|
| 153 |
"tr_pct": round(tr / total * 100, 2),
|
| 154 |
"pure_pct": round(pure / total * 100, 2),
|
| 155 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
nedo_turkish_tokenizer/types.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Core type definitions for NedoTurkishTokenizer.
|
| 2 |
+
|
| 3 |
+
Defines the Token dataclass, SegmentationCandidate for the candidate-based
|
| 4 |
+
segmentation engine, token type constants, and punctuation character sets.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
from dataclasses import dataclass, field
|
| 10 |
+
from typing import Any
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# ── Token type constants ──────────────────────────────────────────────────────
|
| 14 |
+
|
| 15 |
+
ROOT = "ROOT"
|
| 16 |
+
SUFFIX = "SUFFIX"
|
| 17 |
+
FOREIGN = "FOREIGN"
|
| 18 |
+
PUNCT = "PUNCT"
|
| 19 |
+
NUM = "NUM"
|
| 20 |
+
DATE = "DATE"
|
| 21 |
+
UNIT = "UNIT"
|
| 22 |
+
URL = "URL"
|
| 23 |
+
MENTION = "MENTION"
|
| 24 |
+
HASHTAG = "HASHTAG"
|
| 25 |
+
EMOJI = "EMOJI"
|
| 26 |
+
ACRONYM = "ACRONYM"
|
| 27 |
+
|
| 28 |
+
# Special token types that represent non-textual entities
|
| 29 |
+
SPECIAL_TYPES: frozenset[str] = frozenset(
|
| 30 |
+
{NUM, DATE, UNIT, URL, MENTION, HASHTAG, EMOJI, ACRONYM}
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
# All recognized token types
|
| 34 |
+
ALL_TYPES: frozenset[str] = frozenset(
|
| 35 |
+
{ROOT, SUFFIX, FOREIGN, PUNCT, NUM, DATE, UNIT, URL, MENTION, HASHTAG, EMOJI, ACRONYM}
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
# ── Punctuation character set ────────────────────────────────────────────────
|
| 39 |
+
|
| 40 |
+
PUNCT_CHARS: frozenset[str] = frozenset(
|
| 41 |
+
"'?.,;:!-\u2013\u2014()[]{}\"`/\\|@#$%^&*+=<>~"
|
| 42 |
+
"\u2019\u2018\u201c\u201d\u2032\u00ab\u00bb\u2039\u203a"
|
| 43 |
+
"\u2022\u2026\u00b7\u00b0\u00b1\u00d7\u00f7"
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
# Digits — used alongside PUNCT_CHARS for pure-punctuation detection
|
| 47 |
+
_DIGITS: frozenset[str] = frozenset("0123456789")
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def is_punct_token(text: str) -> bool:
|
| 51 |
+
"""Return True if *text* consists entirely of punctuation / digit characters."""
|
| 52 |
+
stripped = text.strip()
|
| 53 |
+
if not stripped:
|
| 54 |
+
return False
|
| 55 |
+
return all(
|
| 56 |
+
c in PUNCT_CHARS or c in _DIGITS or (ord(c) > 0x02FF and not c.isalpha())
|
| 57 |
+
for c in stripped
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
# ── Token dataclass ──────────────────────────────────────────────────────────
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
@dataclass
|
| 65 |
+
class Token:
|
| 66 |
+
"""Internal token representation.
|
| 67 |
+
|
| 68 |
+
*text* uses the leading-space convention: a space prefix indicates
|
| 69 |
+
that this token starts a new word. Suffixes within a word have
|
| 70 |
+
no leading space.
|
| 71 |
+
|
| 72 |
+
The *metadata* dict carries optional annotation fields (all prefixed
|
| 73 |
+
with ``_``), for example ``_caps``, ``_foreign``, ``_canonical``.
|
| 74 |
+
"""
|
| 75 |
+
|
| 76 |
+
text: str
|
| 77 |
+
token_type: str
|
| 78 |
+
morph_pos: int = 0
|
| 79 |
+
metadata: dict[str, Any] = field(default_factory=dict)
|
| 80 |
+
|
| 81 |
+
def to_dict(self) -> dict[str, Any]:
|
| 82 |
+
"""Convert to the public API dict format."""
|
| 83 |
+
result: dict[str, Any] = {
|
| 84 |
+
"token": self.text,
|
| 85 |
+
"token_type": self.token_type,
|
| 86 |
+
"morph_pos": self.morph_pos,
|
| 87 |
+
}
|
| 88 |
+
result.update(self.metadata)
|
| 89 |
+
return result
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
# ── Segmentation candidate ───────────────────────────────────────────────────
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
@dataclass
|
| 96 |
+
class SegmentationCandidate:
|
| 97 |
+
"""One possible way to segment a word into tokens.
|
| 98 |
+
|
| 99 |
+
The candidate-generation engine produces multiple candidates per word,
|
| 100 |
+
then the selection step picks the highest-scoring one.
|
| 101 |
+
|
| 102 |
+
*source* is a short human-readable tag describing the strategy that
|
| 103 |
+
produced this candidate (e.g. ``"tdk_root"``, ``"suffix_chain"``,
|
| 104 |
+
``"foreign"``).
|
| 105 |
+
"""
|
| 106 |
+
|
| 107 |
+
tokens: list[Token]
|
| 108 |
+
score: float
|
| 109 |
+
source: str
|
paper_baseline_check.py
DELETED
|
@@ -1,106 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
paper_baseline_check.py
|
| 3 |
-
-----------------------
|
| 4 |
-
Paper'ın %90.29 baseline'ını neden biz %75.57 olarak görüyoruz?
|
| 5 |
-
İki metodolojinin farkını somut olarak gösterir.
|
| 6 |
-
|
| 7 |
-
Kullanım:
|
| 8 |
-
cd NedoTurkishTokenizer/
|
| 9 |
-
python paper_baseline_check.py
|
| 10 |
-
"""
|
| 11 |
-
|
| 12 |
-
import os
|
| 13 |
-
from huggingface_hub import login
|
| 14 |
-
from datasets import load_dataset
|
| 15 |
-
from turkish_tokenizer import TurkishTokenizer
|
| 16 |
-
|
| 17 |
-
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 18 |
-
if HF_TOKEN:
|
| 19 |
-
login(token=HF_TOKEN, add_to_git_credential=False)
|
| 20 |
-
else:
|
| 21 |
-
print("HF_TOKEN not set; using existing Hugging Face login state if available.")
|
| 22 |
-
|
| 23 |
-
base = TurkishTokenizer()
|
| 24 |
-
|
| 25 |
-
print("TR-MMLU yükleniyor...")
|
| 26 |
-
ds = load_dataset("alibayram/turkish_mmlu", split="test", token=HF_TOKEN)
|
| 27 |
-
rows = list(ds)
|
| 28 |
-
print(f"{len(rows)} örnek\n")
|
| 29 |
-
|
| 30 |
-
def orig_tr_pct(tokens):
|
| 31 |
-
total = len(tokens)
|
| 32 |
-
if total == 0: return 0.0
|
| 33 |
-
tr = sum(1 for t in tokens if t["type"] in ("ROOT","SUFFIX"))
|
| 34 |
-
return tr / total * 100
|
| 35 |
-
|
| 36 |
-
# ── Metodoloji A: Sadece soru (paper'ın yaptığı) ─────────────────────────────
|
| 37 |
-
a_sum = 0.0
|
| 38 |
-
for row in rows:
|
| 39 |
-
text = str(row.get("soru") or row.get("question") or "")
|
| 40 |
-
if not text.strip(): continue
|
| 41 |
-
toks = base.tokenize_text(text)
|
| 42 |
-
a_sum += orig_tr_pct(toks)
|
| 43 |
-
a_avg = a_sum / len(rows)
|
| 44 |
-
|
| 45 |
-
# ── Metodoloji B: Soru + şıklar (bizim yaptığımız) ───────────────────────────
|
| 46 |
-
b_sum = 0.0
|
| 47 |
-
for row in rows:
|
| 48 |
-
parts = []
|
| 49 |
-
for f in ["soru","question"]:
|
| 50 |
-
if row.get(f):
|
| 51 |
-
parts.append(str(row[f]))
|
| 52 |
-
break
|
| 53 |
-
choices = row.get("secenekler") or []
|
| 54 |
-
if isinstance(choices, list):
|
| 55 |
-
parts.extend(str(c) for c in choices)
|
| 56 |
-
text = " ".join(parts)
|
| 57 |
-
if not text.strip(): continue
|
| 58 |
-
toks = base.tokenize_text(text)
|
| 59 |
-
b_sum += orig_tr_pct(toks)
|
| 60 |
-
b_avg = b_sum / len(rows)
|
| 61 |
-
|
| 62 |
-
# ── Metodoloji C: Soru + şıklar + açıklama (tam) ─────────────────────────────
|
| 63 |
-
c_sum = 0.0
|
| 64 |
-
for row in rows:
|
| 65 |
-
parts = []
|
| 66 |
-
for f in ["soru","question"]:
|
| 67 |
-
if row.get(f):
|
| 68 |
-
parts.append(str(row[f]))
|
| 69 |
-
break
|
| 70 |
-
choices = row.get("secenekler") or []
|
| 71 |
-
if isinstance(choices, list):
|
| 72 |
-
parts.extend(str(c) for c in choices)
|
| 73 |
-
if row.get("aciklama"):
|
| 74 |
-
parts.append(str(row["aciklama"]))
|
| 75 |
-
text = " ".join(parts)
|
| 76 |
-
if not text.strip(): continue
|
| 77 |
-
toks = base.tokenize_text(text)
|
| 78 |
-
c_sum += orig_tr_pct(toks)
|
| 79 |
-
c_avg = c_sum / len(rows)
|
| 80 |
-
|
| 81 |
-
# ── Şıklardaki içerik analizi ─────────────────────────────────────────────────
|
| 82 |
-
# İlk 20 sorunun şıklarına bak
|
| 83 |
-
print("İlk 20 sorunun şık örnekleri:")
|
| 84 |
-
for row in rows[:20]:
|
| 85 |
-
choices = row.get("secenekler") or []
|
| 86 |
-
if isinstance(choices, list) and choices:
|
| 87 |
-
sample = " | ".join(str(c)[:20] for c in choices[:4])
|
| 88 |
-
print(f" {sample}")
|
| 89 |
-
|
| 90 |
-
print(f"""
|
| 91 |
-
{'='*60}
|
| 92 |
-
METODOLOJİ KARŞILAŞTIRMASI (TürkishTokenizer baseline)
|
| 93 |
-
{'='*60}
|
| 94 |
-
|
| 95 |
-
A) Sadece soru alanı (paper'ın yöntemi): {a_avg:.2f}%
|
| 96 |
-
B) Soru + şıklar (kısmi): {b_avg:.2f}%
|
| 97 |
-
C) Soru + şıklar + açıkl. (bizim yöntemimiz): {c_avg:.2f}%
|
| 98 |
-
|
| 99 |
-
Paper değeri: 90.29%
|
| 100 |
-
Bizim A değeri: {a_avg:.2f}% ← paper ile fark: {a_avg-90.29:+.2f}
|
| 101 |
-
Bizim C değeri: {c_avg:.2f}% ← biz bunu kullanıyoruz
|
| 102 |
-
|
| 103 |
-
Sonuç: {c_avg:.2f}% vs {a_avg:.2f}% = {c_avg-a_avg:.2f} puan fark
|
| 104 |
-
Bu fark şıklardaki kısaltmalar ve yabancı terimlerden kaynaklanıyor.
|
| 105 |
-
{'='*60}
|
| 106 |
-
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pyproject.toml
CHANGED
|
@@ -4,31 +4,27 @@ build-backend = "setuptools.build_meta"
|
|
| 4 |
|
| 5 |
[project]
|
| 6 |
name = "nedo-turkish-tokenizer"
|
| 7 |
-
version = "
|
| 8 |
-
description = "Turkish morphological tokenizer
|
| 9 |
readme = "README.md"
|
| 10 |
license = { text = "MIT" }
|
| 11 |
authors = [{ name = "Ethosoft", email = "info@ethosoft.ai" }]
|
| 12 |
requires-python = ">=3.10"
|
| 13 |
-
keywords = ["turkish", "nlp", "tokenizer", "morphology", "
|
| 14 |
classifiers = [
|
| 15 |
"Programming Language :: Python :: 3",
|
| 16 |
"License :: OSI Approved :: MIT License",
|
| 17 |
"Operating System :: OS Independent",
|
| 18 |
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
|
|
| 19 |
"Natural Language :: Turkish",
|
| 20 |
]
|
| 21 |
-
dependencies = [
|
| 22 |
-
"turkish-tokenizer>=0.1.0",
|
| 23 |
-
"zemberek-python>=0.2.3",
|
| 24 |
-
"requests>=2.28.0",
|
| 25 |
-
]
|
| 26 |
|
| 27 |
[project.optional-dependencies]
|
| 28 |
-
dev = ["pytest"
|
| 29 |
|
| 30 |
[project.urls]
|
| 31 |
-
Homepage = "https://huggingface.co/Ethosoft/NedoTurkishTokenizer"
|
| 32 |
Repository = "https://github.com/ethosoftai/NedoTurkishTokenizer"
|
| 33 |
|
| 34 |
[tool.setuptools.packages.find]
|
|
|
|
| 4 |
|
| 5 |
[project]
|
| 6 |
name = "nedo-turkish-tokenizer"
|
| 7 |
+
version = "2.0.0"
|
| 8 |
+
description = "Self-contained Turkish morphological tokenizer with zero external dependencies"
|
| 9 |
readme = "README.md"
|
| 10 |
license = { text = "MIT" }
|
| 11 |
authors = [{ name = "Ethosoft", email = "info@ethosoft.ai" }]
|
| 12 |
requires-python = ">=3.10"
|
| 13 |
+
keywords = ["turkish", "nlp", "tokenizer", "morphology", "segmentation"]
|
| 14 |
classifiers = [
|
| 15 |
"Programming Language :: Python :: 3",
|
| 16 |
"License :: OSI Approved :: MIT License",
|
| 17 |
"Operating System :: OS Independent",
|
| 18 |
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
| 19 |
+
"Topic :: Text Processing :: Linguistic",
|
| 20 |
"Natural Language :: Turkish",
|
| 21 |
]
|
| 22 |
+
dependencies = []
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
[project.optional-dependencies]
|
| 25 |
+
dev = ["pytest"]
|
| 26 |
|
| 27 |
[project.urls]
|
|
|
|
| 28 |
Repository = "https://github.com/ethosoftai/NedoTurkishTokenizer"
|
| 29 |
|
| 30 |
[tool.setuptools.packages.find]
|
special_tokens_map.json
DELETED
|
@@ -1,9 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"bos_token": "[BOS]",
|
| 3 |
-
"cls_token": "[CLS]",
|
| 4 |
-
"eos_token": "[EOS]",
|
| 5 |
-
"mask_token": "[MASK]",
|
| 6 |
-
"pad_token": "[PAD]",
|
| 7 |
-
"sep_token": "[SEP]",
|
| 8 |
-
"unk_token": "[UNK]"
|
| 9 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_lattice.py
DELETED
|
@@ -1,72 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Test / demo script for the Morphological Lattice API.
|
| 4 |
-
|
| 5 |
-
Shows how ambiguous Turkish words like "evin" and "gelir" produce
|
| 6 |
-
multiple alternative analyses in the lattice.
|
| 7 |
-
"""
|
| 8 |
-
|
| 9 |
-
import json
|
| 10 |
-
from nedo_turkish_tokenizer import NedoTurkishTokenizer
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
def section(title: str) -> None:
|
| 14 |
-
print(f"\n{'═' * 60}")
|
| 15 |
-
print(f" {title}")
|
| 16 |
-
print(f"{'═' * 60}")
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
def main() -> None:
|
| 20 |
-
tok = NedoTurkishTokenizer()
|
| 21 |
-
|
| 22 |
-
# ── 1. Tek kelime lattice testi ──────────────────────────────────────
|
| 23 |
-
section("1) get_morphological_lattice — tek kelime örnekleri")
|
| 24 |
-
|
| 25 |
-
test_words = ["evin", "gelir", "yüz", "çıkar", "koşar"]
|
| 26 |
-
for word in test_words:
|
| 27 |
-
lattice = tok.get_morphological_lattice(word)
|
| 28 |
-
print(f"\n▸ \"{word}\" → {len(lattice)} analiz:")
|
| 29 |
-
for i, entry in enumerate(lattice):
|
| 30 |
-
print(f" [{i}] root={entry['root']:<12} "
|
| 31 |
-
f"pos={entry['pos']:<6} "
|
| 32 |
-
f"suffixes={entry['suffixes']}")
|
| 33 |
-
print(f" lexical_form = {entry['lexical_form']}")
|
| 34 |
-
|
| 35 |
-
# ── 2. Bilinmeyen kelime (boş lattice) ───────────────────────────────
|
| 36 |
-
section("2) Bilinmeyen / yabancı kelime → boş lattice")
|
| 37 |
-
|
| 38 |
-
unknown_words = ["xyzfoo", "meeting", "blockchain"]
|
| 39 |
-
for word in unknown_words:
|
| 40 |
-
lattice = tok.get_morphological_lattice(word)
|
| 41 |
-
print(f" \"{word}\" → lattice boş mu? {len(lattice) == 0} (len={len(lattice)})")
|
| 42 |
-
|
| 43 |
-
# ── 3. tokenize_lattice — cümle bazlı test ──────────────────────────
|
| 44 |
-
section("3) tokenize_lattice — cümle testi")
|
| 45 |
-
|
| 46 |
-
sentences = [
|
| 47 |
-
"Evin güzel gelir",
|
| 48 |
-
"Çocuk okula koşar adım gitti",
|
| 49 |
-
"Yüz yıllık çınar",
|
| 50 |
-
]
|
| 51 |
-
|
| 52 |
-
for sent in sentences:
|
| 53 |
-
print(f"\n▸ Input: \"{sent}\"")
|
| 54 |
-
result = tok.tokenize_lattice(sent)
|
| 55 |
-
for winfo in result["words"]:
|
| 56 |
-
n = len(winfo["lattice"])
|
| 57 |
-
print(f" {winfo['word']:<16} → {n} analiz(ler)")
|
| 58 |
-
for entry in winfo["lattice"]:
|
| 59 |
-
print(f" root={entry['root']:<12} pos={entry['pos']:<6} "
|
| 60 |
-
f"suffixes={entry['suffixes']}")
|
| 61 |
-
|
| 62 |
-
# ── 4. JSON çıktı formatı ────────────────────────────────────────────
|
| 63 |
-
section("4) tokenize_lattice JSON çıktı")
|
| 64 |
-
|
| 65 |
-
data = tok.tokenize_lattice("evin gelir")
|
| 66 |
-
print(json.dumps(data, ensure_ascii=False, indent=2))
|
| 67 |
-
|
| 68 |
-
print("\n✅ Tüm testler başarıyla tamamlandı.")
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
if __name__ == "__main__":
|
| 72 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_tdk_vocab.py
DELETED
|
@@ -1,31 +0,0 @@
|
|
| 1 |
-
from __future__ import annotations
|
| 2 |
-
|
| 3 |
-
import tempfile
|
| 4 |
-
import unittest
|
| 5 |
-
from pathlib import Path
|
| 6 |
-
from unittest import mock
|
| 7 |
-
|
| 8 |
-
from nedo_turkish_tokenizer import _tdk_vocab
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
class TdkVocabTests(unittest.TestCase):
|
| 12 |
-
def setUp(self) -> None:
|
| 13 |
-
self._original_words = _tdk_vocab._TDK_WORDS
|
| 14 |
-
_tdk_vocab._TDK_WORDS = None
|
| 15 |
-
|
| 16 |
-
def tearDown(self) -> None:
|
| 17 |
-
_tdk_vocab._TDK_WORDS = self._original_words
|
| 18 |
-
|
| 19 |
-
def test_load_tdk_words_uses_bundled_file_before_network(self) -> None:
|
| 20 |
-
with tempfile.TemporaryDirectory() as tmpdir:
|
| 21 |
-
cache_path = str(Path(tmpdir) / "tdk_words.txt")
|
| 22 |
-
|
| 23 |
-
with mock.patch.object(_tdk_vocab, "TDK_CACHE_FILE", cache_path):
|
| 24 |
-
with mock.patch.object(_tdk_vocab, "_download_from_hf") as download_hf:
|
| 25 |
-
with mock.patch.object(_tdk_vocab, "_download_from_tdk") as download_tdk:
|
| 26 |
-
words = _tdk_vocab.load_tdk_words()
|
| 27 |
-
|
| 28 |
-
self.assertGreater(len(words), 50_000)
|
| 29 |
-
self.assertIn("zemberek", words)
|
| 30 |
-
download_hf.assert_not_called()
|
| 31 |
-
download_tdk.assert_not_called()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tests/test_tokenizer.py
ADDED
|
@@ -0,0 +1,457 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Comprehensive regression test suite for NedoTurkishTokenizer.
|
| 2 |
+
|
| 3 |
+
Tests the public API and core segmentation with gold-standard examples
|
| 4 |
+
covering: basic Turkish, suffix chains, apostrophes, foreign words,
|
| 5 |
+
acronyms, special spans, ALL CAPS, compound words, and edge cases.
|
| 6 |
+
|
| 7 |
+
TOKEN FORMAT CONTRACT:
|
| 8 |
+
token text does NOT include leading whitespace.
|
| 9 |
+
Whether a token is word-initial is indicated by morph_pos == 0.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import unittest
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class TestTokenizerPublicAPI(unittest.TestCase):
|
| 18 |
+
"""Smoke tests for the public API surface."""
|
| 19 |
+
|
| 20 |
+
@classmethod
|
| 21 |
+
def setUpClass(cls) -> None:
|
| 22 |
+
from nedo_turkish_tokenizer import NedoTurkishTokenizer
|
| 23 |
+
cls.tok = NedoTurkishTokenizer()
|
| 24 |
+
|
| 25 |
+
def test_import_and_instantiate(self) -> None:
|
| 26 |
+
from nedo_turkish_tokenizer import NedoTurkishTokenizer
|
| 27 |
+
t = NedoTurkishTokenizer()
|
| 28 |
+
self.assertIsNotNone(t)
|
| 29 |
+
|
| 30 |
+
def test_version(self) -> None:
|
| 31 |
+
from nedo_turkish_tokenizer import __version__
|
| 32 |
+
self.assertEqual(__version__, "2.0.0")
|
| 33 |
+
|
| 34 |
+
def test_empty_input(self) -> None:
|
| 35 |
+
self.assertEqual(self.tok.tokenize(""), [])
|
| 36 |
+
self.assertEqual(self.tok.tokenize(" "), [])
|
| 37 |
+
|
| 38 |
+
def test_callable_shorthand(self) -> None:
|
| 39 |
+
result = self.tok("Merhaba")
|
| 40 |
+
self.assertTrue(len(result) > 0)
|
| 41 |
+
|
| 42 |
+
def test_token_dict_fields(self) -> None:
|
| 43 |
+
tokens = self.tok.tokenize("ev")
|
| 44 |
+
self.assertTrue(len(tokens) >= 1)
|
| 45 |
+
t = tokens[0]
|
| 46 |
+
self.assertIn("token", t)
|
| 47 |
+
self.assertIn("token_type", t)
|
| 48 |
+
self.assertIn("morph_pos", t)
|
| 49 |
+
|
| 50 |
+
def test_batch_tokenize(self) -> None:
|
| 51 |
+
texts = ["ev", "araba", "merhaba"]
|
| 52 |
+
results = self.tok.batch_tokenize(texts, chunk_size=1000)
|
| 53 |
+
self.assertEqual(len(results), 3)
|
| 54 |
+
for r in results:
|
| 55 |
+
self.assertIsInstance(r, list)
|
| 56 |
+
self.assertTrue(len(r) >= 1)
|
| 57 |
+
|
| 58 |
+
def test_stats(self) -> None:
|
| 59 |
+
tokens = self.tok.tokenize("evde oturuyorum")
|
| 60 |
+
stats = self.tok.stats(tokens)
|
| 61 |
+
self.assertIn("total", stats)
|
| 62 |
+
self.assertIn("roots", stats)
|
| 63 |
+
self.assertIn("suffixes", stats)
|
| 64 |
+
self.assertIn("tr_pct", stats)
|
| 65 |
+
self.assertGreater(stats["total"], 0)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
class TestTokenFormat(unittest.TestCase):
|
| 69 |
+
"""Token text must NOT include leading whitespace."""
|
| 70 |
+
|
| 71 |
+
@classmethod
|
| 72 |
+
def setUpClass(cls) -> None:
|
| 73 |
+
from nedo_turkish_tokenizer import NedoTurkishTokenizer
|
| 74 |
+
cls.tok = NedoTurkishTokenizer()
|
| 75 |
+
|
| 76 |
+
def test_no_leading_space_root(self) -> None:
|
| 77 |
+
tokens = self.tok.tokenize("merhaba")
|
| 78 |
+
self.assertEqual(tokens[0]["token"], "merhaba")
|
| 79 |
+
|
| 80 |
+
def test_no_leading_space_suffix(self) -> None:
|
| 81 |
+
tokens = self.tok.tokenize("evde")
|
| 82 |
+
for t in tokens:
|
| 83 |
+
self.assertFalse(
|
| 84 |
+
t["token"].startswith(" "),
|
| 85 |
+
f"Token {t['token']!r} has a leading space",
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
def test_no_leading_space_url(self) -> None:
|
| 89 |
+
tokens = self.tok.tokenize("https://example.com")
|
| 90 |
+
self.assertEqual(tokens[0]["token"], "https://example.com")
|
| 91 |
+
|
| 92 |
+
def test_no_leading_space_num(self) -> None:
|
| 93 |
+
tokens = self.tok.tokenize("%85")
|
| 94 |
+
self.assertEqual(tokens[0]["token"], "%85")
|
| 95 |
+
|
| 96 |
+
def test_no_leading_space_any_token(self) -> None:
|
| 97 |
+
"""No token in the output should ever start with a space."""
|
| 98 |
+
text = "İstanbul'da meeting'e katılamadım https://example.com %85"
|
| 99 |
+
tokens = self.tok.tokenize(text)
|
| 100 |
+
for t in tokens:
|
| 101 |
+
self.assertFalse(
|
| 102 |
+
t["token"].startswith(" "),
|
| 103 |
+
f"Token {t['token']!r} (type={t['token_type']}) has a leading space",
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
class TestBasicTurkish(unittest.TestCase):
|
| 108 |
+
"""Core Turkish morphology tokenization."""
|
| 109 |
+
|
| 110 |
+
@classmethod
|
| 111 |
+
def setUpClass(cls) -> None:
|
| 112 |
+
from nedo_turkish_tokenizer import NedoTurkishTokenizer
|
| 113 |
+
cls.tok = NedoTurkishTokenizer()
|
| 114 |
+
|
| 115 |
+
def _roots(self, text: str) -> list[str]:
|
| 116 |
+
return [t["token"] for t in self.tok.tokenize(text) if t["token_type"] == "ROOT"]
|
| 117 |
+
|
| 118 |
+
def _types(self, text: str) -> list[str]:
|
| 119 |
+
return [t["token_type"] for t in self.tok.tokenize(text)]
|
| 120 |
+
|
| 121 |
+
def _suffixes(self, text: str) -> list[str]:
|
| 122 |
+
return [t["token"] for t in self.tok.tokenize(text) if t["token_type"] == "SUFFIX"]
|
| 123 |
+
|
| 124 |
+
# ── Single words ─────────────────────────────────────────────────────
|
| 125 |
+
|
| 126 |
+
def test_simple_root(self) -> None:
|
| 127 |
+
tokens = self.tok.tokenize("merhaba")
|
| 128 |
+
self.assertEqual(tokens[0]["token"], "merhaba")
|
| 129 |
+
self.assertEqual(tokens[0]["token_type"], "ROOT")
|
| 130 |
+
|
| 131 |
+
def test_whole_word_tdk_preserved(self) -> None:
|
| 132 |
+
"""'dünya' is in TDK — must NOT be split into 'dün' + 'ya'."""
|
| 133 |
+
roots = self._roots("dünya")
|
| 134 |
+
self.assertIn("dünya", roots)
|
| 135 |
+
|
| 136 |
+
def test_suffix_loc(self) -> None:
|
| 137 |
+
tokens = self.tok.tokenize("evde")
|
| 138 |
+
self.assertEqual(tokens[0]["token"], "ev")
|
| 139 |
+
self.assertEqual(tokens[0]["token_type"], "ROOT")
|
| 140 |
+
self.assertEqual(tokens[1]["token"], "de")
|
| 141 |
+
self.assertEqual(tokens[1]["token_type"], "SUFFIX")
|
| 142 |
+
|
| 143 |
+
def test_suffix_plural_acc(self) -> None:
|
| 144 |
+
tokens = self.tok.tokenize("kitapları")
|
| 145 |
+
self.assertEqual(tokens[0]["token"], "kitap")
|
| 146 |
+
types = [t["token_type"] for t in tokens]
|
| 147 |
+
self.assertIn("SUFFIX", types)
|
| 148 |
+
|
| 149 |
+
def test_verb_stem_past(self) -> None:
|
| 150 |
+
"""Verb stems derived from infinitives must be found."""
|
| 151 |
+
roots = self._roots("geldim")
|
| 152 |
+
self.assertIn("gel", roots)
|
| 153 |
+
|
| 154 |
+
def test_verb_stem_progressive(self) -> None:
|
| 155 |
+
roots = self._roots("geliyorum")
|
| 156 |
+
self.assertIn("gel", roots)
|
| 157 |
+
|
| 158 |
+
def test_verb_otur(self) -> None:
|
| 159 |
+
roots = self._roots("oturuyorum")
|
| 160 |
+
self.assertIn("otur", roots)
|
| 161 |
+
|
| 162 |
+
def test_katil_root(self) -> None:
|
| 163 |
+
roots = self._roots("katılamadım")
|
| 164 |
+
self.assertIn("katıl", roots)
|
| 165 |
+
|
| 166 |
+
def test_longer_root_wins(self) -> None:
|
| 167 |
+
"""'toplantısı' should segment as 'toplantı' + 'sı', not 'toplan' + 'tı' + 'sı'."""
|
| 168 |
+
roots = self._roots("toplantısı")
|
| 169 |
+
self.assertIn("toplantı", roots)
|
| 170 |
+
|
| 171 |
+
def test_morph_pos_increments(self) -> None:
|
| 172 |
+
tokens = self.tok.tokenize("evlerden")
|
| 173 |
+
suffix_positions = [t["morph_pos"] for t in tokens if t["token_type"] == "SUFFIX"]
|
| 174 |
+
for i, pos in enumerate(suffix_positions):
|
| 175 |
+
self.assertGreater(pos, 0, f"Suffix at index {i} should have morph_pos > 0")
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
class TestFalseSuffixSplits(unittest.TestCase):
|
| 179 |
+
"""Regression tests: common words that must NOT be over-segmented.
|
| 180 |
+
|
| 181 |
+
These words look like root+suffix but are standalone units.
|
| 182 |
+
"""
|
| 183 |
+
|
| 184 |
+
@classmethod
|
| 185 |
+
def setUpClass(cls) -> None:
|
| 186 |
+
from nedo_turkish_tokenizer import NedoTurkishTokenizer
|
| 187 |
+
cls.tok = NedoTurkishTokenizer()
|
| 188 |
+
|
| 189 |
+
def _assert_single_root(self, word: str) -> None:
|
| 190 |
+
"""Assert that *word* tokenizes to exactly one ROOT token."""
|
| 191 |
+
tokens = self.tok.tokenize(word)
|
| 192 |
+
roots = [t for t in tokens if t["token_type"] == "ROOT"]
|
| 193 |
+
self.assertEqual(
|
| 194 |
+
len(roots), 1,
|
| 195 |
+
f"'{word}' should be a single ROOT, got: "
|
| 196 |
+
f"{[(t['token'], t['token_type']) for t in tokens]}",
|
| 197 |
+
)
|
| 198 |
+
self.assertEqual(len(tokens), 1, f"'{word}' should produce 1 token, got {len(tokens)}")
|
| 199 |
+
self.assertEqual(tokens[0]["token"], word)
|
| 200 |
+
|
| 201 |
+
# ── Forms of "demek" (to say) ────────────────────────────────────────
|
| 202 |
+
# Stem "de" is a TDK conjunction, causing false splits like de+di.
|
| 203 |
+
|
| 204 |
+
def test_dedi(self) -> None:
|
| 205 |
+
self._assert_single_root("dedi")
|
| 206 |
+
|
| 207 |
+
def test_dedim(self) -> None:
|
| 208 |
+
self._assert_single_root("dedim")
|
| 209 |
+
|
| 210 |
+
def test_demis(self) -> None:
|
| 211 |
+
self._assert_single_root("demiş")
|
| 212 |
+
|
| 213 |
+
def test_denir(self) -> None:
|
| 214 |
+
self._assert_single_root("denir")
|
| 215 |
+
|
| 216 |
+
def test_dese(self) -> None:
|
| 217 |
+
self._assert_single_root("dese")
|
| 218 |
+
|
| 219 |
+
# ── Discourse particles / conjunctions ───────────────────────────────
|
| 220 |
+
# These are in TDK and should be protected by WHOLE_WORD_BONUS.
|
| 221 |
+
|
| 222 |
+
def test_yani(self) -> None:
|
| 223 |
+
self._assert_single_root("yani")
|
| 224 |
+
|
| 225 |
+
def test_belki(self) -> None:
|
| 226 |
+
self._assert_single_root("belki")
|
| 227 |
+
|
| 228 |
+
def test_cunku(self) -> None:
|
| 229 |
+
self._assert_single_root("çünkü")
|
| 230 |
+
|
| 231 |
+
def test_sanki(self) -> None:
|
| 232 |
+
self._assert_single_root("sanki")
|
| 233 |
+
|
| 234 |
+
# ── "dedi mi" phrase ─────────────────────────────────────────────────
|
| 235 |
+
|
| 236 |
+
def test_dedi_mi(self) -> None:
|
| 237 |
+
tokens = self.tok.tokenize("dedi mi")
|
| 238 |
+
roots = [t for t in tokens if t["token_type"] == "ROOT"]
|
| 239 |
+
self.assertEqual(len(roots), 2, "Both 'dedi' and 'mi' should be roots")
|
| 240 |
+
root_texts = [t["token"] for t in roots]
|
| 241 |
+
self.assertIn("dedi", root_texts)
|
| 242 |
+
|
| 243 |
+
# ── TDK-protected words should never be split ────────────────────────
|
| 244 |
+
|
| 245 |
+
def test_bile(self) -> None:
|
| 246 |
+
self._assert_single_root("bile")
|
| 247 |
+
|
| 248 |
+
def test_daha(self) -> None:
|
| 249 |
+
self._assert_single_root("daha")
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
class TestApostrophe(unittest.TestCase):
|
| 253 |
+
"""Apostrophe handling for Turkish proper names and foreign stems."""
|
| 254 |
+
|
| 255 |
+
@classmethod
|
| 256 |
+
def setUpClass(cls) -> None:
|
| 257 |
+
from nedo_turkish_tokenizer import NedoTurkishTokenizer
|
| 258 |
+
cls.tok = NedoTurkishTokenizer()
|
| 259 |
+
|
| 260 |
+
def test_turkish_proper_name(self) -> None:
|
| 261 |
+
"""İstanbul'da → ROOT + PUNCT(') + SUFFIX(da)."""
|
| 262 |
+
tokens = self.tok.tokenize("İstanbul'da")
|
| 263 |
+
types = [t["token_type"] for t in tokens]
|
| 264 |
+
self.assertIn("ROOT", types)
|
| 265 |
+
self.assertIn("PUNCT", types)
|
| 266 |
+
self.assertIn("SUFFIX", types)
|
| 267 |
+
|
| 268 |
+
def test_foreign_stem(self) -> None:
|
| 269 |
+
"""meeting'e → FOREIGN + SUFFIX."""
|
| 270 |
+
tokens = self.tok.tokenize("meeting'e")
|
| 271 |
+
types = [t["token_type"] for t in tokens]
|
| 272 |
+
self.assertIn("FOREIGN", types)
|
| 273 |
+
self.assertIn("SUFFIX", types)
|
| 274 |
+
|
| 275 |
+
def test_apostrophe_suffix_label(self) -> None:
|
| 276 |
+
tokens = self.tok.tokenize("İstanbul'da")
|
| 277 |
+
suffix_tokens = [t for t in tokens if t["token_type"] == "SUFFIX"]
|
| 278 |
+
self.assertTrue(len(suffix_tokens) >= 1)
|
| 279 |
+
self.assertEqual(suffix_tokens[0].get("_suffix_label"), "-LOC")
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
class TestSpecialSpans(unittest.TestCase):
|
| 283 |
+
"""URL, date, number, acronym, emoji detection."""
|
| 284 |
+
|
| 285 |
+
@classmethod
|
| 286 |
+
def setUpClass(cls) -> None:
|
| 287 |
+
from nedo_turkish_tokenizer import NedoTurkishTokenizer
|
| 288 |
+
cls.tok = NedoTurkishTokenizer()
|
| 289 |
+
|
| 290 |
+
def _find_type(self, text: str, ttype: str) -> list[dict]:
|
| 291 |
+
return [t for t in self.tok.tokenize(text) if t["token_type"] == ttype]
|
| 292 |
+
|
| 293 |
+
def test_url_detection(self) -> None:
|
| 294 |
+
urls = self._find_type("https://example.com sitesine bak", "URL")
|
| 295 |
+
self.assertEqual(len(urls), 1)
|
| 296 |
+
self.assertIn("example.com", urls[0]["token"])
|
| 297 |
+
|
| 298 |
+
def test_date_detection(self) -> None:
|
| 299 |
+
dates = self._find_type("14.03.2026 tarihinde", "DATE")
|
| 300 |
+
self.assertEqual(len(dates), 1)
|
| 301 |
+
|
| 302 |
+
def test_number_detection(self) -> None:
|
| 303 |
+
nums = self._find_type("%85 başarı", "NUM")
|
| 304 |
+
self.assertEqual(len(nums), 1)
|
| 305 |
+
|
| 306 |
+
def test_acronym_detection(self) -> None:
|
| 307 |
+
tokens = self.tok.tokenize("NATO güçlü")
|
| 308 |
+
acr = [t for t in tokens if t["token_type"] == "ACRONYM"]
|
| 309 |
+
self.assertEqual(len(acr), 1)
|
| 310 |
+
self.assertTrue(acr[0].get("_expansion"))
|
| 311 |
+
|
| 312 |
+
def test_mention_detection(self) -> None:
|
| 313 |
+
mentions = self._find_type("@kullanici çok iyi", "MENTION")
|
| 314 |
+
self.assertEqual(len(mentions), 1)
|
| 315 |
+
|
| 316 |
+
def test_hashtag_detection(self) -> None:
|
| 317 |
+
tags = self._find_type("#türkiye çok güzel", "HASHTAG")
|
| 318 |
+
self.assertEqual(len(tags), 1)
|
| 319 |
+
|
| 320 |
+
|
| 321 |
+
class TestAllCaps(unittest.TestCase):
|
| 322 |
+
"""ALL CAPS word handling."""
|
| 323 |
+
|
| 324 |
+
@classmethod
|
| 325 |
+
def setUpClass(cls) -> None:
|
| 326 |
+
from nedo_turkish_tokenizer import NedoTurkishTokenizer
|
| 327 |
+
cls.tok = NedoTurkishTokenizer()
|
| 328 |
+
|
| 329 |
+
def test_caps_detected(self) -> None:
|
| 330 |
+
tokens = self.tok.tokenize("İSTANBUL güzel")
|
| 331 |
+
istanbul_tok = [t for t in tokens if "istanbul" in t["token"]]
|
| 332 |
+
self.assertTrue(len(istanbul_tok) >= 1)
|
| 333 |
+
self.assertTrue(istanbul_tok[0].get("_caps"))
|
| 334 |
+
|
| 335 |
+
def test_caps_lowered(self) -> None:
|
| 336 |
+
tokens = self.tok.tokenize("İSTANBUL")
|
| 337 |
+
self.assertEqual(tokens[0]["token"], "istanbul")
|
| 338 |
+
|
| 339 |
+
def test_caps_acronym(self) -> None:
|
| 340 |
+
"""Known acronyms in ALL CAPS should be ACRONYM type."""
|
| 341 |
+
tokens = self.tok.tokenize("TBMM toplantısı")
|
| 342 |
+
tbmm = [t for t in tokens if t["token_type"] == "ACRONYM"]
|
| 343 |
+
self.assertTrue(len(tbmm) >= 1)
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
class TestCanonicalLabels(unittest.TestCase):
|
| 347 |
+
"""Allomorph canonicalization metadata."""
|
| 348 |
+
|
| 349 |
+
@classmethod
|
| 350 |
+
def setUpClass(cls) -> None:
|
| 351 |
+
from nedo_turkish_tokenizer import NedoTurkishTokenizer
|
| 352 |
+
cls.tok = NedoTurkishTokenizer()
|
| 353 |
+
|
| 354 |
+
def test_loc_canonical(self) -> None:
|
| 355 |
+
tokens = self.tok.tokenize("evde")
|
| 356 |
+
suffix = [t for t in tokens if t["token_type"] == "SUFFIX"]
|
| 357 |
+
self.assertTrue(any(t.get("_canonical") == "LOC" for t in suffix))
|
| 358 |
+
|
| 359 |
+
def test_pl_canonical(self) -> None:
|
| 360 |
+
tokens = self.tok.tokenize("evler")
|
| 361 |
+
suffix = [t for t in tokens if t["token_type"] == "SUFFIX"]
|
| 362 |
+
self.assertTrue(any(t.get("_canonical") == "PL" for t in suffix))
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
class TestCompoundAnnotation(unittest.TestCase):
|
| 366 |
+
"""Compound word detection."""
|
| 367 |
+
|
| 368 |
+
@classmethod
|
| 369 |
+
def setUpClass(cls) -> None:
|
| 370 |
+
from nedo_turkish_tokenizer import NedoTurkishTokenizer
|
| 371 |
+
cls.tok = NedoTurkishTokenizer()
|
| 372 |
+
|
| 373 |
+
def test_known_compound(self) -> None:
|
| 374 |
+
tokens = self.tok.tokenize("başbakan")
|
| 375 |
+
root = [t for t in tokens if t["token_type"] == "ROOT"]
|
| 376 |
+
if root and root[0]["token"] == "başbakan":
|
| 377 |
+
self.assertTrue(root[0].get("_compound"))
|
| 378 |
+
self.assertIn("baş", root[0].get("_parts", []))
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
class TestNoDependencies(unittest.TestCase):
|
| 382 |
+
"""Verify no external runtime dependencies are imported."""
|
| 383 |
+
|
| 384 |
+
def test_no_external_imports(self) -> None:
|
| 385 |
+
import ast
|
| 386 |
+
from pathlib import Path
|
| 387 |
+
|
| 388 |
+
pkg_dir = Path(__file__).parent.parent / "nedo_turkish_tokenizer"
|
| 389 |
+
banned = {"turkish_tokenizer", "zemberek", "requests", "transformers"}
|
| 390 |
+
|
| 391 |
+
for py_file in pkg_dir.glob("*.py"):
|
| 392 |
+
tree = ast.parse(py_file.read_text(encoding="utf-8"))
|
| 393 |
+
for node in ast.walk(tree):
|
| 394 |
+
if isinstance(node, ast.Import):
|
| 395 |
+
for alias in node.names:
|
| 396 |
+
top = alias.name.split(".")[0]
|
| 397 |
+
self.assertNotIn(
|
| 398 |
+
top, banned,
|
| 399 |
+
f"{py_file.name} imports banned dependency: {alias.name}"
|
| 400 |
+
)
|
| 401 |
+
elif isinstance(node, ast.ImportFrom):
|
| 402 |
+
if node.module:
|
| 403 |
+
top = node.module.split(".")[0]
|
| 404 |
+
self.assertNotIn(
|
| 405 |
+
top, banned,
|
| 406 |
+
f"{py_file.name} imports banned dependency: {node.module}"
|
| 407 |
+
)
|
| 408 |
+
|
| 409 |
+
|
| 410 |
+
class TestEdgeCases(unittest.TestCase):
|
| 411 |
+
"""Edge cases and regression guards."""
|
| 412 |
+
|
| 413 |
+
@classmethod
|
| 414 |
+
def setUpClass(cls) -> None:
|
| 415 |
+
from nedo_turkish_tokenizer import NedoTurkishTokenizer
|
| 416 |
+
cls.tok = NedoTurkishTokenizer()
|
| 417 |
+
|
| 418 |
+
def test_punctuation_only(self) -> None:
|
| 419 |
+
tokens = self.tok.tokenize("...")
|
| 420 |
+
self.assertTrue(all(t["token_type"] == "PUNCT" for t in tokens))
|
| 421 |
+
|
| 422 |
+
def test_mixed_punctuation(self) -> None:
|
| 423 |
+
tokens = self.tok.tokenize('"Merhaba," dedi.')
|
| 424 |
+
types = [t["token_type"] for t in tokens]
|
| 425 |
+
self.assertIn("PUNCT", types)
|
| 426 |
+
self.assertIn("ROOT", types)
|
| 427 |
+
|
| 428 |
+
def test_unicode_normalized(self) -> None:
|
| 429 |
+
tokens = self.tok.tokenize(" merhaba dünya ")
|
| 430 |
+
roots = [t["token"] for t in tokens if t["token_type"] == "ROOT"]
|
| 431 |
+
self.assertIn("merhaba", roots)
|
| 432 |
+
self.assertIn("dünya", roots)
|
| 433 |
+
|
| 434 |
+
def test_single_char_word(self) -> None:
|
| 435 |
+
tokens = self.tok.tokenize("a")
|
| 436 |
+
self.assertTrue(len(tokens) >= 1)
|
| 437 |
+
|
| 438 |
+
def test_number_apostrophe_suffix(self) -> None:
|
| 439 |
+
"""3'te, 1990'larda should be NUM + SUFFIX."""
|
| 440 |
+
tokens = self.tok.tokenize("3'te geldim")
|
| 441 |
+
num = [t for t in tokens if t["token_type"] == "NUM"]
|
| 442 |
+
self.assertTrue(len(num) >= 1)
|
| 443 |
+
|
| 444 |
+
def test_integration_full_sentence(self) -> None:
|
| 445 |
+
"""Full integration test with mixed content."""
|
| 446 |
+
tokens = self.tok.tokenize("İstanbul'da meeting'e katılamadım")
|
| 447 |
+
self.assertTrue(len(tokens) > 0)
|
| 448 |
+
# Verify the critical acceptance criteria
|
| 449 |
+
from nedo_turkish_tokenizer import NedoTurkishTokenizer
|
| 450 |
+
t = NedoTurkishTokenizer()
|
| 451 |
+
result = t.tokenize("İstanbul'da meeting'e katılamadım")
|
| 452 |
+
self.assertIsInstance(result, list)
|
| 453 |
+
self.assertTrue(all("token" in tok and "token_type" in tok for tok in result))
|
| 454 |
+
|
| 455 |
+
|
| 456 |
+
if __name__ == "__main__":
|
| 457 |
+
unittest.main()
|
tests/test_zemberek_integration.py
DELETED
|
@@ -1,58 +0,0 @@
|
|
| 1 |
-
from __future__ import annotations
|
| 2 |
-
|
| 3 |
-
import tempfile
|
| 4 |
-
import unittest
|
| 5 |
-
from pathlib import Path
|
| 6 |
-
from unittest import mock
|
| 7 |
-
|
| 8 |
-
from nedo_turkish_tokenizer import NedoTurkishTokenizer, _tdk_vocab
|
| 9 |
-
from nedo_turkish_tokenizer._root_validator import (
|
| 10 |
-
ZEMBEREK_AVAILABLE,
|
| 11 |
-
disambiguate_sentence,
|
| 12 |
-
)
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
@unittest.skipUnless(ZEMBEREK_AVAILABLE, "zemberek-python is required for these tests")
|
| 16 |
-
class ZemberekIntegrationTests(unittest.TestCase):
|
| 17 |
-
def setUp(self) -> None:
|
| 18 |
-
self._original_words = _tdk_vocab._TDK_WORDS
|
| 19 |
-
_tdk_vocab._TDK_WORDS = None
|
| 20 |
-
|
| 21 |
-
def tearDown(self) -> None:
|
| 22 |
-
_tdk_vocab._TDK_WORDS = self._original_words
|
| 23 |
-
|
| 24 |
-
def test_sentence_disambiguation_uses_zemberek_python(self) -> None:
|
| 25 |
-
analyses = disambiguate_sentence(["Bug\u00fcn", "geldi"])
|
| 26 |
-
|
| 27 |
-
self.assertEqual(2, len(analyses))
|
| 28 |
-
self.assertEqual("bug\u00fcn", analyses[0]["lemma"])
|
| 29 |
-
self.assertEqual("gelmek", analyses[1]["lemma"])
|
| 30 |
-
self.assertEqual("Verb", analyses[1]["pos"])
|
| 31 |
-
|
| 32 |
-
def test_tokenizer_smoke_uses_bundled_tdk_words(self) -> None:
|
| 33 |
-
with tempfile.TemporaryDirectory() as tmpdir:
|
| 34 |
-
cache_path = str(Path(tmpdir) / "tdk_words.txt")
|
| 35 |
-
|
| 36 |
-
with mock.patch.object(_tdk_vocab, "TDK_CACHE_FILE", cache_path):
|
| 37 |
-
with mock.patch.object(_tdk_vocab, "_download_from_hf") as download_hf:
|
| 38 |
-
with mock.patch.object(_tdk_vocab, "_download_from_tdk") as download_tdk:
|
| 39 |
-
tokenizer = NedoTurkishTokenizer()
|
| 40 |
-
tokens = tokenizer.tokenize("Bug\u00fcn \u0130stanbul'a gidiyorum.")
|
| 41 |
-
|
| 42 |
-
self.assertTrue(
|
| 43 |
-
any(t["token"].strip() == "bug\u00fcn" and t["token_type"] == "ROOT" for t in tokens)
|
| 44 |
-
)
|
| 45 |
-
self.assertTrue(any(t["token"] == "'" and t["token_type"] == "PUNCT" for t in tokens))
|
| 46 |
-
self.assertTrue(
|
| 47 |
-
any(
|
| 48 |
-
t["token"].strip() == "a"
|
| 49 |
-
and t["token_type"] == "SUFFIX"
|
| 50 |
-
and t["morph_pos"] == 1
|
| 51 |
-
for t in tokens
|
| 52 |
-
)
|
| 53 |
-
)
|
| 54 |
-
self.assertTrue(
|
| 55 |
-
any(t["token"].strip() == "gitmek" and t.get("_root_corrected") for t in tokens)
|
| 56 |
-
)
|
| 57 |
-
download_hf.assert_not_called()
|
| 58 |
-
download_tdk.assert_not_called()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokenization_nedo_turkish.py
DELETED
|
@@ -1,172 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
NedoTurkishTokenizer — HuggingFace AutoTokenizer compatible class.
|
| 3 |
-
|
| 4 |
-
Usage:
|
| 5 |
-
from transformers import AutoTokenizer
|
| 6 |
-
|
| 7 |
-
tok = AutoTokenizer.from_pretrained("Ethosoft/NedoTurkishTokenizer", trust_remote_code=True)
|
| 8 |
-
out = tok("İstanbul'da meeting'e katılamadım")
|
| 9 |
-
|
| 10 |
-
out["input_ids"] # hash-stable int IDs of morphological tokens
|
| 11 |
-
out["attention_mask"] # all 1s
|
| 12 |
-
out["token_type_ids"] # 0=root/other, 1=suffix
|
| 13 |
-
out["morphological_tokens"] # full morphological dicts (token, token_type, morph_pos, ...)
|
| 14 |
-
"""
|
| 15 |
-
|
| 16 |
-
from __future__ import annotations
|
| 17 |
-
|
| 18 |
-
import hashlib
|
| 19 |
-
from typing import Any
|
| 20 |
-
|
| 21 |
-
from transformers import PreTrainedTokenizer
|
| 22 |
-
|
| 23 |
-
# ── Morphological type → token_type_id ───────────────────────────────────────
|
| 24 |
-
_MTYPE_ID = {
|
| 25 |
-
"ROOT": 0,
|
| 26 |
-
"FOREIGN": 0,
|
| 27 |
-
"SUFFIX": 1,
|
| 28 |
-
"BPE": 2,
|
| 29 |
-
"PUNCT": 3,
|
| 30 |
-
"NUM": 4,
|
| 31 |
-
"DATE": 4,
|
| 32 |
-
"UNIT": 4,
|
| 33 |
-
"URL": 5,
|
| 34 |
-
"MENTION": 5,
|
| 35 |
-
"HASHTAG": 5,
|
| 36 |
-
"EMOJI": 5,
|
| 37 |
-
}
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
def _stable_hash(s: str) -> int:
|
| 41 |
-
"""MD5-based stable hash that does NOT change between Python runs."""
|
| 42 |
-
return int(hashlib.md5(s.encode("utf-8")).hexdigest()[:6], 16)
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
class NedoTurkishTokenizer(PreTrainedTokenizer):
|
| 46 |
-
"""
|
| 47 |
-
Turkish morphological tokenizer — HuggingFace compatible.
|
| 48 |
-
|
| 49 |
-
``input_ids`` are MD5-hash-based stable IDs (not lookup-table vocab IDs).
|
| 50 |
-
For downstream transformer use, embed by ``token_type_ids`` or learn a
|
| 51 |
-
projection from the ``morphological_tokens`` metadata.
|
| 52 |
-
|
| 53 |
-
All standard HuggingFace fields are present:
|
| 54 |
-
input_ids, attention_mask, token_type_ids
|
| 55 |
-
|
| 56 |
-
Extra field:
|
| 57 |
-
morphological_tokens — list[dict] with token, token_type, morph_pos, ...
|
| 58 |
-
"""
|
| 59 |
-
|
| 60 |
-
vocab_files_names: dict = {}
|
| 61 |
-
model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
|
| 62 |
-
|
| 63 |
-
def __init__(self, **kwargs: Any) -> None:
|
| 64 |
-
super().__init__(**kwargs)
|
| 65 |
-
self._morph: "NedoTurkishTokenizer_core | None" = None # lazy init
|
| 66 |
-
|
| 67 |
-
def _get_morph(self):
|
| 68 |
-
if self._morph is None:
|
| 69 |
-
from nedo_turkish_tokenizer import NedoTurkishTokenizer as _Core # noqa: PLC0415
|
| 70 |
-
self._morph = _Core()
|
| 71 |
-
return self._morph
|
| 72 |
-
|
| 73 |
-
# ── PreTrainedTokenizer required interface ────────────────────────────────
|
| 74 |
-
|
| 75 |
-
@property
|
| 76 |
-
def vocab_size(self) -> int:
|
| 77 |
-
return 16_777_216 # 2^24 — MD5 hash space
|
| 78 |
-
|
| 79 |
-
def get_vocab(self) -> dict:
|
| 80 |
-
return {} # no fixed vocabulary
|
| 81 |
-
|
| 82 |
-
def _tokenize(self, text: str) -> list[str]:
|
| 83 |
-
"""Return token strings from the morphological pipeline."""
|
| 84 |
-
tokens = self._get_morph().tokenize(text)
|
| 85 |
-
return [t["token"] for t in tokens]
|
| 86 |
-
|
| 87 |
-
def _convert_token_to_id(self, token: str) -> int:
|
| 88 |
-
return _stable_hash(token)
|
| 89 |
-
|
| 90 |
-
def _convert_id_to_token(self, index: int) -> str:
|
| 91 |
-
return "" # no inverse mapping without a vocab
|
| 92 |
-
|
| 93 |
-
def save_vocabulary(
|
| 94 |
-
self,
|
| 95 |
-
save_directory: str,
|
| 96 |
-
filename_prefix: str | None = None,
|
| 97 |
-
) -> tuple:
|
| 98 |
-
return () # no vocab file
|
| 99 |
-
|
| 100 |
-
# ── Main __call__ override ────────────────────────────────────────────────
|
| 101 |
-
|
| 102 |
-
def __call__(
|
| 103 |
-
self,
|
| 104 |
-
text: str | list[str],
|
| 105 |
-
return_morphological_tokens: bool = True,
|
| 106 |
-
**kwargs: Any,
|
| 107 |
-
) -> dict:
|
| 108 |
-
"""
|
| 109 |
-
Tokenize text and return a dict with standard HuggingFace fields
|
| 110 |
-
plus ``morphological_tokens``.
|
| 111 |
-
|
| 112 |
-
Args:
|
| 113 |
-
text: Single string or list of strings.
|
| 114 |
-
return_morphological_tokens: Include full morphological dicts.
|
| 115 |
-
|
| 116 |
-
Returns:
|
| 117 |
-
dict with:
|
| 118 |
-
input_ids : list[int] or list[list[int]]
|
| 119 |
-
attention_mask : list[int] or list[list[int]]
|
| 120 |
-
token_type_ids : list[int] or list[list[int]]
|
| 121 |
-
morphological_tokens : list[dict] or list[list[dict]]
|
| 122 |
-
"""
|
| 123 |
-
if isinstance(text, list):
|
| 124 |
-
results = [self._encode_single(t, return_morphological_tokens) for t in text]
|
| 125 |
-
return {
|
| 126 |
-
"input_ids": [r["input_ids"] for r in results],
|
| 127 |
-
"attention_mask": [r["attention_mask"] for r in results],
|
| 128 |
-
"token_type_ids": [r["token_type_ids"] for r in results],
|
| 129 |
-
"morphological_tokens": [r["morphological_tokens"] for r in results],
|
| 130 |
-
}
|
| 131 |
-
return self._encode_single(text, return_morphological_tokens)
|
| 132 |
-
|
| 133 |
-
def _encode_single(self, text: str, with_morph: bool) -> dict:
|
| 134 |
-
morph = self._get_morph()
|
| 135 |
-
tokens = morph.tokenize(text)
|
| 136 |
-
|
| 137 |
-
input_ids = [_stable_hash(t["token"]) for t in tokens]
|
| 138 |
-
attn_mask = [1] * len(tokens)
|
| 139 |
-
type_ids = [_MTYPE_ID.get(t["token_type"], 0) for t in tokens]
|
| 140 |
-
|
| 141 |
-
out: dict = {
|
| 142 |
-
"input_ids": input_ids,
|
| 143 |
-
"attention_mask": attn_mask,
|
| 144 |
-
"token_type_ids": type_ids,
|
| 145 |
-
}
|
| 146 |
-
if with_morph:
|
| 147 |
-
out["morphological_tokens"] = tokens
|
| 148 |
-
return out
|
| 149 |
-
|
| 150 |
-
# ── Convenience helpers ───────────────────────────────────────────────────
|
| 151 |
-
|
| 152 |
-
def encode(self, text: str, **kwargs) -> list[int]: # type: ignore[override]
|
| 153 |
-
return self._encode_single(text, with_morph=False)["input_ids"]
|
| 154 |
-
|
| 155 |
-
def decode(self, token_ids: list[int], **kwargs) -> str: # type: ignore[override]
|
| 156 |
-
"""Not meaningful without a fixed vocab — returns empty string."""
|
| 157 |
-
return ""
|
| 158 |
-
|
| 159 |
-
def tokenize(self, text: str, **kwargs) -> list[str]:
|
| 160 |
-
return self._tokenize(text)
|
| 161 |
-
|
| 162 |
-
def morphological_tokenize(self, text: str) -> list[dict]:
|
| 163 |
-
"""Return full morphological token dicts (main NedoTurkishTokenizer output)."""
|
| 164 |
-
return self._get_morph().tokenize(text)
|
| 165 |
-
|
| 166 |
-
def batch_tokenize(self, texts: list[str], workers: int | None = None) -> list[list[dict]]:
|
| 167 |
-
"""Parallel morphological tokenization."""
|
| 168 |
-
return self._get_morph().batch_tokenize(texts, workers=workers)
|
| 169 |
-
|
| 170 |
-
def stats(self, tokens: list[dict]) -> dict:
|
| 171 |
-
"""Compute TR% and other morphological coverage stats."""
|
| 172 |
-
return self._get_morph().stats(tokens)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokenizer_config.json
DELETED
|
@@ -1,12 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"bos_token": "[BOS]",
|
| 3 |
-
"clean_up_tokenization_spaces": true,
|
| 4 |
-
"cls_token": "[CLS]",
|
| 5 |
-
"eos_token": "[EOS]",
|
| 6 |
-
"mask_token": "[MASK]",
|
| 7 |
-
"model_max_length": 1000000000000000019884624838656,
|
| 8 |
-
"pad_token": "[PAD]",
|
| 9 |
-
"sep_token": "[SEP]",
|
| 10 |
-
"tokenizer_class": "NedoPreTrainedTokenizer",
|
| 11 |
-
"unk_token": "[UNK]"
|
| 12 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab_64k.json
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|