Ethosoft commited on Apr 17

Commit

edec8b7

1 Parent(s): ec3d319

Refactor to standalone v2.0: zero dependencies, internal engine, removed zemberek/hf wrapper

Files changed (35) hide show

.gitattributes +5 -37
.gitignore +3 -0
README.md +117 -240
hf_benchmark.py +0 -327
id_to_token_64k.json +0 -0
nedo_turkish_tokenizer/__init__.py +11 -12
nedo_turkish_tokenizer/{_acronym_dict.py → _acronym_table.py} +13 -35
nedo_turkish_tokenizer/_allomorph.py +0 -46
nedo_turkish_tokenizer/_compound.py +0 -76
nedo_turkish_tokenizer/_context_aware.py +0 -61
nedo_turkish_tokenizer/{_medical_vocab.py → _domain_vocab.py} +17 -2
nedo_turkish_tokenizer/_preprocessor.py +0 -246
nedo_turkish_tokenizer/_root_validator.py +0 -205
nedo_turkish_tokenizer/_suffix_expander.py +0 -212
nedo_turkish_tokenizer/_suffix_table.py +197 -0
nedo_turkish_tokenizer/_tdk_vocab.py +0 -148
nedo_turkish_tokenizer/apostrophe.py +138 -0
nedo_turkish_tokenizer/engine.py +157 -0
nedo_turkish_tokenizer/morphology.py +161 -0
nedo_turkish_tokenizer/normalization.py +63 -0
nedo_turkish_tokenizer/resources.py +107 -0
nedo_turkish_tokenizer/segmentation.py +475 -0
nedo_turkish_tokenizer/{_normalizer.py → special_spans.py} +142 -134
nedo_turkish_tokenizer/tokenizer.py +48 -296
nedo_turkish_tokenizer/types.py +109 -0
paper_baseline_check.py +0 -106
pyproject.toml +6 -10
special_tokens_map.json +0 -9
test_lattice.py +0 -72
tests/test_tdk_vocab.py +0 -31
tests/test_tokenizer.py +457 -0
tests/test_zemberek_integration.py +0 -58
tokenization_nedo_turkish.py +0 -172
tokenizer_config.json +0 -12
vocab_64k.json +0 -0

.gitattributes CHANGED Viewed

@@ -1,37 +1,5 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
-nedo_turkish_tokenizer/data/zemberek-full.jar filter=lfs diff=lfs merge=lfs -text
-*.jar filter=lfs diff=lfs merge=lfs -text

+# Git LFS tracking rules.
+# Only data files that are genuinely large are tracked.
+# The HuggingFace boilerplate entries for model weights have been
+# removed because this is a standalone tokenizer, not a model repo.
+nedo_turkish_tokenizer/data/tdk_words.txt filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -8,3 +8,6 @@ build/
 *.egg
 .env
 .venv/

 *.egg
 .env
 .venv/
+.pytest_cache/
+*.whl
+*.tar.gz

README.md CHANGED Viewed

@@ -1,291 +1,168 @@
----
-language:
-  - tr
-  - en
-tags:
-  - tokenizer
-  - morphology
-  - turkish
-  - nlp
-  - transformers
-license: mit
-library_name: nedo-turkish-tokenizer
-pipeline_tag: token-classification
----
 # NedoTurkishTokenizer
-<h1 align="center">NedoTurkishTokenizer</h1>
-<p align="center"><strong>The Turkish tokenizer that actually understands Turkish morphology.</strong></p>
-<p align="center">
-Morphology-aware Turkish tokenization with roots, suffixes, canonical morphemes, compounds, acronyms, foreign-word handling, and context-sensitive analysis.
-</p>
-<p align="center">
-<strong>TR-MMLU 92.64%</strong> · <strong>Turkish-first</strong> · <strong>Zemberek-powered</strong> · <strong>Transformers-compatible</strong>
-</p>
----
-<p align="center">
-  <img src="https://cdn-uploads.huggingface.co/production/uploads/684ffdf517ebbc34153de81b/fbFKRHdqH7x1Iz20QlXx7.png" alt="Resim" style="width: 100%; max-width: 1000px;" />
-</p>
-## Overview
-**NedoTurkishTokenizer** is a Turkish morphological tokenizer built for people who want more than generic subword splitting.
-Instead of chopping Turkish into arbitrary BPE fragments, it tokenizes text in a way that reflects the real structure of the language: **roots, suffixes, morphological positions, canonical morphemes, compounds, acronym expansions, foreign roots, and contextual disambiguation**.
-For a language as morphologically rich as Turkish, that difference is massive.
-This repository is built to make Turkish tokenization feel **intelligent, interpretable, and linguistically grounded**.
----
-## Why it stands out
-Most tokenizers are optimized for compression.
-This one is optimized for **understanding Turkish properly**.
-NedoTurkishTokenizer is designed to capture the structure that actually matters in Turkish NLP:
-- **Root + suffix aware tokenization**
-- **Morphological positions inside words**
-- **Canonical suffix normalization**
-- **Foreign word + Turkish suffix detection**
-- **Compound decomposition**
-- **Acronym expansion**
-- **Sentence-level disambiguation**
-- **Domain-aware vocabulary support**
-- **Clean integration with Transformers**
-- **Standalone Python usage for custom NLP pipelines**
-This is not just a tokenizer.
-It is a much more linguistically faithful way to represent Turkish text.
----
-## Benchmark
-| Metric | Score |
-|---|---:|
-| **TR-MMLU** | **92.64%** |
-**Current repo claim:** world record.
----
-## Model Details
-| Field | Value |
-|---|---|
-| **Developer** | [Ethosoft](https://huggingface.co/Ethosoft) |
-| **Model** | `Ethosoft/NedoTurkishTokenizer` |
-| **Language** | Turkish (`tr`) |
-| **License** | MIT |
-| **Morphological engine** | `zemberek-python` |
----
 ## Installation
 ```bash
-pip install git+https://huggingface.co/Ethosoft/NedoTurkishTokenizer
 ```
----
 ## Quick Start
-### Transformers Usage
 ```python
-from transformers import AutoTokenizer
-tok = AutoTokenizer.from_pretrained(
-    "Ethosoft/NedoTurkishTokenizer",
-    trust_remote_code=True
-)
-out = tok("Türk dili, morfolojik açıdan zengin bir dildir.")
-print(out["input_ids"])
-print(out["attention_mask"])
-print(out["token_type_ids"])
-for t in out["morphological_tokens"]:
-    print(t["token"], t["token_type"], t["morph_pos"])
 ```
-### Batch Tokenization
 ```python
-out = tok([
-    "Türkçe metin.",
-    "Another sentence with code-switching."
-])
-```
-### Direct Morphological Tokenization
-```python
-tokens = tok.morphological_tokenize("Başbakan Ankara'da toplantı yaptı.")
-for t in tokens:
-    print(f"{t['token']:20s} {t['token_type']:8s} pos={t['morph_pos']}", end="")
-    if t.get("_canonical"):
-        print(f"  [{t['_canonical']}]", end="")
-    if t.get("_compound"):
-        print(f"  compound={t['_parts']}", end="")
-    if t.get("_expansion"):
-        print(f"  -> {t['_expansion']}", end="")
-    print()
 ```
----
-## Standalone Usage
-```python
-from nedo_turkish_tokenizer import NedoTurkishTokenizer
-tok = NedoTurkishTokenizer()
-# Single text
-tokens = tok("İSTANBUL'da meeting'e katılamadım")
-for t in tokens:
-    print(t["token"], t["token_type"], t["morph_pos"])
-# Batch (parallel, all CPUs)
-results = tok.batch_tokenize(["metin 1", "metin 2", "metin 3"], workers=4)
-# Coverage statistics
-s = tok.stats(tokens)
-print(f"TR%: {s['tr_pct']}  Pure%: {s['pure_pct']}")
-```
----
-## Example Output
-**Input**
-```text
-İSTANBUL'da meeting'e katılamadım
 ```
-**Output**
-| token | token_type | morph_pos | notes |
-|---|---|---:|---|
-| `<uppercase_word>` | ROOT | 0 | ALL CAPS marker |
-| ` istanbul` | ROOT | 0 | lowercased normalization |
-| `'` | PUNCT | 0 | fixed boundary |
-| `da` | SUFFIX | 1 | `-LOC` |
-| ` meeting` | FOREIGN | 0 | foreign root |
-| `e` | SUFFIX | 1 | `-DAT` |
-| ` katılmak` | ROOT | 0 | corrected root |
-| `lama` | SUFFIX | 1 | `-VN+NEG` |
-| `d` | SUFFIX | 2 | `-PAST` |
-| `ım` | SUFFIX | 3 | `-1SG` |
-This tokenizer does not just split text.
-It exposes the **morphological logic** inside Turkish words and sentences.
----
-## Output Fields
-Every token dictionary contains:
-| Field | Type | Description |
 |---|---|---|
-| `token` | `str` | Token string — leading space means word-initial |
-| `token_type` | `str` | Morphological type such as `ROOT`, `SUFFIX`, `FOREIGN`, `PUNCT` |
-| `morph_pos` | `int` | Position within word: `0` = root/initial, `1` = first suffix, `2` = second suffix, ... |
----
-## Token Types
-| Type | Description | Example |
-|---|---|---|
-| `ROOT` | Turkish root word | `kitap`, `gel` |
-| `SUFFIX` | Turkish morphological suffix | `lar`, `da`, `dı` |
-| `FOREIGN` | Foreign or loanword root | `meeting`, `zoom`, `tweet` |
-| `BPE` | Unknown subword fallback | rare / OOV fragments |
-| `PUNCT` | Punctuation | `.`, `,`, `?` |
-| `NUM` | Number | `3.5`, `%85` |
-| `DATE` | Date | `14.03.2026` |
-| `UNIT` | Measurement unit | `km`, `mg`, `TL` |
-| `URL` | Web address | `https://...` |
-| `MENTION` | Username mention | `@ethosoft` |
-| `HASHTAG` | Hashtag | `#NLP` |
-| `EMOJI` | Emoji | `🙂` |
----
-## Optional Metadata Fields
-| Field | Description |
-|---|---|
-| `_canonical` | Canonical morpheme mapping such as `"lar"/"ler" -> "PL"` |
-| `_suffix_label` | Detailed morphological label such as `-PL+ACC`, `-P3+LOC` |
-| `_foreign` | Foreign root detected |
-| `_caps` | Word was originally ALL CAPS |
-| `_domain` | Domain-specific term detected |
-| `_compound` | Compound word detected |
-| `_parts` | Compound parts |
-| `_expansion` | Acronym expansion |
-| `_pos` | POS tag from Zemberek |
-| `_lemma` | Lemma |
-| `_disambiguated` | Context-based disambiguation applied |
-| `_root_corrected` | Root corrected using phonetic and morphological validation |
----
-## How It Works
-NedoTurkishTokenizer wraps the base `turkish-tokenizer` BPE model and applies **12 sequential morphological fixes** to make tokenization dramatically more faithful to Turkish.
-| Fix | Problem | Solution |
-|---:|---|---|
-| 1 | `İSTANBUL` becomes many BPE fragments | Lowercase before tokenization, restore uppercase marker |
-| 2 | `meeting'e` breaks badly | Detect foreign base + Turkish suffix and split correctly |
-| 3 | Turkish suffixes appear as generic BPE | Reclassify 260+ suffix patterns as `SUFFIX` |
-| 4 | Wrong roots can appear | Validate and correct roots with Zemberek |
-| 5 | Punctuation gets counted as BPE | Classify punctuation explicitly |
-| 6 | Domain terms fragment unnecessarily | Add domain-aware vocabulary |
-| 7 | Foreign roots are mislabeled | TDK-backed lookup for foreign words |
-| 8 | Numbers, URLs, mentions fragment | Normalize special spans before tokenization |
-| 9 | Allomorphs get separate IDs | Canonicalize morphemes such as `PL`, `ACC`, `DAT` |
-| 10 | Compounds remain opaque | Decompose compound words |
-| 11 | Acronyms lose meaning | Expand known acronyms |
-| 12 | Ambiguous forms stay unresolved | Use sentence-level context disambiguation |
----
-## Why This Matters
-Turkish is an agglutinative language.
-A tokenizer that ignores morphology ignores a huge part of what makes Turkish meaningful.
-NedoTurkishTokenizer is built to preserve that structure so Turkish text becomes:
-- more interpretable
-- more linguistically faithful
-- more useful for analysis
-- more powerful for Turkish NLP pipelines
-This is why the project stands out.
-It does not simply tokenize Turkish.
-It **represents Turkish in a way that respects the language**.
----
 ## License
-MIT © [Ethosoft](https://huggingface.co/Ethosoft)

 # NedoTurkishTokenizer
+Self-contained Turkish morphological tokenizer.
+**Zero external dependencies** — tokenizes Turkish text into morphologically meaningful units using a candidate-based segmentation engine, a bundled TDK dictionary, and 260+ suffix patterns.
+> This is a standalone tokenizer. It does not wrap `turkish-tokenizer`, `zemberek-python`, `requests`, or `transformers`. There are no hidden fallbacks or optional dependency paths. Install and use immediately.
 ## Installation
 ```bash
+pip install .
 ```
+No additional packages required. Everything is bundled.
 ## Quick Start
 ```python
+from nedo_turkish_tokenizer import NedoTurkishTokenizer
+tok = NedoTurkishTokenizer()
+tokens = tok.tokenize("İstanbul'da toplantıya katılamadım")
+for t in tokens:
+    print(f"{t['token']:15s}  {t['token_type']:10s}  pos={t['morph_pos']}")
+```
+Output:
+```
+İstanbul        ROOT        pos=0
+'               PUNCT       pos=0
+da              SUFFIX      pos=1
+toplantı        ROOT        pos=0
+ya              SUFFIX      pos=1
+katıl           ROOT        pos=0
+a               SUFFIX      pos=1
+ma              SUFFIX      pos=2
+dım             SUFFIX      pos=3
 ```
+## API
+### `NedoTurkishTokenizer()`
 ```python
+tok = NedoTurkishTokenizer()
+# Single text
+tokens = tok.tokenize("Merhaba dünya")
+# Callable shorthand
+tokens = tok("Merhaba dünya")
+# Batch (parallel, uses multiprocessing)
+results = tok.batch_tokenize(["text1", "text2", "text3"])
+# Statistics
+stats = tok.stats(tokens)
 ```
+### Token Output Format
+Each token is a `dict` with these guaranteed fields:
+| Field | Type | Description |
+|---|---|---|
+| `token` | `str` | **Clean token text** — no leading/trailing whitespace. |
+| `token_type` | `str` | One of the types below. |
+| `morph_pos` | `int` | 0 = root/word-initial, 1+ = suffix position. |
+**Token text does not encode spacing.** The `token` field contains only the clean surface form. Whether a token starts a new word is indicated by `morph_pos == 0`, not by whitespace in the string.
+### Token Types
+| Type | Description | Example |
+|---|---|---|
+| `ROOT` | Turkish word root | `ev`, `gel`, `kitap` |
+| `SUFFIX` | Morphological suffix | `de`, `ler`, `yor` |
+| `FOREIGN` | Non-Turkish word | `meeting`, `cloud` |
+| `PUNCT` | Punctuation | `.`, `,`, `'` |
+| `NUM` | Number | `42`, `%85`, `3.14` |
+| `DATE` | Date | `14.03.2026` |
+| `UNIT` | Unit | `kg`, `km`, `TL` |
+| `URL` | URL | `https://...` |
+| `MENTION` | Social mention | `@user` |
+| `HASHTAG` | Hashtag | `#topic` |
+| `EMOJI` | Emoji | 😀, :) |
+| `ACRONYM` | Acronym | `NATO`, `TBMM` |
+### Optional Metadata Fields
+Tokens may include `_`-prefixed metadata fields:
+| Field | Type | Description |
+|---|---|---|
+| `_suffix_label` | `str` | Morphological label (e.g. `-LOC`, `-PL`, `-PST`) |
+| `_canonical` | `str` | Canonical morpheme (e.g. `LOC`, `PL`, `PAST`) |
+| `_caps` | `bool` | Word was originally ALL CAPS |
+| `_foreign` | `bool` | Word detected as foreign |
+| `_acronym` | `bool` | Token is an acronym |
+| `_expansion` | `str` | Acronym expansion (e.g. `NATO` → `Kuzey Atlantik...`) |
+| `_compound` | `bool` | Root is a compound word |
+| `_parts` | `list[str]` | Compound decomposition |
+| `_apo_suffix` | `bool` | Suffix follows an apostrophe |
+| `_domain` | `bool` | Root from domain vocabulary |
+## Architecture
+The tokenizer uses a **candidate-based segmentation** pipeline:
+```
+Text → Normalize → Special Spans → Word Split → Per-Word Segmentation → Annotate → Strip
+                                                      │
+                                              Generate Candidates
+                                              Score & Select Best
 ```
+For each word, the engine:
+1. Generates 2–5 segmentation candidates (whole ROOT, suffix chains, foreign)
+2. Scores each candidate deterministically (TDK validation, root length, suffix recognition)
+3. Selects the highest-scoring segmentation
+4. Strips internal whitespace markers from the output
+### Scoring Rules
+| Factor | Score |
+|---|---|
+| Root in TDK dictionary | +10 |
+| Whole word in TDK (unsplit) | +5 bonus |
+| Root in domain vocabulary | +8 |
+| Root length | +2 per character |
+| Each recognised suffix | +2 |
+| Short root penalty (≤2 chars) | −4 |
+| Foreign root (fallback) | +3 base |
+| Unknown root | +1 base |
+### Known-Intact Words
+A curated set of common Turkish words (inflected forms of `demek`, `yemek`, and discourse particles) bypass candidate generation entirely and are always kept whole. This prevents false splits like `dedi` → `de` + `di` where the root `de` is a valid TDK conjunction.
+### Bundled Resources
+| Resource | Size | Purpose |
 |---|---|---|
+| `tdk_words.txt` | ~746 KB | TDK dictionary (64K+ lemmas + derived verb stems) |
+| `turkish_proper_nouns.txt` | ~1 KB | Proper nouns (cities, regions, names) |
+| Suffix table | 260+ entries | Turkish suffix patterns with morphological labels |
+| Acronym table | 80+ entries | Acronym → Turkish expansion mappings |
+| Domain vocabulary | 200+ entries | Medical, sports, tourism terms |
+## Known Limitations
+- **Not a full morphological analyzer.** This is a heuristic segmenter, not a Zemberek/TRMorph replacement. Morphological labels may be incorrect for ambiguous suffixes (e.g. `-ACC` vs `-GEN` for "in").
+- **No disambiguation.** The tokenizer does not use sentence-level context to resolve ambiguous words (e.g. "gelir" = income vs. aorist).
+- **Verb stem derivation is simple.** Only `-mak`/`-mek` infinitive stripping (for stems ≥3 characters) is used; vowel harmony alternations in stems are not modelled.
+- **TDK dictionary coverage.** The bundled TDK list has ~64K entries. Words absent from TDK default to ROOT (unknown) or FOREIGN.
+- **Not backward-compatible with v1.x.** The old wrapper relied on `turkish-tokenizer` BPE and `zemberek-python`. Output format and token boundaries will differ.
+## Running Tests
+```bash
+pip install -e ".[dev]"
+pytest tests/ -v
+```
 ## License
+MIT

hf_benchmark.py DELETED Viewed

@@ -1,327 +0,0 @@
-"""
-hf_benchmark.py
----------------
-NedoTurkishTokenizer'ı TR-MMLU üzerinde benchmark eder.
-HuggingFace'den hem tokenizer hem dataset'i çeker.
-Kullanım:
-    cd NedoTurkishTokenizer/
-    pip install huggingface_hub datasets
-    python hf_benchmark.py
-    # Sadece 2000 örnek (hızlı test):
-    python hf_benchmark.py --samples 2000
-    # Belirli kategori:
-    python hf_benchmark.py --category TUS
-"""
-import argparse
-import json
-import os
-import sys
-import csv
-from pathlib import Path
-# ── HuggingFace token ─────────────────────────────────────────────────────────
-HF_TOKEN = os.environ.get("HF_TOKEN")
-# ── Argümanlar ────────────────────────────────────────────────────────────────
-parser = argparse.ArgumentParser()
-parser.add_argument("--samples",  default="all",  help="Kaç örnek (all veya sayı)")
-parser.add_argument("--category", default=None,   help="Kategori filtresi (ör. TUS)")
-parser.add_argument("--out",      default=".",    help="Çıktı klasörü")
-args = parser.parse_args()
-OUT = Path(args.out)
-OUT.mkdir(parents=True, exist_ok=True)
-# ── HuggingFace login ─────────────────────────────────────────────────────────
-from huggingface_hub import login
-if HF_TOKEN:
-    login(token=HF_TOKEN, add_to_git_credential=False)
-else:
-    print("HF_TOKEN not set; using existing Hugging Face login state if available.")
-print("HuggingFace login ✓")
-# ── Tokenizer yükle ───────────────────────────────────────────────────────────
-print("NedoTurkishTokenizer yükleniyor...")
-sys.path.insert(0, str(Path(__file__).parent))
-from nedo_turkish_tokenizer import NedoTurkishTokenizer
-tok = NedoTurkishTokenizer()
-print(f"  Zemberek: {'✓' if tok.zemberek_available else '✗ (devre dışı)'}")
-# ── Dataset yükle ─────────────────────────────────────────────────────────────
-print("TR-MMLU dataset yükleniyor...")
-from datasets import load_dataset
-ds = load_dataset(
-    "alibayram/turkish_mmlu",
-    split="test",
-    token=HF_TOKEN,
-)
-print(f"  {len(ds)} örnek")
-# ── Kategori filtresi ─────────────────────────────────────────────────────────
-samples = list(ds)
-if args.category:
-    samples = [r for r in samples
-               if str(r.get("bolum","")).strip() == args.category]
-    print(f"  Kategori '{args.category}': {len(samples)} örnek")
-if args.samples != "all":
-    n = int(args.samples)
-    samples = samples[:n]
-    print(f"  Kısıtlı: {len(samples)} örnek")
-# ── Metin alanlarını birleştir ─────────────────────────────────────────────────
-def get_text(row: dict) -> str:
-    parts = []
-    for field in ["soru", "question"]:
-        if row.get(field):
-            parts.append(str(row[field]))
-            break
-    choices = row.get("secenekler") or []
-    if isinstance(choices, list):
-        parts.extend(str(c) for c in choices)
-    for lbl in ["A","B","C","D"]:
-        if row.get(lbl):
-            parts.append(str(row[lbl]))
-    if row.get("aciklama"):
-        parts.append(str(row["aciklama"]))
-    return " ".join(parts)
-# ── Token istatistikleri ──────────────────────────────────────────────────────
-def token_stats(tokens: list[dict]) -> dict:
-    """NedoTurkishTokenizer'ın token_type alanını kullan."""
-    total    = len(tokens)
-    if total == 0:
-        return {"total":0,"bpe":0,"tr_pct":0.0,"pure_pct":0.0,"orig_tr_pct":0.0}
-    # Enhanced stats
-    roots    = sum(1 for t in tokens if t["token_type"] == "ROOT")
-    suffixes = sum(1 for t in tokens if t["token_type"] == "SUFFIX")
-    foreign  = sum(1 for t in tokens if t["token_type"] == "FOREIGN")
-    punct    = sum(1 for t in tokens if t["token_type"] == "PUNCT")
-    bpe      = sum(1 for t in tokens if t["token_type"] == "BPE")
-    special  = sum(1 for t in tokens
-                   if t["token_type"] in ("NUM","DATE","UNIT","URL",
-                                          "MENTION","HASHTAG","EMOJI","ACRONYM"))
-    tr       = roots + suffixes + foreign + punct + special
-    pure     = sum(1 for t in tokens
-                   if t["token_type"] in ("ROOT","SUFFIX","FOREIGN")
-                   and not t["token"].strip().startswith("<"))
-    return {
-        "total":    total,
-        "roots":    roots,
-        "suffixes": suffixes,
-        "foreign":  foreign,
-        "bpe":      bpe,
-        "punct":    punct,
-        "special":  special,
-        "tr_pct":   round(tr / total * 100, 4),
-        "pure_pct": round(pure / total * 100, 4),
-    }
-def orig_stats(tokens: list[dict]) -> dict:
-    """Orijinal tokenizer istatistikleri (karşılaştırma için)."""
-    total = len(tokens)
-    if total == 0:
-        return {"total":0,"bpe":0,"tr_pct":0.0,"pure_pct":0.0}
-    roots    = sum(1 for t in tokens if t.get("type") == "ROOT")
-    suffixes = sum(1 for t in tokens if t.get("type") == "SUFFIX")
-    bpe      = sum(1 for t in tokens if t.get("type") == "BPE")
-    tr       = roots + suffixes
-    pure     = sum(1 for t in tokens
-                   if t.get("type") in ("ROOT","SUFFIX")
-                   and not t.get("token","").strip().startswith("<"))
-    return {
-        "total":    total,
-        "bpe":      bpe,
-        "tr_pct":   round(tr / total * 100, 4),
-        "pure_pct": round(pure / total * 100, 4),
-    }
-# ── Ana benchmark döngüsü ─────────────────────────────────────────────────────
-print(f"\nBenchmark başlıyor: {len(samples)} örnek...")
-per_sample   = []
-orig_tr_sum  = 0.0
-enh_tr_sum   = 0.0
-orig_pur_sum = 0.0
-enh_pur_sum  = 0.0
-orig_tok_sum = 0
-enh_tok_sum  = 0
-orig_bpe_sum = 0
-enh_bpe_sum  = 0
-improved     = 0
-regressed    = 0
-unchanged    = 0
-REPORT_EVERY = 500
-for idx, row in enumerate(samples):
-    text = get_text(row)
-    if not text.strip():
-        continue
-    # Orijinal tokenizer
-    orig_toks = tok._base.tokenize_text(text)
-    os_      = orig_stats(orig_toks)
-    # NedoTurkishTokenizer
-    enh_toks  = tok.tokenize(text)
-    es_       = token_stats(enh_toks)
-    d_tr = round(es_["tr_pct"] - os_["tr_pct"], 4)
-    per_sample.append({
-        "idx":       idx,
-        "bolum":     str(row.get("bolum","")),
-        "orig_tr":   os_["tr_pct"],
-        "enh_tr":    es_["tr_pct"],
-        "d_tr":      d_tr,
-        "orig_pure": os_["pure_pct"],
-        "enh_pure":  es_["pure_pct"],
-        "orig_tok":  os_["total"],
-        "enh_tok":   es_["total"],
-        "orig_bpe":  os_["bpe"],
-        "enh_bpe":   es_["bpe"],
-    })
-    orig_tr_sum  += os_["tr_pct"]
-    enh_tr_sum   += es_["tr_pct"]
-    orig_pur_sum += os_["pure_pct"]
-    enh_pur_sum  += es_["pure_pct"]
-    orig_tok_sum += os_["total"]
-    enh_tok_sum  += es_["total"]
-    orig_bpe_sum += os_["bpe"]
-    enh_bpe_sum  += es_["bpe"]
-    if d_tr > 0:   improved  += 1
-    elif d_tr < 0: regressed += 1
-    else:          unchanged += 1
-    if (idx + 1) % REPORT_EVERY == 0:
-        n = idx + 1
-        print(f"  [{n:>6}/{len(samples)}]  "
-              f"TR%: {enh_tr_sum/n:.2f}%  "
-              f"BPE/örnek: {enh_bpe_sum/n:.2f}  "
-              f"Regressed: {regressed}")
-n = len(per_sample)
-if n == 0:
-    print("Hiç örnek işlenmedi!")
-    sys.exit(1)
-# ── Özet ─────────────────────────────────────────────────────────────────────
-summary = {
-    "n_samples":        n,
-    "orig_tr_pct":      round(orig_tr_sum / n, 4),
-    "orig_pure_pct":    round(orig_pur_sum / n, 4),
-    "enh_tr_pct":       round(enh_tr_sum / n, 4),
-    "enh_pure_pct":     round(enh_pur_sum / n, 4),
-    "delta_tr_pct":     round((enh_tr_sum - orig_tr_sum) / n, 4),
-    "delta_pure_pct":   round((enh_pur_sum - orig_pur_sum) / n, 4),
-    "orig_avg_tokens":  round(orig_tok_sum / n, 2),
-    "enh_avg_tokens":   round(enh_tok_sum / n, 2),
-    "orig_avg_bpe":     round(orig_bpe_sum / n, 2),
-    "enh_avg_bpe":      round(enh_bpe_sum / n, 2),
-    "pct_improved":     round(improved / n * 100, 2),
-    "pct_regressed":    round(regressed / n * 100, 2),
-    "pct_unchanged":    round(unchanged / n * 100, 2),
-}
-# ── Kategori bazında ──────────────────────────────────────────────────────────
-from collections import defaultdict
-cat_scores = defaultdict(list)
-for row in per_sample:
-    cat_scores[row["bolum"]].append(row["enh_tr"])
-cat_summary = {
-    cat: round(sum(v)/len(v), 2)
-    for cat, v in cat_scores.items()
-    if len(v) >= 3
-}
-cat_sorted = sorted(cat_summary.items(), key=lambda x: x[1])
-# ── Rapor yazdır ─────────────────────────────────────────────────────────────
-SEP = "═" * 65
-print(f"\n{SEP}")
-print("  NedoTurkishTokenizer — TR-MMLU Benchmark Sonuçları")
-print(SEP)
-print(f"  N = {n:,} örnek\n")
-print(f"  {'Metrik':30s} {'Orijinal':>10} {'Enhanced':>10} {'Δ':>8}")
-print("  " + "─"*55)
-for label, orig, enh, delta in [
-    ("TR%",          summary["orig_tr_pct"],   summary["enh_tr_pct"],   summary["delta_tr_pct"]),
-    ("Pure%",        summary["orig_pure_pct"], summary["enh_pure_pct"], summary["delta_pure_pct"]),
-    ("Avg token/örn",summary["orig_avg_tokens"],summary["enh_avg_tokens"],
-     round(summary["enh_avg_tokens"]-summary["orig_avg_tokens"],2)),
-    ("Avg BPE/örn",  summary["orig_avg_bpe"],  summary["enh_avg_bpe"],
-     round(summary["enh_avg_bpe"]-summary["orig_avg_bpe"],2)),
-]:
-    print(f"  {label:30s} {orig:>10.2f} {enh:>10.2f} {delta:>+8.2f}")
-print(f"\n  İyileşen : {improved:>6,} (%{summary['pct_improved']:.2f})")
-print(f"  Gerileyen: {regressed:>6,} (%{summary['pct_regressed']:.2f})")
-print(f"  Değişmeyen:{unchanged:>6,} (%{summary['pct_unchanged']:.2f})")
-print(f"\n  En düşük TR% kategoriler:")
-for cat, avg in cat_sorted[:10]:
-    n_cat = len(cat_scores[cat])
-    print(f"    {cat:<35} {avg:>6.2f}%  (n={n_cat})")
-print(f"\n  En yüksek TR% kategoriler:")
-for cat, avg in cat_sorted[-8:]:
-    n_cat = len(cat_scores[cat])
-    print(f"    {cat:<35} {avg:>6.2f}%  (n={n_cat})")
-print(SEP)
-# ── Dosyalara yaz ─────────────────────────────────────────────────────────────
-# Summary JSON
-summary_path = OUT / "tr_mmlu_summary.json"
-with open(summary_path, "w", encoding="utf-8") as f:
-    json.dump(summary, f, ensure_ascii=False, indent=2)
-print(f"\n  ✓ {summary_path}")
-# Report Markdown
-paper_tr  = 90.29
-paper_pur = 85.80
-report_path = OUT / "tr_mmlu_report.md"
-with open(report_path, "w", encoding="utf-8") as f:
-    f.write("# TR-MMLU Benchmark — NedoTurkishTokenizer\n\n")
-    f.write(f"**N = {n:,} örnek**\n\n")
-    f.write("## Ana Metrikler\n\n")
-    f.write("| Metrik | Orijinal | Enhanced | Δ |\n")
-    f.write("|--------|----------|----------|---|\n")
-    f.write(f"| TR% | {summary['orig_tr_pct']:.2f}% | {summary['enh_tr_pct']:.2f}% | {summary['delta_tr_pct']:+.2f}% |\n")
-    f.write(f"| Pure% | {summary['orig_pure_pct']:.2f}% | {summary['enh_pure_pct']:.2f}% | {summary['delta_pure_pct']:+.2f}% |\n")
-    f.write(f"| Avg token/örnek | {summary['orig_avg_tokens']:.2f} | {summary['enh_avg_tokens']:.2f} | {summary['enh_avg_tokens']-summary['orig_avg_tokens']:+.2f} |\n")
-    f.write(f"| Avg BPE/örnek | {summary['orig_avg_bpe']:.2f} | {summary['enh_avg_bpe']:.2f} | {summary['enh_avg_bpe']-summary['orig_avg_bpe']:+.2f} |\n")
-    f.write("\n## Paper ile Karşılaştırma\n\n")
-    f.write("| Metrik | Paper (orijinal) | Bizim (orijinal) | NedoTurkishTokenizer |\n")
-    f.write("|--------|-----------------|-----------------|---------------------|\n")
-    f.write(f"| TR% | {paper_tr}% | {summary['orig_tr_pct']:.2f}% | **{summary['enh_tr_pct']:.2f}%** |\n")
-    f.write(f"| Pure% | {paper_pur}% | {summary['orig_pure_pct']:.2f}% | **{summary['enh_pure_pct']:.2f}%** |\n")
-    f.write("\n## Örnek Dağılımı\n\n")
-    f.write(f"- İyileşen: {improved:,} (%{summary['pct_improved']:.2f})\n")
-    f.write(f"- Gerileyen: {regressed:,} (%{summary['pct_regressed']:.2f})\n")
-    f.write(f"- Değişmeyen: {unchanged:,} (%{summary['pct_unchanged']:.2f})\n")
-    f.write("\n## Kategori Bazında TR%\n\n")
-    f.write("| Kategori | TR% | N |\n")
-    f.write("|----------|-----|---|\n")
-    for cat, avg in cat_sorted:
-        n_cat = len(cat_scores[cat])
-        f.write(f"| {cat} | {avg:.2f}% | {n_cat} |\n")
-print(f"  ✓ {report_path}")
-# Per-sample CSV
-csv_path = OUT / "tr_mmlu_per_sample.csv"
-with open(csv_path, "w", newline="", encoding="utf-8") as f:
-    writer = csv.DictWriter(f, fieldnames=per_sample[0].keys())
-    writer.writeheader()
-    writer.writerows(per_sample)
-print(f"  ✓ {csv_path}")
-print("\nTamamlandı.\n")

id_to_token_64k.json DELETED Viewed

The diff for this file is too large to render. See raw diff

nedo_turkish_tokenizer/__init__.py CHANGED Viewed

@@ -1,21 +1,20 @@
-"""
-NedoTurkishTokenizer — Turkish morphological tokenizer.
-TR-MMLU world record: 92%
-Usage:
     from nedo_turkish_tokenizer import NedoTurkishTokenizer
     tok = NedoTurkishTokenizer()
-    tokens = tok("İstanbul'da meeting'e katılamadım")
-    # Each token dict contains:
-    #   token      : str   — token string (with leading space if word-initial)
-    #   token_type : str   — ROOT | SUFFIX | FOREIGN | BPE | PUNCT |
-    #                        NUM | DATE | UNIT | URL | MENTION | HASHTAG | EMOJI
-    #   morph_pos  : int   — 0=root/word-initial, 1=first suffix, 2=second...
 """
 from .tokenizer import NedoTurkishTokenizer
 __all__ = ["NedoTurkishTokenizer"]
-__version__ = "1.0.0"

+"""NedoTurkishTokenizer — self-contained Turkish morphological tokenizer.
+Zero external dependencies.  Segments Turkish text into morphologically
+meaningful tokens using a candidate-based segmentation engine with a
+bundled TDK dictionary, suffix heuristics, and domain-aware vocabulary.
+Usage::
     from nedo_turkish_tokenizer import NedoTurkishTokenizer
     tok = NedoTurkishTokenizer()
+    tokens = tok.tokenize("İstanbul'da meeting'e katılamadım")
+    for t in tokens:
+        print(t["token"], t["token_type"], t["morph_pos"])
 """
 from .tokenizer import NedoTurkishTokenizer
 __all__ = ["NedoTurkishTokenizer"]
+__version__ = "2.0.0"

nedo_turkish_tokenizer/{_acronym_dict.py → _acronym_table.py} RENAMED Viewed

@@ -1,9 +1,14 @@
-"""Fix 11: Acronym/abbreviation expansion dictionary."""
 from __future__ import annotations
 ACRONYM_EXPANSIONS: dict[str, str] = {
-    # International organizations
     "NATO":     "Kuzey Atlantik Antlaşması Örgütü",
     "UN":       "Birleşmiş Milletler",
     "UNESCO":   "BM Eğitim, Bilim ve Kültür Örgütü",
@@ -16,7 +21,7 @@ ACRONYM_EXPANSIONS: dict[str, str] = {
     "FIFA":     "Uluslararası Futbol Federasyonları Birliği",
     "IOC":      "Uluslararası Olimpiyat Komitesi",
     "UEFA":     "Avrupa Futbol Birliği",
-    # Turkish institutions
     "TBMM":    "Türkiye Büyük Millet Meclisi",
     "MEB":     "Milli Eğitim Bakanlığı",
     "TDK":     "Türk Dil Kurumu",
@@ -32,12 +37,12 @@ ACRONYM_EXPANSIONS: dict[str, str] = {
     "TÜİK":    "Türkiye İstatistik Kurumu",
     "TÜBİTAK": "Türkiye Bilimsel ve Teknolojik Araştırma Kurumu",
     "ASELSAN":  "Askeri Elektronik Sanayii",
-    # Turkish exams
     "TUS":  "Tıpta Uzmanlık Sınavı",
     "DUS":  "Diş Hekimliğinde Uzmanlık Sınavı",
     "YDUS": "Yabancı Dil Uzmanlık Sınavı",
     "KPSS": "Kamu Personeli Seçme Sınavı",
-    # Medical
     "CMV": "Sitomegalovirüs",  "EBV": "Epstein-Barr Virüsü",
     "VZV": "Varisella-Zoster Virüsü", "HHV": "İnsan Herpes Virüsü",
     "HSV": "Herpes Simplex Virüsü",   "HIV": "İnsan İmmün Yetmezlik Virüsü",
@@ -61,7 +66,7 @@ ACRONYM_EXPANSIONS: dict[str, str] = {
     "SMMM": "Serbest Muhasebeci Mali Müşavir",
     "YMM":  "Yeminli Mali Müşavir",
     "SM":   "Serbest Muhasebeci",
-    # Technology
     "AI":   "Yapay Zeka",        "ML":  "Makine Öğrenmesi",
     "LLM":  "Büyük Dil Modeli",  "NLP": "Doğal Dil İşleme",
     "API":  "Uygulama Programlama Arayüzü",
@@ -73,37 +78,10 @@ ACRONYM_EXPANSIONS: dict[str, str] = {
     "OS":   "İşletim Sistemi",
     "BERT": "Çift Yönlü Kodlayıcı Temsiller",
     "GPT":  "Üretici Önceden Eğitilmiş Dönüştürücü",
-    # Economics
     "OPEC": "Petrol İhraç Eden Ülkeler Örgütü",
     "NAFTA": "Kuzey Amerika Serbest Ticaret Anlaşması",
-    # Sports
     "NBA": "Ulusal Basketbol Birliği",
     "NFL": "Ulusal Futbol Ligi",
 }
-def reclassify_acronyms(tokens: list[dict]) -> list[dict]:
-    """Add ``_expansion`` to known acronyms; promote CAPS ROOTs to ACRONYM."""
-    result: list[dict] = []
-    for tok in tokens:
-        token_upper = tok["token"].strip().upper()
-        expansion = ACRONYM_EXPANSIONS.get(token_upper)
-        if tok["type"] == "ACRONYM":
-            # Already typed as ACRONYM by span detection — add expansion
-            if expansion:
-                result.append({**tok, "_expansion": expansion, "_known_acronym": True})
-            else:
-                result.append(tok)
-        elif tok["type"] == "ROOT" and (tok.get("_acronym") or tok.get("_caps")):
-            # ALL CAPS ROOT that's in the acronym dict → promote to ACRONYM
-            if expansion:
-                result.append({
-                    **tok, "type": "ACRONYM",
-                    "_expansion": expansion, "_known_acronym": True,
-                })
-            else:
-                result.append(tok)
-        else:
-            result.append(tok)
-    return result

+"""Acronym / abbreviation expansion dictionary.
+Maps uppercase acronyms to their Turkish expansions.  Used for:
+- Acronym detection (is an ALL CAPS word a known acronym?)
+- Expansion metadata (``_expansion`` field on ACRONYM tokens)
+"""
 from __future__ import annotations
 ACRONYM_EXPANSIONS: dict[str, str] = {
+    # ── International organizations ──────────────────────────────────────
     "NATO":     "Kuzey Atlantik Antlaşması Örgütü",
     "UN":       "Birleşmiş Milletler",
     "UNESCO":   "BM Eğitim, Bilim ve Kültür Örgütü",
     "FIFA":     "Uluslararası Futbol Federasyonları Birliği",
     "IOC":      "Uluslararası Olimpiyat Komitesi",
     "UEFA":     "Avrupa Futbol Birliği",
+    # ── Turkish institutions ─────────────────────────────────────────────
     "TBMM":    "Türkiye Büyük Millet Meclisi",
     "MEB":     "Milli Eğitim Bakanlığı",
     "TDK":     "Türk Dil Kurumu",
     "TÜİK":    "Türkiye İstatistik Kurumu",
     "TÜBİTAK": "Türkiye Bilimsel ve Teknolojik Araştırma Kurumu",
     "ASELSAN":  "Askeri Elektronik Sanayii",
+    # ── Turkish exams ────────────────────────────────────────────────────
     "TUS":  "Tıpta Uzmanlık Sınavı",
     "DUS":  "Diş Hekimliğinde Uzmanlık Sınavı",
     "YDUS": "Yabancı Dil Uzmanlık Sınavı",
     "KPSS": "Kamu Personeli Seçme Sınavı",
+    # ── Medical ──────────────────────────────────────────────────────────
     "CMV": "Sitomegalovirüs",  "EBV": "Epstein-Barr Virüsü",
     "VZV": "Varisella-Zoster Virüsü", "HHV": "İnsan Herpes Virüsü",
     "HSV": "Herpes Simplex Virüsü",   "HIV": "İnsan İmmün Yetmezlik Virüsü",
     "SMMM": "Serbest Muhasebeci Mali Müşavir",
     "YMM":  "Yeminli Mali Müşavir",
     "SM":   "Serbest Muhasebeci",
+    # ── Technology ───────────────────────────────────────────────────────
     "AI":   "Yapay Zeka",        "ML":  "Makine Öğrenmesi",
     "LLM":  "Büyük Dil Modeli",  "NLP": "Doğal Dil İşleme",
     "API":  "Uygulama Programlama Arayüzü",
     "OS":   "İşletim Sistemi",
     "BERT": "Çift Yönlü Kodlayıcı Temsiller",
     "GPT":  "Üretici Önceden Eğitilmiş Dönüştürücü",
+    # ── Economics ────────────────────────────────────────────────────────
     "OPEC": "Petrol İhraç Eden Ülkeler Örgütü",
     "NAFTA": "Kuzey Amerika Serbest Ticaret Anlaşması",
+    # ── Sports ───────────────────────────────────────────────────────────
     "NBA": "Ulusal Basketbol Birliği",
     "NFL": "Ulusal Futbol Ligi",
 }

nedo_turkish_tokenizer/_allomorph.py DELETED Viewed

@@ -1,46 +0,0 @@
-"""Fix 9: Allomorph canonicalization — map surface forms to morpheme IDs."""
-from __future__ import annotations
-ALLOMORPH_MAP: dict[str, str] = {
-    "lar": "PL",   "ler": "PL",
-    "ı":   "ACC",  "i":   "ACC",  "u":   "ACC",  "ü":   "ACC",
-    "yı":  "ACC",  "yi":  "ACC",  "yu":  "ACC",  "yü":  "ACC",
-    "a":   "DAT",  "e":   "DAT",  "ya":  "DAT",  "ye":  "DAT",
-    "da":  "LOC",  "de":  "LOC",  "ta":  "LOC",  "te":  "LOC",
-    "dan": "ABL",  "den": "ABL",  "tan": "ABL",  "ten": "ABL",
-    "ın":  "GEN",  "in":  "GEN",  "un":  "GEN",  "ün":  "GEN",
-    "nın": "GEN",  "nin": "GEN",  "nun": "GEN",  "nün": "GEN",
-    "la":  "INS",  "le":  "INS",  "yla": "INS",  "yle": "INS",
-    "dı":  "PAST", "di":  "PAST", "du":  "PAST", "dü":  "PAST",
-    "tı":  "PAST", "ti":  "PAST", "tu":  "PAST", "tü":  "PAST",
-    "yor": "PROG",
-    "ar":  "AOR",  "er":  "AOR",
-    "ır":  "AOR",  "ir":  "AOR",  "ur":  "AOR",  "ür":  "AOR",
-    "mış": "EVID", "miş": "EVID", "muş": "EVID", "müş": "EVID",
-    "ma":  "NEG",  "me":  "NEG",
-    "mak": "INF",  "mek": "INF",
-    "ım":  "1SG",  "im":  "1SG",  "um":  "1SG",  "üm":  "1SG",
-    "ın":  "2SG",  "in":  "2SG",  "un":  "2SG",  "ün":  "2SG",
-    "iz":  "1PL",  "ız":  "1PL",  "uz":  "1PL",  "üz":  "1PL",
-    "mı":  "Q",    "mi":  "Q",    "mu":  "Q",    "mü":  "Q",
-    "lı":  "WITH", "li":  "WITH", "lu":  "WITH", "lü":  "WITH",
-    "sız": "WITHOUT","siz": "WITHOUT","suz": "WITHOUT","süz": "WITHOUT",
-    "cı":  "AGT",  "ci":  "AGT",  "cu":  "AGT",  "cü":  "AGT",
-    "çı":  "AGT",  "çi":  "AGT",  "çu":  "AGT",  "çü":  "AGT",
-    "lık": "ABSTR","lik": "ABSTR","luk": "ABSTR","lük": "ABSTR",
-    "sa":  "COND", "se":  "COND",
-    "ıl":  "PASS", "il":  "PASS", "ul":  "PASS", "ül":  "PASS",
-}
-def add_canonical_labels(tokens: list[dict]) -> list[dict]:
-    """Add ``_canonical`` field to SUFFIX tokens (e.g. 'lar'/'ler' → 'PL')."""
-    result: list[dict] = []
-    for tok in tokens:
-        if tok["type"] != "SUFFIX":
-            result.append(tok)
-            continue
-        canonical = ALLOMORPH_MAP.get(tok["token"].strip().lower())
-        result.append({**tok, "_canonical": canonical} if canonical else tok)
-    return result

nedo_turkish_tokenizer/_compound.py DELETED Viewed

@@ -1,76 +0,0 @@
-"""Fix 10: Turkish compound word annotation."""
-from __future__ import annotations
-KNOWN_COMPOUNDS: dict[str, list[str]] = {
-    "başbakan":         ["baş", "bakan"],
-    "cumhurbaşkanı":    ["cumhur", "başkan"],
-    "dışişleri":        ["dış", "iş"],
-    "içişleri":         ["iç", "iş"],
-    "maliye":           ["mal", "iye"],
-    "belediye":         ["beled", "iye"],
-    "ayakkabı":         ["ayak", "kap"],
-    "yelkovan":         ["yel", "kovan"],
-    "saatlik":          ["saat", "lik"],
-    "günlük":           ["gün", "lük"],
-    "yıllık":           ["yıl", "lık"],
-    "aylık":            ["ay", "lık"],
-    "haftalık":         ["hafta", "lık"],
-    "gastrointestinal": ["gastro", "intestinal"],
-    "kardiyovasküler":  ["kardio", "vasküler"],
-    "nöropsikiyatri":   ["nöro", "psikiyatri"],
-    "biyokimya":        ["biyo", "kimya"],
-    "mikrobiyoloji":    ["mikro", "biyoloji"],
-    "farmakoloji":      ["farma", "koloji"],
-    "patoloji":         ["pato", "loji"],
-    "hematoloji":       ["hemato", "loji"],
-    "nefroloji":        ["nefro", "loji"],
-    "kardiyoloji":      ["kardio", "loji"],
-    "radyoloji":        ["radyo", "loji"],
-    "onkoloji":         ["onko", "loji"],
-    "elektromanyetik":  ["elektro", "manyetik"],
-    "termodinamik":     ["termo", "dinamik"],
-    "hidroelektrik":    ["hidro", "elektrik"],
-    "biyoinformatik":   ["biyo", "informatik"],
-    "nanoteknoloji":    ["nano", "teknoloji"],
-    "futbolcu":         ["futbol", "cu"],
-    "basketbolcu":      ["basketbol", "cu"],
-    "voleybolcu":       ["voleybol", "cu"],
-}
-def _decompose_zemberek(word: str, morphology) -> list[str] | None:
-    try:
-        wa = morphology.analyze(word)
-        for sa in wa:
-            morphemes = [str(m) for m in sa.get_morphemes()]
-            roots = [m for m in morphemes if "Noun" in m or "Verb" in m or "Adj" in m]
-            if len(roots) > 1:
-                return roots
-    except Exception:  # noqa: BLE001
-        pass
-    return None
-def add_compound_info(tokens: list[dict], morphology=None) -> list[dict]:
-    """Annotate ROOT tokens that are compound words with ``_compound`` and ``_parts``."""
-    result: list[dict] = []
-    for tok in tokens:
-        if tok["type"] != "ROOT" or tok["token"].strip().startswith("<"):
-            result.append(tok)
-            continue
-        surface = tok["token"].strip().lower()
-        if morphology is not None:
-            parts = _decompose_zemberek(surface, morphology)
-            if parts and len(parts) > 1:
-                result.append({**tok, "_compound": True, "_parts": parts, "_source": "zemberek"})
-                continue
-        if surface in KNOWN_COMPOUNDS:
-            result.append({**tok, "_compound": True, "_parts": KNOWN_COMPOUNDS[surface], "_source": "manual"})
-        else:
-            result.append(tok)
-    return result

nedo_turkish_tokenizer/_context_aware.py DELETED Viewed

@@ -1,61 +0,0 @@
-"""Fix 12: Context-aware Zemberek disambiguation.
-Uses zemberek-python (pure Python) — no JVM required.
-"""
-from __future__ import annotations
-from ._root_validator import ZEMBEREK_AVAILABLE, _morphology
-AMBIGUOUS_WORDS = {
-    "yüz", "gelir", "yazar", "geçer", "çıkar", "gider",
-    "biter", "düşer", "tutar", "kalır", "gerekir", "uyar",
-    "uçar", "güzel", "büyük", "küçük", "yeni", "eski",
-}
-def annotate_with_context(tokens: list[dict], original_text: str) -> list[dict]:
-    """Enrich ROOT tokens with POS and lemma using Zemberek sentence-level disambiguation."""
-    if not ZEMBEREK_AVAILABLE:
-        return tokens
-    try:
-        sa_result = _morphology.analyze_and_disambiguate(original_text.strip())
-        best_list = sa_result.best_analysis()
-        analyses: dict[str, dict] = {}
-        for sa in best_list:
-            try:
-                sf = (str(sa.get_stem()) + str(sa.get_ending())).lower().strip()
-                if sf not in analyses:
-                    analyses[sf] = {
-                        "lemma":     str(sa.item.lemma),
-                        "pos":       str(sa.item.primary_pos.short_form),
-                        "morphemes": [str(m) for m in sa.get_morphemes()],
-                    }
-            except Exception:  # noqa: BLE001
-                continue
-        result: list[dict] = []
-        for tok in tokens:
-            if tok["type"] != "ROOT" or tok["token"].strip().startswith("<"):
-                result.append(tok)
-                continue
-            surface = tok["token"].strip().lower()
-            z = analyses.get(surface)
-            if z:
-                result.append({
-                    **tok,
-                    "_pos":           z["pos"],
-                    "_lemma":         z["lemma"],
-                    "_morphemes":     z["morphemes"],
-                    "_disambiguated": surface in AMBIGUOUS_WORDS,
-                })
-            else:
-                result.append(tok)
-        return result
-    except Exception:  # noqa: BLE001
-        return tokens

nedo_turkish_tokenizer/{_medical_vocab.py → _domain_vocab.py} RENAMED Viewed

@@ -1,4 +1,14 @@
-"""Domain vocabulary: medical, sports, tourism roots (Fix 6)."""
 from __future__ import annotations
@@ -136,4 +146,9 @@ TOURISM_ROOTS: dict[str, str] = {
     "delüks": "delüks",
 }
-ALL_DOMAIN_ROOTS: dict[str, str] = {**MEDICAL_ROOTS, **SPORTS_ROOTS, **TOURISM_ROOTS}

+"""Domain vocabulary: medical, sports, tourism roots.
+These are domain-specific terms that a generic Turkish dictionary may not
+contain but that should be recognised as valid ROOT tokens rather than
+left as unknown fragments.
+The domain vocabulary is an **optional support layer** — the core
+segmentation engine works without it.  It is consulted during candidate
+scoring to boost the score of candidates whose root matches a known
+domain term.
+"""
 from __future__ import annotations
     "delüks": "delüks",
 }
+# Combined set of all domain roots (lowercase) for fast lookup
+ALL_DOMAIN_ROOTS: frozenset[str] = frozenset(
+    k.lower()
+    for d in (MEDICAL_ROOTS, SPORTS_ROOTS, TOURISM_ROOTS)
+    for k in d
+)

nedo_turkish_tokenizer/_preprocessor.py DELETED Viewed

@@ -1,246 +0,0 @@
-"""Fix 1: ALL CAPS inflation fix. Fix 2: Apostrophe / code-switching split."""
-from __future__ import annotations
-import re
-from pathlib import Path
-TR_CHARS = set("çğışöüÇĞİŞÖÜ")
-_PROPER_NOUNS: set[str] | None = None
-def _load_proper_nouns() -> set[str]:
-    global _PROPER_NOUNS
-    if _PROPER_NOUNS is not None:
-        return _PROPER_NOUNS
-    path = Path(__file__).parent / "data" / "turkish_proper_nouns.txt"
-    if path.exists():
-        _PROPER_NOUNS = {
-            line.strip().lower()
-            for line in path.read_text(encoding="utf-8").splitlines()
-            if line.strip() and not line.startswith("#")
-        }
-    else:
-        _PROPER_NOUNS = set()
-    return _PROPER_NOUNS
-def _turkish_lower(s: str) -> str:
-    """Turkish-aware lowercase: İ→i, I→ı (not i), then standard lower."""
-    return s.replace("İ", "i").replace("I", "ı").lower()
-TURKISH_SUFFIXES_AFTER_APOSTROPHE = sorted(
-    [
-        "nın","nin","nun","nün","dan","den","tan","ten",
-        "da","de","ta","te","ya","ye","nda","nde",
-        "yı","yi","yu","yü","nı","ni","nu","nü",
-        "lar","ler","lara","lere","ları","leri",
-        "ım","im","um","üm","ın","in","un","ün",
-        "mız","miz","muz","müz","nız","niz","nuz","nüz",
-        "dır","dir","dur","dür","tır","tir","tur","tür",
-        "ki","li","lı","lu","lü","sız","siz","suz","süz",
-        "a","e","ı","i","u","ü",
-    ],
-    key=len,
-    reverse=True,
-)
-_APO_RE  = re.compile(
-    r"([A-Za-zÇçĞğİıÖöŞşÜü0-9]{2,})['\u2019]([A-Za-zÇçĞğİıÖöŞşÜü]{1,6})\b"
-)
-_CAPS_RE = re.compile(r'\b([A-ZÇĞİÖŞÜ]{2,})\b')
-def _is_turkish_base(word: str) -> bool:
-    """Return True if the word should be treated as Turkish (don't split apostrophe)."""
-    wl = _turkish_lower(word)
-    # Fast path: Turkish-specific characters → definitely Turkish
-    if any(c in TR_CHARS for c in wl):
-        return True
-    # Turkish proper nouns (cities, regions) — not in TDK common-word list
-    if wl in _load_proper_nouns():
-        return True
-    # TDK lookup: if it's in the dictionary it's Turkish (or an accepted loanword)
-    from ._tdk_vocab import load_tdk_words  # noqa: PLC0415
-    tdk = load_tdk_words()
-    if tdk and wl in tdk:
-        return True
-    # Zemberek: proper nouns whose lemma contains Turkish chars (İstanbul, İzmir…)
-    try:
-        from ._root_validator import _morphology, ZEMBEREK_AVAILABLE  # noqa: PLC0415
-        if ZEMBEREK_AVAILABLE and _morphology:
-            wa = _morphology.analyze(wl)
-            for sa in wa:
-                lemma = str(sa.item.lemma)
-                if any(c in TR_CHARS for c in lemma):
-                    return True
-    except Exception:  # noqa: BLE001
-        pass
-    # TDK unavailable + Zemberek unavailable: very short words are ambiguous
-    return len(wl) < 4
-# ── Fix 1: ALL CAPS ───────────────────────────────────────────────────────────
-def _fix_all_caps(text: str) -> tuple[str, set]:
-    caps: set[str] = set()
-    def _replace(m: re.Match) -> str:
-        w = m.group(1)
-        caps.add(_turkish_lower(w))
-        return _turkish_lower(w)
-    return _CAPS_RE.sub(_replace, text), caps
-def _restore_caps_tokens(tokens: list[dict], caps: set) -> list[dict]:
-    result: list[dict] = []
-    i = 0
-    while i < len(tokens):
-        tok = tokens[i]
-        raw_low = _turkish_lower(tok["token"].strip())
-        if tok["type"] == "ROOT" and raw_low in caps:
-            result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
-            result.append(tok)
-            i += 1
-            continue
-        if tok["type"] == "BPE" and tok["token"].startswith(" "):
-            combined  = raw_low
-            lookahead = [tok]
-            j = i + 1
-            while j < len(tokens):
-                nt = tokens[j]
-                if not nt["token"].startswith(" "):
-                    combined += _turkish_lower(nt["token"].strip())
-                    lookahead.append(nt)
-                    j += 1
-                    if combined in caps:
-                        break
-                    if len(combined) > 8:
-                        break
-                else:
-                    break
-            if combined in caps:
-                result.append({"token": "<uppercase_word>", "type": "ROOT", "_caps": True})
-                result.append({"token": f" {combined}", "type": "ROOT",
-                                "_acronym": True, "_caps": True})
-                i = j
-                continue
-        result.append(tok)
-        i += 1
-    return result
-# ── Fix 2: Apostrophe split ───────────────────────────────────────────────────
-#
-# Strategy: record (foreign_base, suffix) pairs, replace apostrophe with space.
-# After tokenization, _merge_apostrophe_tokens uses these pairs to find the
-# BPE pieces that form the foreign word and merge them into one FOREIGN ROOT,
-# then marks the following word-initial suffix token as SUFFIX.
-#
-# Old approach used a \ue001 separator — the base tokenizer converts that to
-# '<unknown>' so the separator was never found. Simple-space + pair-list is
-# robust regardless of how the tokenizer handles the input.
-def _split_apostrophe(text: str) -> tuple[str, list[tuple[str, str]]]:
-    """
-    Replace FOREIGN'SUFFIX with 'FOREIGN SUFFIX' (apostrophe → space).
-    Returns (modified_text, [(foreign_base_lower, suffix_lower), ...]).
-    Turkish proper names (İstanbul'da) are left unchanged.
-    """
-    splits: list[tuple[str, str]] = []
-    def _repl(m: re.Match) -> str:
-        base, suffix = m.group(1), m.group(2)
-        if _is_turkish_base(base):
-            return m.group(0)          # leave Turkish names alone
-        sl = suffix.lower()
-        if any(sl == s for s in TURKISH_SUFFIXES_AFTER_APOSTROPHE):
-            splits.append((_turkish_lower(base), sl))
-            return f"{base} {suffix}"  # just drop the apostrophe
-        return m.group(0)
-    return _APO_RE.sub(_repl, text), splits
-def _merge_apostrophe_tokens(
-    tokens: list[dict], apo_splits: list[tuple[str, str]]
-) -> list[dict]:
-    """
-    For each (foreign_base, suffix) pair recorded during _split_apostrophe,
-    find the consecutive BPE/ROOT pieces that together spell foreign_base,
-    merge them into one FOREIGN ROOT token, and mark the next word-initial
-    token whose stripped form == suffix as SUFFIX.
-    """
-    if not apo_splits:
-        return tokens
-    result = list(tokens)
-    for foreign_base, suffix in apo_splits:
-        n = len(result)
-        for j in range(1, n):
-            tok_j = result[j]
-            # Candidate suffix token: word-initial, stripped == suffix
-            if not tok_j["token"].startswith(" "):
-                continue
-            if _turkish_lower(tok_j["token"].strip()) != suffix:
-                continue
-            # Walk back to find pieces of the word before j (no leading space)
-            word_start = j - 1
-            while word_start > 0 and not result[word_start]["token"].startswith(" "):
-                word_start -= 1
-            pieces = result[word_start:j]
-            if not pieces:
-                continue
-            combined = "".join(_turkish_lower(p["token"].strip()) for p in pieces)
-            if combined != foreign_base:
-                continue
-            # Merge pieces into one FOREIGN ROOT
-            merged = pieces[0]["token"]        # keeps leading space
-            for p in pieces[1:]:
-                merged += p["token"].strip()
-            new_root = {"token": merged, "type": "ROOT", "_foreign": True}
-            new_suf  = {**tok_j, "type": "SUFFIX", "_apo_suffix": True}
-            result = (
-                result[:word_start]
-                + [new_root, new_suf]
-                + result[j + 1:]
-            )
-            break   # this pair is handled
-    return result
-# ── Combined pre / post ───────────────────────────────────────────────────────
-def preprocess(text: str) -> tuple[str, set, list]:
-    """Prepare text before base tokenization.
-    Returns:
-        (modified_text, caps_set, apo_splits)
-    """
-    text, caps = _fix_all_caps(text)
-    text, apo_splits = _split_apostrophe(text)
-    return text, caps, apo_splits
-def postprocess(
-    tokens: list[dict], caps: set, apo_splits: list | None = None
-) -> list[dict]:
-    """Fix tokens after base tokenization."""
-    tokens = _restore_caps_tokens(tokens, caps)
-    tokens = _merge_apostrophe_tokens(tokens, apo_splits or [])
-    return tokens

nedo_turkish_tokenizer/_root_validator.py DELETED Viewed

@@ -1,205 +0,0 @@
-"""Zemberek-based root validation and correction (Fix 4).
-Uses zemberek-python (pure Python) — no JVM or JPype required.
-"""
-from __future__ import annotations
-ZEMBEREK_AVAILABLE = False
-_morphology = None
-def _apply_zemberek_patch() -> None:
-    """Fix O(N^2) loading time bug in zemberek-python 0.2.3."""
-    import csv
-    import zemberek.morphology.lexicon.root_lexicon as rl
-    def fast_load_from_resources(resource_path: str):
-        items = list()
-        csv.field_size_limit(100000000)
-        with open(resource_path, 'r', encoding='utf-8') as f:
-            lex = list(csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE))
-            # O(1) dictionary lookup instead of O(N) iteration per reference
-            lex_dict = {line[0]: line for line in lex}
-            for i, line in enumerate(lex):
-                item = rl.DictionaryReader.make_dict_item_from_line(line)
-                if line[7] != 'null':
-                    reference_item_line = lex_dict.get(line[7])
-                    if reference_item_line is not None:
-                        item.set_reference_item(rl.DictionaryReader.make_dict_item_from_line(reference_item_line))
-                items.append(item)
-        return rl.RootLexicon(items)
-    rl.DictionaryReader.load_from_resources = fast_load_from_resources
-def _init_zemberek() -> None:
-    global ZEMBEREK_AVAILABLE, _morphology
-    try:
-        from zemberek import TurkishMorphology  # noqa: PLC0415
-        _apply_zemberek_patch()
-        _morphology = TurkishMorphology.create_with_defaults()
-        ZEMBEREK_AVAILABLE = True
-    except ImportError:
-        print("[NedoTurkishTokenizer] zemberek-python not installed → pip install zemberek-python")
-    except Exception as exc:  # noqa: BLE001
-        print(f"[NedoTurkishTokenizer] Zemberek init failed: {exc}")
-_init_zemberek()
-# ── Zemberek API helpers ──────────────────────────────────────────────────────
-def analyze_word(word: str) -> list[dict]:
-    """Return all Zemberek analyses for a single word."""
-    if not ZEMBEREK_AVAILABLE:
-        return []
-    try:
-        wa = _morphology.analyze(word)
-        return [
-            {
-                "lemma":    str(sa.item.lemma),
-                "pos":      str(sa.item.primary_pos.short_form),
-                "morphemes": [str(m) for m in sa.get_morphemes()],
-                "surface":  str(sa.get_stem()) + str(sa.get_ending()),
-            }
-            for sa in wa
-        ]
-    except Exception:  # noqa: BLE001
-        return []
-def get_root_and_suffixes(word: str) -> dict | None:
-    """Return root + suffix list for a word, or None if unknown."""
-    analyses = analyze_word(word)
-    if not analyses:
-        return None
-    a = analyses[0]
-    return {"root": a["lemma"], "suffixes": a["morphemes"][1:], "pos": a["pos"]}
-# ── Heuristic fallback (no Zemberek) ─────────────────────────────────────────
-_SPURIOUS_SHORT_ROOTS = {"oğ", "gök", "zo", "me", "im", "pro", "go", "da", "al"}
-def _is_spurious_root(root: str, next_tokens: list[dict]) -> bool:
-    if root.strip().lower() not in _SPURIOUS_SHORT_ROOTS:
-        return False
-    return sum(1 for t in next_tokens[:3] if t["type"] == "BPE") >= 2
-# ── Main validation ───────────────────────────────────────────────────────────
-def build_correction_map(
-    original_words: list[str], base_tokenizer
-) -> dict[str, str]:
-    """Build a {tokenizer_root → zemberek_root} correction map."""
-    correction_map: dict[str, str] = {}
-    for word in original_words:
-        w = word.lower().strip("'\".,!?;:()")
-        if not w or len(w) < 3:
-            continue
-        z = get_root_and_suffixes(w)
-        if not z or z["root"] == "UNK":
-            continue
-        z_root = z["root"].lower()
-        try:
-            toks = base_tokenizer.tokenize_text(w)
-            t_root = next(
-                (t["token"].strip().lower() for t in toks if t["type"] == "ROOT"),
-                None,
-            )
-        except Exception:  # noqa: BLE001
-            continue
-        if not t_root or t_root == z_root:
-            continue
-        diff = len(z_root) - len(t_root)
-        if diff < 0 or diff > 4:
-            continue
-        if not z_root.startswith(t_root):
-            continue
-        correction_map[t_root] = z_root
-    return correction_map
-def validate_roots(
-    tokens: list[dict],
-    original_words: list[str],
-    base_tokenizer=None,
-) -> list[dict]:
-    """Apply Zemberek root corrections to the token stream."""
-    if not ZEMBEREK_AVAILABLE:
-        result = []
-        for i, tok in enumerate(tokens):
-            if tok["type"] == "ROOT" and not tok["token"].strip().startswith("<"):
-                if _is_spurious_root(tok["token"], tokens[i + 1 : i + 5]):
-                    tok = {**tok, "_suspicious": True}
-            result.append(tok)
-        return result
-    corr = (
-        build_correction_map(original_words, base_tokenizer)
-        if base_tokenizer is not None
-        else {}
-    )
-    result = []
-    for tok in tokens:
-        if tok["type"] != "ROOT" or tok["token"].strip().startswith("<"):
-            result.append(tok)
-            continue
-        surface = tok["token"].strip().lower()
-        correct = corr.get(surface)
-        if correct and correct != surface:
-            leading = " " if tok["token"].startswith(" ") else ""
-            tok = {
-                **tok,
-                "token":           leading + correct,
-                "_original_token": tok["token"],
-                "_root_corrected": True,
-                "_note":           f"root corrected: '{surface}' → '{correct}'",
-            }
-        result.append(tok)
-    return result
-def disambiguate_sentence(words: list[str]) -> list[dict | None]:
-    """Sentence-level Zemberek disambiguation."""
-    if not ZEMBEREK_AVAILABLE:
-        return [None] * len(words)
-    try:
-        sentence = " ".join(words)
-        sa_result = _morphology.analyze_and_disambiguate(sentence)
-        best = sa_result.best_analysis()
-        out = []
-        for sa in best:
-            try:
-                out.append({
-                    "lemma":     str(sa.item.lemma),
-                    "pos":       str(sa.item.primary_pos.short_form),
-                    "morphemes": [str(m) for m in sa.get_morphemes()],
-                })
-            except Exception:  # noqa: BLE001
-                out.append(None)
-        while len(out) < len(words):
-            out.append(None)
-        return out[: len(words)]
-    except Exception:  # noqa: BLE001
-        return [analyze_word(w)[0] if analyze_word(w) else None for w in words]

nedo_turkish_tokenizer/_suffix_expander.py DELETED Viewed

@@ -1,212 +0,0 @@
-"""Fix 3: BPE → SUFFIX reclassification. Fix 5: Punctuation → PUNCT."""
-from __future__ import annotations
-PUNCT_CHARS = set(
-    "'?.,;:!-\u2013\u2014()[]{}\"`/\\|@#$%^&*+=<>~"
-    "\u2019\u2018\u201c\u201d\u2032\u00ab\u00bb\u2039\u203a"
-    "\u2022\u2026\u00b7\u00b0\u00b1\u00d7\u00f7"
-)
-_PUNCT_DIGITS = set("0123456789")
-def _is_punct(token: str) -> bool:
-    s = token.strip()
-    if not s:
-        return False
-    return all(
-        c in PUNCT_CHARS or c in _PUNCT_DIGITS or (ord(c) > 0x02FF and not c.isalpha())
-        for c in s
-    )
-# ── Suffix dictionary (260+ entries) ─────────────────────────────────────────
-EXTENDED_SUFFIX_MAP: dict[str, str] = {
-    # Plural + case
-    "leri": "-PL+ACC",  "ları": "-PL+ACC",
-    "lere": "-PL+DAT",  "lara": "-PL+DAT",
-    "lerin": "-PL+GEN", "ların": "-PL+GEN",
-    "lerde": "-PL+LOC", "larda": "-PL+LOC",
-    "lerden": "-PL+ABL","lardan": "-PL+ABL",
-    "lerle": "-PL+INS", "larla": "-PL+INS",
-    "lerce": "-PL+EQU", "larca": "-PL+EQU",
-    # -yon / loanword suffixes
-    "yon": "-YON",  "iyon": "-YON",  "asyon": "-YON",  "izasyon": "-YON",
-    # Adjective derivation
-    "al": "-ADJ",   "el": "-ADJ",   "ik": "-ADJ",
-    "sal": "-ADJ.TR", "sel": "-ADJ.TR",
-    # 1st/2nd plural possessive
-    "imiz": "-P1PL", "ımız": "-P1PL", "umuz": "-P1PL", "ümüz": "-P1PL",
-    "iniz": "-P2PL", "ınız": "-P2PL", "unuz": "-P2PL", "ünüz": "-P2PL",
-    # Arabic long vowels
-    "\u00e2": "-LONG_A", "\u00ee": "-LONG_I", "\u00fb": "-LONG_U",
-    # Roman numerals
-    "ii": "-ROM",  "iii": "-ROM",  "iv": "-ROM",  "vi": "-ROM",
-    "vii": "-ROM", "viii": "-ROM", "ix": "-ROM",  "xi": "-ROM",
-    "xii": "-ROM", "xiii": "-ROM", "xiv": "-ROM", "xv": "-ROM",
-    # Frequent BPE pieces
-    "eri": "-PL.SFX",  "una": "-P3+DAT",  "iril": "-PASS.SFX",
-    "yan": "-PART.ACT","ren": "-PART.ACT", "ıda": "-LOC.SFX",
-    "maya": "-NEG.INF", "üler": "-PL.SFX", "ıler": "-PL.SFX",
-    "ni": "-ACC.SFX",  "ri": "-PL.SFX",   "lan": "-PASS+NZ",
-    "on": "-YON.SFX",
-    # Possessive + case compounds
-    "ımı": "-P1+ACC",  "imi": "-P1+ACC",  "umu": "-P1+ACC",  "ümü": "-P1+ACC",
-    "ıyla": "-INS.COMP","iyle": "-INS.COMP","uyla": "-INS.COMP","üyle": "-INS.COMP",
-    "kten": "-ABL.COMP","ğından": "-ABL.COMP","ğinden": "-ABL.COMP",
-    "yla": "-COM", "yle": "-COM",
-    # Abstract noun + possessive
-    "liği": "-ABSTR+P3",  "lığı": "-ABSTR+P3",
-    "luğu": "-ABSTR+P3",  "lüğü": "-ABSTR+P3",
-    "liğini": "-ABSTR+P3+ACC", "lığını": "-ABSTR+P3+ACC",
-    # -izm (ideology)
-    "izm": "-ISM",    "izmi": "-ISM+P3",  "izmde": "-ISM+LOC",
-    "izmden": "-ISM+ABL", "izmin": "-ISM+GEN",
-    # Aorist
-    "lir": "-AOR3SG", "lır": "-AOR3SG", "lur": "-AOR3SG", "lür": "-AOR3SG",
-    # 3sg possessive + case
-    "ine": "-P3+DAT",  "ına": "-P3+DAT",  "une": "-P3+DAT",  "üne": "-P3+DAT",
-    "inde": "-P3+LOC", "ında": "-P3+LOC", "unda": "-P3+LOC", "ünde": "-P3+LOC",
-    "ini": "-P3+ACC",  "ını": "-P3+ACC",  "unu": "-P3+ACC",  "ünü": "-P3+ACC",
-    "inden": "-P3+ABL","ından": "-P3+ABL","undan": "-P3+ABL","ünden": "-P3+ABL",
-    # -daki
-    "daki": "-LOC+REL","deki": "-LOC+REL","taki": "-LOC+REL","teki": "-LOC+REL",
-    # Passive + nominalization
-    "lan": "-PASS+NZ", "len": "-PASS+NZ",
-    # Verbal noun
-    "mesi": "-VN3",  "ması": "-VN3",
-    "mesini": "-VN3+ACC",  "masını": "-VN3+ACC",
-    "mesine": "-VN3+DAT",  "masına": "-VN3+DAT",
-    "mesinde": "-VN3+LOC", "masında": "-VN3+LOC",
-    # Genitive + possessive
-    "ının": "-GEN+P", "inin": "-GEN+P", "unun": "-GEN+P", "ünün": "-GEN+P",
-    # Participle
-    "diği": "-PART",  "dığı": "-PART",  "tiği": "-PART",  "tığı": "-PART",
-    "duğu": "-PART",  "düğü": "-PART",  "tuğu": "-PART",  "tüğü": "-PART",
-    "ği": "-PART.SFX","ğı": "-PART.SFX","gu": "-PART.SFX","gü": "-PART.SFX",
-    # Negative verbal noun
-    "mas": "-NEG.VN", "mes": "-NEG.VN",
-    # 2sg imperative
-    "sin": "-IMP2",  "sın": "-IMP2",  "sun": "-IMP2",  "sün": "-IMP2",
-    # Passive short
-    "ıl": "-PASS",  "il": "-PASS",  "ul": "-PASS",  "ül": "-PASS",
-    # Causative + VN
-    "irme": "-CAUS+VN","ırma": "-CAUS+VN","urma": "-CAUS+VN",
-    "ürme": "-CAUS+VN","erme": "-CAUS+VN","arma": "-CAUS+VN",
-    # Accusative
-    "ı": "-ACC", "i": "-ACC", "u": "-ACC", "ü": "-ACC",
-    # Past tense
-    "dım": "-DI1SG","dim": "-DI1SG","dum": "-DI1SG","düm": "-DI1SG",
-    "tım": "-DI1SG","tim": "-DI1SG","tum": "-DI1SG","tüm": "-DI1SG",
-    "dık": "-DI1PL","dik": "-DI1PL","duk": "-DI1PL","dük": "-DI1PL",
-    "tık": "-DI1PL","tik": "-DI1PL","tuk": "-DI1PL","tük": "-DI1PL",
-    "dın": "-DI2SG","din": "-DI2SG","dun": "-DI2SG","dün": "-DI2SG",
-    "tın": "-DI2SG","tin": "-DI2SG","tun": "-DI2SG","tün": "-DI2SG",
-    "d": "-PAST",  "t": "-PAST",
-    # Conditional
-    "sa": "-COND",  "se": "-COND",
-    # Progressive
-    "yor": "-PROG",
-    # Simple past
-    "dı": "-PST", "di": "-PST", "du": "-PST", "dü": "-PST",
-    "tı": "-PST", "ti": "-PST", "tu": "-PST", "tü": "-PST",
-    # Aorist short
-    "ir": "-AOR", "ır": "-AOR", "ur": "-AOR", "ür": "-AOR",
-    "er": "-AOR", "ar": "-AOR",
-    # Evidential past
-    "mış": "-EVID","miş": "-EVID","muş": "-EVID","müş": "-EVID",
-    # Negation
-    "ma": "-NEG",  "me": "-NEG",
-    "lama": "-VN+NEG","leme": "-VN+NEG",
-    # Abilitative
-    "bil": "-ABIL",
-    # Necessitative
-    "malı": "-NECES","meli": "-NECES",
-    # Infinitive
-    "mak": "-INF",  "mek": "-INF",
-    # -ken (while/when)
-    "ken": "-WHEN",
-    # Converb
-    "arak": "-CONV","erek": "-CONV",
-    # With / without
-    "lı": "-WITH",  "li": "-WITH",  "lu": "-WITH",  "lü": "-WITH",
-    # Agentive
-    "cı": "-AGT",  "ci": "-AGT",  "cu": "-AGT",  "cü": "-AGT",
-    "çı": "-AGT",  "çi": "-AGT",  "çu": "-AGT",  "çü": "-AGT",
-    # Abstract noun
-    "lık": "-ABSTR","lik": "-ABSTR","luk": "-ABSTR","lük": "-ABSTR",
-    "lığ": "-ABSTR","liğ": "-ABSTR",
-    # Optative 1pl
-    "elim": "-OPT1PL","alım": "-OPT1PL",
-    # Person suffixes
-    "ım": "-1SG", "im": "-1SG", "um": "-1SG", "üm": "-1SG",
-    "ın": "-2SG", "in": "-2SG", "un": "-2SG", "ün": "-2SG",
-    "iz": "-1PL", "ız": "-1PL", "uz": "-1PL", "üz": "-1PL",
-    "nız": "-2PL","niz": "-2PL","nuz": "-2PL","nüz": "-2PL",
-    # Question
-    "mı": "-Q", "mi": "-Q", "mu": "-Q", "mü": "-Q",
-    # Dative
-    "a": "-DAT",  "e": "-DAT",  "ya": "-DAT",  "ye": "-DAT",
-    # Ablative
-    "dan": "-ABL","den": "-ABL","tan": "-ABL","ten": "-ABL",
-    # Locative
-    "da": "-LOC", "de": "-LOC", "ta": "-LOC", "te": "-LOC",
-    # Plural
-    "lar": "-PL",  "ler": "-PL",
-    # 3sg possessive short
-    "sı": "-P3",  "si": "-P3",  "su": "-P3",  "sü": "-P3",
-    # Genitive
-    "nin": "-GEN","nın": "-GEN","nun": "-GEN","nün": "-GEN",
-    # Instrumental
-    "le": "-INS", "la": "-INS",
-    # Equative
-    "ce": "-EQU","ca": "-EQU","çe": "-EQU","ça": "-EQU",
-    # Glide
-    "y": "-GLIDE",
-}
-_SUFFIX_MAP_SORTED = sorted(
-    EXTENDED_SUFFIX_MAP.items(), key=lambda x: len(x[0]), reverse=True
-)
-def reclassify_bpe_suffixes(tokens: list[dict]) -> list[dict]:
-    """Reclassify BPE tokens: punctuation → PUNCT, word-internal suffixes → SUFFIX."""
-    result: list[dict] = []
-    for tok in tokens:
-        if tok["type"] != "BPE":
-            result.append(tok)
-            continue
-        raw = tok["token"]
-        stripped = raw.strip()
-        if _is_punct(raw):
-            result.append({**tok, "type": "PUNCT", "_punct": True})
-            continue
-        # Only reclassify tokens without a leading space (word-internal)
-        if raw != stripped:
-            result.append(tok)
-            continue
-        prev_ok = bool(result) and result[-1]["type"] in ("ROOT", "SUFFIX", "BPE")
-        if not prev_ok:
-            result.append(tok)
-            continue
-        sl = stripped.lower()
-        label = next((lbl for surf, lbl in _SUFFIX_MAP_SORTED if sl == surf), None)
-        if label:
-            result.append({
-                "token":          raw,
-                "type":           "SUFFIX",
-                "_reclassified":  True,
-                "_suffix_label":  label,
-                **{k: v for k, v in tok.items() if k not in ("token", "type")},
-            })
-        else:
-            result.append(tok)
-    return result

nedo_turkish_tokenizer/_suffix_table.py ADDED Viewed

	@@ -0,0 +1,197 @@

+"""Turkish suffix pattern table (260+ entries).
+Maps surface-form suffixes to morphological labels.  Used by the
+segmentation engine for candidate generation (suffix stripping) and by
+the post-annotation layer for ``_suffix_label`` metadata.
+Suffixes are sorted longest-first at module load time so that the
+candidate generator always tries the most specific match first.
+Design note: some surface forms are ambiguous (e.g. "in" can be GEN or
+2SG).  This table assigns a single canonical label per surface form —
+the most common interpretation in written Turkish.  The candidate scoring
+system resolves segmentation ambiguity via root validation, not via
+suffix-label disambiguation.
+"""
+from __future__ import annotations
+# ── Raw suffix → label mapping ───────────────────────────────────────────────
+# Organised by morphological category for readability.
+SUFFIX_MAP: dict[str, str] = {
+    # ── Plural + case ────────────────────────────────────────────────────
+    "leri": "-PL+ACC",   "ları": "-PL+ACC",
+    "lere": "-PL+DAT",   "lara": "-PL+DAT",
+    "lerin": "-PL+GEN",  "ların": "-PL+GEN",
+    "lerde": "-PL+LOC",  "larda": "-PL+LOC",
+    "lerden": "-PL+ABL",  "lardan": "-PL+ABL",
+    "lerle": "-PL+INS",  "larla": "-PL+INS",
+    "lerce": "-PL+EQU",  "larca": "-PL+EQU",
+    # ── Loanword / derivational ──────────────────────────────────────────
+    "yon": "-YON",   "iyon": "-YON",   "asyon": "-YON",   "izasyon": "-YON",
+    # ── Adjective derivation ─────────────────────────────────────────────
+    "sal": "-ADJ.TR",  "sel": "-ADJ.TR",
+    # ── 1st/2nd plural possessive ────────────────────────────────────────
+    "imiz": "-P1PL",  "ımız": "-P1PL",  "umuz": "-P1PL",  "ümüz": "-P1PL",
+    "iniz": "-P2PL",  "ınız": "-P2PL",  "unuz": "-P2PL",  "ünüz": "-P2PL",
+    # ── Possessive + case compounds ──────────────────────────────────────
+    "ımı": "-P1+ACC",   "imi": "-P1+ACC",   "umu": "-P1+ACC",   "ümü": "-P1+ACC",
+    "ıyla": "-INS.COMP", "iyle": "-INS.COMP", "uyla": "-INS.COMP", "üyle": "-INS.COMP",
+    "kten": "-ABL.COMP", "ğından": "-ABL.COMP", "ğinden": "-ABL.COMP",
+    "yla": "-COM",  "yle": "-COM",
+    # ── Abstract noun + possessive ───────────────────────────────────────
+    "liği": "-ABSTR+P3",   "lığı": "-ABSTR+P3",
+    "luğu": "-ABSTR+P3",   "lüğü": "-ABSTR+P3",
+    "liğini": "-ABSTR+P3+ACC",  "lığını": "-ABSTR+P3+ACC",
+    # ── -izm (ideology) ─────────────────────────────────────────────────
+    "izm": "-ISM",     "izmi": "-ISM+P3",   "izmde": "-ISM+LOC",
+    "izmden": "-ISM+ABL",  "izmin": "-ISM+GEN",
+    # ── Aorist ───────────────────────────────────────────────────────────
+    "lir": "-AOR3SG",  "lır": "-AOR3SG",  "lur": "-AOR3SG",  "lür": "-AOR3SG",
+    # ── 3sg possessive + case ────────────────────────────────────────────
+    "ine": "-P3+DAT",   "ına": "-P3+DAT",   "une": "-P3+DAT",   "üne": "-P3+DAT",
+    "inde": "-P3+LOC",  "ında": "-P3+LOC",  "unda": "-P3+LOC",  "ünde": "-P3+LOC",
+    "ini": "-P3+ACC",   "ını": "-P3+ACC",   "unu": "-P3+ACC",   "ünü": "-P3+ACC",
+    "inden": "-P3+ABL", "ından": "-P3+ABL", "undan": "-P3+ABL", "ünden": "-P3+ABL",
+    # ── Locative-relative ────────────────────────────────────────────────
+    "daki": "-LOC+REL",  "deki": "-LOC+REL",  "taki": "-LOC+REL",  "teki": "-LOC+REL",
+    # ── Passive + nominalization ─────────────────────────────────────────
+    "lan": "-PASS+NZ",  "len": "-PASS+NZ",
+    # ── Verbal noun ──────────────────────────────────────────────────────
+    "mesi": "-VN3",      "ması": "-VN3",
+    "mesini": "-VN3+ACC", "masını": "-VN3+ACC",
+    "mesine": "-VN3+DAT", "masına": "-VN3+DAT",
+    "mesinde": "-VN3+LOC", "masında": "-VN3+LOC",
+    # ── Genitive + possessive ─────────────���──────────────────────────────
+    "ının": "-GEN+P",  "inin": "-GEN+P",  "unun": "-GEN+P",  "ünün": "-GEN+P",
+    # ── Participle ───────────────────────────────────────────────────────
+    "diği": "-PART",   "dığı": "-PART",   "tiği": "-PART",   "tığı": "-PART",
+    "duğu": "-PART",   "düğü": "-PART",   "tuğu": "-PART",   "tüğü": "-PART",
+    "ği": "-PART.SFX",  "ğı": "-PART.SFX",  "gu": "-PART.SFX",  "gü": "-PART.SFX",
+    # ── Negative verbal noun ─────────────────────────────────────────────
+    "mas": "-NEG.VN",  "mes": "-NEG.VN",
+    # ── 2sg imperative ───────────────────────────────────────────────────
+    "sin": "-IMP2",  "sın": "-IMP2",  "sun": "-IMP2",  "sün": "-IMP2",
+    # ── Passive short ────────────────────────────────────────────────────
+    "ıl": "-PASS",  "il": "-PASS",  "ul": "-PASS",  "ül": "-PASS",
+    # ── Causative + VN ───────────────────────────────────────────────────
+    "irme": "-CAUS+VN", "ırma": "-CAUS+VN", "urma": "-CAUS+VN",
+    "ürme": "-CAUS+VN", "erme": "-CAUS+VN", "arma": "-CAUS+VN",
+    # ── Past tense ───────────────────────────────────────────────────────
+    "dım": "-DI1SG", "dim": "-DI1SG", "dum": "-DI1SG", "düm": "-DI1SG",
+    "tım": "-DI1SG", "tim": "-DI1SG", "tum": "-DI1SG", "tüm": "-DI1SG",
+    "dık": "-DI1PL", "dik": "-DI1PL", "duk": "-DI1PL", "dük": "-DI1PL",
+    "tık": "-DI1PL", "tik": "-DI1PL", "tuk": "-DI1PL", "tük": "-DI1PL",
+    "dın": "-DI2SG", "din": "-DI2SG", "dun": "-DI2SG", "dün": "-DI2SG",
+    "tın": "-DI2SG", "tin": "-DI2SG", "tun": "-DI2SG", "tün": "-DI2SG",
+    # ── Conditional ──────────────────────────────────────────────────────
+    "sa": "-COND",  "se": "-COND",
+    # ── Progressive ──────────────────────────────────────────────────────
+    "iyor": "-PROG",  "ıyor": "-PROG",  "uyor": "-PROG",  "üyor": "-PROG",
+    "yor": "-PROG",
+    # ── Simple past ──────────────────────────────────────────────────────
+    "dı": "-PST",  "di": "-PST",  "du": "-PST",  "dü": "-PST",
+    "tı": "-PST",  "ti": "-PST",  "tu": "-PST",  "tü": "-PST",
+    # ── Aorist short ─────────────────────────────────────────────────────
+    "ir": "-AOR",  "ır": "-AOR",  "ur": "-AOR",  "ür": "-AOR",
+    "er": "-AOR",  "ar": "-AOR",
+    # ── Evidential past ──────────────────────────────────────────────────
+    "mış": "-EVID",  "miş": "-EVID",  "muş": "-EVID",  "müş": "-EVID",
+    # ── Negation ─────────────────────────────────────────────────────────
+    "ma": "-NEG",  "me": "-NEG",
+    "lama": "-VN+NEG",  "leme": "-VN+NEG",
+    "maya": "-NEG.INF",
+    # ── Abilitative ──────────────────────────────────────────────────────
+    "bil": "-ABIL",
+    # ── Necessitative ────────────────────────────────────────────────────
+    "malı": "-NECES",  "meli": "-NECES",
+    # ── Infinitive ───────────────────────────────────────────────────────
+    "mak": "-INF",  "mek": "-INF",
+    # ── -ken (while/when) ────────────────────────────────────────────────
+    "ken": "-WHEN",
+    # ── Converb ──────────────────────────────────────────────────────────
+    "arak": "-CONV",  "erek": "-CONV",
+    # ── With / without ��──────────────────────────────────────────────────
+    "lı": "-WITH",   "li": "-WITH",   "lu": "-WITH",   "lü": "-WITH",
+    "sız": "-WITHOUT", "siz": "-WITHOUT", "suz": "-WITHOUT", "süz": "-WITHOUT",
+    # ── Agentive ─────────────────────────────────────────────────────────
+    "cı": "-AGT",  "ci": "-AGT",  "cu": "-AGT",  "cü": "-AGT",
+    "çı": "-AGT",  "çi": "-AGT",  "çu": "-AGT",  "çü": "-AGT",
+    # ── Abstract noun ────────────────────────────────────────────────────
+    "lık": "-ABSTR",  "lik": "-ABSTR",  "luk": "-ABSTR",  "lük": "-ABSTR",
+    "lığ": "-ABSTR",  "liğ": "-ABSTR",
+    # ── Optative 1pl ─────────────────────────────────────────────────────
+    "elim": "-OPT1PL",  "alım": "-OPT1PL",
+    # ── Person suffixes ──────────────────────────────────────────────────
+    "ım": "-1SG",  "im": "-1SG",  "um": "-1SG",  "üm": "-1SG",
+    "sın": "-2SG",  "sin": "-2SG",  "sun": "-2SG",  "sün": "-2SG",
+    "iz": "-1PL",  "ız": "-1PL",  "uz": "-1PL",  "üz": "-1PL",
+    "nız": "-2PL",  "niz": "-2PL",  "nuz": "-2PL",  "nüz": "-2PL",
+    # ── Question ─────────────────────────────────────────────────────────
+    "mı": "-Q",  "mi": "-Q",  "mu": "-Q",  "mü": "-Q",
+    # ── Accusative ───────────────────────────────────────────────────────
+    "yı": "-ACC",  "yi": "-ACC",  "yu": "-ACC",  "yü": "-ACC",
+    "nı": "-ACC",  "ni": "-ACC",  "nu": "-ACC",  "nü": "-ACC",
+    # ── Dative ───────────────────────────────────────────────────────────
+    "ya": "-DAT",  "ye": "-DAT",
+    "a": "-DAT",   "e": "-DAT",
+    # ── Ablative ─────────────────────────────────────────────────────────
+    "dan": "-ABL",  "den": "-ABL",  "tan": "-ABL",  "ten": "-ABL",
+    # ── Locative ─────────────────────────────────────────────────────────
+    "da": "-LOC",  "de": "-LOC",  "ta": "-LOC",  "te": "-LOC",
+    # ── Plural ───────────────────────────────────────────────────────────
+    "lar": "-PL",  "ler": "-PL",
+    # ── 3sg possessive ───────────────────────────────────────────────────
+    "sı": "-P3",  "si": "-P3",  "su": "-P3",  "sü": "-P3",
+    # ── Genitive ─────────────────────────────────────────────────────────
+    "nin": "-GEN",  "nın": "-GEN",  "nun": "-GEN",  "nün": "-GEN",
+    "ın": "-GEN",   "in": "-GEN",   "un": "-GEN",   "ün": "-GEN",
+    # ── Instrumental ─────────────────────────────────────────────────────
+    "le": "-INS",  "la": "-INS",
+    # ── Equative ─────────────────────────────────────────────────────────
+    "ce": "-EQU",  "ca": "-EQU",  "çe": "-EQU",  "ça": "-EQU",
+    # ── Frequent BPE-origin suffixes ─────────────────────────────────────
+    "eri": "-PL.SFX",  "una": "-P3+DAT",  "iril": "-PASS.SFX",
+    "yan": "-PART.ACT", "ren": "-PART.ACT", "ıda": "-LOC.SFX",
+    "üler": "-PL.SFX",  "ıler": "-PL.SFX",
+    "ri": "-PL.SFX",
+    # ── Single-vowel accusative (used cautiously by the scorer) ──────────
+    "ı": "-ACC",  "i": "-ACC",  "u": "-ACC",  "ü": "-ACC",
+}
+# Suffixes that are too short / ambiguous for aggressive stripping.
+# The segmentation engine applies extra constraints when matching these
+# (e.g. minimum root length of 3, root must be in TDK).
+SHORT_AMBIGUOUS_SUFFIXES: frozenset[str] = frozenset(
+    {"a", "e", "ı", "i", "u", "ü"}
+)
+# Pre-sorted list: (surface_form, label) ordered longest-first.
+# Longest-first ordering ensures the most specific suffix wins when
+# multiple suffixes could match at the same position.
+SUFFIX_ENTRIES: list[tuple[str, str]] = sorted(
+    SUFFIX_MAP.items(), key=lambda x: len(x[0]), reverse=True
+)
+# ── Turkish suffixes that can follow an apostrophe ───────────────────────────
+# Used for apostrophe-based segmentation (e.g. İstanbul'da, meeting'e).
+APOSTROPHE_SUFFIXES: list[str] = sorted(
+    [
+        "nın", "nin", "nun", "nün", "dan", "den", "tan", "ten",
+        "da", "de", "ta", "te", "ya", "ye", "nda", "nde",
+        "yı", "yi", "yu", "yü", "nı", "ni", "nu", "nü",
+        "lar", "ler", "lara", "lere", "ları", "leri",
+        "ım", "im", "um", "üm", "ın", "in", "un", "ün",
+        "mız", "miz", "muz", "müz", "nız", "niz", "nuz", "nüz",
+        "dır", "dir", "dur", "dür", "tır", "tir", "tur", "tür",
+        "ki", "li", "lı", "lu", "lü", "sız", "siz", "suz", "süz",
+        "inci", "ıncı", "uncu", "üncü", "nci", "ncı",
+        "lık", "lik", "luk", "lük",
+        "a", "e", "ı", "i", "u", "ü",
+    ],
+    key=len,
+    reverse=True,
+)

nedo_turkish_tokenizer/_tdk_vocab.py DELETED Viewed

@@ -1,148 +0,0 @@
-"""Fix 7: TDK-based FOREIGN word detection."""
-from __future__ import annotations
-import json
-import os
-from pathlib import Path
-_CACHE_DIR = Path.home() / ".cache" / "nedo_turkish_tokenizer"
-_CACHE_DIR.mkdir(parents=True, exist_ok=True)
-TDK_CACHE_FILE = str(_CACHE_DIR / "tdk_words.txt")
-_BUNDLED_TDK_FILE = Path(__file__).parent / "data" / "tdk_words.txt"
-TR_CHARS = set("çğışöüÇĞİŞÖÜ")
-_TDK_WORDS: set | None = None
-_HF_TDK_URL = (
-    "https://huggingface.co/Ethosoft/NedoTurkishTokenizer/resolve/main"
-    "/nedo_turkish_tokenizer/data/tdk_words.txt"
-)
-def _read_word_file(path: Path) -> set[str]:
-    with path.open(encoding="utf-8") as f:
-        return {line.strip().lower() for line in f if line.strip()}
-def _load_cached_or_bundled_words() -> tuple[set[str] | None, str | None]:
-    candidates = (
-        (Path(TDK_CACHE_FILE), "cache"),
-        (_BUNDLED_TDK_FILE, "package bundle"),
-    )
-    for path, source in candidates:
-        if path.exists():
-            return _read_word_file(path), source
-    return None, None
-def load_tdk_words() -> set:
-    global _TDK_WORDS
-    if _TDK_WORDS is not None:
-        return _TDK_WORDS
-    words, source = _load_cached_or_bundled_words()
-    if words is not None:
-        _TDK_WORDS = words
-        print(f"[NedoTurkishTokenizer] TDK: {len(_TDK_WORDS):,} words loaded from {source} [ok]")
-        return _TDK_WORDS
-    if not os.path.exists(TDK_CACHE_FILE):
-        print("[NedoTurkishTokenizer] TDK word list not found - downloading...")
-        words = _download_from_hf() or _download_from_tdk()
-        if not words:
-            _TDK_WORDS = set()
-            return _TDK_WORDS
-    _TDK_WORDS, source = _load_cached_or_bundled_words()
-    if _TDK_WORDS is None:
-        _TDK_WORDS = set()
-        return _TDK_WORDS
-    print(f"[NedoTurkishTokenizer] TDK: {len(_TDK_WORDS):,} words loaded from {source} [ok]")
-    return _TDK_WORDS
-def _download_from_hf() -> list[str]:
-    """Download the bundled TDK word list from the HuggingFace repo."""
-    try:
-        import urllib.request  # noqa: PLC0415
-        with urllib.request.urlopen(_HF_TDK_URL, timeout=30) as resp:
-            content = resp.read().decode("utf-8")
-        words = [w.strip() for w in content.splitlines() if w.strip()]
-        with open(TDK_CACHE_FILE, "w", encoding="utf-8") as f:
-            f.write("\n".join(words))
-        print(f"[NedoTurkishTokenizer] TDK: {len(words):,} words downloaded from HuggingFace [ok]")
-        return words
-    except Exception as exc:  # noqa: BLE001
-        print(f"[NedoTurkishTokenizer] HuggingFace download failed: {exc} - trying TDK API...")
-        return []
-def _download_from_tdk() -> list[str]:
-    """Fallback: download from the official TDK autocomplete API."""
-    try:
-        import urllib.request  # noqa: PLC0415
-        url = "https://sozluk.gov.tr/autocomplete.json"
-        with urllib.request.urlopen(url, timeout=30) as resp:
-            data = json.loads(resp.read().decode("utf-8"))
-        words = sorted({item.get("madde", "").strip().lower() for item in data if item.get("madde")})
-        with open(TDK_CACHE_FILE, "w", encoding="utf-8") as f:
-            f.write("\n".join(words))
-        print(f"[NedoTurkishTokenizer] TDK: {len(words):,} words downloaded from TDK API [ok]")
-        return words
-    except Exception as exc:  # noqa: BLE001
-        print(f"[NedoTurkishTokenizer] TDK API also failed: {exc}")
-        print("  FOREIGN detection will be disabled for this session.")
-        return []
-def download_tdk_words() -> list[str]:
-    """Public helper: force re-download TDK word list."""
-    return _download_from_hf() or _download_from_tdk()
-def is_foreign_word(word: str) -> bool:
-    w = word.strip().lower()
-    if not w or len(w) < 2:
-        return False
-    if any(c in TR_CHARS for c in w):
-        return False
-    return w not in load_tdk_words()
-def reclassify_foreign_words(tokens: list[dict]) -> list[dict]:
-    """Reclassify word-initial BPE tokens as ROOT if they are foreign words."""
-    tdk = load_tdk_words()
-    if not tdk:
-        return tokens
-    result: list[dict] = []
-    for tok in tokens:
-        if tok["type"] != "BPE":
-            result.append(tok)
-            continue
-        raw = tok["token"]
-        stripped = raw.lstrip()
-        if raw == stripped:   # no leading space → not word-initial
-            result.append(tok)
-            continue
-        if is_foreign_word(stripped):
-            result.append({**tok, "type": "ROOT", "_foreign": True, "_tdk": False})
-        else:
-            result.append(tok)
-    return result

nedo_turkish_tokenizer/apostrophe.py ADDED Viewed

	@@ -0,0 +1,138 @@

+"""Apostrophe-aware segmentation for Turkish text.
+Handles two distinct cases:
+1. **Turkish proper names** — İstanbul'da, Ankara'ya
+   → ROOT(İstanbul) + PUNCT(') + SUFFIX(da)
+2. **Foreign stems with Turkish suffixes** — meeting'e, zoom'da
+   → FOREIGN(meeting) + SUFFIX(e)
+The decision between these two cases uses:
+- Turkish character detection (ç,ğ,ı,ş,ö,ü → Turkish)
+- TDK dictionary lookup
+- Proper noun list
+"""
+from __future__ import annotations
+import re
+from ._suffix_table import APOSTROPHE_SUFFIXES, SUFFIX_MAP
+from .normalization import has_turkish_chars, turkish_lower
+from .resources import load_proper_nouns, load_tdk_words
+# Matches word'suffix patterns (both ASCII and Unicode apostrophes)
+_APO_RE = re.compile(
+    r"([A-Za-zÇçĞğİıÖöŞşÜü0-9]{2,})['\u2019]([A-Za-zÇçĞğİıÖöŞşÜü]{1,6})\b"
+)
+def is_turkish_base(word: str) -> bool:
+    """Return True if *word* should be treated as a Turkish word.
+    Used to decide whether ``word'suffix`` is a Turkish proper name
+    (keep apostrophe as punctuation boundary) or a foreign word
+    (merge into FOREIGN root + SUFFIX).
+    Decision order:
+    1. Turkish-specific chars → definitely Turkish
+    2. Proper nouns list → Turkish
+    3. TDK dictionary → Turkish (or accepted loanword)
+    4. Very short words (< 4 chars) → assume Turkish (because short
+       words are ambiguous and Turkish short words are common)
+    """
+    wl = turkish_lower(word)
+    # Turkish-specific characters are a strong signal
+    if has_turkish_chars(wl):
+        return True
+    # Known proper nouns
+    if wl in load_proper_nouns():
+        return True
+    # TDK dictionary
+    tdk = load_tdk_words()
+    if tdk and wl in tdk:
+        return True
+    # Very short words are ambiguous — default to Turkish
+    return len(wl) < 4
+def split_apostrophe_words(
+    text: str,
+) -> tuple[str, list[tuple[str, str]]]:
+    """Process apostrophe patterns in *text*.
+    For **foreign** stems followed by a Turkish suffix after apostrophe,
+    replaces the apostrophe with a space so the word can later be
+    segmented as FOREIGN ROOT + SUFFIX.
+    For **Turkish** proper names (İstanbul'da), leaves the text
+    unchanged — the apostrophe will be handled as punctuation by the
+    word splitter.
+    Returns:
+        ``(modified_text, [(foreign_base_lower, suffix_lower), ...])``
+    """
+    foreign_splits: list[tuple[str, str]] = []
+    def _repl(m: re.Match) -> str:
+        base, suffix = m.group(1), m.group(2)
+        if is_turkish_base(base):
+            return m.group(0)  # Keep apostrophe for Turkish names
+        sl = suffix.lower()
+        if any(sl == s for s in APOSTROPHE_SUFFIXES):
+            foreign_splits.append((turkish_lower(base), sl))
+            return f"{base} {suffix}"  # Drop apostrophe → space
+        return m.group(0)
+    modified = _APO_RE.sub(_repl, text)
+    return modified, foreign_splits
+def build_apostrophe_tokens(
+    word: str, suffix_str: str, *, is_foreign: bool
+) -> list[dict[str, object]]:
+    """Create token dicts for a word + apostrophe + suffix pattern.
+    Args:
+        word: The base word (before apostrophe).
+        suffix_str: The suffix string (after apostrophe).
+        is_foreign: Whether the base word is foreign.
+    Returns:
+        List of token dicts.
+    """
+    label = SUFFIX_MAP.get(suffix_str.lower(), "-SFX")
+    if is_foreign:
+        # Foreign: FOREIGN(word) + SUFFIX(suffix)
+        return [
+            {
+                "token": f" {word}", "token_type": "FOREIGN", "morph_pos": 0,
+                "_foreign": True,
+            },
+            {
+                "token": suffix_str, "token_type": "SUFFIX", "morph_pos": 1,
+                "_apo_suffix": True, "_suffix_label": label,
+            },
+        ]
+    else:
+        # Turkish: ROOT(word) + PUNCT(') + SUFFIX(suffix)
+        return [
+            {
+                "token": f" {word}", "token_type": "ROOT", "morph_pos": 0,
+            },
+            {
+                "token": "'", "token_type": "PUNCT", "morph_pos": 0,
+                "_punct": True,
+            },
+            {
+                "token": suffix_str, "token_type": "SUFFIX", "morph_pos": 1,
+                "_apo_suffix": True, "_suffix_label": label,
+            },
+        ]

nedo_turkish_tokenizer/engine.py ADDED Viewed

	@@ -0,0 +1,157 @@

+"""Tokenization engine — orchestrates the full pipeline.
+This is the central pipeline that ties together all modules:
+1. Text normalization (Unicode, whitespace)
+2. ALL CAPS detection and lowercasing
+3. Special span extraction (URLs, numbers, dates, acronyms, emojis)
+4. Word-level segmentation with candidate generation/selection
+5. Post-annotation (allomorph labels, compound info, acronym expansion)
+6. Number/unit reclassification safety net
+"""
+from __future__ import annotations
+from ._domain_vocab import ALL_DOMAIN_ROOTS
+from .morphology import annotate_acronyms, annotate_canonical, annotate_compounds
+from .normalization import detect_all_caps, normalize_text
+from .resources import load_tdk_words
+from .segmentation import segment_word, split_into_words
+from .special_spans import find_special_spans, make_special_tokens, reclassify_numbers_in_tokens
+class TokenizationEngine:
+    """Core tokenization engine.
+    Stateless after initialisation: loads TDK and domain vocabulary once,
+    then processes texts through a deterministic pipeline.
+    This class is NOT the public API.  Use ``NedoTurkishTokenizer``
+    instead, which delegates to this engine.
+    """
+    def __init__(self) -> None:
+        self._tdk: set[str] = load_tdk_words()
+        self._domain_roots: frozenset[str] = ALL_DOMAIN_ROOTS
+    def tokenize(self, text: str) -> list[dict[str, object]]:
+        """Run the full tokenization pipeline on *text*.
+        Returns a list of token dicts, each with at minimum:
+        ``token``, ``token_type``, ``morph_pos``.
+        """
+        if not text or not text.strip():
+            return []
+        # ── 1. Normalize ─────────────────────────────────────────────────
+        text = normalize_text(text)
+        # ── 2. ALL CAPS detection ────────────────────────────────────────
+        text, caps_set = detect_all_caps(text)
+        # ── 3. Special span extraction ───────────────────────────────────
+        spans = find_special_spans(text)
+        tokens: list[dict[str, object]] = []
+        pos = 0
+        for start, end, span_type, original in spans:
+            # Tokenize normal text before this special span
+            if pos < start:
+                segment = text[pos:start]
+                if segment.strip():
+                    seg_tokens = self._tokenize_segment(segment, caps_set)
+                    tokens.extend(seg_tokens)
+            # Insert special tokens directly
+            tokens.extend(make_special_tokens(span_type, original))
+            pos = end
+        # Tokenize remaining text after last special span
+        if pos < len(text):
+            segment = text[pos:]
+            if segment.strip():
+                seg_tokens = self._tokenize_segment(segment, caps_set)
+                tokens.extend(seg_tokens)
+        # ── 5. Post-annotation passes ────────────────────────────────────
+        tokens = reclassify_numbers_in_tokens(tokens)
+        tokens = annotate_canonical(tokens)
+        tokens = annotate_compounds(tokens)
+        tokens = annotate_acronyms(tokens)
+        # ── 6. Finalize morph_pos ────────────────────────────────────────
+        tokens = _compute_morph_pos(tokens)
+        # ── 7. Strip internal leading spaces from token text ─────────────
+        # Leading spaces are used internally to detect word boundaries
+        # during morph_pos computation but are NOT part of the public API.
+        tokens = _strip_token_text(tokens)
+        return tokens
+    def _tokenize_segment(
+        self, segment: str, caps_set: frozenset[str]
+    ) -> list[dict[str, object]]:
+        """Tokenize a plain-text segment (no special spans)."""
+        words = split_into_words(segment)
+        tokens: list[dict[str, object]] = []
+        for word in words:
+            word_tokens = segment_word(
+                word, self._tdk, self._domain_roots, caps_set
+            )
+            tokens.extend(word_tokens)
+        return tokens
+# ── Helper: compute morph_pos across the full token stream ───────────────────
+def _compute_morph_pos(tokens: list[dict[str, object]]) -> list[dict[str, object]]:
+    """Recompute ``morph_pos`` consistently across the token stream.
+    Rules:
+    - Word-initial tokens (leading space, special types, PUNCT) → morph_pos = 0
+    - SUFFIX tokens increment the position counter
+    - Apostrophe suffixes continue from the previous word
+    """
+    result: list[dict[str, object]] = []
+    word_pos = 0
+    for tok in tokens:
+        raw = str(tok["token"])
+        token_type = str(tok["token_type"])
+        is_word_start = raw.startswith(" ") or raw.strip().startswith("<")
+        # Apostrophe suffixes continue the previous word
+        if tok.get("_apo_suffix"):
+            is_word_start = False
+        if is_word_start or token_type in (
+            "NUM", "DATE", "UNIT", "URL", "MENTION", "HASHTAG", "EMOJI", "ACRONYM", "PUNCT"
+        ):
+            word_pos = 0
+            morph_pos = 0
+        elif token_type == "SUFFIX":
+            word_pos += 1
+            morph_pos = word_pos
+        else:
+            # ROOT or FOREIGN within a word (shouldn't normally happen)
+            word_pos = 0
+            morph_pos = 0
+        result.append({**tok, "morph_pos": morph_pos})
+    return result
+def _strip_token_text(tokens: list[dict[str, object]]) -> list[dict[str, object]]:
+    """Remove internal leading whitespace from all token text strings.
+    During pipeline processing, a leading space in ``token`` signals
+    a word-initial token.  Once ``morph_pos`` has been computed, this
+    space is no longer needed and must be stripped so the public API
+    returns clean text.
+    """
+    return [{**tok, "token": str(tok["token"]).lstrip()} for tok in tokens]

nedo_turkish_tokenizer/morphology.py ADDED Viewed

	@@ -0,0 +1,161 @@

+"""Morphology utilities: suffix analysis, allomorph canonicalization, compound annotation.
+This module provides:
+- Suffix stripping and matching against the suffix table
+- Allomorph → canonical morpheme mapping (e.g. "lar"/"ler" → "PL")
+- Compound word detection and decomposition
+- Acronym expansion annotation
+"""
+from __future__ import annotations
+from ._acronym_table import ACRONYM_EXPANSIONS
+from ._suffix_table import SUFFIX_ENTRIES, SUFFIX_MAP
+# ── Allomorph → canonical morpheme mapping ───────────────────────────────────
+# Maps surface-form suffixes to a language-neutral canonical label.
+# Used for the ``_canonical`` metadata field on SUFFIX tokens.
+ALLOMORPH_MAP: dict[str, str] = {
+    "lar": "PL",   "ler": "PL",
+    "ı":   "ACC",  "i":   "ACC",  "u":   "ACC",  "ü":   "ACC",
+    "yı":  "ACC",  "yi":  "ACC",  "yu":  "ACC",  "yü":  "ACC",
+    "a":   "DAT",  "e":   "DAT",  "ya":  "DAT",  "ye":  "DAT",
+    "da":  "LOC",  "de":  "LOC",  "ta":  "LOC",  "te":  "LOC",
+    "dan": "ABL",  "den": "ABL",  "tan": "ABL",  "ten": "ABL",
+    "ın":  "GEN",  "in":  "GEN",  "un":  "GEN",  "ün":  "GEN",
+    "nın": "GEN",  "nin": "GEN",  "nun": "GEN",  "nün": "GEN",
+    "la":  "INS",  "le":  "INS",  "yla": "INS",  "yle": "INS",
+    "dı":  "PAST", "di":  "PAST", "du":  "PAST", "dü":  "PAST",
+    "tı":  "PAST", "ti":  "PAST", "tu":  "PAST", "tü":  "PAST",
+    "yor": "PROG", "iyor": "PROG", "ıyor": "PROG", "uyor": "PROG", "üyor": "PROG",
+    "ar":  "AOR",  "er":  "AOR",
+    "ır":  "AOR",  "ir":  "AOR",  "ur":  "AOR",  "ür":  "AOR",
+    "mış": "EVID", "miş": "EVID", "muş": "EVID", "müş": "EVID",
+    "ma":  "NEG",  "me":  "NEG",
+    "mak": "INF",  "mek": "INF",
+    "ım":  "1SG",  "im":  "1SG",  "um":  "1SG",  "üm":  "1SG",
+    "iz":  "1PL",  "ız":  "1PL",  "uz":  "1PL",  "üz":  "1PL",
+    "mı":  "Q",    "mi":  "Q",    "mu":  "Q",    "mü":  "Q",
+    "lı":  "WITH", "li":  "WITH", "lu":  "WITH", "lü":  "WITH",
+    "sız": "WITHOUT", "siz": "WITHOUT", "suz": "WITHOUT", "süz": "WITHOUT",
+    "cı":  "AGT",  "ci":  "AGT",  "cu":  "AGT",  "cü":  "AGT",
+    "çı":  "AGT",  "çi":  "AGT",  "çu":  "AGT",  "çü":  "AGT",
+    "lık": "ABSTR", "lik": "ABSTR", "luk": "ABSTR", "lük": "ABSTR",
+    "sa":  "COND", "se":  "COND",
+    "ıl":  "PASS", "il":  "PASS", "ul":  "PASS", "ül":  "PASS",
+}
+# ── Compound word dictionary ────────────────────────────────────────────────
+KNOWN_COMPOUNDS: dict[str, list[str]] = {
+    "başbakan":         ["baş", "bakan"],
+    "cumhurbaşkanı":    ["cumhur", "başkan"],
+    "dışişleri":        ["dış", "iş"],
+    "içişleri":         ["iç", "iş"],
+    "maliye":           ["mal", "iye"],
+    "belediye":         ["beled", "iye"],
+    "ayakkabı":         ["ayak", "kap"],
+    "yelkovan":         ["yel", "kovan"],
+    "saatlik":          ["saat", "lik"],
+    "günlük":           ["gün", "lük"],
+    "yıllık":           ["yıl", "lık"],
+    "aylık":            ["ay", "lık"],
+    "haftalık":         ["hafta", "lık"],
+    "gastrointestinal": ["gastro", "intestinal"],
+    "kardiyovasküler":  ["kardio", "vasküler"],
+    "nöropsikiyatri":   ["nöro", "psikiyatri"],
+    "biyokimya":        ["biyo", "kimya"],
+    "mikrobiyoloji":    ["mikro", "biyoloji"],
+    "farmakoloji":      ["farma", "koloji"],
+    "patoloji":         ["pato", "loji"],
+    "hematoloji":       ["hemato", "loji"],
+    "nefroloji":        ["nefro", "loji"],
+    "kardiyoloji":      ["kardio", "loji"],
+    "radyoloji":        ["radyo", "loji"],
+    "onkoloji":         ["onko", "loji"],
+    "elektromanyetik":  ["elektro", "manyetik"],
+    "termodinamik":     ["termo", "dinamik"],
+    "hidroelektrik":    ["hidro", "elektrik"],
+    "biyoinformatik":   ["biyo", "informatik"],
+    "nanoteknoloji":    ["nano", "teknoloji"],
+    "futbolcu":         ["futbol", "cu"],
+    "basketbolcu":      ["basketbol", "cu"],
+    "voleybolcu":       ["voleybol", "cu"],
+}
+# ── Suffix label lookup ─────────────────────────────────────────────────────
+def get_suffix_label(surface: str) -> str | None:
+    """Return the morphological label for a suffix surface form, or None."""
+    return SUFFIX_MAP.get(surface.lower())
+def get_canonical(surface: str) -> str | None:
+    """Return the canonical morpheme label for a suffix, or None."""
+    return ALLOMORPH_MAP.get(surface.lower())
+# ── Post-annotation passes ──────────────────────────────────────────────────
+def annotate_canonical(tokens: list[dict[str, object]]) -> list[dict[str, object]]:
+    """Add ``_canonical`` field to SUFFIX tokens (e.g. 'lar' → 'PL')."""
+    result: list[dict[str, object]] = []
+    for tok in tokens:
+        if tok["token_type"] != "SUFFIX":
+            result.append(tok)
+            continue
+        surface = str(tok["token"]).strip().lower()
+        canonical = ALLOMORPH_MAP.get(surface)
+        if canonical:
+            result.append({**tok, "_canonical": canonical})
+        else:
+            result.append(tok)
+    return result
+def annotate_compounds(tokens: list[dict[str, object]]) -> list[dict[str, object]]:
+    """Annotate ROOT tokens that are compound words."""
+    result: list[dict[str, object]] = []
+    for tok in tokens:
+        if tok["token_type"] != "ROOT" or str(tok["token"]).strip().startswith("<"):
+            result.append(tok)
+            continue
+        surface = str(tok["token"]).strip().lower()
+        if surface in KNOWN_COMPOUNDS:
+            result.append({
+                **tok,
+                "_compound": True,
+                "_parts": KNOWN_COMPOUNDS[surface],
+            })
+        else:
+            result.append(tok)
+    return result
+def annotate_acronyms(tokens: list[dict[str, object]]) -> list[dict[str, object]]:
+    """Add ``_expansion`` to known acronyms; promote CAPS ROOTs to ACRONYM."""
+    result: list[dict[str, object]] = []
+    for tok in tokens:
+        token_upper = str(tok["token"]).strip().upper()
+        expansion = ACRONYM_EXPANSIONS.get(token_upper)
+        if tok["token_type"] == "ACRONYM":
+            if expansion:
+                result.append({**tok, "_expansion": expansion, "_known_acronym": True})
+            else:
+                result.append(tok)
+        elif tok["token_type"] == "ROOT" and (tok.get("_acronym") or tok.get("_caps")):
+            if expansion:
+                result.append({
+                    **tok, "token_type": "ACRONYM",
+                    "_expansion": expansion, "_known_acronym": True,
+                })
+            else:
+                result.append(tok)
+        else:
+            result.append(tok)
+    return result

nedo_turkish_tokenizer/normalization.py ADDED Viewed

	@@ -0,0 +1,63 @@

+"""Text normalization utilities for Turkish text.
+Handles:
+- Turkish-aware lowercasing (İ→i, I→ı)
+- Unicode NFC normalization
+- Whitespace cleanup
+- ALL CAPS word detection and lowercasing
+"""
+from __future__ import annotations
+import re
+import unicodedata
+# Turkish-specific characters — presence indicates a Turkish word
+TR_CHARS: frozenset[str] = frozenset("çğışöüÇĞİŞÖÜ")
+# Pattern for detecting ALL CAPS words (≥2 uppercase letters)
+_CAPS_RE = re.compile(r"\b([A-ZÇĞİÖŞÜ]{2,})\b")
+def turkish_lower(s: str) -> str:
+    """Turkish-aware lowercase: İ→i, I→ı, then standard ``str.lower()``.
+    Standard Python ``str.lower()`` maps both I and İ to 'i', which is
+    wrong for Turkish where I→ı and İ→i.
+    """
+    return s.replace("İ", "i").replace("I", "ı").lower()
+def normalize_text(text: str) -> str:
+    """Apply Unicode NFC normalization and collapse whitespace."""
+    text = unicodedata.normalize("NFC", text)
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+def has_turkish_chars(word: str) -> bool:
+    """Return True if *word* contains Turkish-specific characters (ç,ğ,ı,ş,ö,ü)."""
+    return any(c in TR_CHARS for c in word)
+def detect_all_caps(text: str) -> tuple[str, frozenset[str]]:
+    """Detect ALL CAPS words, lowercase them, and return the modified text.
+    ALL CAPS words like ``İSTANBUL`` cause problems for suffix-based
+    segmentation because the suffix table works on lowercase text.  This
+    function lowercases them in-place and returns a set of the lowered
+    forms so the output tokens can be annotated with ``_caps=True``.
+    Returns:
+        ``(modified_text, frozenset_of_lowered_caps_words)``
+    """
+    caps_collector: set[str] = set()
+    def _replace(m: re.Match) -> str:
+        word = m.group(1)
+        lowered = turkish_lower(word)
+        caps_collector.add(lowered)
+        return lowered
+    modified = _CAPS_RE.sub(_replace, text)
+    return modified, frozenset(caps_collector)

nedo_turkish_tokenizer/resources.py ADDED Viewed

	@@ -0,0 +1,107 @@

+"""Static resource loading for the tokenizer.
+Loads bundled data files (TDK dictionary, proper nouns) from the package
+``data/`` directory.  All resources are loaded lazily on first access and
+cached in module-level globals.
+**No network access.  No runtime downloads.  Fully offline.**
+The TDK dictionary contains infinitive verb forms (e.g. "gelmek") but the
+tokenizer needs bare verb stems (e.g. "gel") for suffix stripping.  This
+module automatically derives verb stems from infinitives at load time.
+"""
+from __future__ import annotations
+from pathlib import Path
+_DATA_DIR = Path(__file__).parent / "data"
+# ── TDK Word List ────────────────────────────────────────────────────────────
+_TDK_WORDS: set[str] | None = None
+def _derive_verb_stems(raw_words: set[str]) -> set[str]:
+    """Derive bare verb stems from TDK infinitive entries.
+    TDK lists verbs as infinitives ("gelmek", "bakmak").  The tokenizer
+    needs bare stems ("gel", "bak") for suffix stripping.
+    This function strips "-mak"/"-mek" from infinitives and adds the
+    resulting stems to the word set.  Only stems of 2+ characters are
+    added to avoid spurious short matches.
+    """
+    derived: set[str] = set()
+    for word in raw_words:
+        if word.endswith("mak") and len(word) > 4:
+            stem = word[:-3]
+            if len(stem) >= 2:
+                derived.add(stem)
+        elif word.endswith("mek") and len(word) > 4:
+            stem = word[:-3]
+            if len(stem) >= 2:
+                derived.add(stem)
+    return derived
+def load_tdk_words() -> set[str]:
+    """Load the TDK (Türk Dil Kurumu) word list from the bundled data file.
+    Returns a set of lowercase Turkish words including:
+    - Original dictionary entries (nouns, adjectives, adverbs, infinitives)
+    - Derived verb stems (stripped -mak/-mek from infinitives)
+    Used for:
+    - Root validation during suffix stripping (is the remainder a real word?)
+    - Foreign word detection (word absent from TDK → likely foreign)
+    - Turkish-base detection for apostrophe handling
+    """
+    global _TDK_WORDS
+    if _TDK_WORDS is not None:
+        return _TDK_WORDS
+    tdk_path = _DATA_DIR / "tdk_words.txt"
+    if tdk_path.exists():
+        raw_words = {
+            line.strip().lower()
+            for line in tdk_path.read_text(encoding="utf-8").splitlines()
+            if line.strip()
+        }
+        # Derive verb stems from infinitives (gelmek→gel, bakmak→bak)
+        stems = _derive_verb_stems(raw_words)
+        _TDK_WORDS = raw_words | stems
+    else:
+        _TDK_WORDS = set()
+    return _TDK_WORDS
+# ── Proper Nouns ─────────────────────────────────────────────────────────────
+_PROPER_NOUNS: set[str] | None = None
+def load_proper_nouns() -> set[str]:
+    """Load Turkish proper nouns (cities, regions, names) from bundled data.
+    Used in apostrophe handling to distinguish Turkish proper names
+    (İstanbul'da → keep as Turkish ROOT) from foreign words
+    (meeting'e → mark as FOREIGN ROOT).
+    """
+    global _PROPER_NOUNS
+    if _PROPER_NOUNS is not None:
+        return _PROPER_NOUNS
+    path = _DATA_DIR / "turkish_proper_nouns.txt"
+    if path.exists():
+        _PROPER_NOUNS = {
+            line.strip().lower()
+            for line in path.read_text(encoding="utf-8").splitlines()
+            if line.strip() and not line.startswith("#")
+        }
+    else:
+        _PROPER_NOUNS = set()
+    return _PROPER_NOUNS

nedo_turkish_tokenizer/segmentation.py ADDED Viewed

	@@ -0,0 +1,475 @@

+"""Word-level segmentation with candidate generation and selection.
+This is the core of the tokenizer.  For each word it:
+1. Generates multiple segmentation candidates (whole-word ROOT, suffix
+   chains, foreign root, etc.)
+2. Scores each candidate deterministically
+3. Selects the highest-scoring segmentation
+The scoring rules are transparent and tunable:
+- TDK root match gives a large bonus
+- Domain vocabulary match gives a moderate bonus
+- Longer roots are preferred over shorter ones
+- Each recognised suffix adds a small bonus
+- Unknown / unvalidated roots get a low base score
+"""
+from __future__ import annotations
+import re
+from typing import Any
+from ._domain_vocab import ALL_DOMAIN_ROOTS
+from ._suffix_table import (
+    SHORT_AMBIGUOUS_SUFFIXES,
+    SUFFIX_ENTRIES,
+    SUFFIX_MAP,
+)
+from .normalization import has_turkish_chars, turkish_lower
+from .resources import load_proper_nouns, load_tdk_words
+from .types import PUNCT_CHARS, SegmentationCandidate, Token, is_punct_token
+# ── Scoring constants ────────────────────────────────────────────────────────
+# Why these values: TDK_BONUS dominates so that a TDK-validated root almost
+# always wins over an unvalidated one.  SUFFIX_BONUS is small enough that
+# over-segmentation (many tiny suffixes) doesn't beat a valid longer root.
+_TDK_BONUS = 10         # Root found in TDK dictionary
+_DOMAIN_BONUS = 8        # Root found in domain vocabulary
+_SUFFIX_BONUS = 2        # Each recognised suffix
+_ROOT_LEN_WEIGHT = 2     # Per-character bonus for root length (prefer longer roots)
+_WHOLE_WORD_BONUS = 5    # Extra bonus when the *entire* unsplit word is in TDK
+_FOREIGN_BASE = 3        # Base score for foreign root (intentionally low)
+_UNKNOWN_BASE = 1        # Base score for unrecognised root
+_SHORT_ROOT_PENALTY = 4  # Penalty when root is exactly _MIN_ROOT_LEN chars
+_MIN_ROOT_LEN = 2        # Minimum root length for suffix stripping
+_MAX_SUFFIX_DEPTH = 5    # Maximum number of suffixes to strip
+# ── Known-intact words ───────────────────────────────────────────────────────
+# Common Turkish words that *look* like root+suffix but must stay whole.
+# Without this set, "dedi" would split into "de" (TDK conjunction) + "di"
+# (past tense suffix) because both are individually valid.
+#
+# This set covers inflected forms of very short verb stems (de-, ye-) and
+# common discourse particles that happen to end in suffix-like sequences.
+KNOWN_INTACT: frozenset[str] = frozenset({
+    # Forms of "demek" (to say) — stem "de" is a TDK conjunction,
+    # causing false splits like de+di, de+miş, de+se, etc.
+    "dedi", "dedim", "dedin", "dedik", "dediniz", "dediler",
+    "demiş", "demişti", "demiştir",
+    "dese", "desem", "desen", "desek",
+    "der", "derim", "dersin", "deriz",
+    "denir", "dendi", "denmiş",
+    # Forms of "yemek" (to eat) — stem "ye" is in TDK
+    "yemiş", "yese", "yesem", "yesen",
+    "yer", "yerim", "yersin", "yeriz",
+    "yenir", "yendi", "yenmiş",
+    # Common particles / conjunctions that end in suffix-like sequences
+    # (most already protected by TDK WHOLE_WORD_BONUS, but double-guarding)
+    "diye", "niye", "nice",
+})
+# ── Punctuation splitting ────────────────────────────────────────────────────
+# Regex to split a word at apostrophes (keeping the apostrophe)
+_APOSTROPHE_RE = re.compile(r"(['\u2019])")
+# Regex to split leading/trailing punctuation from a word
+_LEADING_PUNCT_RE = re.compile(r"^([^\w]+)")
+_TRAILING_PUNCT_RE = re.compile(r"([^\w]+)$")
+def _split_punctuation(word: str) -> list[tuple[str, str]]:
+    """Split a raw word token into (text, type) pairs.
+    Separates leading and trailing punctuation from the core word.
+    For example: ``'"hello,'`` → ``[('"', 'PUNCT'), ('hello', 'WORD'), (',', 'PUNCT')]``
+    """
+    if not word:
+        return []
+    parts: list[tuple[str, str]] = []
+    # Check if the entire token is punctuation
+    if is_punct_token(word):
+        return [(word, "PUNCT")]
+    # Strip leading punctuation
+    lead_m = _LEADING_PUNCT_RE.match(word)
+    if lead_m:
+        for ch in lead_m.group(1):
+            parts.append((ch, "PUNCT"))
+        word = word[lead_m.end():]
+    # Strip trailing punctuation
+    trail_m = _TRAILING_PUNCT_RE.search(word)
+    trailing: list[tuple[str, str]] = []
+    if trail_m:
+        for ch in trail_m.group(1):
+            trailing.append((ch, "PUNCT"))
+        word = word[:trail_m.start()]
+    if word:
+        parts.append((word, "WORD"))
+    parts.extend(trailing)
+    return parts
+# ── Word splitting ───────────────────────────────────────────────────────────
+def split_into_words(text: str) -> list[str]:
+    """Split text into whitespace-delimited word tokens.
+    Preserves the original casing and punctuation within each token.
+    """
+    return text.split()
+# ── Candidate generation ────────────────────────────────────────────────────
+def _generate_suffix_candidates(
+    word_lower: str,
+    tdk: set[str],
+    domain_roots: frozenset[str],
+    depth: int = 0,
+) -> list[SegmentationCandidate]:
+    """Recursively generate segmentation candidates by stripping suffixes.
+    Tries each suffix in the table (longest first).  If the remainder
+    is a valid root, produces a candidate.  If not, recurses to try
+    stripping additional suffixes from the remainder.
+    """
+    if depth >= _MAX_SUFFIX_DEPTH or len(word_lower) < _MIN_ROOT_LEN:
+        return []
+    candidates: list[SegmentationCandidate] = []
+    for suffix_surface, suffix_label in SUFFIX_ENTRIES:
+        if not word_lower.endswith(suffix_surface):
+            continue
+        remainder = word_lower[: -len(suffix_surface)]
+        if len(remainder) < _MIN_ROOT_LEN:
+            continue
+        # Extra caution for very short / ambiguous suffixes
+        if suffix_surface in SHORT_AMBIGUOUS_SUFFIXES and len(remainder) < 3:
+            continue
+        suffix_token = Token(
+            text=suffix_surface,
+            token_type="SUFFIX",
+            metadata={"_suffix_label": suffix_label},
+        )
+        # Check if remainder is a valid root
+        root_in_tdk = remainder in tdk
+        root_in_domain = remainder in domain_roots
+        root_score = len(remainder) * _ROOT_LEN_WEIGHT
+        if root_in_tdk:
+            root_score += _TDK_BONUS
+        elif root_in_domain:
+            root_score += _DOMAIN_BONUS
+        else:
+            root_score += _UNKNOWN_BASE
+        # Penalise very short roots: 2-char roots like "de", "ye", "al"
+        # are valid TDK entries but produce many false splits on short
+        # words (e.g. "dedi" → de+di).  The penalty makes it harder for
+        # a 2-char root to beat the whole-word hypothesis.
+        if len(remainder) <= _MIN_ROOT_LEN:
+            root_score -= _SHORT_ROOT_PENALTY
+        if root_in_tdk or root_in_domain:
+            # Valid root found → create single-level candidate
+            root_token = Token(
+                text=remainder,
+                token_type="ROOT",
+                metadata={"_tdk": root_in_tdk, "_domain": root_in_domain} if root_in_domain else {},
+            )
+            total_score = root_score + _SUFFIX_BONUS
+            candidates.append(SegmentationCandidate(
+                tokens=[root_token, suffix_token],
+                score=total_score,
+                source="suffix_chain",
+            ))
+        # Recurse: try stripping more suffixes from the remainder
+        if depth < _MAX_SUFFIX_DEPTH - 1:
+            sub_candidates = _generate_suffix_candidates(
+                remainder, tdk, domain_roots, depth + 1
+            )
+            for sc in sub_candidates:
+                # Only accept recursive results that found a real root
+                if sc.score > len(remainder) + _UNKNOWN_BASE:
+                    extended = SegmentationCandidate(
+                        tokens=sc.tokens + [suffix_token],
+                        score=sc.score + _SUFFIX_BONUS,
+                        source="suffix_chain",
+                    )
+                    candidates.append(extended)
+    return candidates
+def generate_candidates(
+    word: str,
+    tdk: set[str],
+    domain_roots: frozenset[str],
+    caps_set: frozenset[str],
+) -> list[SegmentationCandidate]:
+    """Generate all plausible segmentation candidates for a single word.
+    Returns a list of candidates sorted by score (highest first).
+    """
+    wl = turkish_lower(word)
+    candidates: list[SegmentationCandidate] = []
+    is_caps = wl in caps_set
+    is_tr_chars = has_turkish_chars(wl)
+    # ── Fast path: known-intact words bypass candidate generation ────────
+    # These are common words that look splittable but must stay whole.
+    if wl in KNOWN_INTACT:
+        root_meta_intact: dict[str, Any] = {}
+        if is_caps:
+            root_meta_intact["_caps"] = True
+        return [SegmentationCandidate(
+            tokens=[Token(text=wl, token_type="ROOT", metadata=root_meta_intact)],
+            score=len(wl) * _ROOT_LEN_WEIGHT + _TDK_BONUS + _WHOLE_WORD_BONUS,
+            source="known_intact",
+        )]
+    # ── Candidate 1: whole word as ROOT ──────────────────────────────────
+    in_tdk = wl in tdk
+    in_proper = wl in load_proper_nouns()
+    in_domain = wl in domain_roots
+    whole_score = len(wl) * _ROOT_LEN_WEIGHT
+    if in_tdk or in_proper:
+        # Whole-word TDK/proper-noun match gets an extra bonus to prevent
+        # over-segmenting valid dictionary words like "dünya" into
+        # "dün" + "ya".
+        whole_score += _TDK_BONUS + _WHOLE_WORD_BONUS
+    elif in_domain:
+        whole_score += _DOMAIN_BONUS + _WHOLE_WORD_BONUS
+    else:
+        whole_score += _UNKNOWN_BASE
+    root_meta: dict[str, Any] = {}
+    if is_caps:
+        root_meta["_caps"] = True
+    if in_domain:
+        root_meta["_domain"] = True
+    whole_root = Token(text=wl, token_type="ROOT", metadata=root_meta)
+    candidates.append(SegmentationCandidate(
+        tokens=[whole_root],
+        score=whole_score,
+        source="whole_word",
+    ))
+    # ── Candidate 2+: suffix stripping ───────────────────────────────────
+    suffix_cands = _generate_suffix_candidates(wl, tdk, domain_roots)
+    for sc in suffix_cands:
+        # Propagate caps flag to the root token
+        if is_caps and sc.tokens:
+            sc.tokens[0].metadata["_caps"] = True
+        candidates.append(sc)
+    # ── Candidate N: foreign root ────────────────────────────────────────
+    if not in_tdk and not in_proper and not is_tr_chars and len(wl) >= 2:
+        foreign_token = Token(
+            text=wl, token_type="FOREIGN",
+            metadata={"_foreign": True},
+        )
+        # Foreign score uses flat weight 1 (not ROOT_LEN_WEIGHT) so that
+        # valid suffix chains with a TDK root always beat FOREIGN.
+        foreign_score = _FOREIGN_BASE + len(wl)
+        candidates.append(SegmentationCandidate(
+            tokens=[foreign_token],
+            score=foreign_score,
+            source="foreign",
+        ))
+    # Sort by score descending (highest first)
+    candidates.sort(key=lambda c: c.score, reverse=True)
+    return candidates
+# ── Candidate selection ──────────────────────────────────────────────────────
+def select_best_candidate(
+    candidates: list[SegmentationCandidate],
+) -> SegmentationCandidate:
+    """Select the best segmentation among candidates.
+    Picks the highest-scoring candidate.  Ties are broken by:
+    1. Fewer tokens (less fragmentation)
+    2. Longer root token
+    """
+    if not candidates:
+        # Fallback: should never happen, but safety net
+        return SegmentationCandidate(
+            tokens=[Token(text="", token_type="ROOT")],
+            score=0.0,
+            source="fallback",
+        )
+    if len(candidates) == 1:
+        return candidates[0]
+    best_score = candidates[0].score
+    tied = [c for c in candidates if c.score == best_score]
+    if len(tied) == 1:
+        return tied[0]
+    # Tie-breaking: fewer tokens first; then longer root
+    def _tie_key(c: SegmentationCandidate) -> tuple[int, int]:
+        root_len = max(
+            (len(t.text) for t in c.tokens if t.token_type == "ROOT"),
+            default=0,
+        )
+        return (len(c.tokens), -root_len)
+    tied.sort(key=_tie_key)
+    return tied[0]
+# ── Full word segmentation ───────────────────────────────────────────────────
+def segment_word(
+    word: str,
+    tdk: set[str],
+    domain_roots: frozenset[str],
+    caps_set: frozenset[str],
+) -> list[dict[str, object]]:
+    """Segment a single word into token dicts.
+    This is the main entry point for per-word segmentation.  It handles
+    punctuation splitting, candidate generation, and selection.
+    Args:
+        word: Raw word string (may include surrounding punctuation).
+        tdk: TDK dictionary set.
+        domain_roots: Domain vocabulary set.
+        caps_set: Set of words that were originally ALL CAPS.
+    Returns:
+        List of token dicts ready for inclusion in the output.
+    """
+    parts = _split_punctuation(word)
+    result: list[dict[str, object]] = []
+    is_first = True
+    for text, part_type in parts:
+        if part_type == "PUNCT":
+            prefix = " " if is_first else ""
+            result.append({
+                "token": f"{prefix}{text}",
+                "token_type": "PUNCT",
+                "morph_pos": 0,
+                "_punct": True,
+            })
+            is_first = False
+            continue
+        # part_type == "WORD"
+        # Check for apostrophe within the word
+        if "'" in text or "\u2019" in text:
+            apo_tokens = _segment_apostrophe_word(text, tdk, domain_roots, caps_set)
+            for i, t in enumerate(apo_tokens):
+                if i == 0 and is_first:
+                    t["token"] = f" {t['token'].lstrip()}"
+                result.append(t)
+            is_first = False
+            continue
+        # Standard word segmentation via candidate generation
+        candidates = generate_candidates(text, tdk, domain_roots, caps_set)
+        best = select_best_candidate(candidates)
+        for i, token in enumerate(best.tokens):
+            tok_dict = token.to_dict()
+            # Add leading space to the first token of this word
+            if i == 0 and is_first:
+                tok_dict["token"] = f" {tok_dict['token'].lstrip()}"
+            # Compute morph_pos
+            if i == 0:
+                tok_dict["morph_pos"] = 0
+            else:
+                tok_dict["morph_pos"] = i
+            result.append(tok_dict)
+        is_first = False
+    return result
+def _segment_apostrophe_word(
+    word: str,
+    tdk: set[str],
+    domain_roots: frozenset[str],
+    caps_set: frozenset[str],
+) -> list[dict[str, object]]:
+    """Segment a word containing an apostrophe.
+    Splits at the apostrophe and determines whether the base is Turkish
+    (proper name) or foreign.
+    """
+    from .apostrophe import is_turkish_base  # avoid circular at module level
+    # Find the apostrophe position
+    apo_pos = word.find("'")
+    if apo_pos == -1:
+        apo_pos = word.find("\u2019")
+    if apo_pos == -1:
+        # No apostrophe found (shouldn't happen) — treat as regular word
+        candidates = generate_candidates(word, tdk, domain_roots, caps_set)
+        best = select_best_candidate(candidates)
+        return [t.to_dict() for t in best.tokens]
+    base = word[:apo_pos]
+    suffix = word[apo_pos + 1:]
+    wl = turkish_lower(base)
+    is_caps = wl in caps_set
+    if is_turkish_base(base):
+        # Turkish proper name: ROOT + PUNCT(') + SUFFIX
+        suffix_label = SUFFIX_MAP.get(suffix.lower(), "-SFX")
+        tokens: list[dict[str, object]] = [
+            {
+                "token": base, "token_type": "ROOT", "morph_pos": 0,
+                **( {"_caps": True} if is_caps else {}),
+            },
+            {
+                "token": "'", "token_type": "PUNCT", "morph_pos": 0,
+                "_punct": True,
+            },
+        ]
+        if suffix:
+            tokens.append({
+                "token": suffix, "token_type": "SUFFIX", "morph_pos": 1,
+                "_apo_suffix": True, "_suffix_label": suffix_label,
+            })
+        return tokens
+    else:
+        # Foreign word: FOREIGN + SUFFIX
+        suffix_label = SUFFIX_MAP.get(suffix.lower(), "-SFX")
+        tokens = [
+            {
+                "token": base, "token_type": "FOREIGN", "morph_pos": 0,
+                "_foreign": True,
+            },
+        ]
+        if suffix:
+            tokens.append({
+                "token": suffix, "token_type": "SUFFIX", "morph_pos": 1,
+                "_apo_suffix": True, "_suffix_label": suffix_label,
+            })
+        return tokens

nedo_turkish_tokenizer/{_normalizer.py → special_spans.py} RENAMED Viewed

@@ -1,100 +1,85 @@
-"""Fix 8: Special token normalization (NUM, DATE, URL, MENTION, HASHTAG, EMOJI).
-Uses a segment-based approach: special tokens are detected and extracted
-*before* the base tokenizer runs, so they never pass through it.
 """
 from __future__ import annotations
 import re
-MONTH_NAMES = {
-    "ocak","şubat","mart","nisan","mayıs","haziran",
-    "temmuz","ağustos","eylül","ekim","kasım","aralık",
-    "january","february","march","april","may","june",
-    "july","august","september","october","november","december",
-}
-UNITS = {
-    "km","m","cm","mm","nm",
-    "kg","g","mg","ton",
-    "sn","dk","sa","ms",
-    "tl","usd","eur","gbp",
-    "kb","mb","gb","tb","pb",
-    "ml","mcg","meq","iu","mmhg","mosm",
-    "hz","mhz","ghz","watt","kw","mw","kcal","cal",
-}
-ROMAN_NUMERALS = {
-    "i","ii","iii","iv","vi","vii","viii","ix",
-    "xi","xii","xiii","xiv","xv","xvi","xvii","xviii","xix","xx",
-}
-# ── Regex patterns ────────────────────────────────────────────────────────────
-URL_RE         = re.compile(r'https?://\S+|www\.\S+', re.IGNORECASE)
-MENTION_RE     = re.compile(r'@[\w\u00C0-\u024F]+')
-HASHTAG_RE     = re.compile(r'#[\w\u00C0-\u024F]+')
-# Turkish suffixes that can follow a number+apostrophe
-_NUM_SUFFIXES = sorted(
-    [
-        "nın","nin","nun","nün","dan","den","tan","ten",
-        "da","de","ta","te","ya","ye","nda","nde",
-        "yı","yi","yu","yü","nı","ni","nu","nü",
-        "lar","ler","lara","lere","ları","leri",
-        "ım","im","um","üm","ın","in","un","ün",
-        "mız","miz","muz","müz","nız","niz","nuz","nüz",
-        "dır","dir","dur","dür","tır","tir","tur","tür",
-        "ki","li","lı","lu","lü","sız","siz","suz","süz",
-        "inci","ıncı","uncu","üncü","nci","ncı",
-        "lık","lik","luk","lük",
-        "a","e","ı","i","u","ü",
-    ],
-    key=len,
-    reverse=True,
-)
-_SUFFIX_ALT = '|'.join(re.escape(s) for s in _NUM_SUFFIXES)
-# Number (or time) followed by apostrophe + Turkish suffix(es)
 NUM_APOSTROPHE_RE = re.compile(
     r"\d+(?:[.:,]\d+)*['\u2019](?:" + _SUFFIX_ALT + r")+\b",
     re.IGNORECASE,
 )
-DATE_RE        = re.compile(
-    r'\d{1,2}[./\-]\d{1,2}[./\-]\d{2,4}'
-    r'|\d{4}[./\-]\d{1,2}[./\-]\d{1,2}'
 )
-CURRENCY_RE    = re.compile(r'[$€£¥₺₽]\d+[\.,]?\d*|\d+[\.,]?\d*[$€£¥₺₽]')
-NUMBER_RE      = re.compile(
-    r'%\d+[\.,]?\d*'
-    r'|\d{1,3}(?:\.\d{3})+'       # thousands (1.000.000) — before decimal!
-    r'|\d+[\.,]\d+'               # decimal (2.5, 10,5)
-    r'|\d+%'
-    r'|\d+/\d+'
 )
-TIME_RE        = re.compile(r'\d{1,2}:\d{2}(?::\d{2})?')
-PLAIN_NUM_RE   = re.compile(r'\b\d+\b')
-# ── Acronym patterns ─────────────────────────────────────────────────────────
-# Matches standalone uppercase sequences (+ optional trailing digits).
-#   [A-Z]{2,}[0-9]*  → HTML, GPT, CSS3, HTML5, MP3
-#   [A-Z][0-9]+      → F16, H264, A4
-# Does NOT match mixed-case words (ChatGPT) because \b won't fire mid-word.
 ACRONYM_RE = re.compile(
     r"\b[A-ZÇĞİÖŞÜ]{2,}[0-9]*\b"
     r"|\b[A-ZÇĞİÖŞÜ][0-9]+\b"
 )
-# Acronym followed by apostrophe + Turkish suffix(es): NATO'nun, HTML5'ten
 ACRONYM_APOSTROPHE_RE = re.compile(
     r"\b(?:[A-ZÇĞİÖŞÜ]{2,}[0-9]*|[A-ZÇĞİÖŞÜ][0-9]+)['\u2019](?:"
     + _SUFFIX_ALT + r")+\b"
 )
-TEXT_EMOJI_RE  = re.compile(r'[:;=]-?[\)\(\]\[dDpPoO3]|<3')
 UNICODE_EMOJI_RE = re.compile(
     "[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
     "\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF"
@@ -103,20 +88,20 @@ UNICODE_EMOJI_RE = re.compile(
     flags=re.UNICODE,
 )
-# Pattern priority: earlier entries win when spans overlap.
-_SPAN_PATTERNS: list[tuple[re.Pattern, str]] = [
-    (URL_RE,                 "URL"),
-    (MENTION_RE,             "MENTION"),
-    (HASHTAG_RE,             "HASHTAG"),
-    (DATE_RE,                "DATE"),
-    (CURRENCY_RE,            "UNIT"),
-    (NUM_APOSTROPHE_RE,      "NUM_APO"),
-    (ACRONYM_APOSTROPHE_RE,  "ACRONYM_APO"),
-    (ACRONYM_RE,             "ACRONYM"),
-    (NUMBER_RE,              "NUM"),
-    (TIME_RE,                "NUM"),
-    (PLAIN_NUM_RE,           "NUM"),
-    (UNICODE_EMOJI_RE,       "EMOJI"),
     (TEXT_EMOJI_RE,          "EMOJI"),
 ]
@@ -124,42 +109,34 @@ _SPAN_PATTERNS: list[tuple[re.Pattern, str]] = [
 # ── Acronym vs Turkish word disambiguation ───────────────────────────────────
 def _is_known_turkish_word(word_upper: str) -> bool:
-    """Return True if *word_upper* (ALL CAPS) is a known Turkish word.
-    Checks (in order):
     1. ACRONYM_EXPANSIONS dict → always acronym (return False)
-    2. Same dict without trailing digits (HTML5 → HTML)
-    3. TDK dictionary → Turkish word (return True)
-    4. Proper nouns list → Turkish word (return True)
-    5. Otherwise → treat as acronym (return False)
     """
-    from ._acronym_dict import ACRONYM_EXPANSIONS  # noqa: PLC0415
-    from ._preprocessor import _turkish_lower, _load_proper_nouns  # noqa: PLC0415
-    from ._tdk_vocab import load_tdk_words  # noqa: PLC0415
     # Known acronyms always win
     if word_upper in ACRONYM_EXPANSIONS:
         return False
-    # Also check without trailing digits (HTML5 → HTML)
     base = word_upper.rstrip("0123456789")
     if base and base != word_upper and base in ACRONYM_EXPANSIONS:
         return False
-    wl = _turkish_lower(word_upper)
-    # TDK dictionary: if the lowercase form is a real Turkish word → not acronym
     tdk = load_tdk_words()
     if tdk and wl in tdk:
         return True
-    # Proper nouns (İstanbul, Ankara…)
-    if wl in _load_proper_nouns():
         return True
     return False
-# ── Segment-based API ────────────────────────────────────────────────────────
 def find_special_spans(text: str) -> list[tuple[int, int, str, str]]:
     """Find all special-token spans in *text*.
@@ -172,9 +149,8 @@ def find_special_spans(text: str) -> list[tuple[int, int, str, str]]:
         for m in pattern.finditer(text):
             original = m.group(0)
-            # Acronym filtering: skip if it's actually a Turkish word
             if ttype in ("ACRONYM", "ACRONYM_APO"):
-                # Extract the uppercase base (before apostrophe for APO)
                 if ttype == "ACRONYM_APO":
                     apo = original.find("'")
                     if apo == -1:
@@ -200,28 +176,38 @@ def find_special_spans(text: str) -> list[tuple[int, int, str, str]]:
     return result
-def _split_apostrophe_suffixes(suffix_str: str) -> list[dict]:
-    """Split a suffix string (after apostrophe) into individual SUFFIX tokens."""
-    tokens: list[dict] = []
     remaining = suffix_str.lower()
     while remaining:
         matched = False
-        for s in _NUM_SUFFIXES:
             if remaining.startswith(s):
-                tokens.append({"token": s, "type": "SUFFIX", "_apo_suffix": True})
                 remaining = remaining[len(s):]
                 matched = True
                 break
         if not matched:
-            tokens.append({"token": remaining, "type": "SUFFIX", "_apo_suffix": True})
             break
-    return tokens
-def make_special_tokens(span_type: str, original: str) -> list[dict]:
     """Create token dict(s) for a matched special span.
-    ``NUM_APO`` and ``ACRONYM_APO`` spans are split into base + SUFFIX tokens.
     """
     # ── Number + apostrophe + suffix (3'te, 1990'larda) ──────────────────
     if span_type == "NUM_APO":
@@ -229,10 +215,16 @@ def make_special_tokens(span_type: str, original: str) -> list[dict]:
         if apo_pos == -1:
             apo_pos = original.find("\u2019")
         num_part = original[:apo_pos]
-        return [
-            {"token": f" {num_part}", "type": "NUM", "_num": True},
-            *_split_apostrophe_suffixes(original[apo_pos + 1:]),
         ]
     # ── Acronym + apostrophe + suffix (NATO'nun, HTML5'ten) ──────────────
     if span_type == "ACRONYM_APO":
@@ -240,43 +232,59 @@ def make_special_tokens(span_type: str, original: str) -> list[dict]:
         if apo_pos == -1:
             apo_pos = original.find("\u2019")
         acr_part = original[:apo_pos]
-        return [
-            {"token": f" {acr_part}", "type": "ACRONYM", "_acronym": True},
-            *_split_apostrophe_suffixes(original[apo_pos + 1:]),
         ]
     # ── Plain acronym (HTML5, GPT) ──────────────────────────────────────
     if span_type == "ACRONYM":
-        return [{"token": f" {original}", "type": "ACRONYM", "_acronym": True}]
     # ── Everything else (NUM, DATE, URL, MENTION, HASHTAG, EMOJI, UNIT) ──
     return [{
         "token": f" {original}",
-        "type": span_type,
         f"_{span_type.lower()}": True,
     }]
-# ── Safety-net post-pass ─────────────────────────────────────────────────────
-def reclassify_numbers_in_tokens(tokens: list[dict]) -> list[dict]:
-    """Catch remaining number/unit tokens missed by span detection."""
-    result: list[dict] = []
     for tok in tokens:
-        if tok["type"] not in ("BPE", "ROOT"):
             result.append(tok)
             continue
-        raw = tok["token"].strip()
         if NUMBER_RE.fullmatch(raw):
-            result.append({**tok, "type": "NUM", "_num": True})
-        elif raw.lower() in UNITS and tok["type"] == "BPE":
-            result.append({**tok, "type": "UNIT", "_unit": True})
-        elif raw.lower() in ROMAN_NUMERALS and tok["type"] == "BPE":
-            result.append({**tok, "type": "NUM", "_roman": True})
-        elif raw.lower() in MONTH_NAMES and tok["type"] == "BPE":
-            result.append({**tok, "type": "ROOT", "_month": True})
         else:
             result.append(tok)

+"""Special span detection: URLs, numbers, dates, mentions, hashtags, emojis, acronyms.
+Detects non-textual spans in the input text **before** the word-level
+segmentation runs, so they are never mistakenly split by suffix
+stripping.  Returns a sorted, non-overlapping list of spans.
 """
 from __future__ import annotations
 import re
+from ._acronym_table import ACRONYM_EXPANSIONS
+from ._suffix_table import APOSTROPHE_SUFFIXES
+from .normalization import turkish_lower
+from .resources import load_proper_nouns, load_tdk_words
+# ── Static vocabulary sets ───────────────────────────────────────────────────
+MONTH_NAMES: frozenset[str] = frozenset({
+    "ocak", "şubat", "mart", "nisan", "mayıs", "haziran",
+    "temmuz", "ağustos", "eylül", "ekim", "kasım", "aralık",
+    "january", "february", "march", "april", "may", "june",
+    "july", "august", "september", "october", "november", "december",
+})
+UNITS: frozenset[str] = frozenset({
+    "km", "m", "cm", "mm", "nm",
+    "kg", "g", "mg", "ton",
+    "sn", "dk", "sa", "ms",
+    "tl", "usd", "eur", "gbp",
+    "kb", "mb", "gb", "tb", "pb",
+    "ml", "mcg", "meq", "iu", "mmhg", "mosm",
+    "hz", "mhz", "ghz", "watt", "kw", "mw", "kcal", "cal",
+})
+ROMAN_NUMERALS: frozenset[str] = frozenset({
+    "i", "ii", "iii", "iv", "vi", "vii", "viii", "ix",
+    "xi", "xii", "xiii", "xiv", "xv", "xvi", "xvii", "xviii", "xix", "xx",
+})
+# ── Regex patterns ───────────────────────────────────────────────────────────
+URL_RE = re.compile(r"https?://\S+|www\.\S+", re.IGNORECASE)
+MENTION_RE = re.compile(r"@[\w\u00C0-\u024F]+")
+HASHTAG_RE = re.compile(r"#[\w\u00C0-\u024F]+")
+_SUFFIX_ALT = "|".join(re.escape(s) for s in APOSTROPHE_SUFFIXES)
+# Number + apostrophe + Turkish suffix(es)
 NUM_APOSTROPHE_RE = re.compile(
     r"\d+(?:[.:,]\d+)*['\u2019](?:" + _SUFFIX_ALT + r")+\b",
     re.IGNORECASE,
 )
+DATE_RE = re.compile(
+    r"\d{1,2}[./\-]\d{1,2}[./\-]\d{2,4}"
+    r"|\d{4}[./\-]\d{1,2}[./\-]\d{1,2}"
 )
+CURRENCY_RE = re.compile(r"[$€£¥₺₽]\d+[\.,]?\d*|\d+[\.,]?\d*[$€£¥₺₽]")
+NUMBER_RE = re.compile(
+    r"%\d+[\.,]?\d*"
+    r"|\d{1,3}(?:\.\d{3})+"     # thousands (1.000.000)
+    r"|\d+[\.,]\d+"             # decimal
+    r"|\d+%"
+    r"|\d+/\d+"
 )
+TIME_RE = re.compile(r"\d{1,2}:\d{2}(?::\d{2})?")
+PLAIN_NUM_RE = re.compile(r"\b\d+\b")
+# Acronyms: standalone uppercase 2+ letters (optionally + digits)
 ACRONYM_RE = re.compile(
     r"\b[A-ZÇĞİÖŞÜ]{2,}[0-9]*\b"
     r"|\b[A-ZÇĞİÖŞÜ][0-9]+\b"
 )
+# Acronym + apostrophe + Turkish suffix(es)
 ACRONYM_APOSTROPHE_RE = re.compile(
     r"\b(?:[A-ZÇĞİÖŞÜ]{2,}[0-9]*|[A-ZÇĞİÖŞÜ][0-9]+)['\u2019](?:"
     + _SUFFIX_ALT + r")+\b"
 )
+TEXT_EMOJI_RE = re.compile(r"[:;=]-?[\)\(\]\[dDpPoO3]|<3")
 UNICODE_EMOJI_RE = re.compile(
     "[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
     "\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF"
     flags=re.UNICODE,
 )
+# Priority order: earlier entries win when spans overlap
+_SPAN_PATTERNS: list[tuple[re.Pattern[str], str]] = [
+    (URL_RE,                "URL"),
+    (MENTION_RE,            "MENTION"),
+    (HASHTAG_RE,            "HASHTAG"),
+    (DATE_RE,               "DATE"),
+    (CURRENCY_RE,           "UNIT"),
+    (NUM_APOSTROPHE_RE,     "NUM_APO"),
+    (ACRONYM_APOSTROPHE_RE, "ACRONYM_APO"),
+    (ACRONYM_RE,            "ACRONYM"),
+    (NUMBER_RE,             "NUM"),
+    (TIME_RE,               "NUM"),
+    (PLAIN_NUM_RE,          "NUM"),
+    (UNICODE_EMOJI_RE,      "EMOJI"),
     (TEXT_EMOJI_RE,          "EMOJI"),
 ]
 # ── Acronym vs Turkish word disambiguation ───────────────────────────────────
 def _is_known_turkish_word(word_upper: str) -> bool:
+    """Return True if *word_upper* (ALL CAPS) is actually a Turkish word.
+    Checks:
     1. ACRONYM_EXPANSIONS dict → always acronym (return False)
+    2. TDK dictionary → Turkish word (return True)
+    3. Proper nouns → Turkish word (return True)
+    4. Otherwise → treat as acronym (return False)
     """
     # Known acronyms always win
     if word_upper in ACRONYM_EXPANSIONS:
         return False
     base = word_upper.rstrip("0123456789")
     if base and base != word_upper and base in ACRONYM_EXPANSIONS:
         return False
+    wl = turkish_lower(word_upper)
     tdk = load_tdk_words()
     if tdk and wl in tdk:
         return True
+    if wl in load_proper_nouns():
         return True
     return False
+# ── Public API ───────────────────────────────────────────────────────────────
 def find_special_spans(text: str) -> list[tuple[int, int, str, str]]:
     """Find all special-token spans in *text*.
         for m in pattern.finditer(text):
             original = m.group(0)
+            # Acronym filtering: skip if it's actually a common Turkish word
             if ttype in ("ACRONYM", "ACRONYM_APO"):
                 if ttype == "ACRONYM_APO":
                     apo = original.find("'")
                     if apo == -1:
     return result
+def split_apostrophe_suffixes(suffix_str: str) -> list[tuple[str, str]]:
+    """Split a suffix string (after apostrophe) into individual suffix pieces.
+    Returns a list of ``(surface_form, label)`` tuples.
+    """
+    from ._suffix_table import SUFFIX_MAP  # avoid circular at module level
+    pieces: list[tuple[str, str]] = []
     remaining = suffix_str.lower()
     while remaining:
         matched = False
+        for s in APOSTROPHE_SUFFIXES:
             if remaining.startswith(s):
+                label = SUFFIX_MAP.get(s, "-SFX")
+                pieces.append((s, label))
                 remaining = remaining[len(s):]
                 matched = True
                 break
         if not matched:
+            # Unrecognised remainder → emit as a single suffix chunk
+            pieces.append((remaining, "-SFX"))
             break
+    return pieces
+def make_special_tokens(
+    span_type: str, original: str
+) -> list[dict[str, object]]:
     """Create token dict(s) for a matched special span.
+    ``NUM_APO`` and ``ACRONYM_APO`` spans are split into base + SUFFIX
+    tokens.
     """
     # ── Number + apostrophe + suffix (3'te, 1990'larda) ──────────────────
     if span_type == "NUM_APO":
         if apo_pos == -1:
             apo_pos = original.find("\u2019")
         num_part = original[:apo_pos]
+        suffix_pieces = split_apostrophe_suffixes(original[apo_pos + 1:])
+        result: list[dict[str, object]] = [
+            {"token": f" {num_part}", "token_type": "NUM", "morph_pos": 0, "_num": True},
         ]
+        for idx, (surf, label) in enumerate(suffix_pieces, start=1):
+            result.append({
+                "token": surf, "token_type": "SUFFIX", "morph_pos": idx,
+                "_apo_suffix": True, "_suffix_label": label,
+            })
+        return result
     # ── Acronym + apostrophe + suffix (NATO'nun, HTML5'ten) ──────────────
     if span_type == "ACRONYM_APO":
         if apo_pos == -1:
             apo_pos = original.find("\u2019")
         acr_part = original[:apo_pos]
+        suffix_pieces = split_apostrophe_suffixes(original[apo_pos + 1:])
+        expansion = ACRONYM_EXPANSIONS.get(acr_part.upper())
+        meta: dict[str, object] = {"_acronym": True}
+        if expansion:
+            meta["_expansion"] = expansion
+            meta["_known_acronym"] = True
+        result = [
+            {"token": f" {acr_part}", "token_type": "ACRONYM", "morph_pos": 0, **meta},
         ]
+        for idx, (surf, label) in enumerate(suffix_pieces, start=1):
+            result.append({
+                "token": surf, "token_type": "SUFFIX", "morph_pos": idx,
+                "_apo_suffix": True, "_suffix_label": label,
+            })
+        return result
     # ── Plain acronym (HTML5, GPT) ──────────────────────────────────────
     if span_type == "ACRONYM":
+        expansion = ACRONYM_EXPANSIONS.get(original.upper())
+        meta = {"_acronym": True}
+        if expansion:
+            meta["_expansion"] = expansion
+            meta["_known_acronym"] = True
+        return [{"token": f" {original}", "token_type": "ACRONYM", "morph_pos": 0, **meta}]
     # ── Everything else (NUM, DATE, URL, MENTION, HASHTAG, EMOJI, UNIT) ──
     return [{
         "token": f" {original}",
+        "token_type": span_type,
+        "morph_pos": 0,
         f"_{span_type.lower()}": True,
     }]
+def reclassify_numbers_in_tokens(tokens: list[dict[str, object]]) -> list[dict[str, object]]:
+    """Post-pass: catch remaining numbers / units missed by span detection."""
+    result: list[dict[str, object]] = []
     for tok in tokens:
+        tt = tok["token_type"]
+        if tt not in ("ROOT", "FOREIGN"):
             result.append(tok)
             continue
+        raw = str(tok["token"]).strip()
         if NUMBER_RE.fullmatch(raw):
+            result.append({**tok, "token_type": "NUM", "_num": True})
+        elif raw.lower() in UNITS:
+            result.append({**tok, "token_type": "UNIT", "_unit": True})
+        elif raw.lower() in ROMAN_NUMERALS:
+            result.append({**tok, "token_type": "NUM", "_roman": True})
+        elif raw.lower() in MONTH_NAMES:
+            result.append({**tok, "token_type": "ROOT", "_month": True})
         else:
             result.append(tok)

nedo_turkish_tokenizer/tokenizer.py CHANGED Viewed

@@ -1,24 +1,22 @@
-"""
-NedoTurkishTokenizer — production-ready Turkish morphological tokenizer.
-Applies 12 sequential fixes on top of the base turkish-tokenizer:
-  1.  ALL CAPS inflation fix
-  2.  Apostrophe / code-switching split
-  3.  BPE→SUFFIX reclassification
-  4.  Zemberek root validation & correction
-  5.  Punctuation → PUNCT type
-  6.  Domain vocabulary (medical / sports / tourism)
-  7.  TDK-based FOREIGN word detection
-  8.  Special token normalization (NUM, DATE, URL, MENTION, HASHTAG, EMOJI)
-  9.  Allomorph canonicalization
-  10. Compound word decomposition
-  11. Acronym expansion
-  12. Context-aware Zemberek disambiguation
 Output fields per token:
     token       : str  — token string (leading space = word-initial)
-    token_type  : str  — ROOT | SUFFIX | FOREIGN | BPE | PUNCT |
-                         NUM | DATE | UNIT | URL | MENTION | HASHTAG | EMOJI
     morph_pos   : int  — 0=root/word-initial, 1=first suffix, 2=second suffix…
     (+ optional _* metadata fields)
 """
@@ -26,48 +24,14 @@ Output fields per token:
 from __future__ import annotations
 import os
-import re
 import multiprocessing
 from concurrent.futures import ProcessPoolExecutor, as_completed
-from pathlib import Path
-from ._preprocessor import preprocess, postprocess
-from ._suffix_expander import reclassify_bpe_suffixes
-from ._root_validator import validate_roots, ZEMBEREK_AVAILABLE
-from ._medical_vocab import ALL_DOMAIN_ROOTS
-from ._tdk_vocab import reclassify_foreign_words
-from ._normalizer import (
-    find_special_spans,
-    make_special_tokens,
-    reclassify_numbers_in_tokens,
-)
-from ._allomorph import add_canonical_labels
-from ._compound import add_compound_info
-from ._acronym_dict import reclassify_acronyms
-from ._context_aware import annotate_with_context
-try:
-    from ._root_validator import _morphology as _zemb_morphology
-except Exception:
-    _zemb_morphology = None
-_DOMAIN_ROOTS_LOWER = {k.lower() for k in ALL_DOMAIN_ROOTS}
-# ── Token types ───────────────────────────────────────────────────────────────
-_SPECIAL_TYPES = frozenset(
-    ("NUM", "DATE", "UNIT", "URL", "MENTION", "HASHTAG", "EMOJI", "ACRONYM")
-)
-_TYPE_SYM = {
-    "ROOT": "R", "SUFFIX": "S", "FOREIGN": "F", "BPE": "B", "PUNCT": "P",
-    "NUM": "N", "DATE": "D", "UNIT": "U",
-    "URL": "L", "MENTION": "@", "HASHTAG": "#", "EMOJI": "E", "ACRONYM": "A",
-}
-# ── Parallel worker helpers ───────────────────────────────────────────────────
 _worker_tok: "NedoTurkishTokenizer | None" = None
@@ -84,9 +48,12 @@ def _tokenize_one(text: str) -> list[dict]:
 # ══════════════════════════════════════════════════════════════════════════════
 class NedoTurkishTokenizer:
-    """
-    Turkish morphological tokenizer with HuggingFace-compatible interface.
     Example::
@@ -99,83 +66,22 @@ class NedoTurkishTokenizer:
     """
     def __init__(self) -> None:
-        from turkish_tokenizer import TurkishTokenizer  # noqa: PLC0415
-        self._base = TurkishTokenizer()
-        self.zemberek_available = ZEMBEREK_AVAILABLE
-    # ── Public API ────────────────────────────────────────────────────────────
     def __call__(self, text: str) -> list[dict]:
         return self.tokenize(text)
     def tokenize(self, text: str) -> list[dict]:
         """Tokenize a single text string.
-        Returns a list of token dicts, each with:
-            ``token``, ``token_type``, ``morph_pos``, and optional ``_*`` fields.
         """
-        # Fix 8: detect special tokens (NUM, DATE, URL, MENTION, HASHTAG, …)
-        # and split text into segments so they never enter the base tokenizer.
-        spans = find_special_spans(text)
-        tokens: list[dict] = []
-        pos = 0
-        for start, end, ttype, original in spans:
-            # Tokenize normal text before this special span
-            if pos < start:
-                segment = text[pos:start]
-                if segment.strip():
-                    seg_proc, caps, apo = preprocess(segment)
-                    seg_raw = self._base.tokenize_text(seg_proc)
-                    seg_tokens = postprocess(seg_raw, caps, apo)
-                    tokens.extend(seg_tokens)
-            # Insert the special token(s) directly
-            tokens.extend(make_special_tokens(ttype, original))
-            pos = end
-        # Tokenize remaining text after the last special span
-        if pos < len(text):
-            segment = text[pos:]
-            if segment.strip():
-                seg_proc, caps, apo = preprocess(segment)
-                seg_raw = self._base.tokenize_text(seg_proc)
-                seg_tokens = postprocess(seg_raw, caps, apo)
-                tokens.extend(seg_tokens)
-        # Fix 3 + 5: BPE→SUFFIX reclassification + PUNCT
-        tokens = reclassify_bpe_suffixes(tokens)
-        # Fix 8b: remaining numbers / units
-        tokens = reclassify_numbers_in_tokens(tokens)
-        # Fix 6: domain vocabulary (medical / sports / tourism)
-        tokens = _reclassify_domain_roots(tokens, _DOMAIN_ROOTS_LOWER)
-        # Fix 7: TDK FOREIGN detection
-        tokens = reclassify_foreign_words(tokens)
-        # Fix 11: acronym expansions
-        tokens = reclassify_acronyms(tokens)
-        # Fix 9: allomorph canonical labels
-        tokens = add_canonical_labels(tokens)
-        # Fix 10: compound word annotation
-        tokens = add_compound_info(tokens, morphology=_zemb_morphology)
-        # Fix 12: context-aware Zemberek disambiguation
-        tokens = annotate_with_context(tokens, text)
-        # Fix 4: Zemberek root validation & correction
-        tokens = validate_roots(tokens, text.split(), base_tokenizer=self._base)
-        # Add public output fields
-        tokens = _add_output_fields(tokens)
-        return tokens
     def batch_tokenize(
         self,
@@ -187,11 +93,11 @@ class NedoTurkishTokenizer:
         Args:
             texts: List of strings to tokenize.
-            workers: Number of worker processes (None = all CPUs).
-            chunk_size: Below this count, run sequentially to avoid overhead.
         Returns:
-            List of token lists, in the same order as ``texts``.
         """
         if not texts:
             return []
@@ -209,126 +115,30 @@ class NedoTurkishTokenizer:
                 i = futs[fut]
                 try:
                     results[i] = fut.result()
-                except Exception as exc:  # noqa: BLE001
-                    results[i] = self._base.tokenize_text(texts[i])
                     print(f"[NedoTurkishTokenizer] fallback at idx={i}: {exc}")
         return results  # type: ignore[return-value]
-    # ── HuggingFace-style helpers ─────────────────────────────────────────────
-    @classmethod
-    def from_pretrained(cls, _model_id: str = "Ethosoft/NedoTurkishTokenizer") -> "NedoTurkishTokenizer":
-        """Load tokenizer (rules-based, no weights to download)."""
-        return cls()
-    def save_pretrained(self, save_directory: str) -> None:
-        """Save tokenizer config to a directory (for HF Hub compatibility)."""
-        import json
-        path = Path(save_directory)
-        path.mkdir(parents=True, exist_ok=True)
-        config = {
-            "tokenizer_class": "NedoTurkishTokenizer",
-            "model_type": "nedo-turkish-tokenizer",
-            "version": "1.0.0",
-            "zemberek_available": self.zemberek_available,
-        }
-        (path / "tokenizer_config.json").write_text(
-            json.dumps(config, ensure_ascii=False, indent=2), encoding="utf-8"
-        )
-    # ── Morphological Lattice API ─────────────────────────────────────────────
-    def get_morphological_lattice(self, word: str) -> list[dict]:
-        """Return all possible morphological analyses for *word* as a lattice.
-        Each entry in the returned list is a dict with:
-            ``root``          – the lemma / root form
-            ``suffixes``      – list of surface-form suffixes
-            ``pos``           – abbreviated POS tag (Noun, Verb, Adj, …)
-            ``lexical_form``  – full lexical representation from Zemberek
-        Returns an **empty list** when Zemberek cannot analyse the word
-        (unknown word) or when Zemberek is not available.
-        """
-        if _zemb_morphology is None:
-            return []
-        try:
-            word_analysis = _zemb_morphology.analyze(word)
-            lattice: list[dict] = []
-            for sa in word_analysis:
-                try:
-                    root = str(sa.item.lemma)
-                    pos = str(sa.item.primary_pos.short_form)
-                    lexical_form = str(sa.format_string())
-                    # Build suffix list from morpheme chain (skip the root morpheme)
-                    morphemes = list(sa.get_morphemes())
-                    suffixes = [str(m) for m in morphemes[1:]] if len(morphemes) > 1 else []
-                    lattice.append({
-                        "root": root,
-                        "suffixes": suffixes,
-                        "pos": pos,
-                        "lexical_form": lexical_form,
-                    })
-                except Exception:  # noqa: BLE001
-                    continue
-            return lattice
-        except Exception:  # noqa: BLE001
-            return []
-    def tokenize_lattice(self, text: str) -> dict:
-        """Tokenize *text* and return a morphological lattice for every word.
-        Returns a dict with:
-            ``input``   – the original text
-            ``words``   – list of per-word dicts, each containing
-                          ``word`` (str) and ``lattice`` (list of analyses)
-        Example::
-            tok  = NedoTurkishTokenizer()
-            data = tok.tokenize_lattice("Evin güzel gelir")
-            for w in data["words"]:
-                print(w["word"], "→", len(w["lattice"]), "analysis(es)")
-        """
-        # Split text on whitespace, respecting punctuation
-        words = re.findall(r"\S+", text)
-        result_words: list[dict] = []
-        for w in words:
-            lattice = self.get_morphological_lattice(w)
-            result_words.append({
-                "word": w,
-                "lattice": lattice,
-            })
-        return {
-            "input": text,
-            "words": result_words,
-        }
-    # ── Utility ───────────────────────────────────────────────────────────────
     def stats(self, tokens: list[dict]) -> dict:
         """Compute morphological coverage statistics for a token list."""
         total = len(tokens)
         if total == 0:
-            return {k: 0 for k in ("total", "roots", "suffixes", "foreign",
-                                    "bpe", "punct", "special", "tr_pct", "pure_pct")}
-        roots    = sum(1 for t in tokens if t["token_type"] == "ROOT")
         suffixes = sum(1 for t in tokens if t["token_type"] == "SUFFIX")
-        foreign  = sum(1 for t in tokens if t["token_type"] == "FOREIGN")
-        punct    = sum(1 for t in tokens if t["token_type"] == "PUNCT")
-        bpe      = sum(1 for t in tokens if t["token_type"] == "BPE")
-        special  = sum(1 for t in tokens if t["token_type"] in _SPECIAL_TYPES)
-        tr       = roots + suffixes + foreign + punct + special
-        pure     = sum(
             1 for t in tokens
             if t["token_type"] in ("ROOT", "SUFFIX", "FOREIGN")
             and not t["token"].strip().startswith("<")
@@ -338,66 +148,8 @@ class NedoTurkishTokenizer:
             "roots":    roots,
             "suffixes": suffixes,
             "foreign":  foreign,
-            "bpe":      bpe,
             "punct":    punct,
             "special":  special,
             "tr_pct":   round(tr / total * 100, 2),
             "pure_pct": round(pure / total * 100, 2),
         }
-# ── Internal helpers ──────────────────────────────────────────────────────────
-def _reclassify_domain_roots(tokens: list[dict], domain_lower: set) -> list[dict]:
-    result = []
-    for tok in tokens:
-        if tok["type"] != "BPE":
-            result.append(tok)
-            continue
-        raw = tok["token"]
-        if raw == raw.lstrip():   # no leading space → not word-initial
-            result.append(tok)
-            continue
-        if raw.lstrip().lower() in domain_lower:
-            result.append({**tok, "type": "ROOT", "_domain": True})
-        else:
-            result.append(tok)
-    return result
-def _add_output_fields(tokens: list[dict]) -> list[dict]:
-    """Compute token_type and morph_pos and add them to every token."""
-    result = []
-    word_pos = 0
-    for tok in tokens:
-        raw = tok["token"]
-        base_type = tok["type"]
-        stripped = raw.strip()
-        # ── token_type: FOREIGN for foreign ROOTs ─────────────────────────
-        if base_type == "ROOT" and tok.get("_foreign"):
-            token_type = "FOREIGN"
-        else:
-            token_type = base_type
-        # ── morph_pos ─────────────────────────────────────────────────────
-        is_word_start = raw.startswith(" ") or stripped.startswith("<")
-        # Apostrophe suffixes are word-initial in text but continue the word
-        if tok.get("_apo_suffix"):
-            is_word_start = False
-        if is_word_start or base_type in _SPECIAL_TYPES or base_type == "PUNCT":
-            word_pos = 0
-            morph_pos = 0
-        elif base_type == "SUFFIX":
-            word_pos += 1
-            morph_pos = word_pos
-        else:
-            # ROOT or BPE within a word (no leading space)
-            word_pos = 0
-            morph_pos = 0
-        result.append({**tok, "token_type": token_type, "morph_pos": morph_pos})
-    return result

+"""NedoTurkishTokenizer — self-contained Turkish morphological tokenizer.
+A zero-dependency Turkish tokenizer that segments text into
+morphologically meaningful tokens using deterministic heuristics,
+a bundled TDK dictionary, and a candidate-based segmentation engine.
+Usage::
+    from nedo_turkish_tokenizer import NedoTurkishTokenizer
+    tok = NedoTurkishTokenizer()
+    tokens = tok.tokenize("İstanbul'da meeting'e katılamadım")
+    for t in tokens:
+        print(t["token"], t["token_type"], t["morph_pos"])
 Output fields per token:
     token       : str  — token string (leading space = word-initial)
+    token_type  : str  — ROOT | SUFFIX | FOREIGN | PUNCT |
+                         NUM | DATE | UNIT | URL | MENTION | HASHTAG | EMOJI | ACRONYM
     morph_pos   : int  — 0=root/word-initial, 1=first suffix, 2=second suffix…
     (+ optional _* metadata fields)
 """
 from __future__ import annotations
 import os
 import multiprocessing
 from concurrent.futures import ProcessPoolExecutor, as_completed
+from .engine import TokenizationEngine
+from .types import SPECIAL_TYPES
+# ── Parallel worker helpers ──────────────────────────────────────────────────
 _worker_tok: "NedoTurkishTokenizer | None" = None
 # ══════════════════════════════════════════════════════════════════════════════
 class NedoTurkishTokenizer:
+    """Self-contained Turkish morphological tokenizer.
+    Requires **no external dependencies** — all tokenization logic,
+    dictionaries, and heuristics are bundled within the package.
     Example::
     """
     def __init__(self) -> None:
+        self._engine = TokenizationEngine()
+    # ── Public API ─────────────────────────────────────────────────────────
     def __call__(self, text: str) -> list[dict]:
+        """Shorthand for ``tokenize(text)``."""
         return self.tokenize(text)
     def tokenize(self, text: str) -> list[dict]:
         """Tokenize a single text string.
+        Returns a list of token dicts, each containing at minimum:
+        ``token``, ``token_type``, ``morph_pos``, plus optional
+        ``_*`` metadata fields.
         """
+        return self._engine.tokenize(text)
     def batch_tokenize(
         self,
         Args:
             texts: List of strings to tokenize.
+            workers: Number of worker processes (``None`` = all CPUs).
+            chunk_size: Below this count, run sequentially.
         Returns:
+            List of token lists, in the same order as *texts*.
         """
         if not texts:
             return []
                 i = futs[fut]
                 try:
                     results[i] = fut.result()
+                except Exception as exc:
+                    # Fallback: tokenize in the main process
+                    results[i] = self.tokenize(texts[i])
                     print(f"[NedoTurkishTokenizer] fallback at idx={i}: {exc}")
         return results  # type: ignore[return-value]
+    # ── Statistics ─────────────────────────────────────────────────────────
     def stats(self, tokens: list[dict]) -> dict:
         """Compute morphological coverage statistics for a token list."""
         total = len(tokens)
         if total == 0:
+            return {k: 0 for k in (
+                "total", "roots", "suffixes", "foreign",
+                "punct", "special", "tr_pct", "pure_pct",
+            )}
+        roots = sum(1 for t in tokens if t["token_type"] == "ROOT")
         suffixes = sum(1 for t in tokens if t["token_type"] == "SUFFIX")
+        foreign = sum(1 for t in tokens if t["token_type"] == "FOREIGN")
+        punct = sum(1 for t in tokens if t["token_type"] == "PUNCT")
+        special = sum(1 for t in tokens if t["token_type"] in SPECIAL_TYPES)
+        tr = roots + suffixes + foreign + punct + special
+        pure = sum(
             1 for t in tokens
             if t["token_type"] in ("ROOT", "SUFFIX", "FOREIGN")
             and not t["token"].strip().startswith("<")
             "roots":    roots,
             "suffixes": suffixes,
             "foreign":  foreign,
             "punct":    punct,
             "special":  special,
             "tr_pct":   round(tr / total * 100, 2),
             "pure_pct": round(pure / total * 100, 2),
         }

nedo_turkish_tokenizer/types.py ADDED Viewed

	@@ -0,0 +1,109 @@

+"""Core type definitions for NedoTurkishTokenizer.
+Defines the Token dataclass, SegmentationCandidate for the candidate-based
+segmentation engine, token type constants, and punctuation character sets.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any
+# ── Token type constants ──────────────────────────────────────────────────────
+ROOT = "ROOT"
+SUFFIX = "SUFFIX"
+FOREIGN = "FOREIGN"
+PUNCT = "PUNCT"
+NUM = "NUM"
+DATE = "DATE"
+UNIT = "UNIT"
+URL = "URL"
+MENTION = "MENTION"
+HASHTAG = "HASHTAG"
+EMOJI = "EMOJI"
+ACRONYM = "ACRONYM"
+# Special token types that represent non-textual entities
+SPECIAL_TYPES: frozenset[str] = frozenset(
+    {NUM, DATE, UNIT, URL, MENTION, HASHTAG, EMOJI, ACRONYM}
+)
+# All recognized token types
+ALL_TYPES: frozenset[str] = frozenset(
+    {ROOT, SUFFIX, FOREIGN, PUNCT, NUM, DATE, UNIT, URL, MENTION, HASHTAG, EMOJI, ACRONYM}
+)
+# ── Punctuation character set ────────────────────────────────────────────────
+PUNCT_CHARS: frozenset[str] = frozenset(
+    "'?.,;:!-\u2013\u2014()[]{}\"`/\\|@#$%^&*+=<>~"
+    "\u2019\u2018\u201c\u201d\u2032\u00ab\u00bb\u2039\u203a"
+    "\u2022\u2026\u00b7\u00b0\u00b1\u00d7\u00f7"
+)
+# Digits — used alongside PUNCT_CHARS for pure-punctuation detection
+_DIGITS: frozenset[str] = frozenset("0123456789")
+def is_punct_token(text: str) -> bool:
+    """Return True if *text* consists entirely of punctuation / digit characters."""
+    stripped = text.strip()
+    if not stripped:
+        return False
+    return all(
+        c in PUNCT_CHARS or c in _DIGITS or (ord(c) > 0x02FF and not c.isalpha())
+        for c in stripped
+    )
+# ── Token dataclass ──────────────────────────────────────────────────────────
+@dataclass
+class Token:
+    """Internal token representation.
+    *text* uses the leading-space convention: a space prefix indicates
+    that this token starts a new word.  Suffixes within a word have
+    no leading space.
+    The *metadata* dict carries optional annotation fields (all prefixed
+    with ``_``), for example ``_caps``, ``_foreign``, ``_canonical``.
+    """
+    text: str
+    token_type: str
+    morph_pos: int = 0
+    metadata: dict[str, Any] = field(default_factory=dict)
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to the public API dict format."""
+        result: dict[str, Any] = {
+            "token": self.text,
+            "token_type": self.token_type,
+            "morph_pos": self.morph_pos,
+        }
+        result.update(self.metadata)
+        return result
+# ── Segmentation candidate ───────────────────────────────────────────────────
+@dataclass
+class SegmentationCandidate:
+    """One possible way to segment a word into tokens.
+    The candidate-generation engine produces multiple candidates per word,
+    then the selection step picks the highest-scoring one.
+    *source* is a short human-readable tag describing the strategy that
+    produced this candidate (e.g. ``"tdk_root"``, ``"suffix_chain"``,
+    ``"foreign"``).
+    """
+    tokens: list[Token]
+    score: float
+    source: str

paper_baseline_check.py DELETED Viewed

@@ -1,106 +0,0 @@
-"""
-paper_baseline_check.py
------------------------
-Paper'ın %90.29 baseline'ını neden biz %75.57 olarak görüyoruz?
-İki metodolojinin farkını somut olarak gösterir.
-Kullanım:
-    cd NedoTurkishTokenizer/
-    python paper_baseline_check.py
-"""
-import os
-from huggingface_hub import login
-from datasets import load_dataset
-from turkish_tokenizer import TurkishTokenizer
-HF_TOKEN = os.environ.get("HF_TOKEN")
-if HF_TOKEN:
-    login(token=HF_TOKEN, add_to_git_credential=False)
-else:
-    print("HF_TOKEN not set; using existing Hugging Face login state if available.")
-base = TurkishTokenizer()
-print("TR-MMLU yükleniyor...")
-ds   = load_dataset("alibayram/turkish_mmlu", split="test", token=HF_TOKEN)
-rows = list(ds)
-print(f"{len(rows)} örnek\n")
-def orig_tr_pct(tokens):
-    total = len(tokens)
-    if total == 0: return 0.0
-    tr = sum(1 for t in tokens if t["type"] in ("ROOT","SUFFIX"))
-    return tr / total * 100
-# ── Metodoloji A: Sadece soru (paper'ın yaptığı) ─────────────────────────────
-a_sum = 0.0
-for row in rows:
-    text = str(row.get("soru") or row.get("question") or "")
-    if not text.strip(): continue
-    toks = base.tokenize_text(text)
-    a_sum += orig_tr_pct(toks)
-a_avg = a_sum / len(rows)
-# ── Metodoloji B: Soru + şıklar (bizim yaptığımız) ───────────────────────────
-b_sum = 0.0
-for row in rows:
-    parts = []
-    for f in ["soru","question"]:
-        if row.get(f):
-            parts.append(str(row[f]))
-            break
-    choices = row.get("secenekler") or []
-    if isinstance(choices, list):
-        parts.extend(str(c) for c in choices)
-    text = " ".join(parts)
-    if not text.strip(): continue
-    toks = base.tokenize_text(text)
-    b_sum += orig_tr_pct(toks)
-b_avg = b_sum / len(rows)
-# ── Metodoloji C: Soru + şıklar + açıklama (tam) ─────────────────────────────
-c_sum = 0.0
-for row in rows:
-    parts = []
-    for f in ["soru","question"]:
-        if row.get(f):
-            parts.append(str(row[f]))
-            break
-    choices = row.get("secenekler") or []
-    if isinstance(choices, list):
-        parts.extend(str(c) for c in choices)
-    if row.get("aciklama"):
-        parts.append(str(row["aciklama"]))
-    text = " ".join(parts)
-    if not text.strip(): continue
-    toks = base.tokenize_text(text)
-    c_sum += orig_tr_pct(toks)
-c_avg = c_sum / len(rows)
-# ── Şıklardaki içerik analizi ─────────────────────────────────────────────────
-# İlk 20 sorunun şıklarına bak
-print("İlk 20 sorunun şık örnekleri:")
-for row in rows[:20]:
-    choices = row.get("secenekler") or []
-    if isinstance(choices, list) and choices:
-        sample = " | ".join(str(c)[:20] for c in choices[:4])
-        print(f"  {sample}")
-print(f"""
-{'='*60}
-METODOLOJİ KARŞILAŞTIRMASI (TürkishTokenizer baseline)
-{'='*60}
-  A) Sadece soru alanı       (paper'ın yöntemi): {a_avg:.2f}%
-  B) Soru + şıklar           (kısmi):            {b_avg:.2f}%
-  C) Soru + şıklar + açıkl.  (bizim yöntemimiz): {c_avg:.2f}%
-  Paper değeri: 90.29%
-  Bizim A değeri: {a_avg:.2f}%  ← paper ile fark: {a_avg-90.29:+.2f}
-  Bizim C değeri: {c_avg:.2f}%  ← biz bunu kullanıyoruz
-  Sonuç: {c_avg:.2f}% vs {a_avg:.2f}% = {c_avg-a_avg:.2f} puan fark
-  Bu fark şıklardaki kısaltmalar ve yabancı terimlerden kaynaklanıyor.
-{'='*60}
-""")

pyproject.toml CHANGED Viewed

@@ -4,31 +4,27 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "nedo-turkish-tokenizer"
-version = "1.0.0"
-description = "Turkish morphological tokenizer — TR-MMLU world record %92"
 readme = "README.md"
 license = { text = "MIT" }
 authors = [{ name = "Ethosoft", email = "info@ethosoft.ai" }]
 requires-python = ">=3.10"
-keywords = ["turkish", "nlp", "tokenizer", "morphology", "huggingface"]
 classifiers = [
     "Programming Language :: Python :: 3",
     "License :: OSI Approved :: MIT License",
     "Operating System :: OS Independent",
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
     "Natural Language :: Turkish",
 ]
-dependencies = [
-    "turkish-tokenizer>=0.1.0",
-    "zemberek-python>=0.2.3",
-    "requests>=2.28.0",
-]
 [project.optional-dependencies]
-dev = ["pytest", "huggingface_hub"]
 [project.urls]
-Homepage = "https://huggingface.co/Ethosoft/NedoTurkishTokenizer"
 Repository = "https://github.com/ethosoftai/NedoTurkishTokenizer"
 [tool.setuptools.packages.find]

 [project]
 name = "nedo-turkish-tokenizer"
+version = "2.0.0"
+description = "Self-contained Turkish morphological tokenizer with zero external dependencies"
 readme = "README.md"
 license = { text = "MIT" }
 authors = [{ name = "Ethosoft", email = "info@ethosoft.ai" }]
 requires-python = ">=3.10"
+keywords = ["turkish", "nlp", "tokenizer", "morphology", "segmentation"]
 classifiers = [
     "Programming Language :: Python :: 3",
     "License :: OSI Approved :: MIT License",
     "Operating System :: OS Independent",
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Text Processing :: Linguistic",
     "Natural Language :: Turkish",
 ]
+dependencies = []
 [project.optional-dependencies]
+dev = ["pytest"]
 [project.urls]
 Repository = "https://github.com/ethosoftai/NedoTurkishTokenizer"
 [tool.setuptools.packages.find]

special_tokens_map.json DELETED Viewed

@@ -1,9 +0,0 @@
-{
-  "bos_token": "[BOS]",
-  "cls_token": "[CLS]",
-  "eos_token": "[EOS]",
-  "mask_token": "[MASK]",
-  "pad_token": "[PAD]",
-  "sep_token": "[SEP]",
-  "unk_token": "[UNK]"
-}

test_lattice.py DELETED Viewed

@@ -1,72 +0,0 @@
-#!/usr/bin/env python3
-"""
-Test / demo script for the Morphological Lattice API.
-Shows how ambiguous Turkish words like "evin" and "gelir" produce
-multiple alternative analyses in the lattice.
-"""
-import json
-from nedo_turkish_tokenizer import NedoTurkishTokenizer
-def section(title: str) -> None:
-    print(f"\n{'═' * 60}")
-    print(f"  {title}")
-    print(f"{'═' * 60}")
-def main() -> None:
-    tok = NedoTurkishTokenizer()
-    # ── 1. Tek kelime lattice testi ──────────────────────────────────────
-    section("1) get_morphological_lattice — tek kelime örnekleri")
-    test_words = ["evin", "gelir", "yüz", "çıkar", "koşar"]
-    for word in test_words:
-        lattice = tok.get_morphological_lattice(word)
-        print(f"\n▸ \"{word}\" → {len(lattice)} analiz:")
-        for i, entry in enumerate(lattice):
-            print(f"    [{i}] root={entry['root']:<12} "
-                  f"pos={entry['pos']:<6} "
-                  f"suffixes={entry['suffixes']}")
-            print(f"         lexical_form = {entry['lexical_form']}")
-    # ── 2. Bilinmeyen kelime (boş lattice) ───────────────────────────────
-    section("2) Bilinmeyen / yabancı kelime → boş lattice")
-    unknown_words = ["xyzfoo", "meeting", "blockchain"]
-    for word in unknown_words:
-        lattice = tok.get_morphological_lattice(word)
-        print(f"  \"{word}\" → lattice boş mu? {len(lattice) == 0}  (len={len(lattice)})")
-    # ── 3. tokenize_lattice — cümle bazlı test ──────────────────────────
-    section("3) tokenize_lattice — cümle testi")
-    sentences = [
-        "Evin güzel gelir",
-        "Çocuk okula koşar adım gitti",
-        "Yüz yıllık çınar",
-    ]
-    for sent in sentences:
-        print(f"\n▸ Input: \"{sent}\"")
-        result = tok.tokenize_lattice(sent)
-        for winfo in result["words"]:
-            n = len(winfo["lattice"])
-            print(f"    {winfo['word']:<16} → {n} analiz(ler)")
-            for entry in winfo["lattice"]:
-                print(f"        root={entry['root']:<12} pos={entry['pos']:<6} "
-                      f"suffixes={entry['suffixes']}")
-    # ── 4. JSON çıktı formatı ────────────────────────────────────────────
-    section("4) tokenize_lattice JSON çıktı")
-    data = tok.tokenize_lattice("evin gelir")
-    print(json.dumps(data, ensure_ascii=False, indent=2))
-    print("\n✅ Tüm testler başarıyla tamamlandı.")
-if __name__ == "__main__":
-    main()

tests/test_tdk_vocab.py DELETED Viewed

@@ -1,31 +0,0 @@
-from __future__ import annotations
-import tempfile
-import unittest
-from pathlib import Path
-from unittest import mock
-from nedo_turkish_tokenizer import _tdk_vocab
-class TdkVocabTests(unittest.TestCase):
-    def setUp(self) -> None:
-        self._original_words = _tdk_vocab._TDK_WORDS
-        _tdk_vocab._TDK_WORDS = None
-    def tearDown(self) -> None:
-        _tdk_vocab._TDK_WORDS = self._original_words
-    def test_load_tdk_words_uses_bundled_file_before_network(self) -> None:
-        with tempfile.TemporaryDirectory() as tmpdir:
-            cache_path = str(Path(tmpdir) / "tdk_words.txt")
-            with mock.patch.object(_tdk_vocab, "TDK_CACHE_FILE", cache_path):
-                with mock.patch.object(_tdk_vocab, "_download_from_hf") as download_hf:
-                    with mock.patch.object(_tdk_vocab, "_download_from_tdk") as download_tdk:
-                        words = _tdk_vocab.load_tdk_words()
-        self.assertGreater(len(words), 50_000)
-        self.assertIn("zemberek", words)
-        download_hf.assert_not_called()
-        download_tdk.assert_not_called()

tests/test_tokenizer.py ADDED Viewed

	@@ -0,0 +1,457 @@

+"""Comprehensive regression test suite for NedoTurkishTokenizer.
+Tests the public API and core segmentation with gold-standard examples
+covering: basic Turkish, suffix chains, apostrophes, foreign words,
+acronyms, special spans, ALL CAPS, compound words, and edge cases.
+TOKEN FORMAT CONTRACT:
+    token text does NOT include leading whitespace.
+    Whether a token is word-initial is indicated by morph_pos == 0.
+"""
+from __future__ import annotations
+import unittest
+class TestTokenizerPublicAPI(unittest.TestCase):
+    """Smoke tests for the public API surface."""
+    @classmethod
+    def setUpClass(cls) -> None:
+        from nedo_turkish_tokenizer import NedoTurkishTokenizer
+        cls.tok = NedoTurkishTokenizer()
+    def test_import_and_instantiate(self) -> None:
+        from nedo_turkish_tokenizer import NedoTurkishTokenizer
+        t = NedoTurkishTokenizer()
+        self.assertIsNotNone(t)
+    def test_version(self) -> None:
+        from nedo_turkish_tokenizer import __version__
+        self.assertEqual(__version__, "2.0.0")
+    def test_empty_input(self) -> None:
+        self.assertEqual(self.tok.tokenize(""), [])
+        self.assertEqual(self.tok.tokenize("   "), [])
+    def test_callable_shorthand(self) -> None:
+        result = self.tok("Merhaba")
+        self.assertTrue(len(result) > 0)
+    def test_token_dict_fields(self) -> None:
+        tokens = self.tok.tokenize("ev")
+        self.assertTrue(len(tokens) >= 1)
+        t = tokens[0]
+        self.assertIn("token", t)
+        self.assertIn("token_type", t)
+        self.assertIn("morph_pos", t)
+    def test_batch_tokenize(self) -> None:
+        texts = ["ev", "araba", "merhaba"]
+        results = self.tok.batch_tokenize(texts, chunk_size=1000)
+        self.assertEqual(len(results), 3)
+        for r in results:
+            self.assertIsInstance(r, list)
+            self.assertTrue(len(r) >= 1)
+    def test_stats(self) -> None:
+        tokens = self.tok.tokenize("evde oturuyorum")
+        stats = self.tok.stats(tokens)
+        self.assertIn("total", stats)
+        self.assertIn("roots", stats)
+        self.assertIn("suffixes", stats)
+        self.assertIn("tr_pct", stats)
+        self.assertGreater(stats["total"], 0)
+class TestTokenFormat(unittest.TestCase):
+    """Token text must NOT include leading whitespace."""
+    @classmethod
+    def setUpClass(cls) -> None:
+        from nedo_turkish_tokenizer import NedoTurkishTokenizer
+        cls.tok = NedoTurkishTokenizer()
+    def test_no_leading_space_root(self) -> None:
+        tokens = self.tok.tokenize("merhaba")
+        self.assertEqual(tokens[0]["token"], "merhaba")
+    def test_no_leading_space_suffix(self) -> None:
+        tokens = self.tok.tokenize("evde")
+        for t in tokens:
+            self.assertFalse(
+                t["token"].startswith(" "),
+                f"Token {t['token']!r} has a leading space",
+            )
+    def test_no_leading_space_url(self) -> None:
+        tokens = self.tok.tokenize("https://example.com")
+        self.assertEqual(tokens[0]["token"], "https://example.com")
+    def test_no_leading_space_num(self) -> None:
+        tokens = self.tok.tokenize("%85")
+        self.assertEqual(tokens[0]["token"], "%85")
+    def test_no_leading_space_any_token(self) -> None:
+        """No token in the output should ever start with a space."""
+        text = "İstanbul'da meeting'e katılamadım https://example.com %85"
+        tokens = self.tok.tokenize(text)
+        for t in tokens:
+            self.assertFalse(
+                t["token"].startswith(" "),
+                f"Token {t['token']!r} (type={t['token_type']}) has a leading space",
+            )
+class TestBasicTurkish(unittest.TestCase):
+    """Core Turkish morphology tokenization."""
+    @classmethod
+    def setUpClass(cls) -> None:
+        from nedo_turkish_tokenizer import NedoTurkishTokenizer
+        cls.tok = NedoTurkishTokenizer()
+    def _roots(self, text: str) -> list[str]:
+        return [t["token"] for t in self.tok.tokenize(text) if t["token_type"] == "ROOT"]
+    def _types(self, text: str) -> list[str]:
+        return [t["token_type"] for t in self.tok.tokenize(text)]
+    def _suffixes(self, text: str) -> list[str]:
+        return [t["token"] for t in self.tok.tokenize(text) if t["token_type"] == "SUFFIX"]
+    # ── Single words ─────────────────────────────────────────────────────
+    def test_simple_root(self) -> None:
+        tokens = self.tok.tokenize("merhaba")
+        self.assertEqual(tokens[0]["token"], "merhaba")
+        self.assertEqual(tokens[0]["token_type"], "ROOT")
+    def test_whole_word_tdk_preserved(self) -> None:
+        """'dünya' is in TDK — must NOT be split into 'dün' + 'ya'."""
+        roots = self._roots("dünya")
+        self.assertIn("dünya", roots)
+    def test_suffix_loc(self) -> None:
+        tokens = self.tok.tokenize("evde")
+        self.assertEqual(tokens[0]["token"], "ev")
+        self.assertEqual(tokens[0]["token_type"], "ROOT")
+        self.assertEqual(tokens[1]["token"], "de")
+        self.assertEqual(tokens[1]["token_type"], "SUFFIX")
+    def test_suffix_plural_acc(self) -> None:
+        tokens = self.tok.tokenize("kitapları")
+        self.assertEqual(tokens[0]["token"], "kitap")
+        types = [t["token_type"] for t in tokens]
+        self.assertIn("SUFFIX", types)
+    def test_verb_stem_past(self) -> None:
+        """Verb stems derived from infinitives must be found."""
+        roots = self._roots("geldim")
+        self.assertIn("gel", roots)
+    def test_verb_stem_progressive(self) -> None:
+        roots = self._roots("geliyorum")
+        self.assertIn("gel", roots)
+    def test_verb_otur(self) -> None:
+        roots = self._roots("oturuyorum")
+        self.assertIn("otur", roots)
+    def test_katil_root(self) -> None:
+        roots = self._roots("katılamadım")
+        self.assertIn("katıl", roots)
+    def test_longer_root_wins(self) -> None:
+        """'toplantısı' should segment as 'toplantı' + 'sı', not 'toplan' + 'tı' + 'sı'."""
+        roots = self._roots("toplantısı")
+        self.assertIn("toplantı", roots)
+    def test_morph_pos_increments(self) -> None:
+        tokens = self.tok.tokenize("evlerden")
+        suffix_positions = [t["morph_pos"] for t in tokens if t["token_type"] == "SUFFIX"]
+        for i, pos in enumerate(suffix_positions):
+            self.assertGreater(pos, 0, f"Suffix at index {i} should have morph_pos > 0")
+class TestFalseSuffixSplits(unittest.TestCase):
+    """Regression tests: common words that must NOT be over-segmented.
+    These words look like root+suffix but are standalone units.
+    """
+    @classmethod
+    def setUpClass(cls) -> None:
+        from nedo_turkish_tokenizer import NedoTurkishTokenizer
+        cls.tok = NedoTurkishTokenizer()
+    def _assert_single_root(self, word: str) -> None:
+        """Assert that *word* tokenizes to exactly one ROOT token."""
+        tokens = self.tok.tokenize(word)
+        roots = [t for t in tokens if t["token_type"] == "ROOT"]
+        self.assertEqual(
+            len(roots), 1,
+            f"'{word}' should be a single ROOT, got: "
+            f"{[(t['token'], t['token_type']) for t in tokens]}",
+        )
+        self.assertEqual(len(tokens), 1, f"'{word}' should produce 1 token, got {len(tokens)}")
+        self.assertEqual(tokens[0]["token"], word)
+    # ── Forms of "demek" (to say) ────────────────────────────────────────
+    # Stem "de" is a TDK conjunction, causing false splits like de+di.
+    def test_dedi(self) -> None:
+        self._assert_single_root("dedi")
+    def test_dedim(self) -> None:
+        self._assert_single_root("dedim")
+    def test_demis(self) -> None:
+        self._assert_single_root("demiş")
+    def test_denir(self) -> None:
+        self._assert_single_root("denir")
+    def test_dese(self) -> None:
+        self._assert_single_root("dese")
+    # ── Discourse particles / conjunctions ───────────────────────────────
+    # These are in TDK and should be protected by WHOLE_WORD_BONUS.
+    def test_yani(self) -> None:
+        self._assert_single_root("yani")
+    def test_belki(self) -> None:
+        self._assert_single_root("belki")
+    def test_cunku(self) -> None:
+        self._assert_single_root("çünkü")
+    def test_sanki(self) -> None:
+        self._assert_single_root("sanki")
+    # ── "dedi mi" phrase ─────────────────────────────────────────────────
+    def test_dedi_mi(self) -> None:
+        tokens = self.tok.tokenize("dedi mi")
+        roots = [t for t in tokens if t["token_type"] == "ROOT"]
+        self.assertEqual(len(roots), 2, "Both 'dedi' and 'mi' should be roots")
+        root_texts = [t["token"] for t in roots]
+        self.assertIn("dedi", root_texts)
+    # ── TDK-protected words should never be split ────────────────────────
+    def test_bile(self) -> None:
+        self._assert_single_root("bile")
+    def test_daha(self) -> None:
+        self._assert_single_root("daha")
+class TestApostrophe(unittest.TestCase):
+    """Apostrophe handling for Turkish proper names and foreign stems."""
+    @classmethod
+    def setUpClass(cls) -> None:
+        from nedo_turkish_tokenizer import NedoTurkishTokenizer
+        cls.tok = NedoTurkishTokenizer()
+    def test_turkish_proper_name(self) -> None:
+        """İstanbul'da → ROOT + PUNCT(') + SUFFIX(da)."""
+        tokens = self.tok.tokenize("İstanbul'da")
+        types = [t["token_type"] for t in tokens]
+        self.assertIn("ROOT", types)
+        self.assertIn("PUNCT", types)
+        self.assertIn("SUFFIX", types)
+    def test_foreign_stem(self) -> None:
+        """meeting'e → FOREIGN + SUFFIX."""
+        tokens = self.tok.tokenize("meeting'e")
+        types = [t["token_type"] for t in tokens]
+        self.assertIn("FOREIGN", types)
+        self.assertIn("SUFFIX", types)
+    def test_apostrophe_suffix_label(self) -> None:
+        tokens = self.tok.tokenize("İstanbul'da")
+        suffix_tokens = [t for t in tokens if t["token_type"] == "SUFFIX"]
+        self.assertTrue(len(suffix_tokens) >= 1)
+        self.assertEqual(suffix_tokens[0].get("_suffix_label"), "-LOC")
+class TestSpecialSpans(unittest.TestCase):
+    """URL, date, number, acronym, emoji detection."""
+    @classmethod
+    def setUpClass(cls) -> None:
+        from nedo_turkish_tokenizer import NedoTurkishTokenizer
+        cls.tok = NedoTurkishTokenizer()
+    def _find_type(self, text: str, ttype: str) -> list[dict]:
+        return [t for t in self.tok.tokenize(text) if t["token_type"] == ttype]
+    def test_url_detection(self) -> None:
+        urls = self._find_type("https://example.com sitesine bak", "URL")
+        self.assertEqual(len(urls), 1)
+        self.assertIn("example.com", urls[0]["token"])
+    def test_date_detection(self) -> None:
+        dates = self._find_type("14.03.2026 tarihinde", "DATE")
+        self.assertEqual(len(dates), 1)
+    def test_number_detection(self) -> None:
+        nums = self._find_type("%85 başarı", "NUM")
+        self.assertEqual(len(nums), 1)
+    def test_acronym_detection(self) -> None:
+        tokens = self.tok.tokenize("NATO güçlü")
+        acr = [t for t in tokens if t["token_type"] == "ACRONYM"]
+        self.assertEqual(len(acr), 1)
+        self.assertTrue(acr[0].get("_expansion"))
+    def test_mention_detection(self) -> None:
+        mentions = self._find_type("@kullanici çok iyi", "MENTION")
+        self.assertEqual(len(mentions), 1)
+    def test_hashtag_detection(self) -> None:
+        tags = self._find_type("#türkiye çok güzel", "HASHTAG")
+        self.assertEqual(len(tags), 1)
+class TestAllCaps(unittest.TestCase):
+    """ALL CAPS word handling."""
+    @classmethod
+    def setUpClass(cls) -> None:
+        from nedo_turkish_tokenizer import NedoTurkishTokenizer
+        cls.tok = NedoTurkishTokenizer()
+    def test_caps_detected(self) -> None:
+        tokens = self.tok.tokenize("İSTANBUL güzel")
+        istanbul_tok = [t for t in tokens if "istanbul" in t["token"]]
+        self.assertTrue(len(istanbul_tok) >= 1)
+        self.assertTrue(istanbul_tok[0].get("_caps"))
+    def test_caps_lowered(self) -> None:
+        tokens = self.tok.tokenize("İSTANBUL")
+        self.assertEqual(tokens[0]["token"], "istanbul")
+    def test_caps_acronym(self) -> None:
+        """Known acronyms in ALL CAPS should be ACRONYM type."""
+        tokens = self.tok.tokenize("TBMM toplantısı")
+        tbmm = [t for t in tokens if t["token_type"] == "ACRONYM"]
+        self.assertTrue(len(tbmm) >= 1)
+class TestCanonicalLabels(unittest.TestCase):
+    """Allomorph canonicalization metadata."""
+    @classmethod
+    def setUpClass(cls) -> None:
+        from nedo_turkish_tokenizer import NedoTurkishTokenizer
+        cls.tok = NedoTurkishTokenizer()
+    def test_loc_canonical(self) -> None:
+        tokens = self.tok.tokenize("evde")
+        suffix = [t for t in tokens if t["token_type"] == "SUFFIX"]
+        self.assertTrue(any(t.get("_canonical") == "LOC" for t in suffix))
+    def test_pl_canonical(self) -> None:
+        tokens = self.tok.tokenize("evler")
+        suffix = [t for t in tokens if t["token_type"] == "SUFFIX"]
+        self.assertTrue(any(t.get("_canonical") == "PL" for t in suffix))
+class TestCompoundAnnotation(unittest.TestCase):
+    """Compound word detection."""
+    @classmethod
+    def setUpClass(cls) -> None:
+        from nedo_turkish_tokenizer import NedoTurkishTokenizer
+        cls.tok = NedoTurkishTokenizer()
+    def test_known_compound(self) -> None:
+        tokens = self.tok.tokenize("başbakan")
+        root = [t for t in tokens if t["token_type"] == "ROOT"]
+        if root and root[0]["token"] == "başbakan":
+            self.assertTrue(root[0].get("_compound"))
+            self.assertIn("baş", root[0].get("_parts", []))
+class TestNoDependencies(unittest.TestCase):
+    """Verify no external runtime dependencies are imported."""
+    def test_no_external_imports(self) -> None:
+        import ast
+        from pathlib import Path
+        pkg_dir = Path(__file__).parent.parent / "nedo_turkish_tokenizer"
+        banned = {"turkish_tokenizer", "zemberek", "requests", "transformers"}
+        for py_file in pkg_dir.glob("*.py"):
+            tree = ast.parse(py_file.read_text(encoding="utf-8"))
+            for node in ast.walk(tree):
+                if isinstance(node, ast.Import):
+                    for alias in node.names:
+                        top = alias.name.split(".")[0]
+                        self.assertNotIn(
+                            top, banned,
+                            f"{py_file.name} imports banned dependency: {alias.name}"
+                        )
+                elif isinstance(node, ast.ImportFrom):
+                    if node.module:
+                        top = node.module.split(".")[0]
+                        self.assertNotIn(
+                            top, banned,
+                            f"{py_file.name} imports banned dependency: {node.module}"
+                        )
+class TestEdgeCases(unittest.TestCase):
+    """Edge cases and regression guards."""
+    @classmethod
+    def setUpClass(cls) -> None:
+        from nedo_turkish_tokenizer import NedoTurkishTokenizer
+        cls.tok = NedoTurkishTokenizer()
+    def test_punctuation_only(self) -> None:
+        tokens = self.tok.tokenize("...")
+        self.assertTrue(all(t["token_type"] == "PUNCT" for t in tokens))
+    def test_mixed_punctuation(self) -> None:
+        tokens = self.tok.tokenize('"Merhaba," dedi.')
+        types = [t["token_type"] for t in tokens]
+        self.assertIn("PUNCT", types)
+        self.assertIn("ROOT", types)
+    def test_unicode_normalized(self) -> None:
+        tokens = self.tok.tokenize("  merhaba   dünya  ")
+        roots = [t["token"] for t in tokens if t["token_type"] == "ROOT"]
+        self.assertIn("merhaba", roots)
+        self.assertIn("dünya", roots)
+    def test_single_char_word(self) -> None:
+        tokens = self.tok.tokenize("a")
+        self.assertTrue(len(tokens) >= 1)
+    def test_number_apostrophe_suffix(self) -> None:
+        """3'te, 1990'larda should be NUM + SUFFIX."""
+        tokens = self.tok.tokenize("3'te geldim")
+        num = [t for t in tokens if t["token_type"] == "NUM"]
+        self.assertTrue(len(num) >= 1)
+    def test_integration_full_sentence(self) -> None:
+        """Full integration test with mixed content."""
+        tokens = self.tok.tokenize("İstanbul'da meeting'e katılamadım")
+        self.assertTrue(len(tokens) > 0)
+        # Verify the critical acceptance criteria
+        from nedo_turkish_tokenizer import NedoTurkishTokenizer
+        t = NedoTurkishTokenizer()
+        result = t.tokenize("İstanbul'da meeting'e katılamadım")
+        self.assertIsInstance(result, list)
+        self.assertTrue(all("token" in tok and "token_type" in tok for tok in result))
+if __name__ == "__main__":
+    unittest.main()

tests/test_zemberek_integration.py DELETED Viewed

@@ -1,58 +0,0 @@
-from __future__ import annotations
-import tempfile
-import unittest
-from pathlib import Path
-from unittest import mock
-from nedo_turkish_tokenizer import NedoTurkishTokenizer, _tdk_vocab
-from nedo_turkish_tokenizer._root_validator import (
-    ZEMBEREK_AVAILABLE,
-    disambiguate_sentence,
-)
-@unittest.skipUnless(ZEMBEREK_AVAILABLE, "zemberek-python is required for these tests")
-class ZemberekIntegrationTests(unittest.TestCase):
-    def setUp(self) -> None:
-        self._original_words = _tdk_vocab._TDK_WORDS
-        _tdk_vocab._TDK_WORDS = None
-    def tearDown(self) -> None:
-        _tdk_vocab._TDK_WORDS = self._original_words
-    def test_sentence_disambiguation_uses_zemberek_python(self) -> None:
-        analyses = disambiguate_sentence(["Bug\u00fcn", "geldi"])
-        self.assertEqual(2, len(analyses))
-        self.assertEqual("bug\u00fcn", analyses[0]["lemma"])
-        self.assertEqual("gelmek", analyses[1]["lemma"])
-        self.assertEqual("Verb", analyses[1]["pos"])
-    def test_tokenizer_smoke_uses_bundled_tdk_words(self) -> None:
-        with tempfile.TemporaryDirectory() as tmpdir:
-            cache_path = str(Path(tmpdir) / "tdk_words.txt")
-            with mock.patch.object(_tdk_vocab, "TDK_CACHE_FILE", cache_path):
-                with mock.patch.object(_tdk_vocab, "_download_from_hf") as download_hf:
-                    with mock.patch.object(_tdk_vocab, "_download_from_tdk") as download_tdk:
-                        tokenizer = NedoTurkishTokenizer()
-                        tokens = tokenizer.tokenize("Bug\u00fcn \u0130stanbul'a gidiyorum.")
-        self.assertTrue(
-            any(t["token"].strip() == "bug\u00fcn" and t["token_type"] == "ROOT" for t in tokens)
-        )
-        self.assertTrue(any(t["token"] == "'" and t["token_type"] == "PUNCT" for t in tokens))
-        self.assertTrue(
-            any(
-                t["token"].strip() == "a"
-                and t["token_type"] == "SUFFIX"
-                and t["morph_pos"] == 1
-                for t in tokens
-            )
-        )
-        self.assertTrue(
-            any(t["token"].strip() == "gitmek" and t.get("_root_corrected") for t in tokens)
-        )
-        download_hf.assert_not_called()
-        download_tdk.assert_not_called()

tokenization_nedo_turkish.py DELETED Viewed

@@ -1,172 +0,0 @@
-"""
-NedoTurkishTokenizer — HuggingFace AutoTokenizer compatible class.
-Usage:
-    from transformers import AutoTokenizer
-    tok = AutoTokenizer.from_pretrained("Ethosoft/NedoTurkishTokenizer", trust_remote_code=True)
-    out = tok("İstanbul'da meeting'e katılamadım")
-    out["input_ids"]            # hash-stable int IDs of morphological tokens
-    out["attention_mask"]       # all 1s
-    out["token_type_ids"]       # 0=root/other, 1=suffix
-    out["morphological_tokens"] # full morphological dicts (token, token_type, morph_pos, ...)
-"""
-from __future__ import annotations
-import hashlib
-from typing import Any
-from transformers import PreTrainedTokenizer
-# ── Morphological type → token_type_id ───────────────────────────────────────
-_MTYPE_ID = {
-    "ROOT":    0,
-    "FOREIGN": 0,
-    "SUFFIX":  1,
-    "BPE":     2,
-    "PUNCT":   3,
-    "NUM":     4,
-    "DATE":    4,
-    "UNIT":    4,
-    "URL":     5,
-    "MENTION": 5,
-    "HASHTAG": 5,
-    "EMOJI":   5,
-}
-def _stable_hash(s: str) -> int:
-    """MD5-based stable hash that does NOT change between Python runs."""
-    return int(hashlib.md5(s.encode("utf-8")).hexdigest()[:6], 16)
-class NedoTurkishTokenizer(PreTrainedTokenizer):
-    """
-    Turkish morphological tokenizer — HuggingFace compatible.
-    ``input_ids`` are MD5-hash-based stable IDs (not lookup-table vocab IDs).
-    For downstream transformer use, embed by ``token_type_ids`` or learn a
-    projection from the ``morphological_tokens`` metadata.
-    All standard HuggingFace fields are present:
-        input_ids, attention_mask, token_type_ids
-    Extra field:
-        morphological_tokens — list[dict] with token, token_type, morph_pos, ...
-    """
-    vocab_files_names: dict = {}
-    model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
-    def __init__(self, **kwargs: Any) -> None:
-        super().__init__(**kwargs)
-        self._morph: "NedoTurkishTokenizer_core | None" = None  # lazy init
-    def _get_morph(self):
-        if self._morph is None:
-            from nedo_turkish_tokenizer import NedoTurkishTokenizer as _Core  # noqa: PLC0415
-            self._morph = _Core()
-        return self._morph
-    # ── PreTrainedTokenizer required interface ────────────────────────────────
-    @property
-    def vocab_size(self) -> int:
-        return 16_777_216   # 2^24 — MD5 hash space
-    def get_vocab(self) -> dict:
-        return {}           # no fixed vocabulary
-    def _tokenize(self, text: str) -> list[str]:
-        """Return token strings from the morphological pipeline."""
-        tokens = self._get_morph().tokenize(text)
-        return [t["token"] for t in tokens]
-    def _convert_token_to_id(self, token: str) -> int:
-        return _stable_hash(token)
-    def _convert_id_to_token(self, index: int) -> str:
-        return ""           # no inverse mapping without a vocab
-    def save_vocabulary(
-        self,
-        save_directory: str,
-        filename_prefix: str | None = None,
-    ) -> tuple:
-        return ()           # no vocab file
-    # ── Main __call__ override ────────────────────────────────────────────────
-    def __call__(
-        self,
-        text: str | list[str],
-        return_morphological_tokens: bool = True,
-        **kwargs: Any,
-    ) -> dict:
-        """
-        Tokenize text and return a dict with standard HuggingFace fields
-        plus ``morphological_tokens``.
-        Args:
-            text: Single string or list of strings.
-            return_morphological_tokens: Include full morphological dicts.
-        Returns:
-            dict with:
-                input_ids            : list[int] or list[list[int]]
-                attention_mask       : list[int] or list[list[int]]
-                token_type_ids       : list[int] or list[list[int]]
-                morphological_tokens : list[dict] or list[list[dict]]
-        """
-        if isinstance(text, list):
-            results = [self._encode_single(t, return_morphological_tokens) for t in text]
-            return {
-                "input_ids":            [r["input_ids"] for r in results],
-                "attention_mask":       [r["attention_mask"] for r in results],
-                "token_type_ids":       [r["token_type_ids"] for r in results],
-                "morphological_tokens": [r["morphological_tokens"] for r in results],
-            }
-        return self._encode_single(text, return_morphological_tokens)
-    def _encode_single(self, text: str, with_morph: bool) -> dict:
-        morph = self._get_morph()
-        tokens = morph.tokenize(text)
-        input_ids   = [_stable_hash(t["token"]) for t in tokens]
-        attn_mask   = [1] * len(tokens)
-        type_ids    = [_MTYPE_ID.get(t["token_type"], 0) for t in tokens]
-        out: dict = {
-            "input_ids":      input_ids,
-            "attention_mask": attn_mask,
-            "token_type_ids": type_ids,
-        }
-        if with_morph:
-            out["morphological_tokens"] = tokens
-        return out
-    # ── Convenience helpers ───────────────────────────────────────────────────
-    def encode(self, text: str, **kwargs) -> list[int]:  # type: ignore[override]
-        return self._encode_single(text, with_morph=False)["input_ids"]
-    def decode(self, token_ids: list[int], **kwargs) -> str:  # type: ignore[override]
-        """Not meaningful without a fixed vocab — returns empty string."""
-        return ""
-    def tokenize(self, text: str, **kwargs) -> list[str]:
-        return self._tokenize(text)
-    def morphological_tokenize(self, text: str) -> list[dict]:
-        """Return full morphological token dicts (main NedoTurkishTokenizer output)."""
-        return self._get_morph().tokenize(text)
-    def batch_tokenize(self, texts: list[str], workers: int | None = None) -> list[list[dict]]:
-        """Parallel morphological tokenization."""
-        return self._get_morph().batch_tokenize(texts, workers=workers)
-    def stats(self, tokens: list[dict]) -> dict:
-        """Compute TR% and other morphological coverage stats."""
-        return self._get_morph().stats(tokens)

tokenizer_config.json DELETED Viewed

@@ -1,12 +0,0 @@
-{
-  "bos_token": "[BOS]",
-  "clean_up_tokenization_spaces": true,
-  "cls_token": "[CLS]",
-  "eos_token": "[EOS]",
-  "mask_token": "[MASK]",
-  "model_max_length": 1000000000000000019884624838656,
-  "pad_token": "[PAD]",
-  "sep_token": "[SEP]",
-  "tokenizer_class": "NedoPreTrainedTokenizer",
-  "unk_token": "[UNK]"
-}

vocab_64k.json DELETED Viewed

The diff for this file is too large to render. See raw diff