File size: 8,522 Bytes

5a95e08

"""
indus_ngram.py — Standalone module for InduNgramModel
======================================================
This file MUST exist so that pickle can import InduNgramModel
when loading ngram_model.pkl in 07_ensemble.py and 08_electra_train.py.

The pickle fix:
  When you save a class with pickle, Python records the module path.
  If the class was defined in __main__ (i.e. inside 06_ngram_model.py
  when run directly), pickle saves it as __main__.InduNgramModel.
  When another script tries to load it, __main__ refers to THAT script,
  which doesn't have InduNgramModel — hence the AttributeError.

  Solution: define the class in THIS standalone module (indus_ngram.py).
  Both 06_ngram_model.py and 07_ensemble.py import from here.
  Pickle records the path as indus_ngram.InduNgramModel — always findable.

Do not rename or move this file.
"""

import math
import pickle
from pathlib import Path
from collections import Counter, defaultdict


class InduNgramModel:
    """
    Kneser-Ney smoothed N-gram LM for Indus Script.

    RTL mode (default, recommended):
      Sequences reversed before training/scoring.
      RTL bigram entropy (3.18) < LTR (3.72) → supports RTL hypothesis.

    Sign roles in RTL reading direction:
      INITIAL  = reading-start sign (data position [-1])
      TERMINAL = reading-end sign   (data position [0])
      MEDIAL   = appears in middle positions
    """

    def __init__(self, rtl=True):
        self.rtl             = rtl
        self.unigram         = Counter()
        self.bigram          = defaultdict(Counter)
        self.trigram         = defaultdict(Counter)
        self.start_cnt       = Counter()
        self.end_cnt         = Counter()
        self.total_seqs      = 0
        self.total_tokens    = 0
        self.vocab_size      = 0
        self.D               = 0.75
        self.score_mean      = 0.0
        self.score_std       = 1.0
        self.score_min       = -20.0
        self.score_max       = 0.0
        self._pairwise_acc   = 0.0
        self._cont_right     = Counter()
        self._total_bi_types = 0

    def _orient(self, seq):
        cleaned = [t for t in seq if t is not None]
        return list(reversed(cleaned)) if self.rtl else list(cleaned)

    def train(self, sequences):
        mode = "RTL" if self.rtl else "LTR"
        print(f"  Training [{mode}] on {len(sequences):,} sequences...")

        for seq in sequences:
            s = self._orient(seq)
            if not s:
                continue
            self.total_seqs += 1
            self.unigram.update(s)
            self.start_cnt[s[0]]  += 1
            self.end_cnt[s[-1]]   += 1
            for i in range(len(s) - 1):
                self.bigram[s[i]][s[i+1]] += 1
            for i in range(len(s) - 2):
                self.trigram[(s[i], s[i+1])][s[i+2]] += 1

        self.total_tokens    = sum(self.unigram.values())
        self.vocab_size      = len(self.unigram)

        for a, followers in self.bigram.items():
            for b in followers:
                self._cont_right[b] += 1
        self._total_bi_types = sum(self._cont_right.values())

        self._calibrate(sequences)
        print(f"  Vocab      : {self.vocab_size}")
        print(f"  Pairwise   : {self._pairwise_acc*100:.1f}%")
        print(f"  Score range: [{self.score_min:.3f}, {self.score_max:.3f}]")

    def _calibrate(self, sequences):
        import random, statistics
        random.seed(42)
        all_toks = list(self.unigram.keys())

        def corrupt(seq):
            r = random.randint(0, 3)
            c = list(seq)
            if r == 0:
                random.shuffle(c)
            elif r == 1:
                c[0] = random.choice(list(self.end_cnt.keys()))
            elif r == 2:
                c[-1] = random.choice(list(self.start_cnt.keys()))
            else:
                for p in random.sample(range(len(c)), max(1, len(c)//2)):
                    c[p] = random.choice(all_toks)
            return c

        sample = sequences[:500]
        good   = [self._raw_score(s) for s in sample]
        bad    = [self._raw_score(corrupt(s)) for s in sample]
        all_s  = good + bad

        self.score_mean    = statistics.mean(all_s)
        self.score_std     = statistics.stdev(all_s)
        self.score_min     = min(all_s)
        self.score_max     = max(all_s)
        self._pairwise_acc = sum(g > b for g, b in zip(good, bad)) / len(good)

    def _p_uni_kn(self, w):
        return (self._cont_right[w] + 1) / (self._total_bi_types + self.vocab_size)

    def _p_bi_kn(self, w, given):
        gt = sum(self.bigram[given].values())
        if gt == 0:
            return self._p_uni_kn(w)
        cnt   = self.bigram[given].get(w, 0)
        first = max(cnt - self.D, 0) / gt
        lam   = (self.D / gt) * len(self.bigram[given])
        return first + lam * self._p_uni_kn(w)

    def _p_tri_kn(self, w, a, b):
        gt = sum(self.trigram[(a, b)].values())
        if gt == 0:
            return self._p_bi_kn(w, b)
        cnt   = self.trigram[(a, b)].get(w, 0)
        first = max(cnt - self.D, 0) / gt
        lam   = (self.D / gt) * len(self.trigram[(a, b)])
        return first + lam * self._p_bi_kn(w, b)

    def _p_initial(self, w):
        return (self.start_cnt[w] + 0.1) / (self.total_seqs + 0.1 * self.vocab_size)

    def _p_terminal(self, w):
        return (self.end_cnt[w] + 0.1) / (self.total_seqs + 0.1 * self.vocab_size)

    def _raw_score(self, seq):
        if not seq:
            return self.score_min
        s   = self._orient(seq)
        eps = 1e-10
        lp  = math.log(self._p_initial(s[0])  + eps)
        lp += math.log(self._p_terminal(s[-1]) + eps)
        for i, w in enumerate(s):
            if i == 0:
                p = self._p_uni_kn(w)
            elif i == 1:
                p = self._p_bi_kn(w, s[i-1])
            else:
                p = self._p_tri_kn(w, s[i-2], s[i-1])
            lp += math.log(p + eps)
        return lp / (len(s) + 2)

    def validity_score(self, seq):
        raw  = self._raw_score(seq)
        norm = (raw - self.score_min) / (self.score_max - self.score_min + 1e-10)
        return float(max(0.02, min(0.98, norm)))

    def predict_masked(self, seq_with_none, top_k=10):
        masked = [i for i, t in enumerate(seq_with_none) if t is None]
        results = {}
        n = len(seq_with_none)

        for orig_pos in masked:
            ort_pos  = (n - 1 - orig_pos) if self.rtl else orig_pos
            oriented = self._orient(seq_with_none)
            if ort_pos >= len(oriented):
                continue

            prev  = oriented[ort_pos-1] if ort_pos > 0 and oriented[ort_pos-1] is not None else None
            prev2 = oriented[ort_pos-2] if ort_pos > 1 and oriented[ort_pos-2] is not None else None

            cands = []
            for cand in self.unigram:
                if prev2 is not None and prev is not None:
                    p = self._p_tri_kn(cand, prev2, prev)
                elif prev is not None:
                    p = self._p_bi_kn(cand, prev)
                else:
                    p = self._p_uni_kn(cand)

                if ort_pos == 0:
                    p *= max(self._p_initial(cand) * self.vocab_size, 0.01)
                elif ort_pos == n - 1:
                    p *= max(self._p_terminal(cand) * self.vocab_size, 0.01)

                cands.append((cand, p))

            cands.sort(key=lambda x: -x[1])
            total = sum(p for _, p in cands[:top_k * 3]) or 1
            results[orig_pos] = [
                {"id": c, "prob": p / total, "rank": i + 1}
                for i, (c, p) in enumerate(cands[:top_k])
            ]

        return results

    def sign_role(self, sign_id):
        """Positional role in reading direction."""
        init_p = self.start_cnt[sign_id] / (self.total_seqs + 1)
        term_p = self.end_cnt[sign_id]   / (self.total_seqs + 1)
        if init_p > 0.05 and init_p > term_p * 2:
            return "INITIAL"
        elif term_p > 0.05 and term_p > init_p * 2:
            return "TERMINAL"
        elif self.unigram[sign_id] > 5:
            return "MEDIAL"
        return "RARE"

    def save(self, path):
        path = Path(path)
        path.parent.mkdir(parents=True, exist_ok=True)
        with open(path, "wb") as f:
            pickle.dump(self, f, protocol=pickle.HIGHEST_PROTOCOL)
        print(f"  Saved → {path}")

    @staticmethod
    def load(path):
        with open(path, "rb") as f:
            return pickle.load(f)