"""Self-contained model class for binomial-marks-1.

Distributed alongside the weights on HuggingFace Hub so anyone can do:

    from transformers import AutoTokenizer, AutoModel
    tok   = AutoTokenizer.from_pretrained("BinomialTechnologies/binomial-marks-1")
    model = AutoModel.from_pretrained("BinomialTechnologies/binomial-marks-1",
                                       trust_remote_code=True)

This file imports only from `transformers` + `torch` — no project-internal
dependencies.

Architecture:
    ModernBERT-large encoder (with optional YaRN RoPE extension to 16k)
        ↓ (CLS + masked mean pool concatenated)
        ↓ (3 × MLP heads)
    23 outputs:
        10 × topic_mentioned (binary classification, sigmoid → BCE loss)
        10 × topic_score     (regression in [-2, +2] after clamp at inference)
         3 × tone_score      (regression in [1, 5] after clamp at inference)
"""

from __future__ import annotations

import math
from dataclasses import dataclass
from typing import Optional

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel, AutoConfig
from transformers.modeling_utils import PreTrainedModel
from transformers.modeling_outputs import ModelOutput

# Relative import — HF's `trust_remote_code` loader bundles sibling .py
# files together and resolves these without the symbol being "installed".
from .configuration_marks import MarksConfig, TOPICS, TONES


# ---------------------------------------------------------------------------
# YaRN RoPE extension — per-dim ramp; applied after model load
# ---------------------------------------------------------------------------

def _yarn_inv_freq(
    head_dim: int,
    base: float,
    scale: float,
    original_max_position: int,
    beta_fast: float = 32.0,
    beta_slow: float = 1.0,
    device=None,
    dtype=torch.float32,
) -> torch.Tensor:
    if scale <= 1.0:
        return 1.0 / (base ** (torch.arange(0, head_dim, 2, device=device, dtype=dtype) / head_dim))
    inv_freq_extrap = 1.0 / (base ** (torch.arange(0, head_dim, 2, device=device, dtype=dtype) / head_dim))
    inv_freq_interp = inv_freq_extrap / scale
    wavelengths = 2.0 * math.pi / inv_freq_extrap
    L = original_max_position
    ramp = (L / wavelengths - beta_slow) / (beta_fast - beta_slow)
    ramp = ramp.clamp(0.0, 1.0)
    return inv_freq_interp * (1.0 - ramp) + inv_freq_extrap * ramp


def _apply_yarn_to_modernbert(encoder, new_max_position: int,
                               original_max_position: int = 8192,
                               beta_fast: float = 32.0, beta_slow: float = 1.0):
    if new_max_position == original_max_position:
        return
    scale = new_max_position / original_max_position
    cfg = encoder.config
    head_dim = cfg.hidden_size // cfg.num_attention_heads
    global_base = float(getattr(cfg, "global_rope_theta", getattr(cfg, "rope_theta", 10000.0)))

    rotary_modules = [
        m for _, m in encoder.named_modules()
        if m.__class__.__name__ == "ModernBertRotaryEmbedding"
    ]
    for mod in rotary_modules:
        full_buf = getattr(mod, "full_attention_inv_freq", None)
        if full_buf is None or full_buf.numel() != head_dim // 2:
            continue
        new_inv = _yarn_inv_freq(
            head_dim=head_dim, base=global_base, scale=scale,
            original_max_position=original_max_position,
            beta_fast=beta_fast, beta_slow=beta_slow,
            device=full_buf.device, dtype=full_buf.dtype,
        )
        full_buf.data.copy_(new_inv)


# ---------------------------------------------------------------------------
# Output dataclass
# ---------------------------------------------------------------------------

@dataclass
class MarksOutput(ModelOutput):
    loss: Optional[torch.Tensor] = None
    loss_components: Optional[dict] = None
    topic_mentioned_logits: Optional[torch.Tensor] = None   # (B, 10)
    topic_score: Optional[torch.Tensor] = None              # (B, 10)
    tone_score: Optional[torch.Tensor] = None               # (B,  3)


# ---------------------------------------------------------------------------
# Model
# ---------------------------------------------------------------------------

class MarksMultiHead(PreTrainedModel):
    """Multi-head ModernBERT-large fine-tuned for earnings-call NLP scoring.

    23 outputs per call:
      * topic_mentioned (binary, 10 dims)
      * topic_score     (regression in [-2, +2], 10 dims)
      * tone_score      (regression in [1, 5], 3 dims)
    """

    config_class = MarksConfig
    base_model_prefix = "encoder"
    supports_gradient_checkpointing = True

    def __init__(self, config: MarksConfig):
        super().__init__(config)
        self.n_topics = len(config.topics)
        self.n_tones  = len(config.tones)

        # Encoder — built from config (so we don't redownload base weights;
        # weights come from this repo's safetensors).
        if config.encoder_config:
            enc_cfg = AutoConfig.from_dict(config.encoder_config) if hasattr(AutoConfig, "from_dict") else AutoConfig.for_model(**config.encoder_config)
        else:
            enc_cfg = AutoConfig.from_pretrained(config.encoder_name_or_path)

        # Override the encoder ctx to the trained value (16384 for our v1).
        enc_cfg.max_position_embeddings = config.max_position_embeddings

        # Initialize encoder with config-only constructor (random init); the
        # PreTrainedModel.from_pretrained caller will restore real weights
        # from this repo's safetensors.
        self.encoder = AutoModel.from_config(enc_cfg)
        H = enc_cfg.hidden_size

        # Head input is CLS + mean pool concatenated → 2H.
        head_in = 2 * H
        head_hidden = H // config.head_dim_ratio

        def _mlp(out_dim: int) -> nn.Sequential:
            return nn.Sequential(
                nn.Linear(head_in, head_hidden),
                nn.GELU(),
                nn.Dropout(config.dropout),
                nn.Linear(head_hidden, out_dim),
            )

        self.dropout = nn.Dropout(config.dropout)
        self.head_topic_mentioned = _mlp(self.n_topics)
        self.head_topic_score     = _mlp(self.n_topics)
        self.head_tone_score      = _mlp(self.n_tones)

        # Loss weights (used only if labels are passed for fine-tuning).
        self._loss_weights = config.loss_weights

        # Apply YaRN to encoder (idempotent if max_position == native).
        if config.marks_rope_strategy == "yarn":
            _apply_yarn_to_modernbert(
                self.encoder,
                new_max_position=config.max_position_embeddings,
                original_max_position=config.original_max_position,
            )
        # NTK is applied inside encoder config; nothing to do here.

        self.post_init()

    # -------------------------------------------------------------------------
    # Forward
    # -------------------------------------------------------------------------
    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor,
        topic_mentioned: Optional[torch.Tensor] = None,
        topic_score:     Optional[torch.Tensor] = None,
        tone_score:      Optional[torch.Tensor] = None,
        **kwargs,
    ) -> MarksOutput:

        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden = out.last_hidden_state                      # (B, T, H)

        cls = last_hidden[:, 0]                                  # (B, H)
        m = attention_mask.unsqueeze(-1).to(last_hidden.dtype)
        mean_pool = (last_hidden * m).sum(1) / m.sum(1).clamp(min=1.0)  # (B, H)
        pooled = self.dropout(torch.cat([cls, mean_pool], dim=-1))      # (B, 2H)

        tm_logits = self.head_topic_mentioned(pooled)
        ts_pred   = self.head_topic_score(pooled)
        tn_pred   = self.head_tone_score(pooled)

        loss, components = None, {}
        if topic_mentioned is not None:
            tm_logits_fp = tm_logits.float()
            ts_pred_fp = ts_pred.float()
            tn_pred_fp = tn_pred.float()
            tm_t = topic_mentioned.float()
            ts_t = topic_score.float()
            tn_t = tone_score.float()

            l_tm = F.binary_cross_entropy_with_logits(tm_logits_fp, tm_t)
            l_ts = F.mse_loss(ts_pred_fp, ts_t)
            l_tn = F.mse_loss(tn_pred_fp, tn_t)
            components = {
                "topic_mentioned": l_tm.detach(),
                "topic_score":     l_ts.detach(),
                "tone_scores":     l_tn.detach(),
            }
            w = self._loss_weights
            loss = (
                w["topic_mentioned"] * l_tm
                + w["topic_score"]   * l_ts
                + w["tone_scores"]   * l_tn
            )

        return MarksOutput(
            loss=loss,
            loss_components=components or None,
            topic_mentioned_logits=tm_logits,
            topic_score=ts_pred,
            tone_score=tn_pred,
        )

    # -------------------------------------------------------------------------
    # Convenience predict
    # -------------------------------------------------------------------------
    @torch.no_grad()
    def predict(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor,
        mention_threshold: float = 0.5,
    ) -> dict:
        """Run a forward pass and return clamped + masked predictions.

        Returns a dict with:
          topic_mentioned       (B, 10) hard 0/1
          topic_mentioned_prob  (B, 10) sigmoid confidence
          topic_score           (B, 10) clamped to [-2, +2], zeroed where mentioned=0
          tone_score            (B,  3) clamped to [1, 5]
        """
        out = self.forward(input_ids=input_ids, attention_mask=attention_mask)
        prob = torch.sigmoid(out.topic_mentioned_logits)
        mentioned = (prob >= mention_threshold).float()
        ts_lo, ts_hi = self.config.topic_score_range
        tn_lo, tn_hi = self.config.tone_score_range
        ts = out.topic_score.clamp(ts_lo, ts_hi) * mentioned
        tn = out.tone_score.clamp(tn_lo, tn_hi)
        return {
            "topic_mentioned":      mentioned,
            "topic_mentioned_prob": prob,
            "topic_score":          ts,
            "tone_score":           tn,
        }

    # -------------------------------------------------------------------------
    # Gradient checkpointing — delegate to encoder
    # -------------------------------------------------------------------------
    def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
        if hasattr(self.encoder, "gradient_checkpointing_enable"):
            self.encoder.gradient_checkpointing_enable(
                gradient_checkpointing_kwargs=gradient_checkpointing_kwargs
            )

    def gradient_checkpointing_disable(self):
        if hasattr(self.encoder, "gradient_checkpointing_disable"):
            self.encoder.gradient_checkpointing_disable()