| """Self-contained model class for binomial-marks-1. |
| |
| Distributed alongside the weights on HuggingFace Hub so anyone can do: |
| |
| from transformers import AutoTokenizer, AutoModel |
| tok = AutoTokenizer.from_pretrained("BinomialTechnologies/binomial-marks-1") |
| model = AutoModel.from_pretrained("BinomialTechnologies/binomial-marks-1", |
| trust_remote_code=True) |
| |
| This file imports only from `transformers` + `torch` — no project-internal |
| dependencies. |
| |
| Architecture: |
| ModernBERT-large encoder (with optional YaRN RoPE extension to 16k) |
| ↓ (CLS + masked mean pool concatenated) |
| ↓ (3 × MLP heads) |
| 23 outputs: |
| 10 × topic_mentioned (binary classification, sigmoid → BCE loss) |
| 10 × topic_score (regression in [-2, +2] after clamp at inference) |
| 3 × tone_score (regression in [1, 5] after clamp at inference) |
| """ |
|
|
| from __future__ import annotations |
|
|
| import math |
| from dataclasses import dataclass |
| from typing import Optional |
|
|
| import torch |
| import torch.nn as nn |
| import torch.nn.functional as F |
| from transformers import AutoModel, AutoConfig |
| from transformers.modeling_utils import PreTrainedModel |
| from transformers.modeling_outputs import ModelOutput |
|
|
| |
| |
| from .configuration_marks import MarksConfig, TOPICS, TONES |
|
|
|
|
| |
| |
| |
|
|
| def _yarn_inv_freq( |
| head_dim: int, |
| base: float, |
| scale: float, |
| original_max_position: int, |
| beta_fast: float = 32.0, |
| beta_slow: float = 1.0, |
| device=None, |
| dtype=torch.float32, |
| ) -> torch.Tensor: |
| if scale <= 1.0: |
| return 1.0 / (base ** (torch.arange(0, head_dim, 2, device=device, dtype=dtype) / head_dim)) |
| inv_freq_extrap = 1.0 / (base ** (torch.arange(0, head_dim, 2, device=device, dtype=dtype) / head_dim)) |
| inv_freq_interp = inv_freq_extrap / scale |
| wavelengths = 2.0 * math.pi / inv_freq_extrap |
| L = original_max_position |
| ramp = (L / wavelengths - beta_slow) / (beta_fast - beta_slow) |
| ramp = ramp.clamp(0.0, 1.0) |
| return inv_freq_interp * (1.0 - ramp) + inv_freq_extrap * ramp |
|
|
|
|
| def _apply_yarn_to_modernbert(encoder, new_max_position: int, |
| original_max_position: int = 8192, |
| beta_fast: float = 32.0, beta_slow: float = 1.0): |
| if new_max_position == original_max_position: |
| return |
| scale = new_max_position / original_max_position |
| cfg = encoder.config |
| head_dim = cfg.hidden_size // cfg.num_attention_heads |
| global_base = float(getattr(cfg, "global_rope_theta", getattr(cfg, "rope_theta", 10000.0))) |
|
|
| rotary_modules = [ |
| m for _, m in encoder.named_modules() |
| if m.__class__.__name__ == "ModernBertRotaryEmbedding" |
| ] |
| for mod in rotary_modules: |
| full_buf = getattr(mod, "full_attention_inv_freq", None) |
| if full_buf is None or full_buf.numel() != head_dim // 2: |
| continue |
| new_inv = _yarn_inv_freq( |
| head_dim=head_dim, base=global_base, scale=scale, |
| original_max_position=original_max_position, |
| beta_fast=beta_fast, beta_slow=beta_slow, |
| device=full_buf.device, dtype=full_buf.dtype, |
| ) |
| full_buf.data.copy_(new_inv) |
|
|
|
|
| |
| |
| |
|
|
| @dataclass |
| class MarksOutput(ModelOutput): |
| loss: Optional[torch.Tensor] = None |
| loss_components: Optional[dict] = None |
| topic_mentioned_logits: Optional[torch.Tensor] = None |
| topic_score: Optional[torch.Tensor] = None |
| tone_score: Optional[torch.Tensor] = None |
|
|
|
|
| |
| |
| |
|
|
| class MarksMultiHead(PreTrainedModel): |
| """Multi-head ModernBERT-large fine-tuned for earnings-call NLP scoring. |
| |
| 23 outputs per call: |
| * topic_mentioned (binary, 10 dims) |
| * topic_score (regression in [-2, +2], 10 dims) |
| * tone_score (regression in [1, 5], 3 dims) |
| """ |
|
|
| config_class = MarksConfig |
| base_model_prefix = "encoder" |
| supports_gradient_checkpointing = True |
|
|
| def __init__(self, config: MarksConfig): |
| super().__init__(config) |
| self.n_topics = len(config.topics) |
| self.n_tones = len(config.tones) |
|
|
| |
| |
| if config.encoder_config: |
| enc_cfg = AutoConfig.from_dict(config.encoder_config) if hasattr(AutoConfig, "from_dict") else AutoConfig.for_model(**config.encoder_config) |
| else: |
| enc_cfg = AutoConfig.from_pretrained(config.encoder_name_or_path) |
|
|
| |
| enc_cfg.max_position_embeddings = config.max_position_embeddings |
|
|
| |
| |
| |
| self.encoder = AutoModel.from_config(enc_cfg) |
| H = enc_cfg.hidden_size |
|
|
| |
| head_in = 2 * H |
| head_hidden = H // config.head_dim_ratio |
|
|
| def _mlp(out_dim: int) -> nn.Sequential: |
| return nn.Sequential( |
| nn.Linear(head_in, head_hidden), |
| nn.GELU(), |
| nn.Dropout(config.dropout), |
| nn.Linear(head_hidden, out_dim), |
| ) |
|
|
| self.dropout = nn.Dropout(config.dropout) |
| self.head_topic_mentioned = _mlp(self.n_topics) |
| self.head_topic_score = _mlp(self.n_topics) |
| self.head_tone_score = _mlp(self.n_tones) |
|
|
| |
| self._loss_weights = config.loss_weights |
|
|
| |
| if config.marks_rope_strategy == "yarn": |
| _apply_yarn_to_modernbert( |
| self.encoder, |
| new_max_position=config.max_position_embeddings, |
| original_max_position=config.original_max_position, |
| ) |
| |
|
|
| self.post_init() |
|
|
| |
| |
| |
| def forward( |
| self, |
| input_ids: torch.Tensor, |
| attention_mask: torch.Tensor, |
| topic_mentioned: Optional[torch.Tensor] = None, |
| topic_score: Optional[torch.Tensor] = None, |
| tone_score: Optional[torch.Tensor] = None, |
| **kwargs, |
| ) -> MarksOutput: |
|
|
| out = self.encoder(input_ids=input_ids, attention_mask=attention_mask) |
| last_hidden = out.last_hidden_state |
|
|
| cls = last_hidden[:, 0] |
| m = attention_mask.unsqueeze(-1).to(last_hidden.dtype) |
| mean_pool = (last_hidden * m).sum(1) / m.sum(1).clamp(min=1.0) |
| pooled = self.dropout(torch.cat([cls, mean_pool], dim=-1)) |
|
|
| tm_logits = self.head_topic_mentioned(pooled) |
| ts_pred = self.head_topic_score(pooled) |
| tn_pred = self.head_tone_score(pooled) |
|
|
| loss, components = None, {} |
| if topic_mentioned is not None: |
| tm_logits_fp = tm_logits.float() |
| ts_pred_fp = ts_pred.float() |
| tn_pred_fp = tn_pred.float() |
| tm_t = topic_mentioned.float() |
| ts_t = topic_score.float() |
| tn_t = tone_score.float() |
|
|
| l_tm = F.binary_cross_entropy_with_logits(tm_logits_fp, tm_t) |
| l_ts = F.mse_loss(ts_pred_fp, ts_t) |
| l_tn = F.mse_loss(tn_pred_fp, tn_t) |
| components = { |
| "topic_mentioned": l_tm.detach(), |
| "topic_score": l_ts.detach(), |
| "tone_scores": l_tn.detach(), |
| } |
| w = self._loss_weights |
| loss = ( |
| w["topic_mentioned"] * l_tm |
| + w["topic_score"] * l_ts |
| + w["tone_scores"] * l_tn |
| ) |
|
|
| return MarksOutput( |
| loss=loss, |
| loss_components=components or None, |
| topic_mentioned_logits=tm_logits, |
| topic_score=ts_pred, |
| tone_score=tn_pred, |
| ) |
|
|
| |
| |
| |
| @torch.no_grad() |
| def predict( |
| self, |
| input_ids: torch.Tensor, |
| attention_mask: torch.Tensor, |
| mention_threshold: float = 0.5, |
| ) -> dict: |
| """Run a forward pass and return clamped + masked predictions. |
| |
| Returns a dict with: |
| topic_mentioned (B, 10) hard 0/1 |
| topic_mentioned_prob (B, 10) sigmoid confidence |
| topic_score (B, 10) clamped to [-2, +2], zeroed where mentioned=0 |
| tone_score (B, 3) clamped to [1, 5] |
| """ |
| out = self.forward(input_ids=input_ids, attention_mask=attention_mask) |
| prob = torch.sigmoid(out.topic_mentioned_logits) |
| mentioned = (prob >= mention_threshold).float() |
| ts_lo, ts_hi = self.config.topic_score_range |
| tn_lo, tn_hi = self.config.tone_score_range |
| ts = out.topic_score.clamp(ts_lo, ts_hi) * mentioned |
| tn = out.tone_score.clamp(tn_lo, tn_hi) |
| return { |
| "topic_mentioned": mentioned, |
| "topic_mentioned_prob": prob, |
| "topic_score": ts, |
| "tone_score": tn, |
| } |
|
|
| |
| |
| |
| def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None): |
| if hasattr(self.encoder, "gradient_checkpointing_enable"): |
| self.encoder.gradient_checkpointing_enable( |
| gradient_checkpointing_kwargs=gradient_checkpointing_kwargs |
| ) |
|
|
| def gradient_checkpointing_disable(self): |
| if hasattr(self.encoder, "gradient_checkpointing_disable"): |
| self.encoder.gradient_checkpointing_disable() |
|
|