upload

Browse files

Files changed (14) hide show

.gitattributes +1 -0
anyup.py +479 -0
attention.py +146 -0
config.json +44 -0
configuration_falcon_perception.py +77 -0
main_fig.jpg +3 -0
model.safetensors +3 -0
model_args.json +37 -0
modeling_falcon_perception.py +935 -0
processing_falcon_perception.py +423 -0
rope.py +127 -0
special_tokens_map.json +380 -0
tokenizer.json +0 -0
tokenizer_config.json +102 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+main_fig.jpg filter=lfs diff=lfs merge=lfs -text

anyup.py ADDED Viewed

	@@ -0,0 +1,479 @@

+"""
+AnyUp – flattened into a single module for HuggingFace trust_remote_code compatibility.
+Original package structure:
+  anyup/layers/convolutions.py         → ResBlock
+  anyup/layers/feature_unification.py  → LearnedFeatureUnification
+  anyup/layers/positional_encoding.py  → RoPE (AnyUp-internal)
+  anyup/layers/attention/attention_masking.py  → window2d, compute_attention_mask, get_attention_mask_mod
+  anyup/layers/attention/chunked_attention.py  → FlexCrossAttention, CrossAttentionBlock
+  anyup/model.py                       → AnyUp
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import einops as E
+from typing import Tuple
+from functools import lru_cache
+from torch.nn.attention.flex_attention import flex_attention
+from torch.distributed.tensor import DTensor, distribute_tensor
+compiled_flex_attn_prefill = torch.compile(flex_attention, dynamic=True)
+# ---------------------------------------------------------------------------
+# ResBlock (from layers/convolutions.py)
+# ---------------------------------------------------------------------------
+class ResBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=3,
+        num_groups=8,
+        pad_mode="zeros",
+        norm_fn=None,
+        activation_fn=nn.SiLU,
+        use_conv_shortcut=False,
+    ):
+        super().__init__()
+        N = (lambda c: norm_fn(num_groups, c)) if norm_fn else (lambda c: nn.Identity())
+        p = kernel_size // 2
+        self.block = nn.Sequential(
+            N(in_channels),
+            activation_fn(),
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                padding=p,
+                padding_mode=pad_mode,
+                bias=False,
+            ),
+            N(out_channels),
+            activation_fn(),
+            nn.Conv2d(
+                out_channels,
+                out_channels,
+                kernel_size,
+                padding=p,
+                padding_mode=pad_mode,
+                bias=False,
+            ),
+        )
+        self.shortcut = (
+            nn.Conv2d(in_channels, out_channels, 1, bias=False, padding_mode=pad_mode)
+            if use_conv_shortcut or in_channels != out_channels
+            else nn.Identity()
+        )
+    def forward(self, x):
+        return self.block(x) + self.shortcut(x)
+# ---------------------------------------------------------------------------
+# LearnedFeatureUnification (from layers/feature_unification.py)
+# ---------------------------------------------------------------------------
+class LearnedFeatureUnification(nn.Module):
+    def __init__(
+        self,
+        out_channels: int,
+        kernel_size: int = 3,
+        init_gaussian_derivatives: bool = False,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.basis = nn.Parameter(
+            torch.randn(out_channels, 1, kernel_size, kernel_size)
+        )
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        b, c, h, w = features.shape
+        x = self._depthwise_conv(features, self.basis, self.kernel_size).view(
+            b, self.out_channels, c, h, w
+        )
+        attn = F.softmax(x, dim=1)
+        return attn.mean(dim=2)
+    @staticmethod
+    def _depthwise_conv(feats, basis, k):
+        b, c, h, w = feats.shape
+        p = k // 2
+        x = F.pad(feats, (p, p, p, p), value=0)
+        x = F.conv2d(x, basis.repeat(c, 1, 1, 1), groups=c)
+        mask = torch.ones(1, 1, h, w, dtype=x.dtype, device=x.device)
+        denom = F.conv2d(
+            F.pad(mask, (p, p, p, p), value=0),
+            torch.ones(1, 1, k, k, device=x.device, dtype=x.dtype),
+        )
+        return x / denom
+# ---------------------------------------------------------------------------
+# RoPE (from layers/positional_encoding.py) – AnyUp-internal, separate from
+# the main model's 3D RoPE
+# ---------------------------------------------------------------------------
+def _rotate_half(x):
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+class AnyUpRoPE(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        theta: int = 100,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.freqs = nn.Parameter(torch.empty(2, self.dim))
+    def _device_weight_init(self):
+        if isinstance(self.freqs, DTensor):
+            target_device = self.freqs.to_local().device
+            target_dtype = self.freqs.to_local().dtype
+        else:
+            target_device = self.freqs.device
+            target_dtype = self.freqs.dtype
+        freqs_1d = self.theta ** torch.linspace(
+            0, -1, self.dim // 4, device=target_device, dtype=target_dtype
+        )
+        freqs_1d = torch.cat([freqs_1d, freqs_1d])
+        freqs_2d = torch.zeros(2, self.dim, device=target_device, dtype=target_dtype)
+        freqs_2d[0, : self.dim // 2] = freqs_1d
+        freqs_2d[1, -self.dim // 2 :] = freqs_1d
+        freqs_2d.mul_(2 * torch.pi)
+        with torch.no_grad():
+            if isinstance(self.freqs, DTensor):
+                dist_freqs = distribute_tensor(
+                    freqs_2d, self.freqs.device_mesh, placements=self.freqs.placements
+                )
+                self.freqs.to_local().copy_(dist_freqs.to_local())
+            else:
+                self.freqs.copy_(freqs_2d)
+    def forward(self, x: torch.Tensor, coords: torch.Tensor) -> torch.Tensor:
+        angle = coords @ self.freqs
+        return x * angle.cos() + _rotate_half(x) * angle.sin()
+# ---------------------------------------------------------------------------
+# Attention masking (from layers/attention/attention_masking.py)
+# ---------------------------------------------------------------------------
+def window2d(
+    low_res: int | Tuple[int, int],
+    high_res: int | Tuple[int, int],
+    ratio: float,
+    *,
+    device: str = "cpu",
+) -> torch.Tensor:
+    """Calculate the lower and upper bounds of row and col for each pixel/position"""
+    if isinstance(high_res, int):
+        H = W = high_res
+    else:
+        H, W = high_res
+    if isinstance(low_res, int):
+        Lh = Lw = low_res
+    else:
+        Lh, Lw = low_res
+    r_pos = (torch.arange(H, device=device, dtype=torch.float32) + 0.5) / H
+    c_pos = (torch.arange(W, device=device, dtype=torch.float32) + 0.5) / W
+    pos_r, pos_c = torch.meshgrid(r_pos, c_pos, indexing="ij")
+    r_lo = (pos_r - ratio).clamp(0.0, 1.0)
+    r_hi = (pos_r + ratio).clamp(0.0, 1.0)
+    c_lo = (pos_c - ratio).clamp(0.0, 1.0)
+    c_hi = (pos_c + ratio).clamp(0.0, 1.0)
+    r0 = (r_lo * Lh).floor().long()
+    r1 = (r_hi * Lh).ceil().long()
+    c0 = (c_lo * Lw).floor().long()
+    c1 = (c_hi * Lw).ceil().long()
+    return torch.stack([r0, r1, c0, c1], dim=2)
+@lru_cache
+def compute_attention_mask(
+    high_res_h, high_res_w, low_res_h, low_res_w, window_size_ratio, device="cpu"
+):
+    h, w = high_res_h, high_res_w
+    h_, w_ = low_res_h, low_res_w
+    windows = window2d(
+        low_res=(h_, w_), high_res=(h, w), ratio=window_size_ratio, device=device
+    )
+    q = h * w
+    r0 = windows[..., 0].reshape(q, 1)
+    r1 = windows[..., 1].reshape(q, 1)
+    c0 = windows[..., 2].reshape(q, 1)
+    c1 = windows[..., 3].reshape(q, 1)
+    rows = torch.arange(h_, device=device)
+    cols = torch.arange(w_, device=device)
+    row_ok = (rows >= r0) & (rows < r1)
+    col_ok = (cols >= c0) & (cols < c1)
+    attention_mask = (
+        (row_ok.unsqueeze(2) & col_ok.unsqueeze(1))
+        .reshape(q, h_ * w_)
+        .to(dtype=torch.bool)
+    )
+    return ~attention_mask
+def get_attention_mask_mod(
+    high_res_h, high_res_w, low_res_h, low_res_w, window_size_ratio=0.1, device="cpu"
+):
+    """Window Attention as above but for FlexAttention."""
+    h, w = high_res_h, high_res_w
+    h_, w_ = low_res_h, low_res_w
+    windows = window2d(
+        low_res=(h_, w_),
+        high_res=(h, w),
+        ratio=window_size_ratio,
+        device=device,
+    )
+    r0 = windows[..., 0]
+    r1 = windows[..., 1]
+    c0 = windows[..., 2]
+    c1 = windows[..., 3]
+    def _mask_mod(b_idx, h_idx, q_idx, kv_idx):
+        q_r_idx = q_idx // w
+        q_c_idx = q_idx % w
+        kv_r_idx = kv_idx // w_
+        kv_c_idx = kv_idx % w_
+        row_lower = kv_r_idx >= r0[q_r_idx, q_c_idx]
+        row_upper = kv_r_idx < r1[q_r_idx, q_c_idx]
+        col_lower = kv_c_idx >= c0[q_r_idx, q_c_idx]
+        col_upper = kv_c_idx < c1[q_r_idx, q_c_idx]
+        return row_lower & row_upper & col_lower & col_upper
+    return _mask_mod
+# ---------------------------------------------------------------------------
+# Cross-attention (from layers/attention/chunked_attention.py)
+# ---------------------------------------------------------------------------
+class AttentionWrapper(nn.Module):
+    def __init__(self, qk_dim: int):
+        super().__init__()
+        self.in_proj_weight = nn.Parameter(torch.empty([qk_dim * 3, qk_dim]))
+        self.in_proj_bias = nn.Parameter(torch.empty([qk_dim * 3]))
+    def forward(self, x_q, x_k, x_v):
+        w_q, w_k, w_v = self.in_proj_weight.chunk(3, dim=0)
+        b_q, b_k, b_v = self.in_proj_bias.chunk(3)
+        x_q = x_q @ w_q.T + b_q
+        x_k = x_k @ w_k.T + b_k
+        return x_q, x_k, x_v
+class FlexCrossAttention(nn.Module):
+    def __init__(self, qk_dim: int, num_heads: int, **kwargs):
+        super().__init__()
+        self.dim = qk_dim
+        self.num_head = num_heads
+        self.norm_q = nn.RMSNorm(qk_dim)
+        self.norm_k = nn.RMSNorm(qk_dim)
+        self.attention = AttentionWrapper(qk_dim)
+    def forward(self, query, key, value, mask=None, **kwargs):
+        x_q = self.norm_q(query)
+        x_k = self.norm_k(key)
+        x_q, x_k, x_v = self.attention(x_q, x_k, value)
+        x_q = E.rearrange(x_q, "b HW (h d) -> b h HW d", h=self.num_head)
+        x_k = E.rearrange(x_k, "b hw (h d) -> b h hw d", h=self.num_head)
+        x_v = E.rearrange(value, "b hw (h d) -> b h hw d", h=self.num_head)
+        output = compiled_flex_attn_prefill(x_q, x_k, x_v, block_mask=mask)
+        output = E.rearrange(output, "b h hw d -> b hw (h d)")
+        return output
+class CrossAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        qk_dim,
+        num_heads,
+        window_ratio: float = 0.1,
+        **kwargs,
+    ):
+        super().__init__()
+        self.cross_attn = FlexCrossAttention(qk_dim, num_heads)
+        self.window_ratio = window_ratio
+        self.conv2d = nn.Conv2d(
+            qk_dim, qk_dim, kernel_size=3, stride=1, padding=1, bias=False
+        )
+    def forward(self, q, k, v, block_mask, **kwargs):
+        b, _, h, w = q.shape
+        q = self.conv2d(q)
+        q = E.rearrange(q, "b c h w -> b (h w) c")
+        k = E.rearrange(k, "b c h w -> b (h w) c")
+        v = E.rearrange(v, "b c h w -> b (h w) c")
+        features = self.cross_attn(q, k, v, mask=block_mask)
+        return E.rearrange(features, "b (h w) c -> b c h w", h=h, w=w)
+# ---------------------------------------------------------------------------
+# AnyUp (from model.py)
+# ---------------------------------------------------------------------------
+IMAGENET_MEAN = torch.tensor([0.485, 0.456, 0.406]).reshape(1, 3, 1, 1)
+IMAGENET_STD = torch.tensor([0.229, 0.224, 0.225]).reshape(1, 3, 1, 1)
+def create_coordinate(h, w, start=0.0, end=1.0, device=None, dtype=None):
+    x = torch.linspace(start, end, h, device=device, dtype=dtype)
+    y = torch.linspace(start, end, w, device=device, dtype=dtype)
+    xx, yy = torch.meshgrid(x, y, indexing="ij")
+    return torch.stack((xx, yy), -1).view(1, h * w, 2)
+class AnyUp(nn.Module):
+    def __init__(
+        self,
+        input_dim=3,
+        qk_dim=128,
+        kernel_size=1,
+        kernel_size_lfu=5,
+        window_ratio=0.1,
+        num_heads=4,
+        init_gaussian_derivatives=False,
+        **kwargs,
+    ):
+        super().__init__()
+        self.qk_dim = qk_dim
+        self.window_ratio = window_ratio
+        self._rb_args = dict(
+            kernel_size=1,
+            num_groups=8,
+            pad_mode="reflect",
+            norm_fn=nn.GroupNorm,
+            activation_fn=nn.SiLU,
+        )
+        self.image_encoder = self._make_encoder(input_dim, kernel_size)
+        self.key_encoder = self._make_encoder(qk_dim, 1)
+        self.query_encoder = self._make_encoder(qk_dim, 1)
+        self.key_features_encoder = self._make_encoder(
+            None,
+            1,
+            first_layer_k=kernel_size_lfu,
+            init_gaussian_derivatives=init_gaussian_derivatives,
+        )
+        self.cross_decode = CrossAttentionBlock(
+            qk_dim=qk_dim, num_heads=num_heads, window_ratio=window_ratio
+        )
+        self.aggregation = self._make_encoder(2 * qk_dim, 3)
+        self.rope = AnyUpRoPE(qk_dim)
+        self.rope._device_weight_init()
+        self._compiled_encoders = False
+    def compile(self, *, mode: str | None = None, dynamic: bool = True):
+        if self._compiled_encoders:
+            return self
+        self.image_encoder = torch.compile(self.image_encoder, dynamic=dynamic, mode=mode)
+        self.key_encoder = torch.compile(self.key_encoder, dynamic=dynamic, mode=mode)
+        self.query_encoder = torch.compile(self.query_encoder, dynamic=dynamic, mode=mode)
+        self.key_features_encoder = torch.compile(
+            self.key_features_encoder, dynamic=dynamic, mode=mode
+        )
+        self.aggregation = torch.compile(self.aggregation, dynamic=dynamic, mode=mode)
+        self._compiled_encoders = True
+        return self
+    def _make_encoder(
+        self, in_ch, k, layers=2, first_layer_k=0, init_gaussian_derivatives=False
+    ):
+        pre = (
+            nn.Conv2d(
+                in_ch,
+                self.qk_dim,
+                k,
+                padding=k // 2,
+                padding_mode="reflect",
+                bias=False,
+            )
+            if first_layer_k == 0
+            else LearnedFeatureUnification(
+                self.qk_dim,
+                first_layer_k,
+                init_gaussian_derivatives=init_gaussian_derivatives,
+            )
+        )
+        blocks = [
+            ResBlock(self.qk_dim, self.qk_dim, **self._rb_args) for _ in range(layers)
+        ]
+        return nn.Sequential(pre, *blocks)
+    def upsample(
+        self, enc_img, feats, attn_mask, out_size, vis_attn=False, q_chunk_size=None
+    ):
+        b, c, h, w = feats.shape
+        q = F.adaptive_avg_pool2d(self.query_encoder(enc_img), output_size=out_size)
+        k = F.adaptive_avg_pool2d(self.key_encoder(enc_img), output_size=(h, w))
+        k = torch.cat([k, self.key_features_encoder(F.normalize(feats, dim=1))], dim=1)
+        k = self.aggregation(k)
+        v = feats
+        result = self.cross_decode(
+            q, k, v, attn_mask, vis_attn=vis_attn, q_chunk_size=q_chunk_size
+        )
+        return result
+    def forward(
+        self,
+        images,
+        features,
+        attn_mask,
+        output_size=None,
+        vis_attn=False,
+        q_chunk_size=None,
+    ):
+        output_size = output_size if output_size is not None else images.shape[-2:]
+        images = images * 0.5 + 0.5
+        images = (images - IMAGENET_MEAN.to(images)) / IMAGENET_STD.to(images)
+        images = images.to(features)
+        enc = self.image_encoder(images)
+        h = enc.shape[-2]
+        coords = create_coordinate(h, enc.shape[-1], device=enc.device, dtype=enc.dtype)
+        enc = enc.permute(0, 2, 3, 1).view(enc.shape[0], -1, enc.shape[1])
+        enc = self.rope(enc, coords)
+        enc = enc.view(enc.shape[0], h, -1, enc.shape[-1]).permute(0, 3, 1, 2)
+        result = self.upsample(
+            enc,
+            features,
+            attn_mask,
+            output_size,
+            vis_attn=vis_attn,
+            q_chunk_size=q_chunk_size,
+        )
+        return result

attention.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import torch
+from torch import Tensor as T
+from torch.nn.attention.flex_attention import (
+    BlockMask,
+    _mask_mod_signature,
+    and_masks,
+    create_block_mask,
+    flex_attention,
+    or_masks,
+)
+# ---------------------------------------------------------------------------
+# Two compiled variants of flex_attention
+# ---------------------------------------------------------------------------
+# _decode:  fullgraph=True, static shapes.
+#           Used for decode steps (S_q == 1) where shapes are fixed and
+#           the call will be captured inside a CUDA graph.  fullgraph=True
+#           avoids graph breaks that would corrupt the capture.
+#
+# _prefill: dynamic=True, symbolic shapes.
+#           Used for prefill steps (S_q > 1) where the sequence length
+#           varies per image.  dynamic=True lets one compiled graph handle
+#           all lengths without recompilation.  Prefill is never inside a
+#           CUDA graph, so symbolic shape guards are fine.
+compiled_flex_attn_decode = torch.compile(flex_attention, fullgraph=True)
+compiled_flex_attn_prefill = torch.compile(flex_attention, dynamic=True)
+def offset_mask_mod(mask_mod: _mask_mod_signature, offset: int):
+    """Get a mask mod function with an offset applied to the query positions."""
+    def _mask_mod(b, h, q, kv):
+        return mask_mod(b, h, q + offset, kv)
+    return _mask_mod
+def get_causal_mask_mod() -> _mask_mod_signature:
+    """Causal mask that prevents attention to future tokens."""
+    def _causal_mask(b: T, h: T, q_idx: T, kv_idx: T) -> T:
+        return q_idx >= kv_idx
+    return _causal_mask
+def get_document_mask_mod(batch: T, eos_id: int) -> _mask_mod_signature:
+    """Creates a document mask that prevents attention across document boundaries.
+    Args:
+        batch: Input batch tensor with shape [b, s, h, d]
+        eos_id: End-of-sequence token ID that marks document boundaries
+    Returns:
+        A mask modifier function that implements document-level masking.
+    """
+    # batch is [b, s, h, d] shape
+    eos_mask = batch == eos_id
+    eos_mask[:, -1] = True
+    cumulative_mask = torch.cumsum(torch.where(eos_mask, 1, 0), dim=1)
+    sequence_indices = torch.zeros_like(cumulative_mask, dtype=torch.int32)
+    sequence_indices[:, 1:] = cumulative_mask[:, :-1]
+    def document_mask(b: T, h: T, q_idx: T, kv_idx: T) -> T:
+        return sequence_indices[b, q_idx] == sequence_indices[b, kv_idx]
+    return document_mask
+def get_non_left_pad_mask_mod(batch: T, pad_id: int) -> _mask_mod_signature:
+    """Prevent model from attending to the left-padded token required for correct batch inference."""
+    non_pad_mask_id = torch.cumsum(batch != pad_id, dim=1)
+    # Left-most pad tokens have cumulative id == 0.
+    def mask_mod(b, h, q_idx, kv_idx):
+        return non_pad_mask_id[b, kv_idx] > 0
+    return mask_mod
+def get_image_prefix_mask_mod(
+    batch: T, soi_id: int, eoi_id: int
+) -> _mask_mod_signature:
+    # batch is [b, s, h, d] shape
+    soi_mask = batch == soi_id
+    eoi_mask = batch == eoi_id
+    acc_soi_mask = torch.cumsum(soi_mask, dim=1)
+    acc_eoi_mask = torch.cumsum(eoi_mask, dim=1)
+    # Get every tokens between two soi_id and eoi_id exclusive of eoi_id
+    img_mask = (acc_soi_mask - acc_eoi_mask) > 0
+    # Create a tensor that assigns each token to its image number
+    # Each image starts with SOI token, so we can use acc_soi_mask to track image numbers
+    img_indices = acc_soi_mask * img_mask
+    def image_prefix_mask_mod(b, h, q_idx, kv_idx):
+        # Check if both tokens are image tokens and belong to the same image
+        is_img_tokens = img_mask[b, q_idx] & img_mask[b, kv_idx]
+        is_same_image = img_indices[b, q_idx] == img_indices[b, kv_idx]
+        return is_img_tokens & is_same_image
+    return image_prefix_mask_mod
+_compiled_create_block_mask = torch.compile(
+    create_block_mask, dynamic=True
+) # Note: can't use mode = 'reduce-overhead' here because it uses internal CUDA graph trees on private streams, causing manual capture to record empty graphs
+@torch.inference_mode()
+def create_attention_mask(*args, **kwargs) -> BlockMask:
+    """
+    NOTE: We compile this for performance/memory reasons in large masks. To reduce
+    recompiles due to grad_mode flips, we always run mask creation under inference_mode.
+    """
+    return _compiled_create_block_mask(*args, **kwargs)
+def create_batch_attention_mask(
+    input_batch: T,
+    *,
+    pad_token_id: int,
+    eos_token_id: int,
+    soi_token_id: int,
+    eoi_token_id: int,
+    max_len: int | None = None,
+) -> BlockMask:
+    """Build the combined FlexAttention mask for the batch engine.
+    Composes causal + document + non-left-pad + image-prefix masks.
+    """
+    B, S = input_batch.size()
+    block_causal_mask_mod = and_masks(
+        get_causal_mask_mod(),
+        get_document_mask_mod(input_batch, eos_token_id),
+        get_non_left_pad_mask_mod(input_batch, pad_token_id),
+    )
+    image_prefix_mask_mod = get_image_prefix_mask_mod(
+        batch=input_batch,
+        soi_id=soi_token_id,
+        eoi_id=eoi_token_id,
+    )
+    mask_mod = or_masks(image_prefix_mask_mod, block_causal_mask_mod)
+    max_len = max_len or S
+    return create_attention_mask(mask_mod, B, None, max_len, max_len)

config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "architectures": [
+    "FalconPerceptionForSegmentation"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_falcon_perception.FalconPerceptionConfig",
+    "AutoModelForCausalLM": "modeling_falcon_perception.FalconPerceptionForSegmentation"
+  },
+  "model_type": "falcon_perception",
+  "torch_dtype": "float32",
+  "dim": 1024,
+  "n_layers": 28,
+  "n_heads": 16,
+  "head_dim": 128,
+  "n_kv_heads": 8,
+  "vocab_size": 65536,
+  "ffn_dim": 3072,
+  "norm_eps": 1e-05,
+  "max_seq_len": 8192,
+  "rope_theta": 10000,
+  "channel_size": 3,
+  "spatial_patch_size": 16,
+  "temporal_patch_size": 1,
+  "do_segmentation": true,
+  "segm_out_dim": 256,
+  "num_segm_layers": 3,
+  "coord_enc_dim": 512,
+  "coord_dec_dim": 8192,
+  "coord_out_dim": 2048,
+  "coord_token_id": 240,
+  "size_enc_dim": 512,
+  "size_dec_dim": 8192,
+  "size_out_dim": 2048,
+  "size_token_id": 241,
+  "seg_token_id": 262,
+  "eos_id": 11,
+  "img_id": 227,
+  "image_cls_token_id": 244,
+  "image_reg_1_token_id": 245,
+  "image_reg_2_token_id": 246,
+  "image_reg_3_token_id": 247,
+  "image_reg_4_token_id": 248,
+  "img_end_id": 230
+}

configuration_falcon_perception.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from transformers import PretrainedConfig
+class FalconPerceptionConfig(PretrainedConfig):
+    model_type = "falcon_perception"
+    def __init__(
+        self,
+        dim: int = 1024,
+        n_layers: int = 28,
+        n_heads: int = 16,
+        head_dim: int = 128,
+        n_kv_heads: int = 8,
+        vocab_size: int = 65536,
+        ffn_dim: int = 3072,
+        norm_eps: float = 1e-5,
+        max_seq_len: int = 8192,
+        rope_theta: int = 10000,
+        channel_size: int = 3,
+        spatial_patch_size: int = 16,
+        temporal_patch_size: int = 1,
+        do_segmentation: bool = True,
+        segm_out_dim: int = 256,
+        num_segm_layers: int = 3,
+        coord_enc_dim: int = 512,
+        coord_dec_dim: int = 8192,
+        coord_out_dim: int = 2048,
+        coord_token_id: int = 240,
+        size_enc_dim: int = 512,
+        size_dec_dim: int = 8192,
+        size_out_dim: int = 2048,
+        size_token_id: int = 241,
+        seg_token_id: int = 262,
+        eos_id: int = 11,
+        img_id: int = 227,
+        image_cls_token_id: int = 244,
+        image_reg_1_token_id: int = 245,
+        image_reg_2_token_id: int = 246,
+        image_reg_3_token_id: int = 247,
+        image_reg_4_token_id: int = 248,
+        img_end_id: int = 230,
+        **kwargs,
+    ):
+        self.dim = dim
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.head_dim = head_dim
+        self.n_kv_heads = n_kv_heads
+        self.vocab_size = vocab_size
+        self.ffn_dim = ffn_dim
+        self.norm_eps = norm_eps
+        self.max_seq_len = max_seq_len
+        self.rope_theta = rope_theta
+        self.channel_size = channel_size
+        self.spatial_patch_size = spatial_patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.do_segmentation = do_segmentation
+        self.segm_out_dim = segm_out_dim
+        self.num_segm_layers = num_segm_layers
+        self.coord_enc_dim = coord_enc_dim
+        self.coord_dec_dim = coord_dec_dim
+        self.coord_out_dim = coord_out_dim
+        self.coord_token_id = coord_token_id
+        self.size_enc_dim = size_enc_dim
+        self.size_dec_dim = size_dec_dim
+        self.size_out_dim = size_out_dim
+        self.size_token_id = size_token_id
+        self.seg_token_id = seg_token_id
+        self.eos_id = eos_id
+        self.img_id = img_id
+        self.image_cls_token_id = image_cls_token_id
+        self.image_reg_1_token_id = image_reg_1_token_id
+        self.image_reg_2_token_id = image_reg_2_token_id
+        self.image_reg_3_token_id = image_reg_3_token_id
+        self.image_reg_4_token_id = image_reg_4_token_id
+        self.img_end_id = img_end_id
+        super().__init__(**kwargs)

main_fig.jpg ADDED Viewed

Git LFS Details

SHA256: a25a745799ac3cf2967620af6936fa35cb3534b73ff93e42acd37adb67a03c34
Pointer size: 132 Bytes
Size of remote file: 1.48 MB

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:32b342ee3cb22d05a380b26aac2ddaaee0d7479093c2007ac5618d8d19f5272e
+size 632397880

model_args.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "channel_size": 3,
+  "coord_dec_dim": 8192,
+  "coord_enc_dim": 512,
+  "coord_out_dim": 2048,
+  "coord_token_id": 240,
+  "dim": 1024,
+  "eos_id": 11,
+  "ffn_dim": 3072,
+  "head_dim": 128,
+  "image_cls_token_id": 244,
+  "image_reg_1_token_id": 245,
+  "image_reg_2_token_id": 246,
+  "image_reg_3_token_id": 247,
+  "image_reg_4_token_id": 248,
+  "img_end_id": 230,
+  "img_id": 227,
+  "img_row_sep_id": 228,
+  "img_start_id": 229,
+  "max_seq_len": 8192,
+  "n_heads": 16,
+  "n_kv_heads": 8,
+  "n_layers": 28,
+  "norm_eps": 1e-05,
+  "num_segm_layers": 3,
+  "perception_heads": true,
+  "rope_theta": 10000,
+  "seg_token_id": 262,
+  "segm_out_dim": 256,
+  "size_dec_dim": 8192,
+  "size_enc_dim": 512,
+  "size_out_dim": 2048,
+  "size_token_id": 241,
+  "spatial_patch_size": 16,
+  "temporal_patch_size": 1,
+  "vocab_size": 65536
+}

modeling_falcon_perception.py ADDED Viewed

	@@ -0,0 +1,935 @@

+import math
+from pathlib import Path
+import einops as E
+import numpy as np
+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+from PIL import Image
+from pycocotools import mask as mask_utils
+from torch import Tensor as T
+from torch import nn
+from torch.nn.attention.flex_attention import (
+    AuxRequest,
+    BlockMask,
+)
+from transformers import AutoTokenizer, PreTrainedModel
+from .anyup import AnyUp, get_attention_mask_mod as get_upsampler_attn_mask_mod
+from .attention import (
+    compiled_flex_attn_decode,
+    compiled_flex_attn_prefill,
+    create_attention_mask,
+    create_batch_attention_mask,
+    offset_mask_mod,
+)
+from .configuration_falcon_perception import FalconPerceptionConfig
+from .processing_falcon_perception import load_image, process_batch
+from .rope import (
+    apply_3d_rotary_emb,
+    apply_golden_freqs_cis_to_visual_pos,
+    precompute_freqs_cis,
+)
+# ---------------------------------------------------------------------------
+# Sub-modules: Heads
+# ---------------------------------------------------------------------------
+class FourierEncoder(nn.Module):
+    def __init__(self, in_dim: int, feat_dim: int, out_dim: int):
+        super().__init__()
+        self.embed = nn.Linear(in_dim, feat_dim // 2, bias=False)
+        self.transform = nn.Linear(feat_dim, out_dim, bias=False)
+    def forward(self, x):
+        f = 2 * math.pi * self.embed(x)
+        f = torch.cat([f.cos(), f.sin()], dim=-1)
+        return self.transform(f)
+class BboxDecoder(nn.Module):
+    def __init__(self, in_dim: int, hidden_dim: int, out_dim: int) -> None:
+        super().__init__()
+        self.w1 = nn.Linear(in_dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, out_dim, bias=False)
+    def forward(self, x: T) -> T:
+        return self.w2(F.relu(self.w1(x)).square())
+class SegmDecoder(nn.Module):
+    def __init__(self, in_dim: int, out_dim: int, num_layers: int) -> None:
+        super().__init__()
+        self.layers = nn.ModuleList([nn.Linear(in_dim, in_dim) for _ in range(num_layers - 1)])
+        self.pixel_layer = nn.Linear(in_dim, out_dim, bias=False)
+    def forward(self, x) -> torch.Tensor:
+        for layer in self.layers:
+            x = F.relu(layer(x)).square()
+        return self.pixel_layer(x)
+# ---------------------------------------------------------------------------
+# Sub-modules: Attention
+# ---------------------------------------------------------------------------
+def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
+    B, S, H, D = x.shape
+    if n_rep == 1:
+        return x
+    return torch.unsqueeze(x, dim=3).expand(B, S, H, n_rep, D).reshape(B, S, H * n_rep, D)
+class Attention(nn.Module):
+    def __init__(self, config: FalconPerceptionConfig, layer_id: int):
+        super().__init__()
+        self.layer_id = layer_id
+        self.n_kv_heads = config.n_kv_heads or config.n_heads
+        self.n_rep = config.n_heads // self.n_kv_heads
+        self.head_dim = config.head_dim or config.dim // config.n_heads
+        self.q_dim = config.n_heads * self.head_dim
+        self.kv_dim = self.n_kv_heads * self.head_dim
+        self.wqkv = nn.Linear(config.dim, self.q_dim + 2 * self.kv_dim, bias=False)
+        self.wo = nn.Linear(config.n_heads * self.head_dim, config.dim, bias=False)
+        self.sinks = nn.Parameter(torch.empty((config.n_heads,)))
+    def _pre_attention_qkv(self, x) -> tuple[T, T, T]:
+        qkv = self.wqkv(F.rms_norm(x, (x.size(-1),)))
+        xq, xk, xv = qkv.split([self.q_dim, self.kv_dim, self.kv_dim], dim=-1)
+        xq = E.rearrange(xq, "b s (h d) -> b s h d", d=self.head_dim)
+        xk = E.rearrange(xk, "b s (h d) -> b s h d", d=self.head_dim)
+        xv = E.rearrange(xv, "b s (h d) -> b s h d", d=self.head_dim)
+        xq = F.rms_norm(xq, (xq.size(-1),))
+        xk = F.rms_norm(xk, (xk.size(-1),))
+        xk = repeat_kv(xk, n_rep=self.n_rep)
+        xv = repeat_kv(xv, n_rep=self.n_rep)
+        return xq, xk, xv
+    def _post_attention(self, output: T, lse: T) -> T:
+        sinks_BHS = self.sinks.view(1, -1, 1)
+        sink_scale = torch.sigmoid(lse - sinks_BHS)
+        output = (output * sink_scale.unsqueeze(-1)).to(output.dtype)
+        output = output.permute(0, 2, 1, 3).contiguous().flatten(2)
+        return self.wo(output)
+    def compile_attention(self, *, dynamic: bool = True, mode: str = "default"):
+        self._pre_attention_qkv = torch.compile(self._pre_attention_qkv, dynamic=dynamic, mode=mode)
+        self._post_attention = torch.compile(self._post_attention, dynamic=dynamic, mode=mode)
+    def forward(
+        self, x: T, attention_masks: BlockMask, freqs_cis: T,
+        freqs_cis_2d: T | None = None, pos_hw: T | None = None,
+        kv_cache=None, input_pos=None, batch_idx=None,
+        flex_attn_kernel_options=None,
+    ):
+        xq, xk, xv = self._pre_attention_qkv(x)
+        xq, xk = apply_3d_rotary_emb(xq, xk, freqs_cis, freqs_cis_2d, pos_hw)
+        xq = E.rearrange(xq, "b s h d -> b h s d")
+        xk = E.rearrange(xk, "b s h d -> b h s d")
+        xv = E.rearrange(xv, "b s h d -> b h s d")
+        xk, xv = kv_cache.insert_kv(self.layer_id, xk, xv, input_pos=input_pos, batch_idx=batch_idx)
+        flex_fn = compiled_flex_attn_decode if xq.shape[2] == 1 else compiled_flex_attn_prefill
+        output, aux_output = flex_fn(xq, xk, xv, block_mask=attention_masks, return_aux=AuxRequest(lse=True))
+        return self._post_attention(output, aux_output.lse)
+# ---------------------------------------------------------------------------
+# Sub-modules: FeedForward
+# ---------------------------------------------------------------------------
+@triton.jit
+def _squared_relu_gate_kernel(
+    packed_ptr, out_ptr, n_rows, n_cols,
+    in_row_stride, in_col_stride, out_row_stride, out_col_stride,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    n_elements = n_rows * n_cols
+    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    rows = offsets // n_cols
+    cols = offsets % n_cols
+    gate_idx = rows * in_row_stride + (2 * cols) * in_col_stride
+    up_idx = rows * in_row_stride + (2 * cols + 1) * in_col_stride
+    out_idx = rows * out_row_stride + cols * out_col_stride
+    gate = tl.load(packed_ptr + gate_idx, mask=mask)
+    up = tl.load(packed_ptr + up_idx, mask=mask)
+    gate = tl.where(gate > 0, gate, 0.0)
+    out = gate * gate * up
+    tl.store(out_ptr + out_idx, out, mask=mask)
+def squared_relu_gate(packed: T, hidden_dim: int) -> T:
+    packed_2d = packed.flatten(0, -2)
+    n_rows = packed_2d.shape[0]
+    n_cols = hidden_dim
+    out_2d = torch.empty((n_rows, n_cols), device=packed.device, dtype=packed.dtype)
+    n = n_rows * n_cols
+    grid = lambda meta: (triton.cdiv(n, meta["BLOCK_SIZE"]),)
+    _squared_relu_gate_kernel[grid](
+        packed_2d, out_2d, n_rows, n_cols,
+        packed_2d.stride(0), packed_2d.stride(1),
+        out_2d.stride(0), out_2d.stride(1),
+        BLOCK_SIZE=1024,
+    )
+    return out_2d.view(*packed.shape[:-1], hidden_dim)
+class FeedForward(nn.Module):
+    def __init__(self, dim: int, hidden_dim: int):
+        super().__init__()
+        self.w13 = nn.Linear(dim, 2 * hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+        self.hidden_dim = hidden_dim
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.rms_norm(x, (x.size(-1),))
+        w13_out = self.w13(x)
+        return self.w2(squared_relu_gate(w13_out, self.hidden_dim))
+# ---------------------------------------------------------------------------
+# Sub-modules: TransformerBlock
+# ---------------------------------------------------------------------------
+class TransformerBlock(nn.Module):
+    def __init__(self, layer_id: int, config: FalconPerceptionConfig):
+        super().__init__()
+        self.attention = Attention(config, layer_id)
+        self.feed_forward = FeedForward(config.dim, config.ffn_dim)
+    def compile(self, *, dynamic: bool = True, mode: str = "default"):
+        self.feed_forward = torch.compile(self.feed_forward, dynamic=dynamic, mode=mode)
+        self.attention.compile_attention(dynamic=dynamic, mode=mode)
+        return self
+    def forward(
+        self, x: T, freqs_cis: T, freqs_cis_2d: T | None = None,
+        pos_hw: T | None = None, attention_masks=None, kv_cache=None,
+        input_pos=None, batch_idx=None, flex_attn_kernel_options=None,
+    ):
+        B, S, D = x.shape
+        x = x + self.attention(
+            x, freqs_cis=freqs_cis, freqs_cis_2d=freqs_cis_2d, pos_hw=pos_hw,
+            attention_masks=attention_masks, kv_cache=kv_cache,
+            input_pos=input_pos, batch_idx=batch_idx,
+            flex_attn_kernel_options=flex_attn_kernel_options,
+        )
+        out = x + self.feed_forward(x)
+        return out.reshape(B, S, D)
+# ---------------------------------------------------------------------------
+# KV Cache
+# ---------------------------------------------------------------------------
+class KVCache:
+    def __init__(self, max_batch_size, max_seq_length, n_heads, head_dim, num_layers):
+        self.kv_shape = (num_layers, 2, max_batch_size, n_heads, max_seq_length, head_dim)
+        self.kv_cache = None
+        self.pos = 0
+        self.pos_t: T | None = None
+    def reset(self):
+        self.pos = 0
+        self.pos_t = None
+    def get_pos(self):
+        return self.pos
+    def set_pos_t(self, pos_t):
+        self.pos_t = pos_t
+    def increment_and_get_pos_t(self):
+        assert self.pos_t is not None
+        self.pos_t += 1
+        return self.pos_t
+    def insert_kv(self, layer_id: int, k: T, v: T, **kwargs):
+        del kwargs
+        assert self.pos_t is not None
+        if self.kv_cache is None:
+            self.kv_cache = torch.empty(self.kv_shape, dtype=k.dtype, device=k.device)
+        B, H, T_add, D = k.size()
+        t0, t1 = self.pos, self.pos + T_add
+        self.kv_cache[layer_id, 0, :, :, t0:t1] = k
+        self.kv_cache[layer_id, 1, :, :, t0:t1] = v
+        key_view = self.kv_cache[layer_id, 0, :, :, :t1]
+        value_view = self.kv_cache[layer_id, 1, :, :, :t1]
+        if layer_id == self.kv_cache.size(0) - 1:
+            self.pos = t1
+        return key_view, value_view
+# ---------------------------------------------------------------------------
+# Sampling
+# ---------------------------------------------------------------------------
+@torch.inference_mode()
+def sample_next_token(logits, rng, temperature=0.0, top_k=None):
+    assert temperature >= 0.0
+    if temperature == 0.0:
+        return torch.argmax(logits, dim=-1, keepdim=True)
+    if top_k is not None:
+        k = min(top_k, logits.size(-1))
+        vals, idx = torch.topk(logits, k, dim=-1)
+        vals = vals / temperature
+        probs = F.softmax(vals, dim=-1)
+        choice = torch.multinomial(probs, num_samples=1, generator=rng)
+        return idx.gather(1, choice)
+    logits = logits / temperature
+    probs = F.softmax(logits, dim=-1)
+    return torch.multinomial(probs, num_samples=1, generator=rng)
+# ---------------------------------------------------------------------------
+# Main Model
+# ---------------------------------------------------------------------------
+class FalconPerceptionForSegmentation(PreTrainedModel):
+    config_class = FalconPerceptionConfig
+    _no_split_modules = ["TransformerBlock"]
+    def __init__(self, config: FalconPerceptionConfig):
+        super().__init__(config)
+        img_in_dim = config.temporal_patch_size * config.spatial_patch_size ** 2 * config.channel_size
+        self.img_projector = nn.Linear(img_in_dim, config.dim, bias=False)
+        self.tok_embeddings = nn.Embedding(config.vocab_size, config.dim)
+        self.layers = nn.ModuleDict()
+        for layer_id in range(config.n_layers):
+            self.layers[str(layer_id)] = TransformerBlock(layer_id, config)
+        self.norm = nn.RMSNorm(config.dim, eps=config.norm_eps)
+        self.output = nn.Linear(config.dim, config.vocab_size, bias=False)
+        self.coord_encoder = FourierEncoder(2, config.coord_enc_dim, config.dim)
+        self.coord_decoder = BboxDecoder(config.dim, config.coord_dec_dim, config.coord_out_dim)
+        self.size_encoder = FourierEncoder(2, config.size_enc_dim, config.dim)
+        self.size_decoder = BboxDecoder(config.dim, config.size_dec_dim, config.size_out_dim)
+        if config.do_segmentation:
+            self.itok_upsampler = AnyUp()
+            self.proj_segm = SegmDecoder(config.dim, config.segm_out_dim, config.num_segm_layers)
+            self.conv_segm = nn.Conv2d(config.dim, config.segm_out_dim, kernel_size=3, padding=1)
+        rope_dim = config.head_dim // 2
+        freqs_cis = precompute_freqs_cis(rope_dim, config.max_seq_len, config.rope_theta)
+        freqs_cis_golden = torch.empty((config.n_heads, rope_dim // 2, 2), dtype=torch.float)
+        self.register_buffer("freqs_cis", freqs_cis, persistent=False)
+        self.register_buffer("freqs_cis_golden", freqs_cis_golden, persistent=True)
+        self._weights_fused = False
+        self._is_compiled = False
+        self.post_init()
+    # -- Weight management ---------------------------------------------------
+    def _ensure_device_buffers(self):
+        """Recompute non-persistent buffers that HF meta-device loading may discard."""
+        if self._weights_fused:
+            return
+        device = self.tok_embeddings.weight.device
+        c = self.config
+        rope_dim = c.head_dim // 2
+        freqs_cis = precompute_freqs_cis(rope_dim, c.max_seq_len, c.rope_theta).to(device)
+        self.register_buffer("freqs_cis", freqs_cis, persistent=False)
+        if self.freqs_cis_golden.device != device:
+            self.freqs_cis_golden = self.freqs_cis_golden.to(device)
+        self._weights_fused = True
+    def compile_model(self):
+        if self._is_compiled:
+            return
+        torch._inductor.config.triton.cudagraphs = False
+        for layer in self.layers.values():
+            layer.compile(dynamic=True, mode="default")
+        self.coord_encoder = torch.compile(self.coord_encoder, dynamic=True, mode="default")
+        self.coord_decoder = torch.compile(self.coord_decoder, dynamic=True, mode="default")
+        self.size_encoder = torch.compile(self.size_encoder, dynamic=True, mode="default")
+        self.size_decoder = torch.compile(self.size_decoder, dynamic=True, mode="default")
+        if self.config.do_segmentation:
+            self.itok_upsampler.compile(mode="default", dynamic=True)
+        self._is_compiled = True
+    # -- Tokenizer -----------------------------------------------------------
+    def _get_tokenizer(self):
+        if not hasattr(self, "_tokenizer"):
+            import os
+            path = self.config._name_or_path
+            is_local = os.path.exists(path)
+            self._tokenizer = AutoTokenizer.from_pretrained(path, local_files_only=is_local, trust_remote_code=True)
+            for token_name, token in self._tokenizer.special_tokens_map.items():
+                if isinstance(token, str):
+                    setattr(self._tokenizer, token_name, token)
+                    setattr(
+                        self._tokenizer, token_name + "_id",
+                        self._tokenizer.convert_tokens_to_ids(token),
+                    )
+        return self._tokenizer
+    # -- Attention mask ------------------------------------------------------
+    def get_attention_mask(self, input_batch: T, max_len: int | None = None):
+        return create_batch_attention_mask(
+            input_batch,
+            pad_token_id=self._pad_token_id,
+            eos_token_id=self.config.eos_id,
+            soi_token_id=self.config.image_cls_token_id,
+            eoi_token_id=self.config.img_end_id,
+            max_len=max_len,
+        )
+    def get_upsampler_attn_mask(self, H, W, h, w, device):
+        return create_attention_mask(
+            get_upsampler_attn_mask_mod(H, W, h, w, device=device),
+            B=None, H=None, Q_LEN=H * W, KV_LEN=h * w,
+        )
+    # -- Embedding helpers ---------------------------------------------------
+    def _scatter_img_tokens_with_projector(self, h_BSD, pixel_patches_NLC, pixel_masks_NTHW, tokens_BS):
+        B, S, D = h_BSD.shape
+        pixel_patch_mask = E.reduce(
+            pixel_masks_NTHW,
+            "n (t pt) (h ph) (w pw) -> (n t h w)",
+            reduction="any",
+            pt=self.config.temporal_patch_size,
+            ph=self.config.spatial_patch_size,
+            pw=self.config.spatial_patch_size,
+        )
+        pixel_patches_flat = E.rearrange(pixel_patches_NLC, "n p c -> (n p) c")
+        valid_patches = pixel_patches_flat[pixel_patch_mask]
+        valid_feats = self.img_projector(valid_patches)
+        img_mask_h_BSD = E.repeat(tokens_BS == self.config.img_id, "b s -> b s d", d=D)
+        assert valid_feats.numel() == img_mask_h_BSD.sum()
+        return torch.masked_scatter(h_BSD, img_mask_h_BSD, valid_feats)
+    def _encode_coords(self, h_BSD: T, tokens_BS: T, all_xy: T):
+        coord_tokens_mask = tokens_BS == self.config.coord_token_id
+        if all_xy.numel() == 0:
+            return h_BSD
+        coord_tokens = self.coord_encoder(all_xy.reshape(-1, 2))
+        if coord_tokens.shape[0] == h_BSD.shape[0]:
+            h_BSD = torch.where(
+                coord_tokens_mask.unsqueeze(-1),
+                coord_tokens.view(h_BSD.shape[0], -1, h_BSD.shape[-1]),
+                h_BSD,
+            )
+        else:
+            h_BSD = h_BSD.masked_scatter_(coord_tokens_mask.unsqueeze(-1), coord_tokens)
+        return h_BSD
+    def _encode_sizes(self, h_BSD, tokens_BS, all_hw: T):
+        size_tokens_mask = tokens_BS == self.config.size_token_id
+        if all_hw.numel() == 0:
+            return h_BSD
+        size_tokens = self.size_encoder(all_hw.reshape(-1, 2))
+        if size_tokens.shape[0] == h_BSD.shape[0]:
+            h_BSD = torch.where(
+                size_tokens_mask.unsqueeze(-1),
+                size_tokens.view(h_BSD.shape[0], -1, h_BSD.shape[-1]),
+                h_BSD,
+            )
+        else:
+            h_BSD = h_BSD.masked_scatter_(size_tokens_mask.unsqueeze(-1), size_tokens)
+        return h_BSD
+    def decode_coords(self, h_BSD, labels):
+        B, S, D = h_BSD.shape
+        coord_masks = labels == self.config.coord_token_id
+        coord_tokens = torch.masked_select(h_BSD, coord_masks.unsqueeze(-1))
+        coord_logits = self.coord_decoder(coord_tokens.reshape(-1, D))
+        return E.rearrange(coord_logits, "b (two dim) -> b two dim", two=2)
+    def decode_sizes(self, h_BSD, labels):
+        B, S, D = h_BSD.shape
+        size_masks = labels == self.config.size_token_id
+        size_tokens = torch.masked_select(h_BSD, size_masks.unsqueeze(-1))
+        size_logits = self.size_decoder(size_tokens.reshape(-1, D))
+        return E.rearrange(size_logits, "b (two dim) -> b two dim", two=2)
+    def process_sizes(self, logits):
+        num_bins = logits.shape[-1]
+        pred = torch.argmax(logits, dim=-1).float() / (num_bins - 1)
+        min_size = torch.log2(torch.tensor(1 / num_bins))
+        max_size = 0.0
+        pred = pred * (max_size - min_size) + min_size
+        return torch.pow(2.0, pred)
+    # -- Segmentation -------------------------------------------------------
+    def gather_img_tokens(self, h_BSD: T, tokens_BS: T, itok_masks_NTHW: T):
+        B, S, D = h_BSD.shape
+        itok_masks_BSD = E.repeat(tokens_BS == self.config.img_id, "b s -> b s d", d=D)
+        itok_flatten = torch.masked_select(h_BSD, itok_masks_BSD)
+        itok_masks_NTHWD = E.repeat(itok_masks_NTHW, "n t h w -> n t h w d", d=D)
+        itok_NTHWD = torch.zeros_like(itok_masks_NTHWD, dtype=h_BSD.dtype, device=h_BSD.device)
+        itok_NTHWD = itok_NTHWD.masked_scatter_(itok_masks_NTHWD, itok_flatten)
+        return itok_NTHWD
+    def upsample_img_features(self, h_BSD: T, tokens_BS: T, pixel_values_NTHWC: T, pixel_mask_NTHW: T):
+        device = h_BSD.device
+        c = self.config
+        itok_masks_NTHW = E.reduce(
+            pixel_mask_NTHW,
+            "n (t pt) (h ph) (w pw) -> n t h w",
+            reduction="any",
+            pt=c.temporal_patch_size, ph=c.spatial_patch_size, pw=c.spatial_patch_size,
+        )
+        N, _, h, w = itok_masks_NTHW.shape
+        _, _, H, W = pixel_mask_NTHW.shape
+        images = E.rearrange(pixel_values_NTHWC, "n 1 h w c -> n c h w")
+        lr_img_features = self.gather_img_tokens(h_BSD, tokens_BS, itok_masks_NTHW)
+        lr_img_features = E.rearrange(lr_img_features, "n 1 h w d -> n d h w")
+        lr_img_features = self.conv_segm(lr_img_features)
+        upsampler_attn_mask = self.get_upsampler_attn_mask(H, W, h, w, device=device)
+        hr_parts = []
+        for i in range(N):
+            hr_i = self.itok_upsampler(
+                images=images[i:i + 1], features=lr_img_features[i:i + 1], attn_mask=upsampler_attn_mask,
+            )
+            hr_parts.append(hr_i)
+        return torch.cat(hr_parts, dim=0) if N > 1 else hr_parts[0]
+    @staticmethod
+    def _mask_to_coco_rle(binary_masks: torch.Tensor) -> list[dict]:
+        C, H, W = binary_masks.shape
+        has_any = E.reduce(binary_masks, "c h w -> c", reduction="any")
+        binary_col = E.rearrange(binary_masks, "c h w -> c (w h)")
+        diffs = binary_col[:, 1:] != binary_col[:, :-1]
+        nz = torch.nonzero(diffs, as_tuple=False)
+        first_vals = binary_col[:, 0]
+        nz_cpu = nz.cpu().numpy()
+        has_any_cpu = has_any.cpu().numpy()
+        first_vals_cpu = first_vals.cpu().numpy()
+        del diffs, nz, binary_col, first_vals, has_any
+        N_px = H * W
+        if nz_cpu.shape[0] > 0:
+            mask_ids = nz_cpu[:, 0]
+            change_cols = nz_cpu[:, 1]
+            uniq, grp_starts = np.unique(mask_ids, return_index=True)
+            grp_ends = np.append(grp_starts[1:], len(mask_ids))
+            mask_to_grp = {int(m): (int(gs), int(ge)) for m, gs, ge in zip(uniq, grp_starts, grp_ends)}
+        else:
+            change_cols = np.array([], dtype=np.intp)
+            mask_to_grp = {}
+        results = []
+        for i in range(C):
+            if not has_any_cpu[i]:
+                continue
+            if i in mask_to_grp:
+                gs, ge = mask_to_grp[i]
+                cidx = change_cols[gs:ge]
+            else:
+                cidx = np.array([], dtype=np.intp)
+            num_runs = len(cidx) + 1
+            starts = np.empty(num_runs, dtype=np.intp)
+            starts[0] = 0
+            if len(cidx) > 0:
+                starts[1:] = cidx + 1
+            counts = np.empty(num_runs, dtype=np.uint32)
+            if num_runs > 1:
+                counts[:-1] = np.diff(starts)
+            counts[-1] = N_px - starts[-1]
+            if first_vals_cpu[i]:
+                counts = np.concatenate([[0], counts])
+            rle = {"counts": counts.tolist(), "size": [H, W]}
+            rle = mask_utils.frPyObjects(rle, H, W)
+            rle["counts"] = rle["counts"].decode("utf-8")
+            results.append(rle)
+        return results
+    # -- Core forward --------------------------------------------------------
+    def forward(
+        self,
+        tokens: T,
+        attention_mask: BlockMask,
+        kv_cache,
+        rope_pos_t: T | None = None,
+        rope_pos_hw: T | None = None,
+        pixel_values: T | None = None,
+        pixel_mask: T | None = None,
+        coord_xy: T | None = None,
+        size_hw: T | None = None,
+    ):
+        B, S = tokens.size()
+        c = self.config
+        block_mask = attention_mask
+        T_pos = kv_cache.get_pos()
+        is_prefill = S != 1
+        if is_prefill:
+            assert rope_pos_t is not None and rope_pos_hw is not None
+            pos_t = rope_pos_t[:, T_pos:T_pos + S].long()
+            kv_cache.pos_t = pos_t[:, -1:]
+            freqs_cis = self.freqs_cis[pos_t]
+            rope_pos_hw = rope_pos_hw[:, T_pos:T_pos + S]
+            freqs_cis_golden = apply_golden_freqs_cis_to_visual_pos(self.freqs_cis_golden, rope_pos_hw)
+            block_mask.seq_lengths = (S, S)
+        else:
+            pos_t = kv_cache.increment_and_get_pos_t()
+            freqs_cis = self.freqs_cis[pos_t]
+            freqs_cis_golden = None
+            block_idx = T_pos // block_mask.BLOCK_SIZE[0]
+            block_mask = block_mask[:, :, block_idx]
+            block_mask.seq_lengths = (S, T_pos + S)
+            block_mask.mask_mod = offset_mask_mod(attention_mask.mask_mod, offset=T_pos)
+        h_BSD = self.tok_embeddings(tokens)
+        coord_xy = coord_xy if coord_xy is not None else h_BSD.new_empty(0)
+        size_hw = size_hw if size_hw is not None else h_BSD.new_empty(0)
+        h_BSD = self._encode_coords(h_BSD, tokens, coord_xy)
+        h_BSD = self._encode_sizes(h_BSD, tokens, size_hw)
+        if pixel_values is not None:
+            assert pixel_mask is not None
+            pixel_values = pixel_values.to(self.dtype)
+            pixel_mask = pixel_mask.to(self.dtype)
+            pixel_patches_NLC = E.rearrange(
+                pixel_values,
+                "n (t pt) (h ph) (w pw) c -> n (t h w) (pt ph pw c)",
+                pt=c.temporal_patch_size, ph=c.spatial_patch_size, pw=c.spatial_patch_size,
+            )
+            h_BSD = self._scatter_img_tokens_with_projector(h_BSD, pixel_patches_NLC, pixel_mask, tokens)
+        for layer in self.layers.values():
+            h_BSD = layer(
+                h_BSD, freqs_cis=freqs_cis, freqs_cis_2d=freqs_cis_golden,
+                pos_hw=rope_pos_hw, attention_masks=block_mask, kv_cache=kv_cache,
+            )
+        h_BSD = self.norm(h_BSD)
+        logits_BSV = self.output(h_BSD)
+        return logits_BSV, h_BSD
+    # -- Main API: generate --------------------------------------------------
+    @torch.inference_mode()
+    def generate(
+        self,
+        images,
+        queries,
+        max_new_tokens: int = 2048,
+        temperature: float = 0.0,
+        top_k: int | None = None,
+        min_dimension: int = 256,
+        max_dimension: int = 1024,
+        compile: bool = True,
+        seed: int | None = 42,
+        segm_threshold: float = 0.5,
+    ) -> list[list[dict]]:
+        """
+        Segment objects in images matching the given queries.
+        Args:
+            images: Single PIL Image (or path/URL) or list of them.
+            queries: Single query string or list of query strings (one per image).
+            max_new_tokens: Maximum generation steps.
+            temperature: Sampling temperature (0.0 = greedy).
+            top_k: Top-k sampling (None = disabled).
+            min_dimension: Min image side after resize.
+            max_dimension: Max image side after resize.
+            compile: Whether to torch.compile on first call.
+            seed: Random seed for reproducibility (None = non-deterministic).
+            segm_threshold: Sigmoid threshold for binary mask.
+        Returns:
+            List (per image) of lists (per detection) of dicts::
+                {
+                    "xy": {"x": float, "y": float},
+                    "hw": {"h": float, "w": float},
+                    "mask_rle": {"counts": str, "size": [H, W]},
+                }
+        """
+        self._ensure_device_buffers()
+        if compile:
+            self.compile_model()
+        # Normalize inputs
+        if isinstance(images, (str, Path, Image.Image)):
+            images = [images]
+        if isinstance(queries, str):
+            queries = [queries]
+        assert len(images) == len(queries), "Must provide one query per image"
+        device = self.device
+        tokenizer = self._get_tokenizer()
+        self._pad_token_id = tokenizer.convert_tokens_to_ids("<|pad|>")
+        stop_token_ids = [self.config.eos_id, tokenizer.convert_tokens_to_ids("<|end_of_query|>")]
+        # Store original image sizes for mask resizing
+        pil_images = [load_image(img).convert("RGB") for img in images]
+        original_sizes = [(img.height, img.width) for img in pil_images]
+        # Build prompts
+        image_prompt_pairs = [
+            (img, f"<|image|>Segment these expressions in the image:<|start_of_query|>{q}<|REF_SEG|>")
+            for img, q in zip(pil_images, queries)
+        ]
+        # Preprocess
+        batch_inputs = process_batch(
+            tokenizer, self.config, image_prompt_pairs,
+            max_length=4096, min_dimension=min_dimension, max_dimension=max_dimension,
+        )
+        batch_inputs = {k: (v.to(device) if torch.is_tensor(v) else v) for k, v in batch_inputs.items()}
+        tokens = batch_inputs["tokens"]
+        B, L = tokens.size()
+        block_size = 128
+        S = (L + max_new_tokens + block_size - 1) // block_size * block_size
+        assert S <= self.config.max_seq_len
+        rng = torch.Generator(device).manual_seed(seed) if seed is not None else None
+        kv_cache = KVCache(
+            max_batch_size=B, max_seq_length=S, n_heads=self.config.n_heads,
+            head_dim=self.config.head_dim, num_layers=self.config.n_layers,
+        )
+        padded_tokens = torch.full((B, S), self._pad_token_id, dtype=tokens.dtype, device=device)
+        padded_tokens[:, :L] = tokens
+        attention_mask = self.get_attention_mask(padded_tokens, max_len=S)
+        all_xy, all_hw = self._extract_coords([[]])
+        coord_xy = all_xy.to(device=device, dtype=self.dtype)
+        size_hw_t = all_hw.to(device=device, dtype=self.dtype)
+        # Prefill
+        logits_BSV, h_BSD = self.forward(
+            tokens=tokens, rope_pos_t=batch_inputs["pos_t"], rope_pos_hw=batch_inputs["pos_hw"],
+            attention_mask=attention_mask, kv_cache=kv_cache,
+            pixel_values=batch_inputs["pixel_values"], pixel_mask=batch_inputs["pixel_mask"],
+            coord_xy=coord_xy, size_hw=size_hw_t,
+        )
+        hr_img_features = self.upsample_img_features(
+            h_BSD, tokens, batch_inputs["pixel_values"], batch_inputs["pixel_mask"],
+        )
+        aux_output_B = [[] for _ in range(B)]
+        stop_ids = torch.tensor(stop_token_ids).to(device)
+        should_stop_B = torch.full((B,), False, dtype=torch.bool, device=device)
+        # Decode loop
+        while not torch.all(should_stop_B) and (pos := kv_cache.get_pos()) < S:
+            tokens_B1 = sample_next_token(logits_BSV[:, -1], rng, temperature, top_k)
+            if torch.any(should_stop_B):
+                tokens_B1 = tokens_B1.clone()
+                tokens_B1[should_stop_B, :] = self._pad_token_id
+            padded_tokens[:, pos] = tokens_B1[:, -1]
+            # Decode coords (with deduplication to avoid repeating the same location)
+            coord_logits = self.decode_coords(h_BSD[:, -1:], tokens_B1)
+            sample_w_coord = torch.where(tokens_B1 == self.config.coord_token_id)[0]
+            num_bins = coord_logits.size(-1)
+            coord_repeat_threshold = 0.01  # coords within 1% of image size are considered duplicates
+            max_coord_attempts = 100
+            xy_b2 = torch.zeros(B, 2, device=device, dtype=self.dtype)
+            for i, b in enumerate(sample_w_coord.tolist()):
+                logits_b = coord_logits[i].clone()  # (2, num_bins)
+                existing_coords = [
+                    item for item in aux_output_B[b]
+                    if isinstance(item, dict) and "x" in item and "y" in item
+                ]
+                pred_x, pred_y = 0.0, 0.0
+                for _ in range(max_coord_attempts):
+                    pred_bins = torch.argmax(logits_b, dim=-1)  # (2,)
+                    pred_x = pred_bins[0].item() / (num_bins - 1)
+                    pred_y = pred_bins[1].item() / (num_bins - 1)
+                    is_repeat = any(
+                        abs(ec["x"] - pred_x) < coord_repeat_threshold
+                        and abs(ec["y"] - pred_y) < coord_repeat_threshold
+                        for ec in existing_coords
+                    )
+                    if not is_repeat:
+                        break
+                    logits_b[0, pred_bins[0]] = float("-inf")
+                    logits_b[1, pred_bins[1]] = float("-inf")
+                xy_b2[b, 0] = pred_x
+                xy_b2[b, 1] = pred_y
+                aux_output_B[b].append({"x": pred_x, "y": pred_y})
+            # Decode sizes
+            size_logits = self.decode_sizes(h_BSD[:, -1:], tokens_B1)
+            hw_b2 = self.process_sizes(size_logits)
+            size_preds = [{"h": hw[0].item(), "w": hw[1].item()} for hw in hw_b2]
+            sample_w_size = torch.where(tokens_B1 == self.config.size_token_id)[0]
+            for i, b in enumerate(sample_w_size.tolist()):
+                aux_output_B[b].append(size_preds[i])
+            # Decode segmentation
+            sample_w_segm = torch.where(tokens_B1 == self.config.seg_token_id)[0]
+            segm_tokens = h_BSD[sample_w_segm, -1, :]
+            segm_tokens = self.proj_segm(segm_tokens)
+            segm_masks = torch.einsum("kdhw,kd->khw", hr_img_features[sample_w_segm], segm_tokens)
+            for i, b in enumerate(sample_w_segm):
+                aux_output_B[b].append(segm_masks[i])
+            # Next step
+            logits_BSV, h_BSD = self.forward(
+                tokens=tokens_B1, attention_mask=attention_mask,
+                coord_xy=xy_b2.to(self.dtype), size_hw=hw_b2.to(self.dtype), kv_cache=kv_cache,
+            )
+            hit_stop_B = torch.isin(tokens_B1, stop_ids).any(dim=-1)
+            should_stop_B = should_stop_B.logical_or(hit_stop_B)
+        # Post-process: convert aux outputs to structured results with RLE masks
+        pixel_mask_batch = batch_inputs["pixel_mask"][:, 0]  # (B, H, W)
+        results = []
+        for b in range(B):
+            dets = self._postprocess_aux(
+                aux_output_B[b], pixel_mask_batch[b], original_sizes[b], segm_threshold,
+            )
+            results.append(dets)
+        return results
+    # -- Post-processing helpers ---------------------------------------------
+    def _extract_coords(self, coords_BO: list[list]):
+        all_xy, all_hw = [], []
+        for coords_O in coords_BO:
+            if not coords_O:
+                continue
+            for coords in coords_O:
+                for k, v in coords.items():
+                    if k.startswith(("x", "y")):
+                        all_xy.append(v)
+                    elif k.startswith(("h", "w")):
+                        all_hw.append(v)
+        return torch.tensor(all_xy), torch.tensor(all_hw)
+    @staticmethod
+    def _mask_nms(
+        binary_masks: list[torch.Tensor],
+        iou_threshold: float = 0.6,
+        nms_max_side: int = 256,
+    ) -> list[int]:
+        """
+        Fast vectorised mask NMS on binary (H, W) tensors.
+        Returns the list of kept indices ordered by descending mask score.
+        The IoU matrix is computed via a single batched matmul; suppression
+        uses one GPU boolean op per kept mask — no .item() in the inner loop.
+        """
+        N = len(binary_masks)
+        if N <= 1:
+            return list(range(N))
+        device = binary_masks[0].device
+        base_h, base_w = binary_masks[0].shape
+        scale = min(1.0, nms_max_side / max(base_h, base_w))
+        th = max(1, int(round(base_h * scale)))
+        tw = max(1, int(round(base_w * scale)))
+        resized = []
+        for m in binary_masks:
+            m = m.float()
+            if m.shape != (th, tw):
+                m = F.interpolate(
+                    m[None, None], size=(th, tw), mode="bilinear", align_corners=False
+                ).squeeze()
+            resized.append(m)
+        binary = torch.stack(resized)          # (N, th, tw)
+        flat = binary.view(N, -1)              # (N, th*tw)
+        areas = flat.sum(dim=1)                # (N,)
+        scores = areas                         # larger mask = higher priority
+        intersection = flat @ flat.T           # (N, N)
+        union = areas[:, None] + areas[None, :] - intersection
+        iou = intersection / union.clamp(min=1)
+        order = scores.argsort(descending=True)
+        suppressed = torch.zeros(N, dtype=torch.bool, device=device)
+        keep = []
+        for idx in order.tolist():
+            if suppressed[idx]:
+                continue
+            keep.append(idx)
+            suppressed |= iou[idx] > iou_threshold
+        return keep
+    def _postprocess_aux(
+        self,
+        aux_list: list,
+        pixel_mask_hw: T,
+        orig_hw: tuple[int, int],
+        threshold: float,
+        nms_iou_threshold: float = 0.6,
+    ) -> list[dict]:
+        """Convert raw aux outputs into structured detections with RLE masks."""
+        orig_h, orig_w = orig_hw
+        # Find active image region from pixel mask
+        nonzero = torch.nonzero(pixel_mask_hw, as_tuple=False)
+        if len(nonzero) > 0:
+            min_h, min_w = nonzero.min(dim=0)[0]
+            max_h, max_w = nonzero.max(dim=0)[0]
+            act_h = (max_h - min_h + 1).item()
+            act_w = (max_w - min_w + 1).item()
+        else:
+            min_h = min_w = 0
+            act_h = act_w = None
+        # Group into triplets: coord, size, mask — build binary masks first
+        candidates = []
+        step = 3  # coord, size, mask
+        for i in range(0, len(aux_list), step):
+            if i + 2 >= len(aux_list):
+                break
+            xy = aux_list[i]
+            hw = aux_list[i + 1]
+            mask_logits = aux_list[i + 2]
+            if not isinstance(mask_logits, torch.Tensor):
+                continue
+            # Crop to active region
+            if act_h is not None and act_w is not None:
+                mask_logits = mask_logits[min_h:min_h + act_h, min_w:min_w + act_w]
+            # Resize to original image size
+            mask_logits = mask_logits.unsqueeze(0).unsqueeze(0).float()
+            mask_logits = F.interpolate(mask_logits, size=(orig_h, orig_w), mode="bilinear", align_corners=False)
+            mask_logits = mask_logits.squeeze(0).squeeze(0)
+            # Threshold
+            binary_mask = (torch.sigmoid(mask_logits) > threshold).bool()
+            candidates.append({"xy": xy, "hw": hw, "binary_mask": binary_mask})
+        if not candidates:
+            return []
+        # NMS on binary masks before RLE encoding
+        keep_indices = self._mask_nms(
+            [c["binary_mask"] for c in candidates],
+            iou_threshold=nms_iou_threshold,
+        )
+        candidates = [candidates[i] for i in keep_indices]
+        # Encode survivors as COCO RLE
+        detections = []
+        for c in candidates:
+            rle_list = self._mask_to_coco_rle(c["binary_mask"].unsqueeze(0))
+            mask_rle = rle_list[0] if rle_list else {"counts": "", "size": [orig_h, orig_w]}
+            detections.append({"xy": c["xy"], "hw": c["hw"], "mask_rle": mask_rle})
+        return detections

processing_falcon_perception.py ADDED Viewed

	@@ -0,0 +1,423 @@

+import io
+import math
+import einops as E
+import numpy as np
+import requests
+import torch
+from PIL import Image
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.image_transforms import convert_to_rgb, resize
+from transformers.image_utils import (
+    ImageInput,
+    get_image_size,
+    infer_channel_dimension_format,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+IMAGE_MEAN = [0.5, 0.5, 0.5]
+IMAGE_STD = [0.5, 0.5, 0.5]
+def load_image(image):
+    if image is None:
+        return None
+    if isinstance(image, Image.Image):
+        return image
+    if isinstance(image, str):
+        if image.startswith(("http://", "https://")):
+            response = requests.get(image, timeout=10)
+            response.raise_for_status()
+            return Image.open(io.BytesIO(response.content))
+        if image.endswith(".npy"):
+            img_array = io.BytesIO(np.load(image))
+            return Image.open(img_array)
+        return Image.open(image)
+    if isinstance(image, np.bytes_):
+        return Image.open(io.BytesIO(image))
+    if isinstance(image, np.ndarray):
+        return Image.fromarray(image)
+    raise TypeError(f"Unknown image format {image}")
+def load_images(images_input, min_dimension: int, max_dimension: int):
+    images = []
+    if images_input is not None:
+        for inp in images_input:
+            img = load_image(inp)
+            img = resize_image_if_necessary(img, min_dimension, max_dimension)
+            images.append(img)
+    return images
+def resize_image_if_necessary(
+    image,
+    shortest_dimension=224,
+    longest_dimension=896,
+):
+    original_width, original_height = image.size
+    aspect_ratio = original_width / original_height
+    if (
+        shortest_dimension <= original_width <= longest_dimension
+        and shortest_dimension <= original_height <= longest_dimension
+    ):
+        return image
+    is_vertical_image = original_width < original_height
+    if original_width < shortest_dimension or original_height < shortest_dimension:
+        if is_vertical_image:
+            new_width = shortest_dimension
+            new_height = int(new_width / aspect_ratio)
+        else:
+            new_height = shortest_dimension
+            new_width = int(new_height * aspect_ratio)
+    else:
+        if is_vertical_image:
+            new_width = longest_dimension
+            new_height = int(new_width / aspect_ratio)
+        else:
+            new_height = longest_dimension
+            new_width = int(new_height * aspect_ratio)
+    if new_width > longest_dimension:
+        new_width = longest_dimension
+        new_height = int(new_width / aspect_ratio)
+    if new_height > longest_dimension:
+        new_height = longest_dimension
+        new_width = int(new_height * aspect_ratio)
+    resized_image = image.resize((new_width, new_height))
+    return resized_image
+def smart_resize(
+    image,
+    factor: int,
+    resample,
+    input_data_format,
+    min_pixels: int = 56 * 56,
+    max_pixels: int = 14 * 14 * 4 * 1280,
+):
+    height, width = get_image_size(image, channel_dim=input_data_format)
+    if height < factor or width < factor:
+        raise ValueError(f"{height=} or {width=} must be larger than {factor=}")
+    if max(height, width) / min(height, width) > 200:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = np.sqrt((height * width) / max_pixels)
+        h_bar = math.floor(height / beta / factor) * factor
+        w_bar = math.floor(width / beta / factor) * factor
+    elif h_bar * w_bar < min_pixels:
+        beta = np.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    image = resize(
+        image,
+        size=(h_bar, w_bar),
+        resample=resample,
+        input_data_format=input_data_format,
+    )
+    return image
+class ImageProcessor(BaseImageProcessor):
+    def __init__(
+        self,
+        patch_size,
+        merge_size,
+        do_resize: bool = True,
+        resample: Image.Resampling = Image.Resampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: float = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: float | list[float] | None = None,
+        image_std: float | list[float] | None = None,
+        do_convert_rgb: bool = True,
+        min_pixels: int = 56 * 56,
+        max_pixels: int = 28 * 28 * 1280,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean or IMAGE_MEAN
+        self.image_std = image_std or IMAGE_STD
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.patch_size = patch_size
+        self.merge_size = merge_size
+        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
+        self.do_convert_rgb = do_convert_rgb
+        validate_preprocess_arguments(
+            rescale_factor=self.rescale_factor,
+            do_normalize=self.do_normalize,
+            image_mean=self.image_mean,
+            image_std=self.image_std,
+            do_resize=self.do_resize,
+            size=self.size,
+            resample=self.resample,
+        )
+    def _preprocess(self, image: ImageInput, do_rescale=None, do_normalize=None):
+        if self.do_convert_rgb:
+            image = convert_to_rgb(image)
+        image = to_numpy_array(image)
+        input_data_format = infer_channel_dimension_format(image)
+        if self.do_resize:
+            image = smart_resize(
+                image,
+                factor=self.patch_size * self.merge_size,
+                resample=self.resample,
+                input_data_format=input_data_format,
+                min_pixels=self.min_pixels,
+                max_pixels=self.max_pixels,
+            )
+        if do_rescale or self.do_rescale:
+            image = self.rescale(image, scale=self.rescale_factor, input_data_format=input_data_format)
+        if do_normalize or self.do_normalize:
+            image = self.normalize(
+                image=image, mean=self.image_mean, std=self.image_std,
+                input_data_format=input_data_format,
+            )
+        return image
+    def preprocess(self, images: list[ImageInput] | None, do_rescale=None, do_normalize=None, **kwargs):
+        del kwargs
+        if images is None:
+            return []
+        images = [item for item in images if item is not None]
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        pixel_values = []
+        for image in images:
+            processed_image = self._preprocess(image, do_rescale, do_normalize)
+            processed_image = processed_image[None, ...]
+            pixel_values.append(processed_image)
+        return pixel_values
+    def batch_images_with_mask(self, pixel_values, max_image_height, max_image_width):
+        if pixel_values is None:
+            return None
+        pixel_values = [item for item in pixel_values if item is not None and len(item) != 0]
+        if len(pixel_values) == 0:
+            return None
+        pixel_values = [torch.from_numpy(img) for img in pixel_values]
+        max_temporal = max(img.shape[0] for img in pixel_values)
+        def pad_image_and_mask(img):
+            time_steps, height, width, channels = img.shape
+            if channels != 3:
+                raise ValueError(f"Expected 3-channel RGB images, got {channels} channels.")
+            padding = (0, 0, 0, max_image_width - width, 0, max_image_height - height, 0, max_temporal - time_steps)
+            padded_image = torch.nn.functional.pad(img, padding)
+            mask = torch.zeros((max_temporal, max_image_height, max_image_width), dtype=torch.long)
+            mask[:time_steps, :height, :width] = 1
+            return padded_image, mask
+        padded_pixel_values, padding_masks = zip(*[pad_image_and_mask(img) for img in pixel_values])
+        padded_pixel_values = torch.stack(list(padded_pixel_values))
+        padding_masks = torch.stack(list(padding_masks))
+        return {"pixel_values": padded_pixel_values, "padding_mask": padding_masks}
+# ---------------------------------------------------------------------------
+# Positional encoding helpers
+# ---------------------------------------------------------------------------
+def _compute_image_spatial_positions(
+    pixel_mask_THW: torch.Tensor,
+    spatial_patch_size: int,
+    temporal_patch_size: int = 1,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    mask_thw = E.reduce(
+        pixel_mask_THW,
+        "(t tp) (h hp) (w wp) -> t h w",
+        reduction="any",
+        tp=temporal_patch_size,
+        hp=spatial_patch_size,
+        wp=spatial_patch_size,
+    )
+    width = E.reduce(mask_thw.sum(dim=-1).int(), "t h -> ", reduction="max")
+    height = E.reduce(mask_thw.sum(dim=-2).int(), "t w -> ", reduction="max")
+    xlim = torch.sqrt(width / height)
+    ylim = torch.sqrt(height / width)
+    xpos = torch.linspace(-xlim, xlim, int(width))
+    ypos = torch.linspace(-ylim, ylim, int(height))
+    wpos, hpos = torch.meshgrid(xpos, ypos, indexing="xy")
+    return hpos.flatten(), wpos.flatten()
+def _get_image_token_masks(tokens, config):
+    spatial_mask = tokens == config.img_id
+    no_increase_mask = (
+        spatial_mask
+        | (tokens == config.image_reg_1_token_id)
+        | (tokens == config.image_reg_2_token_id)
+        | (tokens == config.image_reg_3_token_id)
+        | (tokens == config.image_reg_4_token_id)
+        | (tokens == config.img_end_id)
+    )
+    return spatial_mask, no_increase_mask
+def get_pos_thw(
+    tokens: torch.Tensor,
+    pixel_masks_NTHW: torch.Tensor,
+    config,
+    spatial_patch_size: int,
+    temporal_patch_size: int = 1,
+    pad_token_id: int = None,
+):
+    assert pad_token_id is not None
+    assert tokens.ndim == 2
+    assert pixel_masks_NTHW.ndim == 4
+    spatial_img_token_mask_BS, no_increase_idx_img_token_mask_BS = _get_image_token_masks(tokens, config)
+    hpos_parts, wpos_parts = [], []
+    for i in range(pixel_masks_NTHW.shape[0]):
+        h, w = _compute_image_spatial_positions(pixel_masks_NTHW[i], spatial_patch_size, temporal_patch_size)
+        hpos_parts.append(h)
+        wpos_parts.append(w)
+    hpos_N = torch.cat(hpos_parts) if hpos_parts else torch.empty(0)
+    wpos_N = torch.cat(wpos_parts) if wpos_parts else torch.empty(0)
+    expected_tokens = spatial_img_token_mask_BS.sum().item()
+    actual_tokens = hpos_N.numel()
+    assert actual_tokens == expected_tokens, (
+        f"Mismatch between spatial image tokens ({expected_tokens}) and generated positions ({actual_tokens})."
+    )
+    hpos_BS = torch.full_like(tokens, fill_value=torch.nan, dtype=torch.float, device=tokens.device)
+    wpos_BS = torch.full_like(tokens, fill_value=torch.nan, dtype=torch.float, device=tokens.device)
+    hpos_BS = hpos_BS.masked_scatter_(spatial_img_token_mask_BS, hpos_N)
+    wpos_BS = wpos_BS.masked_scatter_(spatial_img_token_mask_BS, wpos_N)
+    tpos_BS = torch.ones_like(tokens, dtype=torch.float, device=tokens.device)
+    tpos_BS[no_increase_idx_img_token_mask_BS] = 0
+    tpos_BS = torch.cumsum(tpos_BS, dim=1) - 1
+    tpos_BS[tokens == pad_token_id] = 0
+    hw_pos_BS2 = torch.stack([hpos_BS, wpos_BS], dim=-1)
+    return tpos_BS.long(), hw_pos_BS2
+def calculate_image_tokens(image, patch_size, merge_size):
+    height, width = get_image_size(image)
+    return int((height * width) / (patch_size * patch_size * merge_size * merge_size))
+def tokenize_inputs(prompt, images, tokenizer, config, patch_size, merge_size, max_length):
+    img_reg_ids = [
+        config.image_reg_1_token_id,
+        config.image_reg_2_token_id,
+        config.image_reg_3_token_id,
+        config.image_reg_4_token_id,
+    ]
+    if images is not None and len(images) > 0:
+        image_token_counts = [calculate_image_tokens(image, patch_size, merge_size) for image in images]
+    else:
+        image_token_counts = []
+    image_token = tokenizer.convert_ids_to_tokens(config.img_id)
+    prompt_chunks = [tokenizer.encode(chunk) for chunk in prompt.split(image_token)]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, sep) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    bos_id = getattr(tokenizer, "bos_token_id", None)
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and bos_id is not None and prompt_chunks[0][0] == bos_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    separators = []
+    for count in image_token_counts:
+        tokens = [config.img_id] * count
+        image_block = [config.image_cls_token_id, *img_reg_ids, *tokens, config.img_end_id]
+        separators.append(image_block)
+    if len(separators) != 0 and len(separators) != len(prompt_chunks):
+        separators.append(separators[-1])
+    selected_images = []
+    if len(separators) == 0:
+        input_ids = prompt_chunks[0]
+    else:
+        for index, x in enumerate(insert_separator(prompt_chunks, separators)):
+            if index % 2 != 0:
+                if (len(input_ids) + len(x)) < max_length:
+                    input_ids.extend(x)
+                    selected_images.append(images[index // 2])
+            elif index % 2 == 0:
+                input_ids.extend(x[offset:])
+    input_ids = torch.LongTensor(input_ids)
+    return input_ids, selected_images
+def process_batch(
+    tokenizer,
+    config,
+    image_prompt_pairs,
+    max_length,
+    min_dimension,
+    max_dimension,
+    patch_size=16,
+    merge_size=1,
+):
+    """
+    Process a batch of images with text prompts.
+    Uses LEFT PADDING for proper batch generation with causal models.
+    """
+    all_input_ids = []
+    all_selected_images = []
+    processor_local = ImageProcessor(patch_size, merge_size)
+    for img_input, prompt in image_prompt_pairs:
+        img = load_image(img_input)
+        if img is not None:
+            img = resize_image_if_necessary(img, min_dimension, max_dimension)
+        images = processor_local.preprocess(images=[img] if img else [])
+        input_ids, selected_images = tokenize_inputs(
+            prompt, images, tokenizer, config, patch_size, merge_size, max_length,
+        )
+        all_input_ids.append(input_ids)
+        all_selected_images.extend(selected_images)
+    pad_token_id = tokenizer.convert_tokens_to_ids("<|pad|>")
+    padded_input_ids = torch.nn.utils.rnn.pad_sequence(
+        all_input_ids, batch_first=True, padding_value=pad_token_id, padding_side="left",
+    )
+    processed = processor_local.batch_images_with_mask(all_selected_images, max_dimension, max_dimension)
+    assert processed is not None
+    pos_t, pos_hw = get_pos_thw(
+        padded_input_ids, processed["padding_mask"], config, patch_size, pad_token_id=pad_token_id,
+    )
+    return {
+        "tokens": padded_input_ids,
+        "pixel_values": processed["pixel_values"],
+        "pixel_mask": processed["padding_mask"],
+        "pos_t": pos_t,
+        "pos_hw": pos_hw,
+        "pad_token_id": pad_token_id,
+    }

rope.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import einops as E
+import torch
+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0) -> torch.Tensor:
+    """
+    Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
+    This function calculates a frequency tensor with complex exponentials using the given dimension 'dim'
+    and the end index 'end'. The 'theta' parameter scales the frequencies.
+    The returned tensor contains complex values in complex64 data type.
+    Args:
+        dim (int): Dimension of the frequency tensor.
+        end (int): End index for precomputing frequencies.
+        theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.
+    Returns:
+        torch.Tensor: Precomputed frequency tensor with complex exponentials.
+    """
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device)
+    freqs = torch.outer(t, freqs).float()
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    return freqs_cis  # [S, D//2]
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """1D rotary embedding"""
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    assert freqs_cis.ndim == 3, (
+        "Freqs_cis must be indexed by position ids already and has shape (B,S,D)"
+    )
+    freqs_cis = E.rearrange(freqs_cis, "b s d -> b s 1 d")
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+###### 2D golden rope
+"""
+Dimension key:
+    B: batch size
+    S: number of tokens per sample, Seqlen
+    T: Number of selected Tokens
+    P: pos_dim
+    h: n_heads
+    d: head_dim
+    F: num_freqs == head_dim // 2
+"""
+def apply_golden_freqs_cis_to_visual_pos(freqs_hFP, pos_BSP) -> torch.Tensor:
+    """
+    This function is applied once per input batch, and the cached
+    freqs_cis is passed through to all layers.
+    Safe for Torch‑Inductor because it never uses boolean indexing on a symbolic tensor.
+    """
+    # 1. Boolean mask → integer indices (no unbacked shapes)
+    img_mask_BS = E.reduce(~torch.isnan(pos_BSP), 'b s p -> b s', reduction='all')
+    idx_b, idx_s = torch.nonzero(img_mask_BS, as_tuple=True)   # each shape: (N,)
+    # 2. Gather the positional tensor for those tokens
+    pos_tP = pos_BSP[idx_b, idx_s].float() # (N, p)
+    # 3. Project positions onto the frequency table → angles θ
+    theta_thF = torch.einsum("tp,hfp->thf", pos_tP, freqs_hFP.float())  # (t, h, f)
+    # 4. Convert to complex numbers on the unit circle
+    freqs_cis_thF = torch.polar(torch.ones_like(theta_thF), theta_thF)
+    return freqs_cis_thF
+def apply_golden_rotary_emb(input_BShd, freqs_cis_thF, pos_BSP) -> torch.Tensor:
+    """
+    Rotates *only* the image tokens in `input_BShd`.  No boolean indexing,
+    so it is safe for Torch‑Inductor.
+    """
+    img_mask_BS = E.reduce(~torch.isnan(pos_BSP), 'b s p -> b s', reduction='all')
+    idx_b, idx_s = torch.nonzero(img_mask_BS, as_tuple=True)  # (N,)
+    input_thd = input_BShd[idx_b, idx_s].float()  # (N, h, d)
+    x_even = input_thd[..., 0::2]  # (N, h, F)
+    x_odd = input_thd[..., 1::2]   # (N, h, F)
+    cos_thF = freqs_cis_thF.real
+    sin_thF = freqs_cis_thF.imag
+    # (a + ib) * (c + id) = (ac - bd) + i(ad + bc)
+    rot_even = x_even * cos_thF - x_odd * sin_thF
+    rot_odd = x_even * sin_thF + x_odd * cos_thF
+    output_real = torch.empty_like(input_thd)
+    output_real[..., 0::2] = rot_even
+    output_real[..., 1::2] = rot_odd
+    output_real = output_real.type_as(input_BShd)
+    output_BShd = input_BShd.clone()
+    output_BShd[idx_b, idx_s] = output_real
+    return output_BShd
+def apply_3d_rotary_emb(
+    xq: torch.Tensor,  # (B, S, H, D)
+    xk: torch.Tensor,  # (B, S, H, D)
+    freqs_cis: torch.Tensor,
+    freqs_cis_2d: torch.Tensor | None,
+    pos_hw: torch.Tensor | None,  # (B,S,3)
+) -> tuple[torch.Tensor, torch.Tensor]:
+    xq_t, xq_hw = xq.chunk(chunks=2, dim=-1)
+    xk_t, xk_hw = xk.chunk(chunks=2, dim=-1)
+    B, S, H, D = xq.shape
+    xq_t, xk_t = apply_rotary_emb(xq_t, xk_t, freqs_cis)
+    if freqs_cis_2d is not None and pos_hw is not None:
+        xq_hw = apply_golden_rotary_emb(xq_hw, freqs_cis_2d, pos_hw)
+        xk_hw = apply_golden_rotary_emb(xk_hw, freqs_cis_2d, pos_hw)
+    xq_out = torch.concat([xq_t, xq_hw], dim=-1).type_as(xq)
+    xk_out = torch.concat([xk_t, xk_hw], dim=-1).type_as(xk)
+    return xq_out, xk_out

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,380 @@

+{
+  "absence_token": "<|absence|>",
+  "additional_special_tokens": [
+    "<|pad|>",
+    ">>ABSTRACT<<",
+    ">>INTRODUCTION<<",
+    ">>SUMMARY<<",
+    ">>COMMENT<<",
+    ">>ANSWER<<",
+    ">>QUESTION<<",
+    ">>DOMAIN<<",
+    ">>PREFIX<<",
+    ">>SUFFIX<<",
+    ">>MIDDLE<<",
+    "<|finetune_right_pad_id|>",
+    "<|start_header_id|>",
+    "<|end_header_id|>",
+    "<|eom_id|>",
+    "<|eot_id|>",
+    "<|begin_of_text|>",
+    ">>TITLE<<",
+    "<tool_response>",
+    "</tool_response>",
+    "<tool_call>",
+    "</tool_call>",
+    "<schema>",
+    "</schema>",
+    "<scratch_pad>",
+    "</scratch_pad>",
+    "<thinking>",
+    "</thinking>",
+    "<explanation>",
+    "</explanation>",
+    "<file_sep>",
+    "<repo_name>",
+    ">>UNUSED_119<<",
+    ">>UNUSED_120<<",
+    "<|image|>",
+    "<|image_row_sep|>",
+    "<|start_of_image|>",
+    "<|end_of_image|>",
+    "<|start_of_video|>",
+    "<|end_of_video|>",
+    "<|frame_sep|>",
+    "<|start_of_turn|>",
+    "<|end_of_turn|>",
+    "<|start_of_diffusion_query|>",
+    "<|end_of_diffusion_query|>",
+    "<|diffusion_query|>",
+    "<|object|>",
+    "<|coord|>",
+    "<|size|>",
+    "<|perceive|>",
+    "<|image_mask_token|>",
+    "<|image_cls|>",
+    "<|image_reg_1|>",
+    "<|image_reg_2|>",
+    "<|image_reg_3|>",
+    "<|image_reg_4|>",
+    "<|image_reg_5|>",
+    "<|image_reg_6|>",
+    "<|image_reg_7|>",
+    "<|image_reg_8|>",
+    "<|DET|>",
+    "<|POINTING|>",
+    "<|OCR_GROUNDING|>",
+    "<|OCR_DOC_PARSER|>",
+    "<|OCR_PLAIN|>",
+    "<|REF_SEG|>",
+    "<|POINT_REF_SEG|>",
+    "<|CAPTION|>",
+    "<|DETAILED_CAPTION|>",
+    "<|seg|>",
+    "<|end_of_query|>",
+    "<|start_of_query|>",
+    "<|task_sep|>",
+    "<|SEMANTIC_SEG_TASK|>",
+    "<|semantic_seg|>",
+    "<|presence|>",
+    "<|absence|>",
+    ">>UNUSED_258<<",
+    ">>UNUSED_259<<",
+    ">>UNUSED_260<<",
+    ">>UNUSED_261<<",
+    ">>UNUSED_262<<",
+    ">>UNUSED_263<<",
+    ">>UNUSED_264<<",
+    ">>UNUSED_265<<",
+    ">>UNUSED_266<<",
+    ">>UNUSED_267<<",
+    ">>UNUSED_268<<",
+    ">>UNUSED_269<<",
+    ">>UNUSED_270<<",
+    ">>UNUSED_271<<",
+    ">>UNUSED_272<<",
+    ">>UNUSED_273<<",
+    ">>UNUSED_274<<",
+    ">>UNUSED_275<<",
+    ">>UNUSED_276<<",
+    ">>UNUSED_277<<",
+    ">>UNUSED_278<<",
+    ">>UNUSED_279<<",
+    ">>UNUSED_280<<",
+    ">>UNUSED_281<<",
+    ">>UNUSED_282<<",
+    ">>UNUSED_283<<",
+    ">>UNUSED_284<<",
+    ">>UNUSED_285<<",
+    ">>UNUSED_286<<",
+    ">>UNUSED_287<<",
+    ">>UNUSED_288<<",
+    ">>UNUSED_289<<",
+    ">>UNUSED_290<<",
+    ">>UNUSED_291<<",
+    ">>UNUSED_292<<",
+    ">>UNUSED_293<<",
+    ">>UNUSED_294<<",
+    ">>UNUSED_295<<",
+    ">>UNUSED_296<<",
+    ">>UNUSED_297<<",
+    ">>UNUSED_298<<",
+    ">>UNUSED_299<<",
+    ">>UNUSED_300<<",
+    ">>UNUSED_301<<",
+    ">>UNUSED_302<<",
+    ">>UNUSED_303<<",
+    ">>UNUSED_304<<",
+    ">>UNUSED_305<<",
+    ">>UNUSED_306<<",
+    ">>UNUSED_307<<",
+    ">>UNUSED_308<<",
+    ">>UNUSED_309<<",
+    ">>UNUSED_310<<",
+    ">>UNUSED_311<<",
+    ">>UNUSED_312<<",
+    ">>UNUSED_313<<",
+    ">>UNUSED_314<<",
+    ">>UNUSED_315<<",
+    ">>UNUSED_316<<",
+    ">>UNUSED_317<<",
+    ">>UNUSED_318<<",
+    ">>UNUSED_319<<",
+    ">>UNUSED_320<<",
+    ">>UNUSED_321<<",
+    ">>UNUSED_322<<",
+    ">>UNUSED_323<<",
+    ">>UNUSED_324<<",
+    ">>UNUSED_325<<",
+    ">>UNUSED_326<<",
+    ">>UNUSED_327<<",
+    ">>UNUSED_328<<",
+    ">>UNUSED_329<<",
+    ">>UNUSED_330<<",
+    ">>UNUSED_331<<",
+    ">>UNUSED_332<<",
+    ">>UNUSED_333<<",
+    ">>UNUSED_334<<",
+    ">>UNUSED_335<<",
+    ">>UNUSED_336<<",
+    ">>UNUSED_337<<",
+    ">>UNUSED_338<<",
+    ">>UNUSED_339<<",
+    ">>UNUSED_340<<",
+    ">>UNUSED_341<<",
+    ">>UNUSED_342<<",
+    ">>UNUSED_343<<",
+    ">>UNUSED_344<<",
+    ">>UNUSED_345<<",
+    ">>UNUSED_346<<",
+    ">>UNUSED_347<<",
+    ">>UNUSED_348<<",
+    ">>UNUSED_349<<",
+    ">>UNUSED_350<<",
+    ">>UNUSED_351<<",
+    ">>UNUSED_352<<",
+    ">>UNUSED_353<<",
+    ">>UNUSED_354<<",
+    ">>UNUSED_355<<",
+    ">>UNUSED_356<<",
+    ">>UNUSED_357<<",
+    ">>UNUSED_358<<",
+    ">>UNUSED_359<<",
+    ">>UNUSED_360<<",
+    ">>UNUSED_361<<",
+    ">>UNUSED_362<<",
+    ">>UNUSED_363<<",
+    ">>UNUSED_364<<",
+    ">>UNUSED_365<<",
+    ">>UNUSED_366<<",
+    ">>UNUSED_367<<",
+    ">>UNUSED_368<<",
+    ">>UNUSED_369<<",
+    ">>UNUSED_370<<",
+    ">>UNUSED_371<<",
+    ">>UNUSED_372<<",
+    ">>UNUSED_373<<",
+    ">>UNUSED_374<<",
+    ">>UNUSED_375<<",
+    ">>UNUSED_376<<",
+    ">>UNUSED_377<<",
+    ">>UNUSED_378<<",
+    ">>UNUSED_379<<",
+    ">>UNUSED_380<<",
+    ">>UNUSED_381<<",
+    ">>UNUSED_382<<",
+    ">>UNUSED_383<<",
+    ">>UNUSED_384<<",
+    ">>UNUSED_385<<",
+    ">>UNUSED_386<<",
+    ">>UNUSED_387<<",
+    ">>UNUSED_388<<",
+    ">>UNUSED_389<<",
+    ">>UNUSED_390<<",
+    ">>UNUSED_391<<",
+    ">>UNUSED_392<<",
+    ">>UNUSED_393<<",
+    ">>UNUSED_394<<",
+    ">>UNUSED_395<<",
+    ">>UNUSED_396<<",
+    ">>UNUSED_397<<",
+    ">>UNUSED_398<<",
+    ">>UNUSED_399<<",
+    ">>UNUSED_400<<",
+    ">>UNUSED_401<<",
+    ">>UNUSED_402<<",
+    ">>UNUSED_403<<",
+    ">>UNUSED_404<<",
+    ">>UNUSED_405<<",
+    ">>UNUSED_406<<",
+    ">>UNUSED_407<<",
+    ">>UNUSED_408<<",
+    ">>UNUSED_409<<",
+    ">>UNUSED_410<<",
+    ">>UNUSED_411<<",
+    ">>UNUSED_412<<",
+    ">>UNUSED_413<<",
+    ">>UNUSED_414<<",
+    ">>UNUSED_415<<",
+    ">>UNUSED_416<<",
+    ">>UNUSED_417<<",
+    ">>UNUSED_418<<",
+    ">>UNUSED_419<<",
+    ">>UNUSED_420<<",
+    ">>UNUSED_421<<",
+    ">>UNUSED_422<<",
+    ">>UNUSED_423<<",
+    ">>UNUSED_424<<",
+    ">>UNUSED_425<<",
+    ">>UNUSED_426<<",
+    ">>UNUSED_427<<",
+    ">>UNUSED_428<<",
+    ">>UNUSED_429<<",
+    ">>UNUSED_430<<",
+    ">>UNUSED_431<<",
+    ">>UNUSED_432<<",
+    ">>UNUSED_433<<",
+    ">>UNUSED_434<<",
+    ">>UNUSED_435<<",
+    ">>UNUSED_436<<",
+    ">>UNUSED_437<<",
+    ">>UNUSED_438<<",
+    ">>UNUSED_439<<",
+    ">>UNUSED_440<<",
+    ">>UNUSED_441<<",
+    ">>UNUSED_442<<",
+    ">>UNUSED_443<<",
+    ">>UNUSED_444<<",
+    ">>UNUSED_445<<",
+    ">>UNUSED_446<<",
+    ">>UNUSED_447<<",
+    ">>UNUSED_448<<",
+    ">>UNUSED_449<<",
+    ">>UNUSED_450<<",
+    ">>UNUSED_451<<",
+    ">>UNUSED_452<<",
+    ">>UNUSED_453<<",
+    ">>UNUSED_454<<",
+    ">>UNUSED_455<<",
+    ">>UNUSED_456<<",
+    ">>UNUSED_457<<",
+    ">>UNUSED_458<<",
+    ">>UNUSED_459<<",
+    ">>UNUSED_460<<",
+    ">>UNUSED_461<<",
+    ">>UNUSED_462<<",
+    ">>UNUSED_463<<",
+    ">>UNUSED_464<<",
+    ">>UNUSED_465<<",
+    ">>UNUSED_466<<",
+    ">>UNUSED_467<<",
+    ">>UNUSED_468<<",
+    ">>UNUSED_469<<",
+    ">>UNUSED_470<<",
+    ">>UNUSED_471<<",
+    ">>UNUSED_472<<",
+    ">>UNUSED_473<<",
+    ">>UNUSED_474<<",
+    ">>UNUSED_475<<",
+    ">>UNUSED_476<<",
+    ">>UNUSED_477<<",
+    ">>UNUSED_478<<",
+    ">>UNUSED_479<<",
+    ">>UNUSED_480<<",
+    ">>UNUSED_481<<",
+    ">>UNUSED_482<<",
+    ">>UNUSED_483<<",
+    ">>UNUSED_484<<",
+    ">>UNUSED_485<<",
+    ">>UNUSED_486<<",
+    ">>UNUSED_487<<",
+    ">>UNUSED_488<<",
+    ">>UNUSED_489<<",
+    ">>UNUSED_490<<",
+    ">>UNUSED_491<<",
+    ">>UNUSED_492<<",
+    ">>UNUSED_493<<",
+    ">>UNUSED_494<<",
+    ">>UNUSED_495<<",
+    ">>UNUSED_496<<",
+    ">>UNUSED_497<<",
+    ">>UNUSED_498<<",
+    ">>UNUSED_499<<",
+    ">>UNUSED_500<<",
+    ">>UNUSED_501<<",
+    ">>UNUSED_502<<",
+    ">>UNUSED_503<<",
+    ">>UNUSED_504<<",
+    ">>UNUSED_505<<",
+    ">>UNUSED_506<<",
+    ">>UNUSED_507<<",
+    ">>UNUSED_508<<",
+    ">>UNUSED_509<<",
+    ">>UNUSED_510<<",
+    ">>UNUSED_511<<"
+  ],
+  "caption_token": "<|CAPTION|>",
+  "coord_token": "<|coord|>",
+  "det_token": "<|DET|>",
+  "detailed_caption_token": "<|DETAILED_CAPTION|>",
+  "diffusion_query_token": "<|diffusion_query|>",
+  "end_of_diffusion_query_token": "<|end_of_diffusion_query|>",
+  "end_of_image_token": "<|end_of_image|>",
+  "end_of_query_token": "<|end_of_query|>",
+  "end_of_turn_token": "<|end_of_turn|>",
+  "end_of_video_token": "<|end_of_video|>",
+  "eos_token": "<|end_of_text|>",
+  "frame_sep_token": "<|frame_sep|>",
+  "image_cls_token": "<|image_cls|>",
+  "image_mask_token": "<|image_mask_token|>",
+  "image_reg_1_token": "<|image_reg_1|>",
+  "image_reg_2_token": "<|image_reg_2|>",
+  "image_reg_3_token": "<|image_reg_3|>",
+  "image_reg_4_token": "<|image_reg_4|>",
+  "image_reg_5_token": "<|image_reg_5|>",
+  "image_reg_6_token": "<|image_reg_6|>",
+  "image_reg_7_token": "<|image_reg_7|>",
+  "image_reg_8_token": "<|image_reg_8|>",
+  "image_row_sep_token": "<|image_row_sep|>",
+  "image_token": "<|image|>",
+  "object_token": "<|object|>",
+  "ocr_doc_parser_token": "<|OCR_DOC_PARSER|>",
+  "ocr_grounding_token": "<|OCR_GROUNDING|>",
+  "ocr_plain_token": "<|OCR_PLAIN|>",
+  "pad_token": "<|pad|>",
+  "perceive_token": "<|perceive|>",
+  "point_ref_seg_token": "<|POINT_REF_SEG|>",
+  "pointing_token": "<|POINTING|>",
+  "presence_token": "<|presence|>",
+  "ref_seg_token": "<|REF_SEG|>",
+  "seg_token": "<|seg|>",
+  "semantic_seg_task_token": "<|SEMANTIC_SEG_TASK|>",
+  "semantic_seg_token": "<|semantic_seg|>",
+  "size_token": "<|size|>",
+  "start_of_diffusion_query_token": "<|start_of_diffusion_query|>",
+  "start_of_image_token": "<|start_of_image|>",
+  "start_of_query_token": "<|start_of_query|>",
+  "start_of_turn_token": "<|start_of_turn|>",
+  "start_of_video_token": "<|start_of_video|>",
+  "task_sep_token": "<|task_sep|>"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,102 @@

+{
+  "absence_token": "<|absence|>",
+  "backend": "tokenizers",
+  "caption_token": "<|CAPTION|>",
+  "clean_up_tokenization_spaces": true,
+  "coord_token": "<|coord|>",
+  "det_token": "<|DET|>",
+  "detailed_caption_token": "<|DETAILED_CAPTION|>",
+  "diffusion_query_token": "<|diffusion_query|>",
+  "end_of_diffusion_query_token": "<|end_of_diffusion_query|>",
+  "end_of_image_token": "<|end_of_image|>",
+  "end_of_query_token": "<|end_of_query|>",
+  "end_of_turn_token": "<|end_of_turn|>",
+  "end_of_video_token": "<|end_of_video|>",
+  "eos_token": "<|end_of_text|>",
+  "frame_sep_token": "<|frame_sep|>",
+  "image_cls_token": "<|image_cls|>",
+  "image_mask_token": "<|image_mask_token|>",
+  "image_reg_1_token": "<|image_reg_1|>",
+  "image_reg_2_token": "<|image_reg_2|>",
+  "image_reg_3_token": "<|image_reg_3|>",
+  "image_reg_4_token": "<|image_reg_4|>",
+  "image_reg_5_token": "<|image_reg_5|>",
+  "image_reg_6_token": "<|image_reg_6|>",
+  "image_reg_7_token": "<|image_reg_7|>",
+  "image_reg_8_token": "<|image_reg_8|>",
+  "image_row_sep_token": "<|image_row_sep|>",
+  "image_token": "<|image|>",
+  "is_local": true,
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 1000000000000000019884624838656,
+  "model_specific_special_tokens": {
+    "absence_token": "<|absence|>",
+    "caption_token": "<|CAPTION|>",
+    "coord_token": "<|coord|>",
+    "det_token": "<|DET|>",
+    "detailed_caption_token": "<|DETAILED_CAPTION|>",
+    "diffusion_query_token": "<|diffusion_query|>",
+    "end_of_diffusion_query_token": "<|end_of_diffusion_query|>",
+    "end_of_image_token": "<|end_of_image|>",
+    "end_of_query_token": "<|end_of_query|>",
+    "end_of_turn_token": "<|end_of_turn|>",
+    "end_of_video_token": "<|end_of_video|>",
+    "frame_sep_token": "<|frame_sep|>",
+    "image_cls_token": "<|image_cls|>",
+    "image_mask_token": "<|image_mask_token|>",
+    "image_reg_1_token": "<|image_reg_1|>",
+    "image_reg_2_token": "<|image_reg_2|>",
+    "image_reg_3_token": "<|image_reg_3|>",
+    "image_reg_4_token": "<|image_reg_4|>",
+    "image_reg_5_token": "<|image_reg_5|>",
+    "image_reg_6_token": "<|image_reg_6|>",
+    "image_reg_7_token": "<|image_reg_7|>",
+    "image_reg_8_token": "<|image_reg_8|>",
+    "image_row_sep_token": "<|image_row_sep|>",
+    "image_token": "<|image|>",
+    "object_token": "<|object|>",
+    "ocr_doc_parser_token": "<|OCR_DOC_PARSER|>",
+    "ocr_grounding_token": "<|OCR_GROUNDING|>",
+    "ocr_plain_token": "<|OCR_PLAIN|>",
+    "pad_token": "<|pad|>",
+    "perceive_token": "<|perceive|>",
+    "point_ref_seg_token": "<|POINT_REF_SEG|>",
+    "pointing_token": "<|POINTING|>",
+    "presence_token": "<|presence|>",
+    "ref_seg_token": "<|REF_SEG|>",
+    "seg_token": "<|seg|>",
+    "semantic_seg_task_token": "<|SEMANTIC_SEG_TASK|>",
+    "semantic_seg_token": "<|semantic_seg|>",
+    "size_token": "<|size|>",
+    "start_of_diffusion_query_token": "<|start_of_diffusion_query|>",
+    "start_of_image_token": "<|start_of_image|>",
+    "start_of_query_token": "<|start_of_query|>",
+    "start_of_turn_token": "<|start_of_turn|>",
+    "start_of_video_token": "<|start_of_video|>",
+    "task_sep_token": "<|task_sep|>"
+  },
+  "object_token": "<|object|>",
+  "ocr_doc_parser_token": "<|OCR_DOC_PARSER|>",
+  "ocr_grounding_token": "<|OCR_GROUNDING|>",
+  "ocr_plain_token": "<|OCR_PLAIN|>",
+  "pad_token": "<|pad|>",
+  "perceive_token": "<|perceive|>",
+  "point_ref_seg_token": "<|POINT_REF_SEG|>",
+  "pointing_token": "<|POINTING|>",
+  "presence_token": "<|presence|>",
+  "ref_seg_token": "<|REF_SEG|>",
+  "seg_token": "<|seg|>",
+  "semantic_seg_task_token": "<|SEMANTIC_SEG_TASK|>",
+  "semantic_seg_token": "<|semantic_seg|>",
+  "size_token": "<|size|>",
+  "start_of_diffusion_query_token": "<|start_of_diffusion_query|>",
+  "start_of_image_token": "<|start_of_image|>",
+  "start_of_query_token": "<|start_of_query|>",
+  "start_of_turn_token": "<|start_of_turn|>",
+  "start_of_video_token": "<|start_of_video|>",
+  "task_sep_token": "<|task_sep|>",
+  "tokenizer_class": "TokenizersBackend"
+}