Lunahera commited on Apr 17

Commit

ce8f665

verified ·

1 Parent(s): 20476f4

Initial upload of simplicityprevails from local project

Browse files

Files changed (18) hide show

README.md +83 -0
core/__init__.py +0 -0
core/vision_encoder/__init__.py +0 -0
core/vision_encoder/bpe_simple_vocab_16e6.txt.gz +3 -0
core/vision_encoder/config.py +261 -0
core/vision_encoder/pe.py +761 -0
core/vision_encoder/rope.py +347 -0
core/vision_encoder/tokenizer.py +342 -0
core/vision_encoder/transforms.py +31 -0
models.py +331 -0
test_vfm_baselines.py +153 -0
weights/dinov2lin0.pth +3 -0
weights/dinov3lin0.pth +3 -0
weights/metaclip2lin0.pth +3 -0
weights/metacliplin0.pth +3 -0
weights/pelin0.pth +3 -0
weights/siglip2lin0.pth +3 -0
weights/sigliplin0.pth +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,83 @@

+# VFM Baselines Release
+This directory contains the 7 vision foundation model baselines used in the paper:
+- `MetaCLIP-Linear`
+- `MetaCLIP2-Linear`
+- `SigLIP-Linear`
+- `SigLIP2-Linear`
+- `PE-CLIP-Linear`
+- `DINOv2-Linear`
+- `DINOv3-Linear`
+## Contents
+- `models.py`: unified model-loading code for all 7 baselines
+- `test_vfm_baselines.py`: unified evaluation script
+- `weights/`: released checkpoints
+- `core/vision_encoder/`: vendored PE vision encoder code required by `PE-CLIP-Linear`
+## Model Names
+The unified loader and test script accept these names:
+- `metacliplin`
+- `metaclip2lin`
+- `sigliplin`
+- `siglip2lin`
+- `pelin`
+- `dinov2lin`
+- `dinov3lin`
+The paper names such as `MetaCLIP-Linear` and `DINOv3-Linear` are also accepted.
+## Usage
+Evaluate a single model:
+```bash
+python test_vfm_baselines.py \
+  --model sigliplin \
+  --real-dir /path/to/0_real \
+  --fake-dir /path/to/1_fake \
+  --max-samples 100
+```
+Evaluate all 7 models:
+```bash
+python test_vfm_baselines.py \
+  --model all \
+  --real-dir /path/to/0_real \
+  --fake-dir /path/to/1_fake \
+  --max-samples 100
+```
+Optional arguments:
+- `--checkpoint`: override the default checkpoint for single-model evaluation
+- `--batch-size`: batch size for evaluation
+- `--num-workers`: dataloader workers
+- `--device`: explicit device such as `cuda:0` or `cpu`
+- `--save-json`: save results to a JSON file
+## Dependencies
+The release code expects these Python packages:
+- `torch`
+- `torchvision`
+- `transformers`
+- `scikit-learn`
+- `Pillow`
+- `timm`
+- `einops`
+- `ftfy`
+- `regex`
+- `huggingface_hub`
+## Notes
+- The clip-family and DINO-family baselines instantiate the backbone from Hugging Face model configs and then load the released checkpoint.
+- `PE-CLIP-Linear` uses the vendored `core/vision_encoder` code in this directory.
+- The checkpoints in `weights/` are arranged locally for packaging convenience. For public release, they can be uploaded as the same filenames.

core/__init__.py ADDED Viewed

File without changes

core/vision_encoder/__init__.py ADDED Viewed

File without changes

core/vision_encoder/bpe_simple_vocab_16e6.txt.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
+size 1356917

core/vision_encoder/config.py ADDED Viewed

	@@ -0,0 +1,261 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+"""
+Include all available vision encoder configurations.
+"""
+from dataclasses import dataclass, replace
+from functools import partial
+from typing import Callable, Optional, Sequence, Tuple, List
+from huggingface_hub import hf_hub_download
+def fetch_pe_checkpoint(name: str, path: Optional[str] = None):
+    path = path or f"hf://facebook/{name}:{name}.pt"
+    if path.startswith("hf://"):
+        # Load from huggingface
+        path = path[len("hf://"):]
+        repo, file = path.split(":")
+        return hf_hub_download(repo_id=repo, filename=file)
+    else:
+        return path
+@dataclass
+class PEConfig:
+    """ Vision Tower Config. """
+    patch_size: int
+    width: int
+    layers: int
+    heads: int
+    mlp_ratio: float
+    output_dim: Optional[int]
+    ls_init_value: float = None
+    drop_path: float = 0.0
+    image_size: int = 224,
+    use_abs_posemb: bool = True
+    use_cls_token: bool = False
+    use_rope2d: bool = True
+    pool_type: str = "attn"
+    attn_pooler_heads: int = 8
+    use_ln_pre: bool = True
+    use_ln_post: bool = True
+@dataclass
+class PETextConfig:
+    """ Text Tower Config. """
+    context_length: int
+    width: int
+    heads: int
+    layers: int
+    output_dim: int
+    mlp_ratio: float = 4.0
+    vocab_size: int = 49408
+PE_VISION_CONFIG = {}
+PE_TEXT_CONFIG = {}
+#########################################
+#                PE CORE                #
+#########################################
+PE_VISION_CONFIG["PE-Core-G14-448"] = PEConfig(
+    image_size=448,
+    patch_size=14,
+    width=1536,
+    layers=50,
+    heads=16,
+    mlp_ratio=8960 / 1536,
+    pool_type="attn",
+    output_dim=1280,
+    use_cls_token=False,
+)
+PE_TEXT_CONFIG["PE-Core-G14-448"] = PETextConfig(
+    context_length=72,
+    width=1280,
+    heads=20,
+    layers=24,
+    output_dim=1280
+)
+PE_VISION_CONFIG["PE-Core-L14-336"] = PEConfig(
+    image_size=336,
+    patch_size=14,
+    width=1024,
+    layers=24,
+    heads=16,
+    mlp_ratio=4.0,
+    pool_type="attn",
+    output_dim=1024,
+    use_cls_token=True,
+)
+PE_TEXT_CONFIG["PE-Core-L14-336"] = PETextConfig(
+    context_length=32,
+    width=1024,
+    heads=16,
+    layers=24,
+    output_dim=1024
+)
+PE_VISION_CONFIG["PE-Core-B16-224"] = PEConfig(
+    image_size=224,
+    patch_size=16,
+    width=768,
+    layers=12,
+    heads=12,
+    mlp_ratio=4.0,
+    pool_type="attn",
+    output_dim=1024,
+    use_cls_token=True,
+)
+PE_TEXT_CONFIG["PE-Core-B16-224"] = PE_TEXT_CONFIG["PE-Core-L14-336"]
+PE_VISION_CONFIG["PE-Core-S16-384"] = PEConfig(
+    image_size=384,
+    patch_size=16,
+    width=384,
+    layers=12,
+    heads=6,
+    mlp_ratio=4.0,
+    pool_type="attn",
+    output_dim=512,
+    use_cls_token=True,
+)
+PE_TEXT_CONFIG["PE-Core-S16-384"] = PETextConfig(
+    context_length=32,
+    width=512,
+    heads=8,
+    layers=12,
+    output_dim=512
+)
+PE_VISION_CONFIG["PE-Core-T16-384"] = PEConfig(
+    image_size=384,
+    patch_size=16,
+    width=192,
+    layers=12,
+    heads=3,
+    mlp_ratio=4.0,
+    pool_type="attn",
+    output_dim=512,
+    use_cls_token=True,
+)
+PE_TEXT_CONFIG["PE-Core-T16-384"] = PE_TEXT_CONFIG["PE-Core-S16-384"]
+#########################################
+#                PE Lang                #
+#########################################
+PE_VISION_CONFIG["PE-Lang-G14-448"] = replace(
+    PE_VISION_CONFIG["PE-Core-G14-448"],
+    image_size=448,
+    pool_type="none",
+    use_ln_post=False,
+    output_dim=None,
+    ls_init_value=0.1,
+    layers=47,
+)
+PE_VISION_CONFIG["PE-Lang-L14-448"] = replace(
+    PE_VISION_CONFIG["PE-Core-L14-336"],
+    image_size=448,
+    pool_type="none",
+    use_ln_post=False,
+    output_dim=None,
+    ls_init_value=0.1,
+    layers=23
+)
+# Stage 2 checkpoints for PLM-8B and PLM-3B respectively. Pretrained with tiling.
+# Use these checkpoints if you're building a model that uses tiling downstream!
+PE_VISION_CONFIG["PE-Lang-G14-448-Tiling"] = PE_VISION_CONFIG["PE-Lang-G14-448"]
+PE_VISION_CONFIG["PE-Lang-L14-448-Tiling"] = PE_VISION_CONFIG["PE-Lang-L14-448"]
+#########################################
+#               PE Spatial              #
+#########################################
+PE_VISION_CONFIG["PE-Spatial-G14-448"] = replace(
+    PE_VISION_CONFIG["PE-Core-G14-448"],
+    image_size=448,
+    pool_type="none",
+    use_ln_post=False,
+    output_dim=None,
+    ls_init_value=0.1,
+)
+# No layerscale on the smaller spatial models
+PE_VISION_CONFIG["PE-Spatial-L14-448"] = replace(
+    PE_VISION_CONFIG["PE-Core-L14-336"],
+    image_size=448,
+    pool_type="none",
+    use_ln_post=False,
+    output_dim=None,
+)
+PE_VISION_CONFIG["PE-Spatial-B16-512"] = replace(
+    PE_VISION_CONFIG["PE-Core-B16-224"],
+    image_size=512,
+    pool_type="none",
+    use_ln_post=False,
+    output_dim=None,
+)
+PE_VISION_CONFIG["PE-Spatial-S16-512"] = replace(
+    PE_VISION_CONFIG["PE-Core-S16-384"],
+    image_size=512,
+    pool_type="none",
+    use_ln_post=False,
+    output_dim=None,
+)
+PE_VISION_CONFIG["PE-Spatial-T16-512"] = replace(
+    PE_VISION_CONFIG["PE-Core-T16-384"],
+    image_size=512,
+    pool_type="none",
+    use_ln_post=False,
+    output_dim=None,
+)

core/vision_encoder/pe.py ADDED Viewed

	@@ -0,0 +1,761 @@

+import copy
+import math
+import random
+from collections import OrderedDict
+from dataclasses import asdict
+from functools import partial
+from logging import getLogger
+from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, Literal
+import numpy as np
+import torch
+import torch.nn as nn
+from einops import rearrange
+from timm.layers import DropPath
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.init import constant_, xavier_normal_, xavier_uniform_
+from torch.nn.parameter import Parameter
+from torch.utils.checkpoint import checkpoint
+from core.vision_encoder.rope import Rope2D
+from core.vision_encoder.config import PEConfig, PETextConfig, PE_VISION_CONFIG, PE_TEXT_CONFIG, fetch_pe_checkpoint
+logger = getLogger()
+class LayerScale(nn.Module):
+    def __init__(self, dim, init_values=1e-5, inplace=False):
+        super().__init__()
+        self.inplace = inplace
+        self.dim = dim
+        self.init_values = init_values
+    def forward(self, x):
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+    def init_tensors(self):
+        self.gamma = nn.Parameter(self.init_values * torch.ones(self.dim))
+class AttentionPooling(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        num_probe: int = 1,
+        mlp_ratio: int = 4,
+        act_layer: Callable = nn.GELU,
+        norm_layer: Callable = nn.LayerNorm,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        assert (
+            self.embed_dim % num_heads == 0
+        ), "embed_dim must be divisible by num_heads"
+        self.probe = nn.Parameter(torch.randn(1, num_probe, self.embed_dim))
+        self.attn = nn.MultiheadAttention(
+            self.embed_dim, self.num_heads, batch_first=True
+        )
+        self.layernorm = norm_layer(embed_dim)
+        self.mlp_width = int(embed_dim * mlp_ratio)
+        self.mlp = nn.Sequential(
+            OrderedDict(
+                [
+                    ("c_fc", nn.Linear(self.embed_dim, self.mlp_width)),
+                    ("gelu", act_layer()),
+                    ("c_proj", nn.Linear(self.mlp_width, self.embed_dim)),
+                ]
+            )
+        )
+    def forward(self, x: torch.Tensor):
+        batch, _, _ = x.shape
+        q = self.probe.repeat((batch, 1, 1)).to(x.dtype)
+        x = self.attn(q, x, x, need_weights=False)[0]
+        x = x + self.mlp(self.layernorm(x))
+        return x
+class SelfAttention(nn.Module):
+    r"""
+    Implements sequence packed attention and RoPe
+    """
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        rope: Optional[nn.Module] = None,
+    ):
+        super(SelfAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), "embed_dim must be divisible by num_heads"
+        # To make this compatibile with nn.MultiHeadAttention
+        self.in_proj_weight = Parameter(torch.empty(3 * embed_dim, embed_dim))
+        self.in_proj_bias = Parameter(torch.empty(3 * embed_dim))
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+        self.rope = rope
+        self.scale = self.head_dim ** (-0.5)
+    def init_tensors(self):
+        xavier_uniform_(self.in_proj_weight)
+        constant_(self.in_proj_bias, 0.0)
+        constant_(self.out_proj.bias, 0.0)
+    def forward(self, x, attn_mask=None):
+        batch, seq, embed_dim = x.shape
+        proj = F.linear(x, self.in_proj_weight, self.in_proj_bias)
+        # reshape to 3, E and not E, 3 is deliberate for better memory coalescing and keeping same order as chunk()
+        proj = (
+            proj.unflatten(-1, (3, embed_dim))
+            .unsqueeze(0)
+            .transpose(0, -2)
+            .squeeze(-2)
+            .contiguous()
+        )
+        q, k, v = proj[0], proj[1], proj[2]
+        # Use "q_" so that we don't accidentally quit in pdb :)
+        q = rearrange(q, "b s (h d) -> b h s d", h=self.num_heads)
+        k = rearrange(k, "b s (h d) -> b h s d", h=self.num_heads)
+        v = rearrange(v, "b s (h d) -> b h s d", h=self.num_heads)
+        if self.rope:
+            q, k = self.rope(q, k)
+        attn = F.scaled_dot_product_attention(
+            q, k, v, attn_mask=None, dropout_p=0.0, is_causal=False, scale=self.scale
+        )
+        attn = rearrange(attn, "b h s d -> b s (h d)")
+        return F.linear(attn, self.out_proj.weight, self.out_proj.bias)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        n_head: int,
+        mlp_ratio: float = 4.0,
+        ls_init_value: float = None,
+        act_layer: Callable = nn.GELU,
+        norm_layer: Callable = nn.LayerNorm,
+        drop_path: float = 0.0,
+        rope: Optional[nn.Module] = None,
+    ):
+        super().__init__()
+        if rope:
+            self.attn = SelfAttention(d_model, n_head, rope=rope)
+        else:
+            self.attn = nn.MultiheadAttention(d_model, n_head, batch_first=True)
+        self.ls_1 = (
+            LayerScale(d_model, ls_init_value)
+            if ls_init_value is not None
+            else nn.Identity()
+        )
+        self.ls_2 = (
+            LayerScale(d_model, ls_init_value)
+            if ls_init_value is not None
+            else nn.Identity()
+        )
+        self.ln_1 = norm_layer(d_model)
+        self.ln_2 = norm_layer(d_model)
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        mlp_width = int(d_model * mlp_ratio)
+        self.mlp = nn.Sequential(
+            OrderedDict(
+                [
+                    ("c_fc", nn.Linear(d_model, mlp_width)),
+                    ("gelu", act_layer()),
+                    ("c_proj", nn.Linear(mlp_width, d_model)),
+                ]
+            )
+        )
+    def _call_attn(
+        self,
+        q_x: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+    ):
+        if attn_mask is not None:
+            # Leave boolean masks as is
+            if not attn_mask.dtype == torch.bool:
+                attn_mask = attn_mask.to(q_x.dtype)
+        if isinstance(self.attn, SelfAttention):
+            return self.attn(q_x, attn_mask=attn_mask)
+        else:
+            return self.attn(q_x, q_x, q_x, attn_mask=attn_mask, need_weights=False)[0]
+    def forward(
+        self,
+        x: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+    ):
+        x = x + self.drop_path1(
+            self.ls_1(self._call_attn(self.ln_1(x), attn_mask=attn_mask))
+        )
+        x = x + self.drop_path2(self.ls_2(self.mlp(self.ln_2(x))))
+        return x
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        width: int,
+        layers: int,
+        heads: int,
+        mlp_ratio: float = 4.0,
+        ls_init_value: float = None,
+        act_layer: Callable = nn.GELU,
+        norm_layer: Callable = nn.LayerNorm,
+        drop_path: float = 0.0,
+        rope: Optional[nn.Module] = None,
+    ):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.grad_checkpointing = False
+        self.resblocks = nn.ModuleList(
+            [
+                ResidualAttentionBlock(
+                    width,
+                    heads,
+                    mlp_ratio,
+                    ls_init_value=ls_init_value,
+                    act_layer=act_layer,
+                    norm_layer=norm_layer,
+                    drop_path=drop_path,
+                    rope=rope,
+                )
+                for _ in range(layers)
+            ]
+        )
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+    @torch.jit.ignore
+    def truncate(self, layer_idx: int):
+        """ Delete layers so the last layer is the given layer index. """
+        self.layers = ((self.layers + layer_idx) % self.layers) + 1
+        self.resblocks = nn.ModuleList(self.resblocks[:self.layers])
+    def forward(
+        self,
+        x: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        layer_idx: int = -1,
+    ):
+        stop_idx = (self.layers + layer_idx) % self.layers
+        for i, r in enumerate(self.resblocks):
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                # TODO: handle kwargs https://github.com/pytorch/pytorch/issues/79887#issuecomment-1161758372
+                x = checkpoint(r, x, None, None, attn_mask)
+            else:
+                x = r(x, attn_mask=attn_mask)
+            if i == stop_idx:
+                break
+        return x
+class VisionTransformer(nn.Module):
+    def __init__(
+        self,
+        patch_size: int,
+        width: int,
+        layers: int,
+        heads: int,
+        mlp_ratio: float,
+        act_layer: Callable = nn.GELU,
+        norm_layer: Callable = partial(nn.LayerNorm, eps=1e-5),
+        use_ln_pre: bool = True,
+        use_ln_post: bool = True,
+        ls_init_value: float = None,
+        drop_path: float = 0.0,
+        image_size: int = 448,  # Pretrain image size only; you can pass in any image size
+        use_abs_posemb: bool = True,
+        use_rope2d: bool = True,
+        use_cls_token: bool = False,
+        output_dim: Optional[int] = 1280,
+        attn_pooler_heads: int = 8,
+        pool_type: Literal["attn", "tok", "avg", "none"] = "attn",
+    ):
+        super().__init__()
+        assert pool_type in ("attn", "tok", "avg", "none")
+        self.pool_type = pool_type
+        self.patch_size = patch_size
+        self.output_dim = output_dim or width
+        self.proj_dim = output_dim
+        self.heads = heads
+        self.width = width
+        self.layers = layers
+        self.use_abs_posemb = use_abs_posemb
+        self.use_cls_token = use_cls_token
+        self.use_rope2d = use_rope2d
+        self.image_size = image_size
+        self.conv1 = nn.Conv2d(
+            in_channels=3,
+            out_channels=width,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=False,
+        )
+        self.rope = (
+            Rope2D(
+                dim=width // heads,
+                use_cls_token=self.use_cls_token,
+            )
+            if self.use_rope2d
+            else None
+        )
+        self.ln_pre = norm_layer(width) if use_ln_pre else nn.Identity()
+        self.ln_post = norm_layer(self.width) if use_ln_post else nn.Identity()
+        self.transformer = Transformer(
+            width,
+            layers,
+            heads,
+            mlp_ratio,
+            ls_init_value=ls_init_value,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            drop_path=drop_path,
+            rope=self.rope,
+        )
+        if pool_type == "attn":
+            self.attn_pool = AttentionPooling(
+                embed_dim=width,
+                num_heads=attn_pooler_heads,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+            )
+        else:
+            self.attn_pool = None
+        self.init_tensors()
+    def init_tensors(self):
+        def init_submodule_tensors(module):
+            for name, child in module.named_children():
+                if hasattr(child, "init_tensors"):
+                    logger.debug(f"Initializing tensors for submodule: {name}")
+                    child.init_tensors()
+                init_submodule_tensors(child)
+        init_submodule_tensors(self)
+        self.rope.init_tensors()
+        # class embeddings and positional embeddings
+        init_scale = self.width**-0.5
+        if self.use_cls_token:
+            self.class_embedding = nn.Parameter(init_scale * torch.randn(self.width))
+        if self.use_abs_posemb:
+            self.posemb_grid_size = self.image_size // self.patch_size
+            self.positional_embedding = nn.Parameter(
+                init_scale
+                * torch.randn(
+                    int(self.use_cls_token) + self.posemb_grid_size**2, self.width
+                )
+            )
+        if self.proj_dim is not None:
+            self.proj = nn.Parameter(
+                init_scale * torch.randn(self.width, self.proj_dim)
+            )
+    def load_ckpt(self, ckpt_path: str, verbose: bool = True):
+        _sd = torch.load(ckpt_path, weights_only=True)
+        if "state_dict" in _sd:
+            _sd = _sd["state_dict"]
+        elif "weights" in _sd:
+            _sd = _sd["weights"]
+        # for backwards compatibility
+        _sd = {k.replace("module.", ""): v for k, v in _sd.items()}
+        if any(k.startswith("visual.") for k in _sd):
+            _sd = {k.replace("visual.", ""): v for k, v in _sd.items() if "visual" in k}
+        m, u = self.load_state_dict(_sd, strict=False)
+        if verbose or (m or u):
+            logger.info(f"Missing keys for loading vision encoder: {m}")
+            logger.info(f"Unexpected keys for loading vision encoder: {u}")
+            print(f"Missing keys for loading vision encoder: {m}")
+            print(f"Unexpected keys for loading vision encoder: {u}")
+    def truncate(self, layer_idx: int):
+        """ Delete layers so the last layer is the given layer index. """
+        self.transformer.truncate(layer_idx)
+        self.layers = self.transformer.layers
+    @classmethod
+    def from_config(
+        cls,
+        name: str,
+        pretrained: bool = False,
+        checkpoint_path: Optional[str] = None,
+        **kwdargs
+    ):
+        if name not in PE_VISION_CONFIG:
+            raise RuntimeError(f"{name} not found in configs.")
+        args = asdict(PE_VISION_CONFIG[name])
+        args.update(kwdargs)
+        model = cls(**args)
+        if pretrained:
+            model.load_ckpt(fetch_pe_checkpoint(name, checkpoint_path))
+        return model
+    @classmethod
+    def available_configs(cls):
+        return list(PE_VISION_CONFIG.keys())
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.transformer.set_grad_checkpointing(enable=enable)
+    def _sample_abs_posemb(self, grid_h: int, grid_w: int):
+        """Interpolates the absolute position embedding if necessary."""
+        if self.posemb_grid_size == grid_h and self.posemb_grid_size == grid_w:
+            return self.positional_embedding[None, ...]
+        pos_embed = self.positional_embedding
+        if self.use_cls_token:
+            cls_token_embed, pos_embed = pos_embed[:1], pos_embed[1:]
+        pos_embed = (
+            pos_embed.reshape(1, self.posemb_grid_size, self.posemb_grid_size, -1)
+            .permute(0, 3, 1, 2)
+            .contiguous()
+        )
+        pos_embed = F.interpolate(
+            pos_embed, size=(grid_h, grid_w), mode="bilinear", align_corners=False
+        )
+        pos_embed = pos_embed.permute(0, 2, 3, 1).reshape(-1, self.width).contiguous()
+        if self.use_cls_token:
+            pos_embed = torch.cat([cls_token_embed, pos_embed], dim=0)
+        return pos_embed[None, ...]
+    def _pool(self, x: torch.Tensor):
+        if self.pool_type == "tok":
+            return x[:, 0]
+        elif self.pool_type == "avg":
+            return x.mean(dim=1)
+        elif self.pool_type == "attn":
+            return self.attn_pool(x).squeeze(1)
+        elif self.pool_type == "none":
+            return x
+        else:
+            raise NotImplementedError
+    def forward_features(
+        self,
+        x: torch.Tensor,
+        norm: bool = False,
+        layer_idx: int = -1,
+        strip_cls_token: bool = False
+    ):
+        batch, _, h, w = x.shape
+        grid_h, grid_w = h // self.patch_size, w // self.patch_size
+        x = self.conv1(x)
+        x = x.permute(0, 2, 3, 1).reshape(batch, -1, self.width)
+        if self.use_cls_token:
+            x = torch.cat(
+                [self.class_embedding.view(1, 1, -1).expand(batch, -1, -1), x],
+                dim=1,
+            )
+        if self.use_abs_posemb:
+            x = x + self._sample_abs_posemb(grid_h, grid_w)
+        if self.use_rope2d:
+            self.rope.update_grid(x.device, grid_h, grid_w)
+        x = self.ln_pre(x)
+        x = self.transformer(x, layer_idx=layer_idx)
+        if norm:
+            x = self.ln_post(x)
+        if strip_cls_token and self.use_cls_token:
+            x = x[:, 1:, :]
+        return x
+    def forward(self, x: torch.Tensor, **kwargs):
+        x = self.forward_features(x, norm=True, **kwargs)
+        x = self._pool(x)
+        if self.proj_dim is not None:
+            x = x @ self.proj
+        return x
+class TextTransformer(nn.Module):
+    def __init__(
+        self,
+        context_length: int = 72,
+        vocab_size: int = 49408,
+        width: int = 512,
+        heads: int = 8,
+        layers: int = 12,
+        mlp_ratio: float = 4.0,
+        ls_init_value: float = None,
+        output_dim: int = 1280,
+        no_causal_mask: bool = False,
+        pad_id: int = 0,
+        pool_type: str = "argmax",
+        proj_bias: bool = False,
+        act_layer: Callable = nn.GELU,
+        norm_layer: Callable = partial(nn.LayerNorm, eps=1e-5),
+        output_tokens: bool = False,
+        use_ln_post: bool = True,
+    ):
+        super().__init__()
+        assert pool_type in ("first", "last", "argmax", "none")
+        self.pool_type = pool_type
+        self.output_tokens = output_tokens
+        self.num_pos = self.context_length = context_length
+        self.vocab_size = vocab_size
+        self.width = width
+        self.output_dim = output_dim
+        self.heads = heads
+        self.pad_id = pad_id
+        self.layers = layers
+        self.token_embedding = nn.Embedding(vocab_size, width)
+        self.positional_embedding = nn.Parameter(torch.empty(self.num_pos, width))
+        self.transformer = Transformer(
+            width=width,
+            layers=layers,
+            heads=heads,
+            mlp_ratio=mlp_ratio,
+            ls_init_value=ls_init_value,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+        )
+        self.ln_final = norm_layer(width) if use_ln_post else nn.Identity()
+        if no_causal_mask:
+            self.attn_mask = None
+        else:
+            self.register_buffer(
+                "attn_mask", self.build_causal_mask(), persistent=False
+            )
+        if pool_type == "attn" or pool_type == "attn_eos":
+            self.attn_pool = AttentionPooling(
+                embed_dim=width,
+                num_heads=heads,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+            )
+        else:  # argmax
+            self.attn_pool = None
+        if proj_bias:
+            self.text_projection = nn.Linear(width, output_dim)
+        else:
+            self.text_projection = nn.Parameter(torch.empty(width, output_dim))
+    def build_causal_mask(self):
+        # lazily create causal attention mask, with full attention between the tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.num_pos, self.num_pos)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    def load_ckpt(self, ckpt_path: str, verbose: bool = True):
+        _sd = torch.load(ckpt_path, weights_only=True)
+        if "state_dict" in _sd:
+            _sd = _sd["state_dict"]
+        elif "weights" in _sd:
+            _sd = _sd["weights"]
+        _sd = {k.replace("module.", ""): v for k, v in _sd.items()}
+        m, u = self.load_state_dict(_sd, strict=False)
+        if verbose or (m or u):
+            logger.info(f"Missing keys for loading model: {m}")
+            logger.info(f"Unexpected keys for loading model: {u}")
+            print(f"Missing keys for loading model: {m}")
+            print(f"Unexpected keys for loading model: {u}")
+    def build_cls_mask(self, text):
+        cls_mask = (text != self.pad_id).unsqueeze(1)
+        cls_mask = F.pad(cls_mask, (1, 0, cls_mask.shape[2], 0), value=True)
+        additive_mask = torch.empty(cls_mask.shape, device=cls_mask.device)
+        additive_mask.fill_(0)
+        additive_mask.masked_fill_(~cls_mask, float("-inf"))
+        additive_mask = torch.repeat_interleave(additive_mask, self.heads, 0)
+        return additive_mask
+    def text_global_pool(
+        self, x, text: Optional[torch.Tensor] = None, pool_type: str = "argmax"
+    ):
+        if pool_type == "first":
+            pooled, tokens = x[:, 0], x[:, 1:]
+        elif pool_type == "last":
+            pooled, tokens = x[:, -1], x[:, :-1]
+        elif pool_type == "argmax":
+            # take features from the eot embedding (eot_token is the highest number in each sequence)
+            assert text is not None
+            pooled, tokens = x[torch.arange(x.shape[0]), text.argmax(dim=-1)], x
+        else:
+            pooled = tokens = x
+        return pooled, tokens
+    def forward(self, text):
+        seq_len = text.shape[1]
+        x = self.token_embedding(
+            text
+        )
+        attn_mask = self.attn_mask
+        if attn_mask is not None:
+            attn_mask = attn_mask[:seq_len, :seq_len]
+        x = x + self.positional_embedding[:seq_len]
+        x = self.transformer(x, attn_mask=attn_mask)
+        x = self.ln_final(x)
+        pooled, tokens = self.text_global_pool(x, text, pool_type=self.pool_type)
+        if self.text_projection is not None:
+            if isinstance(self.text_projection, nn.Linear):
+                pooled = self.text_projection(pooled)
+            else:
+                pooled = pooled @ self.text_projection
+        if self.output_tokens:
+            return pooled, tokens
+        return pooled
+class CLIP(TextTransformer):
+    def __init__(
+        self,
+        vision_cfg: PEConfig,
+        text_cfg: PETextConfig,
+        init_logit_scale: float = np.log(1 / 0.07)
+    ):
+        super(CLIP, self).__init__(**asdict(text_cfg))
+        self.visual = VisionTransformer(**asdict(vision_cfg))
+        self.image_size = self.visual.image_size  # For ease of use
+        self.logit_scale = nn.Parameter(torch.ones([]) * init_logit_scale)
+    def encode_image(self, image, normalize: bool = False):
+        x = self.visual(image)
+        return F.normalize(x, dim=-1) if normalize else x
+    def encode_video(self, video, normalize: bool = False): # b n c h w
+        b, n, c, h, w = video.shape
+        frms = video.reshape(b * n, c, h, w)
+        frm_feats = self.encode_image(frms, normalize=normalize)
+        video_feats = frm_feats.reshape(b, n, -1)
+        video_feats = video_feats.mean(dim=1)
+        return video_feats
+    def encode_text(self, text, normalize: bool = False):
+        x = super().forward(text)
+        return F.normalize(x, dim=-1) if normalize else x
+    def forward(
+        self,
+        image: Optional[torch.Tensor] = None,
+        text: Optional[torch.Tensor] = None,
+    ):
+        image_features = (
+            self.encode_image(image, normalize=True) if image is not None else None
+        )
+        text_features = (
+            self.encode_text(text, normalize=True) if text is not None else None
+        )
+        return image_features, text_features, self.logit_scale.exp()
+    @classmethod
+    def from_config(
+        cls,
+        name: str,
+        pretrained: bool = False,
+        checkpoint_path: Optional[str] = None  # To load your own
+    ):
+        if name not in PE_VISION_CONFIG or name not in PE_TEXT_CONFIG:
+            raise RuntimeError(f"{name} not found in configs.")
+        model = cls(PE_VISION_CONFIG[name], PE_TEXT_CONFIG[name])
+        if pretrained:
+            model.load_ckpt(fetch_pe_checkpoint(name, checkpoint_path))
+        return model
+    @classmethod
+    def available_configs(cls):
+        return [k for k in PE_VISION_CONFIG if k in PE_TEXT_CONFIG]

core/vision_encoder/rope.py ADDED Viewed

	@@ -0,0 +1,347 @@

+from math import log, pi
+from typing import Literal, Optional, Union
+import torch
+from einops import rearrange, repeat
+from torch import Tensor, broadcast_tensors, einsum, nn
+from torch.amp import autocast
+from torch.nn import Module, ModuleList
+# helper functions
+def exists(val):
+    return val is not None
+def default(val, d):
+    return val if exists(val) else d
+# broadcat, as tortoise-tts was using it
+def broadcat(tensors, dim=-1):
+    broadcasted_tensors = broadcast_tensors(*tensors)
+    return torch.cat(broadcasted_tensors, dim=dim)
+# rotary embedding helper functions
+def rotate_half(x):
+    x = rearrange(x, "... (d r) -> ... d r", r=2)
+    x1, x2 = x.unbind(dim=-1)
+    x = torch.stack((-x2, x1), dim=-1)
+    return rearrange(x, "... d r -> ... (d r)")
+@autocast("cuda", enabled=False)
+def apply_rotary_emb(freqs, t, start_index=0, scale=1.0, seq_dim=-2):
+    dtype = t.dtype
+    if t.ndim == 3:
+        seq_len = t.shape[seq_dim]
+        freqs = freqs[-seq_len:]
+    rot_dim = freqs.shape[-1]
+    end_index = start_index + rot_dim
+    assert (
+        rot_dim <= t.shape[-1]
+    ), f"feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}"
+    t_left, t, t_right = (
+        t[..., :start_index],
+        t[..., start_index:end_index],
+        t[..., end_index:],
+    )
+    t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
+    out = torch.cat((t_left, t, t_right), dim=-1)
+    return out.type(dtype)
+# learned rotation helpers
+def apply_learned_rotations(rotations, t, start_index=0, freq_ranges=None):
+    if exists(freq_ranges):
+        rotations = einsum("..., f -> ... f", rotations, freq_ranges)
+        rotations = rearrange(rotations, "... r f -> ... (r f)")
+    rotations = repeat(rotations, "... n -> ... (n r)", r=2)
+    return apply_rotary_emb(rotations, t, start_index=start_index)
+# classes
+class RotaryEmbedding(Module):
+    def __init__(
+        self,
+        dim,
+        custom_freqs: Optional[Tensor] = None,
+        freqs_for: Union[
+            Literal["lang"], Literal["pixel"], Literal["constant"]
+        ] = "lang",
+        theta=10000,
+        max_freq=10,
+        num_freqs=1,
+        learned_freq=False,
+        use_xpos=False,
+        xpos_scale_base=512,
+        interpolate_factor=1.0,
+        theta_rescale_factor=1.0,
+        seq_before_head_dim=False,
+        cache_if_possible=True,
+    ):
+        super().__init__()
+        # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
+        # has some connection to NTK literature
+        # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
+        theta *= theta_rescale_factor ** (dim / (dim - 2))
+        self.freqs_for = freqs_for
+        if exists(custom_freqs):
+            freqs = custom_freqs
+        elif freqs_for == "lang":
+            freqs = 1.0 / (
+                theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)
+            )
+        elif freqs_for == "pixel":
+            freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi
+        elif freqs_for == "constant":
+            freqs = torch.ones(num_freqs).float()
+        self.cache_if_possible = cache_if_possible
+        self.tmp_store("cached_freqs", None)
+        self.tmp_store("cached_scales", None)
+        self.freqs = nn.Parameter(freqs, requires_grad=learned_freq)
+        self.learned_freq = learned_freq
+        # dummy for device
+        self.tmp_store("dummy", torch.tensor(0))
+        # default sequence dimension
+        self.seq_before_head_dim = seq_before_head_dim
+        self.default_seq_dim = -3 if seq_before_head_dim else -2
+        # interpolation factors
+        assert interpolate_factor >= 1.0
+        self.interpolate_factor = interpolate_factor
+        # xpos
+        self.use_xpos = use_xpos
+        if not use_xpos:
+            self.tmp_store("scale", None)
+            return
+        scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
+        self.scale_base = xpos_scale_base
+        self.tmp_store("scale", scale)
+        # add apply_rotary_emb as static method
+        self.apply_rotary_emb = staticmethod(apply_rotary_emb)
+    @property
+    def device(self):
+        return self.dummy.device
+    def tmp_store(self, key, value):
+        self.register_buffer(key, value, persistent=False)
+    def get_seq_pos(self, seq_len, device, dtype, offset=0):
+        return (
+            torch.arange(seq_len, device=device, dtype=dtype) + offset
+        ) / self.interpolate_factor
+    def rotate_queries_or_keys(self, t, seq_dim=None, offset=0):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+        assert (
+            not self.use_xpos
+        ), "you must use `.rotate_queries_and_keys` method instead and pass in both queries and keys, for length extrapolatable rotary embeddings"
+        device, dtype, seq_len = t.device, t.dtype, t.shape[seq_dim]
+        freqs = self.forward(
+            self.get_seq_pos(seq_len, device=device, dtype=dtype, offset=offset),
+            seq_len=seq_len,
+            offset=offset,
+        )
+        if seq_dim == -3:
+            freqs = rearrange(freqs, "n d -> n 1 d")
+        return apply_rotary_emb(freqs, t, seq_dim=seq_dim)
+    def rotate_queries_with_cached_keys(self, q, k, seq_dim=None, offset=0):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+        q_len, k_len = q.shape[seq_dim], k.shape[seq_dim]
+        assert q_len <= k_len
+        rotated_q = self.rotate_queries_or_keys(
+            q, seq_dim=seq_dim, offset=k_len - q_len + offset
+        )
+        rotated_k = self.rotate_queries_or_keys(k, seq_dim=seq_dim, offset=offset)
+        rotated_q = rotated_q.type(q.dtype)
+        rotated_k = rotated_k.type(k.dtype)
+        return rotated_q, rotated_k
+    def rotate_queries_and_keys(self, q, k, seq_dim=None):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+        assert self.use_xpos
+        device, dtype, seq_len = q.device, q.dtype, q.shape[seq_dim]
+        seq = self.get_seq_pos(seq_len, dtype=dtype, device=device)
+        freqs = self.forward(seq, seq_len=seq_len)
+        scale = self.get_scale(seq, seq_len=seq_len).to(dtype)
+        if seq_dim == -3:
+            freqs = rearrange(freqs, "n d -> n 1 d")
+            scale = rearrange(scale, "n d -> n 1 d")
+        rotated_q = apply_rotary_emb(freqs, q, scale=scale, seq_dim=seq_dim)
+        rotated_k = apply_rotary_emb(freqs, k, scale=scale**-1, seq_dim=seq_dim)
+        rotated_q = rotated_q.type(q.dtype)
+        rotated_k = rotated_k.type(k.dtype)
+        return rotated_q, rotated_k
+    def get_scale(self, t: Tensor, seq_len: Optional[int] = None, offset=0):
+        assert self.use_xpos
+        should_cache = self.cache_if_possible and exists(seq_len)
+        if (
+            should_cache
+            and exists(self.cached_scales)
+            and (seq_len + offset) <= self.cached_scales.shape[0]
+        ):
+            return self.cached_scales[offset : (offset + seq_len)]
+        scale = 1.0
+        if self.use_xpos:
+            power = (t - len(t) // 2) / self.scale_base
+            scale = self.scale ** rearrange(power, "n -> n 1")
+            scale = torch.cat((scale, scale), dim=-1)
+        if should_cache:
+            self.tmp_store("cached_scales", scale)
+        return scale
+    def get_axial_freqs(self, *dims):
+        Colon = slice(None)
+        all_freqs = []
+        for ind, dim in enumerate(dims):
+            if self.freqs_for == "pixel":
+                pos = torch.linspace(-1, 1, steps=dim, device=self.device)
+            else:
+                pos = torch.arange(dim, device=self.device)
+            freqs = self.forward(pos, seq_len=dim)
+            all_axis = [None] * len(dims)
+            all_axis[ind] = Colon
+            new_axis_slice = (Ellipsis, *all_axis, Colon)
+            all_freqs.append(freqs[new_axis_slice])
+        all_freqs = broadcast_tensors(*all_freqs)
+        return torch.cat(all_freqs, dim=-1)
+    @autocast("cuda", enabled=False)
+    def forward(self, t: Tensor, seq_len=None, offset=0):
+        should_cache = (
+            self.cache_if_possible
+            and not self.learned_freq
+            and exists(seq_len)
+            and self.freqs_for != "pixel"
+        )
+        if (
+            should_cache
+            and exists(self.cached_freqs)
+            and (offset + seq_len) <= self.cached_freqs.shape[0]
+        ):
+            return self.cached_freqs[offset : (offset + seq_len)].detach()
+        freqs = self.freqs
+        freqs = einsum("..., f -> ... f", t.type(freqs.dtype), freqs)
+        freqs = repeat(freqs, "... n -> ... (n r)", r=2)
+        if should_cache:
+            self.tmp_store("cached_freqs", freqs.detach())
+        return freqs
+class Rope2D:
+    """ Helper class to apply RoPE2D as well as interpolate on the fly. """
+    def __init__(self, dim, use_cls_token=False):
+        self.dim = dim
+        self.use_cls_token = use_cls_token
+        self.grid_size = None
+        self.freq = None
+    def init_tensors(self):
+        self.rope = RotaryEmbedding(self.dim // 2)
+    def update_grid(self, device, grid_h, grid_w):
+        if self.grid_size != (grid_h, grid_w):
+            self.grid_size = (grid_h, grid_w)
+            self.rope = self.rope.to(device)
+            if self.use_cls_token:
+                # +1 to leave space for the cls token to be (0, 0)
+                grid_y_range = torch.arange(grid_h, device=device) + 1
+                grid_x_range = torch.arange(grid_w, device=device) + 1
+            else:
+                grid_y_range = torch.arange(grid_h, device=device)
+                grid_x_range = torch.arange(grid_w, device=device)
+            freqs_y = self.rope(grid_y_range)[:, None].expand(grid_h, grid_w, -1)
+            freqs_x = self.rope(grid_x_range)[None, :].expand(grid_h, grid_w, -1)
+            freq = torch.cat([freqs_x, freqs_y], dim=-1).reshape(grid_h * grid_w, -1)
+            if self.use_cls_token:
+                freq = torch.cat(
+                    [torch.zeros(1, freq.shape[-1], device=device), freq], dim=0
+                )
+            self.freq = freq[None, ...]
+        self.freq = self.freq.to(device)
+    def __call__(self, q, k):
+        # batch, heads, seq, dim = q.shape
+        q = apply_rotary_emb(self.freq[:, None, :, :], q)
+        k = apply_rotary_emb(self.freq[:, None, :, :], k)
+        return q, k

core/vision_encoder/tokenizer.py ADDED Viewed

	@@ -0,0 +1,342 @@

+""" CLIP tokenizer
+Copied from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
+"""
+import gzip
+import html
+import os
+import random
+import string
+from functools import lru_cache, partial
+from typing import Callable, List, Optional, Union
+import ftfy
+import regex as re
+import torch
+# https://stackoverflow.com/q/62691279
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+DEFAULT_CONTEXT_LENGTH = 77  # default context length for OpenAI CLIP
+@lru_cache()
+def default_bpe():
+    return os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz"
+    )
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1))
+        + list(range(ord("¡"), ord("¬") + 1))
+        + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r"\s+", " ", text)
+    text = text.strip()
+    return text
+def _clean_canonicalize(x):
+    # basic, remove whitespace, remove punctuation, lower case
+    return canonicalize_text(basic_clean(x))
+def _clean_lower(x):
+    # basic, remove whitespace, lower case
+    return whitespace_clean(basic_clean(x)).lower()
+def _clean_whitespace(x):
+    # basic, remove whitespace
+    return whitespace_clean(basic_clean(x))
+def get_clean_fn(type: str):
+    if type == "canonicalize":
+        return _clean_canonicalize
+    elif type == "lower":
+        return _clean_lower
+    elif type == "whitespace":
+        return _clean_whitespace
+    else:
+        assert False, f"Invalid clean function ({type})."
+def canonicalize_text(text, *, keep_punctuation_exact_string=None):
+    """Returns canonicalized `text` (lowercase and punctuation removed).
+    From: https://github.com/google-research/big_vision/blob/53f18caf27a9419231bbf08d3388b07671616d3d/big_vision/evaluators/proj/image_text/prompt_engineering.py#L94
+    Args:
+      text: string to be canonicalized.
+      keep_punctuation_exact_string: If provided, then this exact string kept.
+        For example providing '{}' will keep any occurrences of '{}' (but will
+        still remove '{' and '}' that appear separately).
+    """
+    text = text.replace("_", " ")
+    if keep_punctuation_exact_string:
+        text = keep_punctuation_exact_string.join(
+            part.translate(str.maketrans("", "", string.punctuation))
+            for part in text.split(keep_punctuation_exact_string)
+        )
+    else:
+        text = text.translate(str.maketrans("", "", string.punctuation))
+    text = text.lower()
+    text = re.sub(r"\s+", " ", text)
+    return text.strip()
+class SimpleTokenizer(object):
+    def __init__(
+        self,
+        bpe_path: str = default_bpe(),
+        additional_special_tokens: Optional[List[str]] = None,
+        context_length: Optional[int] = DEFAULT_CONTEXT_LENGTH,
+        clean: str = "lower",
+        reduction_mask: str = "",
+    ):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode("utf-8").split("\n")
+        merges = merges[1 : 49152 - 256 - 2 + 1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v + "</w>" for v in vocab]
+        for merge in merges:
+            vocab.append("".join(merge))
+        special_tokens = ["<start_of_text>", "<end_of_text>"]
+        if additional_special_tokens:
+            special_tokens += additional_special_tokens
+        vocab.extend(special_tokens)
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {t: t for t in special_tokens}
+        special = "|".join(special_tokens)
+        self.pat = re.compile(
+            special + r"""|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
+            re.IGNORECASE,
+        )
+        self.vocab_size = len(self.encoder)
+        self.all_special_ids = [self.encoder[t] for t in special_tokens]
+        self.sot_token_id = self.all_special_ids[0]
+        self.eot_token_id = self.all_special_ids[1]
+        self.context_length = context_length
+        self.clean_fn = get_clean_fn(clean)
+        self.reduction_fn = (
+            get_reduction_mask_fn(reduction_mask) if reduction_mask else None
+        )
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + (token[-1] + "</w>",)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token + "</w>"
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+    def encode(self, text):
+        bpe_tokens = []
+        text = self.clean_fn(text)
+        for token in re.findall(self.pat, text):
+            token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
+            bpe_tokens.extend(
+                self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" ")
+            )
+        return bpe_tokens
+    def decode(self, tokens):
+        text = "".join([self.decoder[token] for token in tokens])
+        text = (
+            bytearray([self.byte_decoder[c] for c in text])
+            .decode("utf-8", errors="replace")
+            .replace("</w>", " ")
+        )
+        return text
+    def __call__(
+        self, texts: Union[str, List[str]], context_length: Optional[int] = None
+    ) -> torch.LongTensor:
+        """Returns the tokenized representation of given input string(s)
+        Parameters
+        ----------
+        texts : Union[str, List[str]]
+            An input string or a list of input strings to tokenize
+        context_length : int
+            The context length to use; all CLIP models use 77 as the context length
+        Returns
+        -------
+        A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
+        """
+        if isinstance(texts, str):
+            texts = [texts]
+        context_length = context_length or self.context_length
+        assert context_length, "Please set a valid context length"
+        if self.reduction_fn is not None:
+            # use reduction strategy for tokenize if set, otherwise default to truncation below
+            return self.reduction_fn(
+                texts,
+                context_length=context_length,
+                sot_token_id=self.sot_token_id,
+                eot_token_id=self.eot_token_id,
+                encode_fn=self.encode,
+            )
+        all_tokens = [
+            [self.sot_token_id] + self.encode(text) + [self.eot_token_id]
+            for text in texts
+        ]
+        result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+        for i, tokens in enumerate(all_tokens):
+            if len(tokens) > context_length:
+                tokens = tokens[:context_length]  # Truncate
+                tokens[-1] = self.eot_token_id
+            result[i, : len(tokens)] = torch.tensor(tokens)
+        return result
+def random_mask_tokenize(
+    texts: Union[str, List[str]],
+    context_length: int,
+    sot_token_id: int,
+    eot_token_id: int,
+    encode_fn: Callable,
+    shuffle: bool = False,
+):
+    all_tokens = [encode_fn(text) for text in texts]
+    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+    for i, tokens in enumerate(all_tokens):
+        tokens = torch.tensor(tokens)
+        num_tokens = len(tokens)
+        if num_tokens > context_length - 2:  # 2 for sot and eot token
+            num_keep = context_length - 2
+            indices = torch.randperm(len(tokens))
+            indices = indices[:num_keep]
+            if not shuffle:
+                indices = indices.msort()
+            tokens = tokens[indices]
+            num_tokens = num_keep
+        result[i, 0] = sot_token_id
+        result[i, 1 : num_tokens + 1] = tokens
+        result[i, num_tokens + 1] = eot_token_id
+    return result
+def simple_mask_tokenize(
+    texts: Union[str, List[str]],
+    context_length: int,
+    sot_token_id: int,
+    eot_token_id: int,
+    encode_fn: Callable,
+):
+    all_tokens = [encode_fn(text) for text in texts]
+    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+    for i, tokens in enumerate(all_tokens):
+        num_tokens = len(tokens)
+        if num_tokens > context_length - 2:  # 2 for sot and eot token
+            num_keep = context_length - 2
+            start_index = random.randint(0, num_tokens - num_keep)  # high is incl
+            tokens = tokens[start_index : start_index + num_keep]
+        tokens = [sot_token_id] + tokens + [eot_token_id]
+        result[i, : len(tokens)] = torch.tensor(tokens)
+    return result
+def get_reduction_mask_fn(type: str):
+    """Choose strategy for dropping (masking) tokens to achieve target context length"""
+    assert type in ("simple", "random", "shuffle")
+    if type == "simple":
+        return simple_mask_tokenize  # randomly select block [start:end]
+    elif type == "random":
+        return random_mask_tokenize  # randomly drop tokens (keep order)
+    elif type == "shuffle":
+        return partial(
+            random_mask_tokenize, shuffle=True
+        )  # randomly drop tokens (shuffle order)

core/vision_encoder/transforms.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import torchvision.transforms as T
+from core.vision_encoder.tokenizer import SimpleTokenizer
+def get_image_transform(
+    image_size: int,
+    center_crop: bool = False,
+    interpolation: T.InterpolationMode = T.InterpolationMode.BILINEAR  # We used bilinear during training
+):
+    if center_crop:
+        crop = [
+            T.Resize(image_size, interpolation=interpolation),
+            T.CenterCrop(image_size)
+        ]
+    else:
+        # "Squash": most versatile
+        crop = [
+            T.Resize((image_size, image_size), interpolation=interpolation)
+        ]
+    return T.Compose(crop + [
+        T.Lambda(lambda x: x.convert("RGB")),
+        T.ToTensor(),
+        T.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5], inplace=True),
+    ])
+def get_text_tokenizer(context_length: int):
+    return SimpleTokenizer(context_length=context_length)

models.py ADDED Viewed

	@@ -0,0 +1,331 @@

+"""Minimal model-loading code for the 7 VFM baselines in the paper."""
+from __future__ import annotations
+import sys
+from pathlib import Path
+from typing import Callable
+import torch
+import torch.nn as nn
+from torchvision import transforms
+from transformers import AutoConfig, AutoImageProcessor, AutoModel
+ROOT = Path(__file__).resolve().parent
+WEIGHTS_DIR = ROOT / "weights"
+MODEL_SPECS = {
+    "metacliplin": {
+        "paper_name": "MetaCLIP-Linear",
+        "checkpoint": "metacliplin0.pth",
+        "hf_model": "facebook/metaclip-h14-fullcc2.5b",
+        "feature_dim": 1280,
+        "image_size": 224,
+        "pooler_output": True,
+    },
+    "metaclip2lin": {
+        "paper_name": "MetaCLIP2-Linear",
+        "checkpoint": "metaclip2lin0.pth",
+        "hf_model": "facebook/metaclip-2-worldwide-giant",
+        "feature_dim": 1280,
+        "image_size": 224,
+        "pooler_output": True,
+    },
+    "sigliplin": {
+        "paper_name": "SigLIP-Linear",
+        "checkpoint": "sigliplin0.pth",
+        "hf_model": "google/siglip-large-patch16-384",
+        "feature_dim": 1024,
+        "image_size": 384,
+        "pooler_output": True,
+    },
+    "siglip2lin": {
+        "paper_name": "SigLIP2-Linear",
+        "checkpoint": "siglip2lin0.pth",
+        "hf_model": "google/siglip2-giant-opt-patch16-384",
+        "feature_dim": 1536,
+        "image_size": 384,
+        "pooler_output": True,
+    },
+    "pelin": {
+        "paper_name": "PE-CLIP-Linear",
+        "checkpoint": "pelin0.pth",
+        "feature_dim": 1024,
+        "image_size": 336,
+        "pooler_output": False,
+    },
+    "dinov2lin": {
+        "paper_name": "DINOv2-Linear",
+        "checkpoint": "dinov2lin0.pth",
+        "feature_dim": 1024,
+        "pooler_output": False,
+    },
+    "dinov3lin": {
+        "paper_name": "DINOv3-Linear",
+        "checkpoint": "dinov3lin0.pth",
+        "hf_model": "facebook/dinov3-vit7b16-pretrain-lvd1689m",
+        "feature_dim": 4096,
+        "pooler_output": False,
+    },
+}
+ALIASES = {
+    "MetaCLIP-Linear": "metacliplin",
+    "MetaCLIP2-Linear": "metaclip2lin",
+    "SigLIP-Linear": "sigliplin",
+    "SigLIP2-Linear": "siglip2lin",
+    "PE-CLIP-Linear": "pelin",
+    "DINOv2-Linear": "dinov2lin",
+    "DINOv3-Linear": "dinov3lin",
+}
+def canonical_model_name(name: str) -> str:
+    if name in MODEL_SPECS:
+        return name
+    if name in ALIASES:
+        return ALIASES[name]
+    raise KeyError(f"Unknown model: {name}")
+def default_checkpoint_path(model_name: str) -> Path:
+    model_name = canonical_model_name(model_name)
+    return WEIGHTS_DIR / MODEL_SPECS[model_name]["checkpoint"]
+def _resolve_device(device: str | torch.device | None = None) -> torch.device:
+    if device is None:
+        return torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    return torch.device(device)
+def _load_checkpoint(checkpoint_path: str | Path) -> dict:
+    checkpoint = torch.load(str(checkpoint_path), map_location="cpu", weights_only=False)
+    if isinstance(checkpoint, dict):
+        for key in ("state_dict", "model", "model_state_dict"):
+            if key in checkpoint and isinstance(checkpoint[key], dict):
+                checkpoint = checkpoint[key]
+                break
+    normalized = {}
+    for key, value in checkpoint.items():
+        normalized[key[7:] if key.startswith("module.") else key] = value
+    return normalized
+def _infer_feature_dim(state_dict: dict, default_dim: int) -> int:
+    head_weight = state_dict.get("head.weight")
+    if isinstance(head_weight, torch.Tensor) and head_weight.ndim == 2:
+        return int(head_weight.shape[1])
+    return default_dim
+def _load_image_processor(model_name: str):
+    try:
+        return AutoImageProcessor.from_pretrained(model_name, local_files_only=True)
+    except Exception:
+        try:
+            return AutoImageProcessor.from_pretrained(model_name)
+        except Exception:
+            return None
+def _load_backbone(model_name: str):
+    try:
+        return AutoModel.from_pretrained(model_name, local_files_only=True)
+    except Exception:
+        config = AutoConfig.from_pretrained(model_name)
+        return AutoModel.from_config(config)
+class _PoolerLinearModel(nn.Module):
+    def __init__(self, backbone: nn.Module, feature_dim: int):
+        super().__init__()
+        self.backbone = backbone
+        self.head = nn.Linear(feature_dim, 2)
+    def forward(self, x):
+        with torch.no_grad():
+            outputs = self.backbone(x)
+            features = outputs.pooler_output.float()
+        return self.head(features)
+class _ClsTokenLinearModel(nn.Module):
+    def __init__(self, backbone: nn.Module, feature_dim: int):
+        super().__init__()
+        self.backbone = backbone
+        self.head = nn.Linear(feature_dim, 2)
+    def forward(self, x):
+        with torch.no_grad():
+            outputs = self.backbone(x)
+            features = outputs.last_hidden_state[:, 0].float()
+        return self.head(features)
+class _PELinearModel(nn.Module):
+    def __init__(self, backbone: nn.Module, feature_dim: int):
+        super().__init__()
+        self.backbone = backbone
+        self.head = nn.Linear(feature_dim, 2)
+    def forward(self, x):
+        with torch.no_grad():
+            features = self.backbone(x)
+            if isinstance(features, torch.Tensor):
+                features = features.float()
+        return self.head(features)
+def _finalize_model(model: nn.Module, state_dict: dict, device=None) -> nn.Module:
+    model.load_state_dict(state_dict, strict=False)
+    model.to(_resolve_device(device))
+    model.eval()
+    return model
+def _build_clip_transform(image_size: int, image_processor=None):
+    mean = [0.485, 0.456, 0.406]
+    std = [0.229, 0.224, 0.225]
+    if image_processor is not None:
+        mean = getattr(image_processor, "image_mean", mean)
+        std = getattr(image_processor, "image_std", std)
+    return transforms.Compose(
+        [
+            transforms.Resize(image_size, interpolation=transforms.InterpolationMode.BICUBIC),
+            transforms.CenterCrop(image_size),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=mean, std=std),
+        ]
+    )
+def _build_dino_transform():
+    return transforms.Compose(
+        [
+            transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ]
+    )
+def load_metacliplin(checkpoint_path: str | Path | None = None, device=None):
+    spec = MODEL_SPECS["metacliplin"]
+    checkpoint_path = checkpoint_path or default_checkpoint_path("metacliplin")
+    state_dict = _load_checkpoint(checkpoint_path)
+    feature_dim = _infer_feature_dim(state_dict, spec["feature_dim"])
+    image_processor = _load_image_processor(spec["hf_model"])
+    backbone = _load_backbone(spec["hf_model"])
+    model = _PoolerLinearModel(backbone.vision_model, feature_dim)
+    model = _finalize_model(model, state_dict, device=device)
+    return model, _build_clip_transform(spec["image_size"], image_processor)
+def load_metaclip2lin(checkpoint_path: str | Path | None = None, device=None):
+    spec = MODEL_SPECS["metaclip2lin"]
+    checkpoint_path = checkpoint_path or default_checkpoint_path("metaclip2lin")
+    state_dict = _load_checkpoint(checkpoint_path)
+    feature_dim = _infer_feature_dim(state_dict, spec["feature_dim"])
+    image_processor = _load_image_processor(spec["hf_model"])
+    backbone = _load_backbone(spec["hf_model"])
+    model = _PoolerLinearModel(backbone.vision_model, feature_dim)
+    model = _finalize_model(model, state_dict, device=device)
+    return model, _build_clip_transform(spec["image_size"], image_processor)
+def load_sigliplin(checkpoint_path: str | Path | None = None, device=None):
+    spec = MODEL_SPECS["sigliplin"]
+    checkpoint_path = checkpoint_path or default_checkpoint_path("sigliplin")
+    state_dict = _load_checkpoint(checkpoint_path)
+    feature_dim = _infer_feature_dim(state_dict, spec["feature_dim"])
+    image_processor = _load_image_processor(spec["hf_model"])
+    backbone = _load_backbone(spec["hf_model"])
+    model = _PoolerLinearModel(backbone.vision_model, feature_dim)
+    model = _finalize_model(model, state_dict, device=device)
+    return model, _build_clip_transform(spec["image_size"], image_processor)
+def load_siglip2lin(checkpoint_path: str | Path | None = None, device=None):
+    spec = MODEL_SPECS["siglip2lin"]
+    checkpoint_path = checkpoint_path or default_checkpoint_path("siglip2lin")
+    state_dict = _load_checkpoint(checkpoint_path)
+    feature_dim = _infer_feature_dim(state_dict, spec["feature_dim"])
+    image_processor = _load_image_processor(spec["hf_model"])
+    backbone = _load_backbone(spec["hf_model"])
+    model = _PoolerLinearModel(backbone.vision_model, feature_dim)
+    model = _finalize_model(model, state_dict, device=device)
+    return model, _build_clip_transform(spec["image_size"], image_processor)
+def load_dinov2lin(checkpoint_path: str | Path | None = None, device=None):
+    checkpoint_path = checkpoint_path or default_checkpoint_path("dinov2lin")
+    state_dict = _load_checkpoint(checkpoint_path)
+    feature_dim = _infer_feature_dim(state_dict, MODEL_SPECS["dinov2lin"]["feature_dim"])
+    if feature_dim == 1536:
+        candidates = ["facebook/dinov2-giant", "facebook/dinov2-large"]
+    elif feature_dim == 1024:
+        candidates = ["facebook/dinov2-large", "facebook/dinov2-base"]
+    elif feature_dim == 768:
+        candidates = ["facebook/dinov2-base", "facebook/dinov2-small"]
+    else:
+        candidates = ["facebook/dinov2-large"]
+    last_error = None
+    backbone = None
+    for candidate in candidates:
+        try:
+            backbone = _load_backbone(candidate)
+            break
+        except Exception as exc:
+            last_error = exc
+    if backbone is None:
+        raise RuntimeError(f"Failed to load DINOv2 backbone: {last_error}")
+    model = _ClsTokenLinearModel(backbone, feature_dim)
+    model = _finalize_model(model, state_dict, device=device)
+    return model, _build_dino_transform()
+def load_dinov3lin(checkpoint_path: str | Path | None = None, device=None):
+    checkpoint_path = checkpoint_path or default_checkpoint_path("dinov3lin")
+    state_dict = _load_checkpoint(checkpoint_path)
+    feature_dim = _infer_feature_dim(state_dict, MODEL_SPECS["dinov3lin"]["feature_dim"])
+    backbone = _load_backbone(MODEL_SPECS["dinov3lin"]["hf_model"])
+    model = _ClsTokenLinearModel(backbone, feature_dim)
+    model = _finalize_model(model, state_dict, device=device)
+    return model, _build_dino_transform()
+def load_pelin(checkpoint_path: str | Path | None = None, device=None):
+    checkpoint_path = checkpoint_path or default_checkpoint_path("pelin")
+    if str(ROOT) not in sys.path:
+        sys.path.insert(0, str(ROOT))
+    import core.vision_encoder.pe as pe
+    import core.vision_encoder.transforms as pe_transforms
+    state_dict = _load_checkpoint(checkpoint_path)
+    feature_dim = _infer_feature_dim(state_dict, MODEL_SPECS["pelin"]["feature_dim"])
+    clip_model = pe.CLIP.from_config("PE-Core-L14-336", pretrained=False)
+    model = _PELinearModel(clip_model.visual, feature_dim)
+    model = _finalize_model(model, state_dict, device=device)
+    return model, pe_transforms.get_image_transform(MODEL_SPECS["pelin"]["image_size"])
+LOADERS: dict[str, Callable] = {
+    "metacliplin": load_metacliplin,
+    "metaclip2lin": load_metaclip2lin,
+    "sigliplin": load_sigliplin,
+    "siglip2lin": load_siglip2lin,
+    "pelin": load_pelin,
+    "dinov2lin": load_dinov2lin,
+    "dinov3lin": load_dinov3lin,
+}
+def load_model(model_name: str, checkpoint_path: str | Path | None = None, device=None):
+    model_name = canonical_model_name(model_name)
+    return LOADERS[model_name](checkpoint_path=checkpoint_path, device=device)

test_vfm_baselines.py ADDED Viewed

	@@ -0,0 +1,153 @@

+#!/usr/bin/env python3
+"""Unified evaluation script for the 7 VFM baselines."""
+from __future__ import annotations
+import argparse
+import json
+from pathlib import Path
+import numpy as np
+import torch
+from PIL import Image
+from sklearn.metrics import accuracy_score, average_precision_score, roc_auc_score
+from torch.utils.data import DataLoader, Dataset
+from models import LOADERS, MODEL_SPECS, canonical_model_name, default_checkpoint_path, load_model
+IMAGE_EXTENSIONS = (".jpg", ".jpeg", ".png", ".bmp", ".JPG", ".JPEG", ".PNG")
+class BinaryFolderDataset(Dataset):
+    def __init__(self, real_dir: str, fake_dir: str, transform, max_samples: int | None = None):
+        self.transform = transform
+        real_paths = self._get_image_files(real_dir)
+        fake_paths = self._get_image_files(fake_dir)
+        if max_samples is not None:
+            real_paths = real_paths[:max_samples]
+            fake_paths = fake_paths[:max_samples]
+        self.image_paths = real_paths + fake_paths
+        self.labels = [0] * len(real_paths) + [1] * len(fake_paths)
+    @staticmethod
+    def _get_image_files(folder: str):
+        folder = Path(folder)
+        images = []
+        for extension in IMAGE_EXTENSIONS:
+            images.extend(folder.rglob(f"*{extension}"))
+        return sorted(images)
+    def __len__(self):
+        return len(self.image_paths)
+    def __getitem__(self, index):
+        image_path = self.image_paths[index]
+        image = Image.open(image_path).convert("RGB")
+        return self.transform(image), self.labels[index], str(image_path)
+def evaluate(model, transform, real_dir: str, fake_dir: str, batch_size: int, num_workers: int, max_samples: int | None):
+    dataset = BinaryFolderDataset(real_dir, fake_dir, transform, max_samples=max_samples)
+    dataloader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=num_workers,
+        pin_memory=torch.cuda.is_available(),
+    )
+    device = next(model.parameters()).device
+    y_true = []
+    y_prob = []
+    y_pred = []
+    paths = []
+    with torch.no_grad():
+        for images, labels, batch_paths in dataloader:
+            images = images.to(device)
+            logits = model(images)
+            probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
+            preds = (probs > 0.5).astype(int)
+            y_true.extend(labels.numpy().tolist())
+            y_prob.extend(probs.tolist())
+            y_pred.extend(preds.tolist())
+            paths.extend(batch_paths)
+    y_true = np.asarray(y_true)
+    y_prob = np.asarray(y_prob)
+    y_pred = np.asarray(y_pred)
+    metrics = {
+        "accuracy": float(accuracy_score(y_true, y_pred)),
+        "real_accuracy": float(accuracy_score(y_true[y_true == 0], y_pred[y_true == 0])),
+        "fake_accuracy": float(accuracy_score(y_true[y_true == 1], y_pred[y_true == 1])),
+    }
+    if len(np.unique(y_true)) > 1:
+        metrics["auc"] = float(roc_auc_score(y_true, y_prob))
+        metrics["ap"] = float(average_precision_score(y_true, y_prob))
+    samples = [
+        {
+            "path": path,
+            "label": int(label),
+            "prob_fake": float(prob),
+            "pred": int(pred),
+        }
+        for path, label, prob, pred in zip(paths, y_true, y_prob, y_pred)
+    ]
+    return {"metrics": metrics, "samples": samples}
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", default="all", help="One of: all, metacliplin, metaclip2lin, sigliplin, siglip2lin, pelin, dinov2lin, dinov3lin")
+    parser.add_argument("--real-dir", required=True)
+    parser.add_argument("--fake-dir", required=True)
+    parser.add_argument("--checkpoint", default=None, help="Optional explicit checkpoint path for single-model evaluation")
+    parser.add_argument("--batch-size", type=int, default=8)
+    parser.add_argument("--num-workers", type=int, default=4)
+    parser.add_argument("--max-samples", type=int, default=None)
+    parser.add_argument("--device", default=None)
+    parser.add_argument("--save-json", default=None)
+    args = parser.parse_args()
+    model_names = list(LOADERS.keys()) if args.model == "all" else [canonical_model_name(args.model)]
+    results = {}
+    for model_name in model_names:
+        checkpoint = args.checkpoint if args.model != "all" and args.checkpoint else default_checkpoint_path(model_name)
+        checkpoint = Path(checkpoint)
+        try:
+            checkpoint_for_output = str(checkpoint.relative_to(Path(__file__).resolve().parent))
+        except ValueError:
+            checkpoint_for_output = str(checkpoint)
+        model, transform = load_model(model_name, checkpoint_path=checkpoint, device=args.device)
+        result = evaluate(
+            model=model,
+            transform=transform,
+            real_dir=args.real_dir,
+            fake_dir=args.fake_dir,
+            batch_size=args.batch_size,
+            num_workers=args.num_workers,
+            max_samples=args.max_samples,
+        )
+        results[model_name] = {
+            "paper_name": MODEL_SPECS[model_name]["paper_name"],
+            "checkpoint": checkpoint_for_output,
+            **result,
+        }
+        del model
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+    output = json.dumps(results, indent=2, ensure_ascii=False)
+    print(output)
+    if args.save_json:
+        Path(args.save_json).write_text(output + "\n", encoding="utf-8")
+if __name__ == "__main__":
+    main()

weights/dinov2lin0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3c8604c137ad296d9f6bbd239d03e792cca36b2503eb03cebc5ccb5abf740ebe
+size 4546228799

weights/dinov3lin0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:58e35c23fc4e6a279dadedac8191b4a409a760fbdc43837af9f8541a6f7b2fb9
+size 26864441175

weights/metaclip2lin0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e609f38a74ad280abe48dd0dc0111ef113f5f1cd8c4a3337a8346a22afbc5258
+size 3685870062

weights/metacliplin0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2deed4e9d96cb5f27df0579c57028e9b855162f3daf7326ec402b580e244194c
+size 1261744353

weights/pelin0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d871b813db9d1cf335ba0cde1701c72baf7bc80369238d881c55a676a59b24ff
+size 1268731407

weights/siglip2lin0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c6971aab31d8ff4c061f4ae330b49f38c3f45a96bfb99e756f65af72e8f3f3b7
+size 2327586086

weights/sigliplin0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eaf932895087718f284d880fccbc565cfafbed7b2a6c12b67a346e6c878c8ab3
+size 632730704