Spaces:

Daankular
/

ZeroWan2GP

Runtime error

App Files Files Community

Daankular commited on 28 days ago

Commit

395e472

verified ·

1 Parent(s): 1b01b22

Update ip_adapter.py

Browse files

Files changed (1) hide show

ip_adapter.py +198 -209

ip_adapter.py CHANGED Viewed

@@ -1,21 +1,53 @@
 """
-WAN 2.1 IP-Adapter — diffusers-native port of kaaskoek232/IPAdapterWAN.
-Architecture
-  SigLIP2 so400m (1152-d) → TimeResampler (1024-d, 8 queries)
-  → per-block WanIPAttnProcessor injected into every self-attention of
-    pipe.transformer
-Weights
-  Resampler  : loaded from InstantX/SD3.5-Large-IP-Adapter  ip-adapter.bin
-               key prefix "image_proj"  (architecture-matched)
-  IP proj    : to_k_ip / to_v_ip initialised from the model's own to_k / to_v
-               weights (zero-shot reference-attention style — works without
-               Wan-specific training and produces real identity signal)
-LoRA compatibility
-  IP processors sit on top of whatever to_q/to_k/to_v the LoRA has patched;
-  they are orthogonal (IP adds extra KV, LoRA modifies weight matrices).
 """
 from __future__ import annotations
@@ -27,20 +59,12 @@ from typing import Optional
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from einops import rearrange
 from huggingface_hub import hf_hub_download
 from PIL import Image
 from transformers import AutoProcessor, SiglipVisionModel
-# ── Helpers ────────────────────────────────────────────────────────────────────
-def _reshape(t: torch.Tensor, heads: int) -> torch.Tensor:
-    b, n, d = t.shape
-    return t.reshape(b, n, heads, d // heads).transpose(1, 2)
-# ── Perceiver / TimeResampler (matches SD3.5 ip-adapter.bin image_proj.*) ─────
 class _FeedForward(nn.Module):
     def __init__(self, dim: int, mult: int = 4):
@@ -56,16 +80,21 @@ class _FeedForward(nn.Module):
         return self.net(x)
 class _PerceiverAttention(nn.Module):
     def __init__(self, *, dim: int, dim_head: int = 64, heads: int = 8):
         super().__init__()
-        self.heads   = heads
-        inner        = dim_head * heads
-        self.norm1   = nn.LayerNorm(dim)
-        self.norm2   = nn.LayerNorm(dim)
-        self.to_q    = nn.Linear(dim, inner, bias=False)
-        self.to_kv   = nn.Linear(dim, inner * 2, bias=False)
-        self.to_out  = nn.Linear(inner, dim, bias=False)
     def forward(self, x: torch.Tensor, latents: torch.Tensor) -> torch.Tensor:
         x       = self.norm1(x)
@@ -80,10 +109,10 @@ class _PerceiverAttention(nn.Module):
 class TimeResampler(nn.Module):
-    """Perceiver resampler with adaLN timestep conditioning.
-    Architecture mirrors the image_proj section of
-    InstantX/SD3.5-Large-IP-Adapter ip-adapter.bin so its weights load cleanly.
     """
     def __init__(
@@ -93,7 +122,7 @@ class TimeResampler(nn.Module):
         dim_head: int = 64,
         heads: int = 16,
         num_queries: int = 8,
-        embedding_dim: int = 1152,   # SigLIP2 so400m
         output_dim: int = 1024,
         ff_mult: int = 4,
         timestep_in_dim: int = 320,
@@ -111,20 +140,16 @@ class TimeResampler(nn.Module):
             nn.ModuleList([
                 _PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
                 _FeedForward(dim=dim, mult=ff_mult),
-                nn.Sequential(nn.SiLU(), nn.Linear(dim, 4 * dim)),   # adaLN
             ])
             for _ in range(depth)
         ])
         self.proj_out = nn.Linear(dim, output_dim)
         self.norm_out = nn.LayerNorm(output_dim)
-    def forward(
-        self,
-        x: torch.Tensor,
-        timestep: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
         t       = self.time_proj(timestep.flatten()).to(x.dtype)
-        t_emb   = self.t_emb(t)                          # (B, dim)
         latents = self.latents.expand(x.size(0), -1, -1).clone()
         x       = self.proj_in(x)
         for attn, ff, adaln in self.layers:
@@ -133,96 +158,30 @@ class TimeResampler(nn.Module):
             latents = attn(x, latents)
             latents = latents * (1 + c_mlp[:, None]) + s_mlp[:, None]
             latents = ff(latents) + latents
-        latents = self.norm_out(self.proj_out(latents))
-        return latents, t_emb
-# ── Per-block attention processor ─────────────────────────────────────────────
-class WanIPAttnProcessor:
-    """Wraps an existing Attention processor and adds IP face KV injection.
-    The IP keys/values are initialised from the model's own to_k / to_v weights
-    (zero-shot reference-attention), so no separate IP training is needed.
-    Conditioned frames attend to the face tokens in every self-attention block.
-    """
-    def __init__(
-        self,
-        original_processor,
-        to_k_ip: nn.Linear,
-        to_v_ip: nn.Linear,
-        norm_k_ip: Optional[nn.Module] = None,
-        norm_v_ip: Optional[nn.Module] = None,
-        scale: float = 1.0,
-    ):
-        self.original   = original_processor
-        self.to_k_ip    = to_k_ip
-        self.to_v_ip    = to_v_ip
-        self.norm_k_ip  = norm_k_ip
-        self.norm_v_ip  = norm_v_ip
-        self.scale      = scale
-        # Set before each pipeline call; cleared after.
-        self.ip_hidden_states: Optional[torch.Tensor] = None
-    def __call__(self, attn, hidden_states, *args, **kwargs):
-        out = self.original(attn, hidden_states, *args, **kwargs)
-        if self.ip_hidden_states is None or self.scale == 0:
-            return out
-        hs   = self.ip_hidden_states
-        h    = attn.heads
-        # Compute Q from hidden_states (re-use the model's normalised projection)
-        q    = attn.to_q(hidden_states)
-        norm_q = getattr(attn, "norm_q", None)
-        if norm_q is not None:
-            q = norm_q(q)
-        # Compute IP K / V
-        k_ip = self.to_k_ip(hs)
-        v_ip = self.to_v_ip(hs)
-        if self.norm_k_ip is not None:
-            k_ip = self.norm_k_ip(k_ip)
-        if self.norm_v_ip is not None:
-            v_ip = self.norm_v_ip(v_ip)
-        q    = _reshape(q,    h)
-        k_ip = _reshape(k_ip, h)
-        v_ip = _reshape(v_ip, h)
-        ip_attn = F.scaled_dot_product_attention(q, k_ip, v_ip)
-        inner_dim = getattr(attn, "inner_dim", q.shape[-1] * h)
-        ip_attn = ip_attn.transpose(1, 2).reshape(
-            hidden_states.shape[0], -1, inner_dim
-        )
-        ip_attn = attn.to_out[0](ip_attn)
-        if len(attn.to_out) > 1:
-            ip_attn = attn.to_out[1](ip_attn)
-        return out + ip_attn * self.scale
 # ── Main class ─────────────────────────────────────────────────────────────────
 class WanIPAdapter:
-    """Loads the IP-Adapter and patches pipe.transformer for face conditioning.
-    Usage inside _init_pipeline():
-        ip_adapter = WanIPAdapter(pipe, device=pipe.device, dtype=torch.bfloat16)
-    Usage inside run_inference() (before pipe()):
-        if face_ref is not None:
-            emb = ip_adapter.encode(face_ref, timestep=500)
-            ip_adapter.set_hidden_states(emb, scale=ip_scale)
-        result = pipe(...)
-        ip_adapter.clear_hidden_states()
     """
     _IP_ADAPTER_REPO = "InstantX/SD3.5-Large-IP-Adapter"
     _IP_ADAPTER_FILE = "ip-adapter.bin"
     _VISION_MODEL     = "google/siglip-so400m-patch14-384"
     def __init__(
         self,
         pipe,
@@ -236,8 +195,8 @@ class WanIPAdapter:
         self._load_vision_encoder()
         self._load_resampler(cache_dir)
-        self._patch_transformer(pipe.transformer)
-        print("[IP-Adapter] ready")
     # ── setup ──────────────────────────────────────────────────────────────────
@@ -245,112 +204,142 @@ class WanIPAdapter:
         print("[IP-Adapter] loading SigLIP vision encoder…")
         self.vis_proc  = AutoProcessor.from_pretrained(self._VISION_MODEL)
         self.vis_model = SiglipVisionModel.from_pretrained(
-            self._VISION_MODEL, torch_dtype=self.dtype
         ).to(self.device)
         self.vis_model.eval()
         print("[IP-Adapter] SigLIP loaded")
     def _load_resampler(self, cache_dir: str):
-        print("[IP-Adapter] loading TimeResampler from SD3.5 ip-adapter.bin…")
         ckpt = hf_hub_download(
             repo_id=self._IP_ADAPTER_REPO,
             filename=self._IP_ADAPTER_FILE,
             local_dir=cache_dir,
         )
         state = torch.load(ckpt, map_location="cpu", weights_only=True)
-        # Detect checkpoint key prefix (ip-adapter.bin uses "image_proj.*")
-        prefix = "image_proj"
         img_proj = {
-            k[len(prefix) + 1:]: v
             for k, v in state.items()
-            if k.startswith(prefix + ".")
         }
         self.resampler = TimeResampler().to(self.device, self.dtype)
-        missing, unexpected = self.resampler.load_state_dict(img_proj, strict=False)
         if missing:
             print(f"[IP-Adapter] resampler missing keys ({len(missing)}): {missing[:4]}…")
         print("[IP-Adapter] resampler loaded")
-    def _patch_transformer(self, transformer: nn.Module):
-        """Replace every self-attention processor with WanIPAttnProcessor."""
-        self._processors: list[WanIPAttnProcessor] = []
-        for name, mod in transformer.named_modules():
-            if not (hasattr(mod, "processor") and hasattr(mod, "to_k")):
-                continue
-            # Build IP projections mirroring the model's own K/V projections
-            to_k_ip = nn.Linear(
-                self.resampler.proj_out.out_features,
-                mod.to_k.out_features,
-                bias=False,
-            ).to(self.device, self.dtype)
-            to_v_ip = nn.Linear(
-                self.resampler.proj_out.out_features,
-                mod.to_v.out_features,
-                bias=False,
-            ).to(self.device, self.dtype)
-            # Zero-shot init: copy model's own projection weights then scale down
-            # so the initial IP signal is small but directionally meaningful.
-            k_w = mod.to_k.weight.data
-            v_w = mod.to_v.weight.data
-            out_f, in_f = to_k_ip.weight.shape
-            # in_f = resampler output (1024); in_f may differ from k_w.shape[1]
-            # — just use kaiming init if shapes differ
-            if in_f == k_w.shape[1]:
-                to_k_ip.weight.data.copy_(k_w[:out_f] * 0.01)
-                to_v_ip.weight.data.copy_(v_w[:out_f] * 0.01)
-            else:
-                nn.init.kaiming_uniform_(to_k_ip.weight, a=math.sqrt(5))
-                nn.init.kaiming_uniform_(to_v_ip.weight, a=math.sqrt(5))
-                to_k_ip.weight.data *= 0.01
-                to_v_ip.weight.data *= 0.01
-            # Clone existing norms if present
-            norm_k = mod.norm_k.__class__(mod.norm_k.normalized_shape[0]) \
-                if hasattr(mod, "norm_k") and mod.norm_k is not None else None
-            norm_v = mod.norm_v.__class__(mod.norm_v.normalized_shape[0]) \
-                if hasattr(mod, "norm_v") and mod.norm_v is not None else None
-            if norm_k is not None:
-                norm_k = norm_k.to(self.device, self.dtype)
-            if norm_v is not None:
-                norm_v = norm_v.to(self.device, self.dtype)
-            ip_proc = WanIPAttnProcessor(
-                original_processor=mod.processor,
-                to_k_ip=to_k_ip,
-                to_v_ip=to_v_ip,
-                norm_k_ip=norm_k,
-                norm_v_ip=norm_v,
-            )
-            mod.processor = ip_proc
-            self._processors.append(ip_proc)
-        print(f"[IP-Adapter] patched {len(self._processors)} attention blocks")
-    # ── inference API ─────────────────────────────────────────────────────────
     @torch.no_grad()
-    def encode(self, image: Image.Image, timestep: int = 500) -> torch.Tensor:
-        """Encode *image* through SigLIP2 + TimeResampler → (1, 8, 1024)."""
-        inputs = self.vis_proc(images=image, return_tensors="pt").to(self.device)
         vis_out = self.vis_model(**inputs)
-        # Use last_hidden_state (patch tokens) rather than pooled for richer features
         vis_feats = vis_out.last_hidden_state.to(self.dtype)          # (1, N, 1152)
-        t = torch.tensor([timestep], device=self.device, dtype=torch.long)
-        emb, _ = self.resampler(vis_feats, t)                         # (1, 8, 1024)
-        return emb
-    def set_hidden_states(self, emb: torch.Tensor, scale: float = 0.6):
-        """Broadcast *emb* to all processors before a pipe() call."""
-        for p in self._processors:
-            p.ip_hidden_states = emb
-            p.scale            = scale
-    def clear_hidden_states(self):
-        """Remove face embeddings after pipe() returns."""
-        for p in self._processors:
-            p.ip_hidden_states = None

 """
+WAN IP-Adapter — zero-shot face conditioning via T5 cross-attention injection.
+Strategy
+────────
+Instead of patching WAN's self-attention blocks (which requires trained K/V
+projections that don't exist for WAN), we inject face identity through the
+cross-attention pathway that WAN already uses for text conditioning.
+Pipeline
+  1. SigLIP2 so400m  (1152-d patch tokens)
+       ↓  TimeResampler  (SD3.5 trained weights, 8 queries → 1024-d)
+       ↓  proj_face  nn.Linear(1024 → 4096, xavier_uniform init)
+  = 8 face tokens in T5 space  (1, 8, 4096)
+  2. These are appended to the T5 prompt_embeds before each pipe() call.
+     WAN's cross-attention naturally attends to all tokens in encoder_hidden_states,
+     so no transformer surgery is needed.
+  3. For CFG (guidance_scale > 1), zeros are appended to the negative embeds
+     so the unconditional branch is face-neutral, not anti-face.
+Why this works zero-shot
+────────────────────────
+The TimeResampler is trained (SD3.5 weights) and produces semantically
+structured 1024-d tokens. The random proj_face (xavier_uniform) is a
+fixed linear map — it preserves the relative geometry of the resampler
+space, so the same face always maps to the same region of T5 space and
+similar faces map to nearby regions. WAN's cross-attention sees consistent
+identity tokens for consistent faces.
+Usage in app.py
+───────────────
+Init (once, inside _init_pipeline):
+    ip_adapter = WanIPAdapter(pipe, device=pipe.device, dtype=torch.bfloat16)
+Per-generation (inside run_inference, before pipe()):
+    prompt_embeds, neg_embeds, prompt_mask, neg_mask = ip_adapter.encode_prompt(
+        face_image=face_ref_image,   # PIL Image or None
+        prompt=effective_prompt,
+        negative_prompt=negative_prompt,
+        ip_scale=ip_scale,           # 0.0 → 1.0
+    )
+    result = pipe(
+        ...
+        prompt_embeds=prompt_embeds,
+        negative_prompt_embeds=neg_embeds,
+        prompt_attention_mask=prompt_mask,
+        negative_prompt_attention_mask=neg_mask,
+    )
 """
 from __future__ import annotations
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from huggingface_hub import hf_hub_download
 from PIL import Image
 from transformers import AutoProcessor, SiglipVisionModel
+# ── Perceiver resampler (unchanged from original — SD3.5 weights load here) ───
 class _FeedForward(nn.Module):
     def __init__(self, dim: int, mult: int = 4):
         return self.net(x)
+def _reshape(t: torch.Tensor, heads: int) -> torch.Tensor:
+    b, n, d = t.shape
+    return t.reshape(b, n, heads, d // heads).transpose(1, 2)
 class _PerceiverAttention(nn.Module):
     def __init__(self, *, dim: int, dim_head: int = 64, heads: int = 8):
         super().__init__()
+        self.heads  = heads
+        inner       = dim_head * heads
+        self.norm1  = nn.LayerNorm(dim)
+        self.norm2  = nn.LayerNorm(dim)
+        self.to_q   = nn.Linear(dim, inner, bias=False)
+        self.to_kv  = nn.Linear(dim, inner * 2, bias=False)
+        self.to_out = nn.Linear(inner, dim, bias=False)
     def forward(self, x: torch.Tensor, latents: torch.Tensor) -> torch.Tensor:
         x       = self.norm1(x)
 class TimeResampler(nn.Module):
+    """
+    Perceiver resampler — architecture matches image_proj.* in
+    InstantX/SD3.5-Large-IP-Adapter ip-adapter.bin so weights load cleanly.
+    Output: (batch, num_queries=8, output_dim=1024)
     """
     def __init__(
         dim_head: int = 64,
         heads: int = 16,
         num_queries: int = 8,
+        embedding_dim: int = 1152,   # SigLIP2 so400m hidden size
         output_dim: int = 1024,
         ff_mult: int = 4,
         timestep_in_dim: int = 320,
             nn.ModuleList([
                 _PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
                 _FeedForward(dim=dim, mult=ff_mult),
+                nn.Sequential(nn.SiLU(), nn.Linear(dim, 4 * dim)),
             ])
             for _ in range(depth)
         ])
         self.proj_out = nn.Linear(dim, output_dim)
         self.norm_out = nn.LayerNorm(output_dim)
+    def forward(self, x: torch.Tensor, timestep: torch.Tensor) -> torch.Tensor:
         t       = self.time_proj(timestep.flatten()).to(x.dtype)
+        t_emb   = self.t_emb(t)
         latents = self.latents.expand(x.size(0), -1, -1).clone()
         x       = self.proj_in(x)
         for attn, ff, adaln in self.layers:
             latents = attn(x, latents)
             latents = latents * (1 + c_mlp[:, None]) + s_mlp[:, None]
             latents = ff(latents) + latents
+        return self.norm_out(self.proj_out(latents))   # (B, 8, 1024)
 # ── Main class ─────────────────────────────────────────────────────────────────
 class WanIPAdapter:
+    """
+    Zero-shot face conditioning for WAN I2V via T5 cross-attention injection.
+    No transformer patching. Face tokens are appended to prompt_embeds and
+    WAN's existing cross-attention handles the rest.
     """
     _IP_ADAPTER_REPO = "InstantX/SD3.5-Large-IP-Adapter"
     _IP_ADAPTER_FILE = "ip-adapter.bin"
     _VISION_MODEL     = "google/siglip-so400m-patch14-384"
+    # WAN transformer cross-attention dim (text_dim in WanTransformer3DModel)
+    _T5_DIM           = 4096
+    # TimeResampler output dim
+    _RESAMPLER_DIM    = 1024
+    # Number of face tokens appended to the T5 sequence
+    _NUM_FACE_TOKENS  = 8
     def __init__(
         self,
         pipe,
         self._load_vision_encoder()
         self._load_resampler(cache_dir)
+        self._build_proj_face()
+        print("[IP-Adapter] ready — T5-concat mode, no transformer patching")
     # ── setup ──────────────────────────────────────────────────────────────────
         print("[IP-Adapter] loading SigLIP vision encoder…")
         self.vis_proc  = AutoProcessor.from_pretrained(self._VISION_MODEL)
         self.vis_model = SiglipVisionModel.from_pretrained(
+            self._VISION_MODEL, torch_dtype=self.dtype,
         ).to(self.device)
         self.vis_model.eval()
         print("[IP-Adapter] SigLIP loaded")
     def _load_resampler(self, cache_dir: str):
+        print("[IP-Adapter] loading TimeResampler (SD3.5 ip-adapter.bin)…")
         ckpt = hf_hub_download(
             repo_id=self._IP_ADAPTER_REPO,
             filename=self._IP_ADAPTER_FILE,
             local_dir=cache_dir,
         )
         state = torch.load(ckpt, map_location="cpu", weights_only=True)
         img_proj = {
+            k[len("image_proj."):]: v
             for k, v in state.items()
+            if k.startswith("image_proj.")
         }
         self.resampler = TimeResampler().to(self.device, self.dtype)
+        missing, _ = self.resampler.load_state_dict(img_proj, strict=False)
         if missing:
             print(f"[IP-Adapter] resampler missing keys ({len(missing)}): {missing[:4]}…")
+        self.resampler.eval()
         print("[IP-Adapter] resampler loaded")
+    def _build_proj_face(self):
+        """
+        Fixed linear projection: resampler output (1024) → T5 space (4096).
+        Xavier-uniform init so face tokens land at reasonable magnitude relative
+        to T5 embeddings. This projection is never trained — it's a fixed
+        consistent mapping that preserves the resampler's relative geometry.
+        """
+        self.proj_face = nn.Linear(self._RESAMPLER_DIM, self._T5_DIM, bias=False)
+        nn.init.xavier_uniform_(self.proj_face.weight)
+        self.proj_face = self.proj_face.to(self.device, self.dtype)
+        self.proj_face.eval()
+        n_params = self.proj_face.weight.numel()
+        print(f"[IP-Adapter] proj_face built ({n_params:,} params, xavier_uniform, frozen)")
+    # ── encoding ───────────────────────────────────────────────────────────────
     @torch.no_grad()
+    def _encode_face_tokens(self, image: Image.Image, timestep: int = 500) -> torch.Tensor:
+        """
+        Encode *image* → (1, 8, 4096) face tokens in T5 space.
+        The timestep passed to the TimeResampler controls which denoising
+        stage the resampler "thinks" it's at. 500 (mid-point) is a reasonable
+        default; lower values produce more detail-focused tokens.
+        """
+        inputs  = self.vis_proc(images=image, return_tensors="pt").to(self.device)
         vis_out = self.vis_model(**inputs)
+        # Use patch tokens (last_hidden_state) rather than pooled for spatial detail
         vis_feats = vis_out.last_hidden_state.to(self.dtype)          # (1, N, 1152)
+        t   = torch.tensor([timestep], device=self.device, dtype=torch.long)
+        emb = self.resampler(vis_feats, t)                            # (1, 8, 1024)
+        return self.proj_face(emb)                                    # (1, 8, 4096)
+    # ── main API ───────────────────────────────────────────────────────────────
+    def encode_prompt(
+        self,
+        face_image: Optional[Image.Image],
+        prompt: str,
+        negative_prompt: str = "",
+        ip_scale: float = 0.6,
+        num_videos_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = False,
+        timestep: int = 500,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Returns (prompt_embeds, negative_prompt_embeds,
+                 prompt_attention_mask, negative_attention_mask)
+        ready to pass directly to pipe().
+        If *face_image* is None or *ip_scale* == 0, returns vanilla text embeds.
+        ip_scale blends face tokens into the prompt by scaling them before concat.
+        Scale of 1.0 = full face signal; 0.5 = half strength.
+        """
+        # ── text embeddings ───────────────────────────────────────────────────��
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            prompt_attention_mask,
+            negative_attention_mask,
+        ) = self.pipe.encode_prompt(
+            prompt=prompt,
+            negative_prompt=negative_prompt if do_classifier_free_guidance else None,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            device=self.device,
+        )
+        if face_image is None or ip_scale == 0.0:
+            return (
+                prompt_embeds,
+                negative_prompt_embeds,
+                prompt_attention_mask,
+                negative_attention_mask,
+            )
+        # ── face tokens ────────────────────────────────────────────────────────
+        face_tokens = self._encode_face_tokens(face_image, timestep=timestep)
+        # Repeat for batch if needed
+        B = prompt_embeds.shape[0]
+        if B > 1:
+            face_tokens = face_tokens.expand(B, -1, -1)
+        # Scale face tokens — controls identity signal strength
+        face_tokens = face_tokens * ip_scale
+        # Append to prompt embeds
+        prompt_embeds = torch.cat([prompt_embeds, face_tokens], dim=1)
+        face_ones     = torch.ones(B, self._NUM_FACE_TOKENS, device=self.device,
+                                   dtype=prompt_attention_mask.dtype)
+        prompt_attention_mask = torch.cat([prompt_attention_mask, face_ones], dim=1)
+        # For negative: append zeros (face-neutral, not anti-face)
+        if negative_prompt_embeds is not None:
+            B_neg = negative_prompt_embeds.shape[0]
+            neg_face = torch.zeros(B_neg, self._NUM_FACE_TOKENS, self._T5_DIM,
+                                   device=self.device, dtype=self.dtype)
+            negative_prompt_embeds = torch.cat([negative_prompt_embeds, neg_face], dim=1)
+            neg_ones = torch.ones(B_neg, self._NUM_FACE_TOKENS, device=self.device,
+                                  dtype=negative_attention_mask.dtype)
+            negative_attention_mask = torch.cat([negative_attention_mask, neg_ones], dim=1)
+        print(f"[IP-Adapter] face tokens appended — "
+              f"prompt_embeds: {prompt_embeds.shape}, scale={ip_scale:.2f}")
+        return (
+            prompt_embeds,
+            negative_prompt_embeds,
+            prompt_attention_mask,
+            negative_attention_mask,
+        )