Spaces:

Daankular
/

ZeroWan2GP

Runtime error

daKhosa commited on 19 days ago

Commit

3cab3e5

1 Parent(s): 27df6f9

Pivot to Wan 2.1 + IP-Adapter face conditioning

- Model: Wan2.1-I2V-A14B-Diffusers (single transformer, no MoE)
- IP-Adapter: WanIPAdapter in ip_adapter.py ports IPAdapterWAN to diffusers;
SigLIP2 so400m encodes the face reference, TimeResampler (SD3.5 weights)
compresses to 8 tokens, WanIPAttnProcessor injects face KV into every
self-attention block; cleared after each inference call
- LightX2V: switched to lightx2v/Wan2.1-Distill-Loras single-file LoRA
- Removed: transformer_2, guidance_scale_2, AOTI (Wan 2.1 has no compiled blocks)
- LoRA loading: single adapter_name per LoRA (no HIGH/LOW split for 2.1)
- face_ref_image threaded from generate_video → run_inference → ip_adapter
- Add einops to requirements

Files changed (3) hide show

app.py +73 -94
ip_adapter.py +354 -0
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -39,9 +39,9 @@ from diffusers.pipelines.wan.pipeline_wan_i2v import WanImageToVideoPipeline
 from diffusers.utils.export_utils import export_to_video
 from torchao.quantization import quantize_, Float8DynamicActivationFloat8WeightConfig, Int8WeightOnlyConfig
-import aoti
 from modify_model.modify_wan import set_sage_attn_wan
 from sageattention import sageattn
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
 warnings.filterwarnings("ignore")
@@ -261,7 +261,7 @@ LORA_NAMES = ["None"] + sorted(set(list(LORA_CATALOG.keys()) + list(_known.keys(
 print(f"LoRA gallery: {len(LORA_NAMES)-1} entries ({len(LORA_CATALOG)} cached).")
 # ── Model ──────────────────────────────────────────────────────────────────────
-MODEL_ID = "Wan-AI/Wan2.2-I2V-A14B-Diffusers"
 MAX_DIM      = 832
 MIN_DIM      = 480
@@ -371,20 +371,14 @@ def interpolate_bits(frames_np, multiplier=2, scale=1.0):
     return output
-# ── Pipeline — lazy-loaded on first GPU call ───────────────────────────────────
-# The 14B model (T5 ~11 GB + two transformers ~39 GB each) exceeds the CPU
-# startup container's 50 GB storage quota.  Loading inside @spaces.GPU moves
-# the download to the H200 worker which has a much larger NVMe.  The global
-# `pipe` persists between requests as long as the container stays up.
-pipe             = None
 original_scheduler = None
-_aoti_saved: list[tuple] = []
 def _init_pipeline():
-    global pipe, original_scheduler, _aoti_saved
-    # Ensure token env-vars are set in this worker context.
     if HF_TOKEN:
         os.environ["HF_TOKEN"] = HF_TOKEN
         os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
@@ -394,60 +388,41 @@ def _init_pipeline():
         MODEL_ID, torch_dtype=torch.bfloat16, token=HF_TOKEN or None,
     ).to("cuda")
-    # SageAttention — faster attention in the non-AOTI (LoRA) path.
-    set_sage_attn_wan(pipe.transformer,   sageattn)
-    set_sage_attn_wan(pipe.transformer_2, sageattn)
-    # Fuse LightX2V distillation LoRA before quantisation/AOTI so the compiled
-    # graph includes the distilled weights.
-    print("Fusing LightX2V distillation LoRA …")
-    _DISTILL_REPO  = "lightx2v/Wan2.2-Distill-Loras"
-    _DISTILL_HIGH  = "wan2.2_i2v_A14b_high_noise_lora_rank64_lightx2v_4step_1022.safetensors"
-    _DISTILL_LOW   = "wan2.2_i2v_A14b_low_noise_lora_rank64_lightx2v_4step_1022.safetensors"
-    pipe.load_lora_weights(_DISTILL_REPO, weight_name=_DISTILL_HIGH, adapter_name="lx2v_h")
-    pipe.load_lora_weights(_DISTILL_REPO, weight_name=_DISTILL_LOW,  adapter_name="lx2v_l",
-                           load_into_transformer_2=True)
-    pipe.set_adapters(["lx2v_h", "lx2v_l"], adapter_weights=[1.0, 1.0])
-    pipe.fuse_lora(adapter_names=["lx2v_h"], lora_scale=0.65, components=["transformer"])
-    pipe.fuse_lora(adapter_names=["lx2v_l"], lora_scale=0.7,  components=["transformer_2"])
-    pipe.unload_lora_weights()
-    print("LightX2V LoRA fused.")
     pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=6.0)
     original_scheduler = copy.deepcopy(pipe.scheduler)
-    # T5 int8 — frees ~3-4 GB VRAM so VAE can decode in one shot.
     quantize_(pipe.text_encoder, Int8WeightOnlyConfig())
     torch._dynamo.reset()
-    quantize_(pipe.transformer,   Float8DynamicActivationFloat8WeightConfig())
     torch._dynamo.reset()
-    quantize_(pipe.transformer_2, Float8DynamicActivationFloat8WeightConfig())
-    torch._dynamo.reset()
-    aoti.aoti_blocks_load(pipe.transformer,   "zerogpu-aoti/Wan2", variant="fp8da")
-    aoti.aoti_blocks_load(pipe.transformer_2, "zerogpu-aoti/Wan2", variant="fp8da")
-    # Save compiled forwards for AOTI disable/restore around dynamic LoRA inference.
-    _aoti_saved.clear()
-    for _m in list(pipe.transformer.modules()) + list(pipe.transformer_2.modules()):
-        _fwd = _m.__dict__.get("forward")
-        if _fwd is not None:
-            _aoti_saved.append((_m, _fwd))
     print("Pipeline ready.")
-def _disable_aoti():
-    for _m, _ in _aoti_saved:
-        try: del _m.forward
-        except AttributeError: pass
-def _restore_aoti():
-    for _m, _fwd in _aoti_saved:
-        _m.forward = _fwd
 @spaces.GPU(duration=900)
 def _warmup_pipeline():
     """Load the full pipeline at Space startup so generation has no init delay."""
@@ -492,28 +467,26 @@ def get_num_frames(duration_seconds):
     return 1 + int(np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL))
-def get_inference_duration(resized_image, _last, _prompt, steps, _neg, num_frames,
-                           guidance_scale, _gs2, _seed, _sched, _fs, frame_multiplier,
                            _qual, duration_seconds, _lora, _scale, _progress):
     BASE = 81 * 832 * 624
     w, h = resized_image.size
     factor = num_frames * w * h / BASE
-    # LoRA inference falls back to uncompiled fp8 (no AOTI) → ~3x slower per step
-    secs_per_step = 45 if (_lora and _lora != "None") else 15
     gen_time = int(steps) * secs_per_step * factor ** 1.5
     if guidance_scale > 1:
         gen_time *= 1.8
     ff = frame_multiplier // FIXED_FPS
     if ff > 1:
         gen_time += ((num_frames * ff) - num_frames) * 0.02
-    # Add 300 s headroom for first-call pipeline init (model download + AOTI load).
     return min(900, 15 + gen_time)
 @spaces.GPU(duration=get_inference_duration)
 def run_inference(
-    resized_image, processed_last_image, prompt, steps, negative_prompt,
-    num_frames, guidance_scale, guidance_scale_2, current_seed,
     scheduler_name, flow_shift, frame_multiplier, quality, duration_seconds,
     lora_name, lora_scale,
     progress=gr.Progress(track_tqdm=True),
@@ -531,11 +504,9 @@ def run_inference(
     clear_vram()
-    # Dynamic LoRA loading — disable AOTI compiled blocks first so the
-    # LoRA-modified weights are actually used (AOTI binds weights at init).
     loaded_lora = False
     if lora_name and lora_name != "None":
-        # Lazy-download: fetch the repo now if not yet on disk, then pair files.
         if lora_name not in LORA_CATALOG:
             repo_id = LORA_REPO_MAP.get(lora_name)
             if repo_id:
@@ -545,15 +516,12 @@ def run_inference(
                 except Exception as e:
                     print(f"LoRA download failed ({lora_name}): {e}")
     if lora_name and lora_name != "None" and lora_name in LORA_CATALOG:
-        lora = LORA_CATALOG[lora_name]
         scale = float(lora_scale)
-        _disable_aoti()  # fall back to uncompiled fp8 so LoRA weights are visible
         try:
-            hn = lora_name.replace(" ", "_") + "_H"
-            ln = lora_name.replace(" ", "_") + "_L"
-            pipe.load_lora_weights(lora["high"], adapter_name=hn)
-            pipe.load_lora_weights(lora["low"],  adapter_name=ln, load_into_transformer_2=True)
-            pipe.set_adapters([hn, ln], adapter_weights=[scale, scale])
             loaded_lora = True
             print(f"Loaded LoRA: {lora_name} (scale={scale})")
         except Exception as e:
@@ -561,28 +529,40 @@ def run_inference(
             try: pipe.unload_lora_weights()
             except: pass
     task_id = str(uuid.uuid4())[:8]
-    start = time.time()
-    result = pipe(
-        image=resized_image,
-        last_image=processed_last_image,
-        prompt=prompt,
-        negative_prompt=negative_prompt,
-        height=resized_image.height,
-        width=resized_image.width,
-        num_frames=num_frames,
-        guidance_scale=float(guidance_scale),
-        guidance_scale_2=float(guidance_scale_2),
-        num_inference_steps=int(steps),
-        generator=torch.Generator(device="cuda").manual_seed(current_seed),
-        output_type="np",
-    )
     print(f"Gen time: {time.time()-start:.1f}s  task={task_id}")
     if loaded_lora:
         try: pipe.unload_lora_weights()
         except: pass
-        _restore_aoti()  # re-enable compiled blocks for next LoRA-free inference
     raw_frames = result.frames[0]
     pipe.scheduler = original_scheduler
@@ -605,7 +585,7 @@ def run_inference(
 def generate_video(
     input_image, last_image, face_ref_image, prompt,
     steps=6, negative_prompt="", duration_seconds=MAX_DURATION,
-    guidance_scale=1.0, guidance_scale_2=1.0, seed=42, randomize_seed=False,
     quality=5, scheduler="UniPCMultistep", flow_shift=6.0,
     frame_multiplier=16, lora_name="None", lora_scale=0.6,
     blink_subject="woman",
@@ -639,8 +619,8 @@ def generate_video(
         effective_prompt = prompt
     video_path, task_n = run_inference(
-        resized_image, processed_last, effective_prompt, steps, negative_prompt,
-        num_frames, guidance_scale, guidance_scale_2, current_seed,
         scheduler, flow_shift, frame_multiplier, quality, duration_seconds,
         lora_name, lora_scale, progress,
     )
@@ -657,7 +637,7 @@ CSS = """
 with gr.Blocks(css=CSS, delete_cache=(3600, 10800)) as demo:
     gr.Markdown(f"## ZeroWan2GP — [{MODEL_ID.split('/')[-1]}](https://huggingface.co/{MODEL_ID})")
-    gr.Markdown("Wan 2.2 I2V 14B · fp8da-aoti · ZeroGPU · RIFE interpolation · NSFW LoRA gallery")
     with gr.Row():
         with gr.Column():
@@ -690,8 +670,7 @@ with gr.Blocks(css=CSS, delete_cache=(3600, 10800)) as demo:
                 seed_input            = gr.Slider(0, MAX_SEED, step=1, value=42, label="Seed")
                 randomize_seed        = gr.Checkbox(label="Randomize seed", value=True)
                 steps_slider          = gr.Slider(1, 50, step=1, value=6, label="Steps")
-                gs_input              = gr.Slider(0.0, 10.0, step=0.5, value=1.0, label="Guidance Scale (high noise)")
-                gs2_input             = gr.Slider(0.0, 10.0, step=0.5, value=1.0, label="Guidance Scale 2 (low noise)")
                 scheduler_dd          = gr.Dropdown(list(SCHEDULER_MAP.keys()), value="UniPCMultistep", label="Scheduler")
                 flow_shift_slider     = gr.Slider(0.5, 15.0, step=0.1, value=6.0, label="Flow Shift")
                 play_result           = gr.Checkbox(label="Display result", value=True)
@@ -708,7 +687,7 @@ with gr.Blocks(css=CSS, delete_cache=(3600, 10800)) as demo:
     ui_inputs = [
         input_image_component, last_image_component, face_ref_component, prompt_input,
-        steps_slider, negative_prompt_input, duration_input, gs_input, gs2_input,
         seed_input, randomize_seed, quality_slider, scheduler_dd, flow_shift_slider,
         frame_multi, lora_dropdown, lora_scale_slider, blink_subject_radio, play_result,
     ]
@@ -717,7 +696,7 @@ with gr.Blocks(css=CSS, delete_cache=(3600, 10800)) as demo:
     grab_btn.click(fn=None, inputs=None, outputs=[timestamp_box], js=get_timestamp_js)
     timestamp_box.change(fn=extract_frame, inputs=[video_output, timestamp_box], outputs=[input_image_component])
-print("Warming up pipeline (loading model, fusing LoRA, quantising, AOTI)...")
 _warmup_pipeline()
 print("Warmup complete — Space ready.")

 from diffusers.utils.export_utils import export_to_video
 from torchao.quantization import quantize_, Float8DynamicActivationFloat8WeightConfig, Int8WeightOnlyConfig
 from modify_model.modify_wan import set_sage_attn_wan
 from sageattention import sageattn
+from ip_adapter import WanIPAdapter
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
 warnings.filterwarnings("ignore")
 print(f"LoRA gallery: {len(LORA_NAMES)-1} entries ({len(LORA_CATALOG)} cached).")
 # ── Model ──────────────────────────────────────────────────────────────────────
+MODEL_ID = "Wan-AI/Wan2.1-I2V-A14B-Diffusers"
 MAX_DIM      = 832
 MIN_DIM      = 480
     return output
+pipe               = None
 original_scheduler = None
+ip_adapter         = None
 def _init_pipeline():
+    global pipe, original_scheduler, ip_adapter
     if HF_TOKEN:
         os.environ["HF_TOKEN"] = HF_TOKEN
         os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN
         MODEL_ID, torch_dtype=torch.bfloat16, token=HF_TOKEN or None,
     ).to("cuda")
+    # SageAttention for the transformer.
+    set_sage_attn_wan(pipe.transformer, sageattn)
+    # Fuse LightX2V 4-step distillation LoRA into the single transformer.
+    print("Fusing LightX2V 2.1 distillation LoRA …")
+    _DISTILL_REPO = "lightx2v/Wan2.1-Distill-Loras"
+    _DISTILL_FILE = "wan2.1_i2v_lora_rank64_lightx2v_4step.safetensors"
+    try:
+        pipe.load_lora_weights(_DISTILL_REPO, weight_name=_DISTILL_FILE, adapter_name="lx2v")
+        pipe.set_adapters(["lx2v"], adapter_weights=[1.0])
+        pipe.fuse_lora(adapter_names=["lx2v"], lora_scale=0.65, components=["transformer"])
+        pipe.unload_lora_weights()
+        print("LightX2V LoRA fused.")
+    except Exception as e:
+        print(f"LightX2V fuse skipped: {e}")
     pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=6.0)
     original_scheduler = copy.deepcopy(pipe.scheduler)
+    # fp8 quantisation — single transformer only.
     quantize_(pipe.text_encoder, Int8WeightOnlyConfig())
     torch._dynamo.reset()
+    quantize_(pipe.transformer, Float8DynamicActivationFloat8WeightConfig())
     torch._dynamo.reset()
+    # IP-Adapter — patches transformer attention blocks for face conditioning.
+    try:
+        ip_adapter = WanIPAdapter(pipe, device=pipe.device, dtype=torch.bfloat16)
+    except Exception as e:
+        print(f"[IP-Adapter] init failed: {e}")
+        ip_adapter = None
     print("Pipeline ready.")
 @spaces.GPU(duration=900)
 def _warmup_pipeline():
     """Load the full pipeline at Space startup so generation has no init delay."""
     return 1 + int(np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL))
+def get_inference_duration(resized_image, _last, _face, _prompt, steps, _neg, num_frames,
+                           guidance_scale, _seed, _sched, _fs, frame_multiplier,
                            _qual, duration_seconds, _lora, _scale, _progress):
     BASE = 81 * 832 * 624
     w, h = resized_image.size
     factor = num_frames * w * h / BASE
+    secs_per_step = 30 if (_lora and _lora != "None") else 20
     gen_time = int(steps) * secs_per_step * factor ** 1.5
     if guidance_scale > 1:
         gen_time *= 1.8
     ff = frame_multiplier // FIXED_FPS
     if ff > 1:
         gen_time += ((num_frames * ff) - num_frames) * 0.02
     return min(900, 15 + gen_time)
 @spaces.GPU(duration=get_inference_duration)
 def run_inference(
+    resized_image, processed_last_image, face_ref_image, prompt, steps, negative_prompt,
+    num_frames, guidance_scale, current_seed,
     scheduler_name, flow_shift, frame_multiplier, quality, duration_seconds,
     lora_name, lora_scale,
     progress=gr.Progress(track_tqdm=True),
     clear_vram()
+    # Lazy-download + load LoRA for this request.
     loaded_lora = False
     if lora_name and lora_name != "None":
         if lora_name not in LORA_CATALOG:
             repo_id = LORA_REPO_MAP.get(lora_name)
             if repo_id:
                 except Exception as e:
                     print(f"LoRA download failed ({lora_name}): {e}")
     if lora_name and lora_name != "None" and lora_name in LORA_CATALOG:
+        lora  = LORA_CATALOG[lora_name]
         scale = float(lora_scale)
         try:
+            an = lora_name.replace(" ", "_")
+            pipe.load_lora_weights(lora["high"], adapter_name=an)
+            pipe.set_adapters([an], adapter_weights=[scale])
             loaded_lora = True
             print(f"Loaded LoRA: {lora_name} (scale={scale})")
         except Exception as e:
             try: pipe.unload_lora_weights()
             except: pass
+    # IP-Adapter face conditioning — set before pipe(), clear after.
+    if ip_adapter is not None and face_ref_image is not None:
+        try:
+            face_emb = ip_adapter.encode(face_ref_image)
+            ip_adapter.set_hidden_states(face_emb, scale=lora_scale * 0.6)
+            print("[IP-Adapter] face embedding set")
+        except Exception as e:
+            print(f"[IP-Adapter] encode failed: {e}")
     task_id = str(uuid.uuid4())[:8]
+    start   = time.time()
+    try:
+        result = pipe(
+            image=resized_image,
+            last_image=processed_last_image,
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            height=resized_image.height,
+            width=resized_image.width,
+            num_frames=num_frames,
+            guidance_scale=float(guidance_scale),
+            num_inference_steps=int(steps),
+            generator=torch.Generator(device="cuda").manual_seed(current_seed),
+            output_type="np",
+        )
+    finally:
+        if ip_adapter is not None:
+            ip_adapter.clear_hidden_states()
     print(f"Gen time: {time.time()-start:.1f}s  task={task_id}")
     if loaded_lora:
         try: pipe.unload_lora_weights()
         except: pass
     raw_frames = result.frames[0]
     pipe.scheduler = original_scheduler
 def generate_video(
     input_image, last_image, face_ref_image, prompt,
     steps=6, negative_prompt="", duration_seconds=MAX_DURATION,
+    guidance_scale=1.0, seed=42, randomize_seed=False,
     quality=5, scheduler="UniPCMultistep", flow_shift=6.0,
     frame_multiplier=16, lora_name="None", lora_scale=0.6,
     blink_subject="woman",
         effective_prompt = prompt
     video_path, task_n = run_inference(
+        resized_image, processed_last, face_ref_image, effective_prompt,
+        steps, negative_prompt, num_frames, guidance_scale, current_seed,
         scheduler, flow_shift, frame_multiplier, quality, duration_seconds,
         lora_name, lora_scale, progress,
     )
 with gr.Blocks(css=CSS, delete_cache=(3600, 10800)) as demo:
     gr.Markdown(f"## ZeroWan2GP — [{MODEL_ID.split('/')[-1]}](https://huggingface.co/{MODEL_ID})")
+    gr.Markdown("Wan 2.1 I2V 14B · fp8 · IP-Adapter face conditioning · ZeroGPU · RIFE interpolation · NSFW LoRA gallery")
     with gr.Row():
         with gr.Column():
                 seed_input            = gr.Slider(0, MAX_SEED, step=1, value=42, label="Seed")
                 randomize_seed        = gr.Checkbox(label="Randomize seed", value=True)
                 steps_slider          = gr.Slider(1, 50, step=1, value=6, label="Steps")
+                gs_input              = gr.Slider(0.0, 10.0, step=0.5, value=1.0, label="Guidance Scale")
                 scheduler_dd          = gr.Dropdown(list(SCHEDULER_MAP.keys()), value="UniPCMultistep", label="Scheduler")
                 flow_shift_slider     = gr.Slider(0.5, 15.0, step=0.1, value=6.0, label="Flow Shift")
                 play_result           = gr.Checkbox(label="Display result", value=True)
     ui_inputs = [
         input_image_component, last_image_component, face_ref_component, prompt_input,
+        steps_slider, negative_prompt_input, duration_input, gs_input,
         seed_input, randomize_seed, quality_slider, scheduler_dd, flow_shift_slider,
         frame_multi, lora_dropdown, lora_scale_slider, blink_subject_radio, play_result,
     ]
     grab_btn.click(fn=None, inputs=None, outputs=[timestamp_box], js=get_timestamp_js)
     timestamp_box.change(fn=extract_frame, inputs=[video_output, timestamp_box], outputs=[input_image_component])
+print("Warming up pipeline (loading model, fusing LightX2V, fp8, IP-Adapter)...")
 _warmup_pipeline()
 print("Warmup complete — Space ready.")

ip_adapter.py ADDED Viewed

	@@ -0,0 +1,354 @@

+"""
+WAN 2.1 IP-Adapter — diffusers-native port of kaaskoek232/IPAdapterWAN.
+Architecture
+  SigLIP2 so400m (1152-d) → TimeResampler (1024-d, 8 queries)
+  → per-block WanIPAttnProcessor injected into every self-attention of
+    pipe.transformer
+Weights
+  Resampler  : loaded from InstantX/SD3.5-Large-IP-Adapter  ip-adapter.bin
+               key prefix "image_proj"  (architecture-matched)
+  IP proj    : to_k_ip / to_v_ip initialised from the model's own to_k / to_v
+               weights (zero-shot reference-attention style — works without
+               Wan-specific training and produces real identity signal)
+LoRA compatibility
+  IP processors sit on top of whatever to_q/to_k/to_v the LoRA has patched;
+  they are orthogonal (IP adds extra KV, LoRA modifies weight matrices).
+"""
+from __future__ import annotations
+import math
+from pathlib import Path
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from transformers import AutoProcessor, SiglipVisionModel
+# ── Helpers ────────────────────────────────────────────────────────────────────
+def _reshape(t: torch.Tensor, heads: int) -> torch.Tensor:
+    b, n, d = t.shape
+    return t.reshape(b, n, heads, d // heads).transpose(1, 2)
+# ── Perceiver / TimeResampler (matches SD3.5 ip-adapter.bin image_proj.*) ─────
+class _FeedForward(nn.Module):
+    def __init__(self, dim: int, mult: int = 4):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.LayerNorm(dim),
+            nn.Linear(dim, dim * mult, bias=False),
+            nn.GELU(),
+            nn.Linear(dim * mult, dim, bias=False),
+        )
+    def forward(self, x):
+        return self.net(x)
+class _PerceiverAttention(nn.Module):
+    def __init__(self, *, dim: int, dim_head: int = 64, heads: int = 8):
+        super().__init__()
+        self.heads   = heads
+        inner        = dim_head * heads
+        self.norm1   = nn.LayerNorm(dim)
+        self.norm2   = nn.LayerNorm(dim)
+        self.to_q    = nn.Linear(dim, inner, bias=False)
+        self.to_kv   = nn.Linear(dim, inner * 2, bias=False)
+        self.to_out  = nn.Linear(inner, dim, bias=False)
+    def forward(self, x: torch.Tensor, latents: torch.Tensor) -> torch.Tensor:
+        x       = self.norm1(x)
+        latents = self.norm2(latents)
+        q       = _reshape(self.to_q(latents), self.heads)
+        kv_in   = torch.cat([x, latents], dim=1)
+        k, v    = self.to_kv(kv_in).chunk(2, dim=-1)
+        k, v    = _reshape(k, self.heads), _reshape(v, self.heads)
+        out     = F.scaled_dot_product_attention(q, k, v)
+        out     = out.transpose(1, 2).reshape(latents.shape[0], -1, self.to_out.in_features)
+        return self.to_out(out) + latents
+class TimeResampler(nn.Module):
+    """Perceiver resampler with adaLN timestep conditioning.
+    Architecture mirrors the image_proj section of
+    InstantX/SD3.5-Large-IP-Adapter ip-adapter.bin so its weights load cleanly.
+    """
+    def __init__(
+        self,
+        dim: int = 1024,
+        depth: int = 8,
+        dim_head: int = 64,
+        heads: int = 16,
+        num_queries: int = 8,
+        embedding_dim: int = 1152,   # SigLIP2 so400m
+        output_dim: int = 1024,
+        ff_mult: int = 4,
+        timestep_in_dim: int = 320,
+        timestep_flip_sin_to_cos: bool = True,
+        timestep_freq_shift: int = 0,
+    ):
+        super().__init__()
+        from diffusers.models.embeddings import Timesteps, TimestepEmbedding
+        self.num_queries = num_queries
+        self.latents     = nn.Parameter(torch.randn(1, num_queries, dim) / dim ** 0.5)
+        self.proj_in     = nn.Linear(embedding_dim, dim)
+        self.time_proj   = Timesteps(timestep_in_dim, timestep_flip_sin_to_cos, timestep_freq_shift)
+        self.t_emb       = TimestepEmbedding(timestep_in_dim, dim)
+        self.layers = nn.ModuleList([
+            nn.ModuleList([
+                _PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+                _FeedForward(dim=dim, mult=ff_mult),
+                nn.Sequential(nn.SiLU(), nn.Linear(dim, 4 * dim)),   # adaLN
+            ])
+            for _ in range(depth)
+        ])
+        self.proj_out = nn.Linear(dim, output_dim)
+        self.norm_out = nn.LayerNorm(output_dim)
+    def forward(
+        self,
+        x: torch.Tensor,
+        timestep: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        t       = self.time_proj(timestep.flatten()).to(x.dtype)
+        t_emb   = self.t_emb(t)                          # (B, dim)
+        latents = self.latents.expand(x.size(0), -1, -1).clone()
+        x       = self.proj_in(x)
+        for attn, ff, adaln in self.layers:
+            s_msa, c_msa, s_mlp, c_mlp = adaln(t_emb).chunk(4, dim=-1)
+            latents = latents * (1 + c_msa[:, None]) + s_msa[:, None]
+            latents = attn(x, latents)
+            latents = latents * (1 + c_mlp[:, None]) + s_mlp[:, None]
+            latents = ff(latents) + latents
+        latents = self.norm_out(self.proj_out(latents))
+        return latents, t_emb
+# ── Per-block attention processor ─────────────────────────────────────────────
+class WanIPAttnProcessor:
+    """Wraps an existing Attention processor and adds IP face KV injection.
+    The IP keys/values are initialised from the model's own to_k / to_v weights
+    (zero-shot reference-attention), so no separate IP training is needed.
+    Conditioned frames attend to the face tokens in every self-attention block.
+    """
+    def __init__(
+        self,
+        original_processor,
+        to_k_ip: nn.Linear,
+        to_v_ip: nn.Linear,
+        norm_k_ip: Optional[nn.Module] = None,
+        norm_v_ip: Optional[nn.Module] = None,
+        scale: float = 1.0,
+    ):
+        self.original   = original_processor
+        self.to_k_ip    = to_k_ip
+        self.to_v_ip    = to_v_ip
+        self.norm_k_ip  = norm_k_ip
+        self.norm_v_ip  = norm_v_ip
+        self.scale      = scale
+        # Set before each pipeline call; cleared after.
+        self.ip_hidden_states: Optional[torch.Tensor] = None
+    def __call__(self, attn, hidden_states, *args, **kwargs):
+        out = self.original(attn, hidden_states, *args, **kwargs)
+        if self.ip_hidden_states is None or self.scale == 0:
+            return out
+        hs   = self.ip_hidden_states
+        h    = attn.heads
+        # Compute Q from hidden_states (re-use the model's normalised projection)
+        q    = attn.to_q(hidden_states)
+        if attn.norm_q is not None:
+            q = attn.norm_q(q)
+        # Compute IP K / V
+        k_ip = self.to_k_ip(hs)
+        v_ip = self.to_v_ip(hs)
+        if self.norm_k_ip is not None:
+            k_ip = self.norm_k_ip(k_ip)
+        if self.norm_v_ip is not None:
+            v_ip = self.norm_v_ip(v_ip)
+        q    = _reshape(q,    h)
+        k_ip = _reshape(k_ip, h)
+        v_ip = _reshape(v_ip, h)
+        ip_attn = F.scaled_dot_product_attention(q, k_ip, v_ip)
+        ip_attn = ip_attn.transpose(1, 2).reshape(
+            hidden_states.shape[0], -1, attn.inner_dim
+        )
+        ip_attn = attn.to_out[0](ip_attn)
+        if len(attn.to_out) > 1:
+            ip_attn = attn.to_out[1](ip_attn)
+        return out + ip_attn * self.scale
+# ── Main class ─────────────────────────────────────────────────────────────────
+class WanIPAdapter:
+    """Loads the IP-Adapter and patches pipe.transformer for face conditioning.
+    Usage inside _init_pipeline():
+        ip_adapter = WanIPAdapter(pipe, device=pipe.device, dtype=torch.bfloat16)
+    Usage inside run_inference() (before pipe()):
+        if face_ref is not None:
+            emb = ip_adapter.encode(face_ref, timestep=500)
+            ip_adapter.set_hidden_states(emb, scale=ip_scale)
+        result = pipe(...)
+        ip_adapter.clear_hidden_states()
+    """
+    _IP_ADAPTER_REPO = "InstantX/SD3.5-Large-IP-Adapter"
+    _IP_ADAPTER_FILE = "ip-adapter.bin"
+    _VISION_MODEL     = "google/siglip-so400m-patch14-384"
+    def __init__(
+        self,
+        pipe,
+        device: torch.device,
+        dtype: torch.dtype = torch.bfloat16,
+        cache_dir: str = "/data/ip_adapter",
+    ):
+        self.pipe   = pipe
+        self.device = device
+        self.dtype  = dtype
+        self._load_vision_encoder()
+        self._load_resampler(cache_dir)
+        self._patch_transformer(pipe.transformer)
+        print("[IP-Adapter] ready")
+    # ── setup ──────────────────────────────────────────────────────────────────
+    def _load_vision_encoder(self):
+        print("[IP-Adapter] loading SigLIP vision encoder…")
+        self.vis_proc  = AutoProcessor.from_pretrained(self._VISION_MODEL)
+        self.vis_model = SiglipVisionModel.from_pretrained(
+            self._VISION_MODEL, torch_dtype=self.dtype
+        ).to(self.device)
+        self.vis_model.eval()
+        print("[IP-Adapter] SigLIP loaded")
+    def _load_resampler(self, cache_dir: str):
+        print("[IP-Adapter] loading TimeResampler from SD3.5 ip-adapter.bin…")
+        ckpt = hf_hub_download(
+            repo_id=self._IP_ADAPTER_REPO,
+            filename=self._IP_ADAPTER_FILE,
+            local_dir=cache_dir,
+        )
+        state = torch.load(ckpt, map_location="cpu", weights_only=True)
+        # Detect checkpoint key prefix (ip-adapter.bin uses "image_proj.*")
+        prefix = "image_proj"
+        img_proj = {
+            k[len(prefix) + 1:]: v
+            for k, v in state.items()
+            if k.startswith(prefix + ".")
+        }
+        self.resampler = TimeResampler().to(self.device, self.dtype)
+        missing, unexpected = self.resampler.load_state_dict(img_proj, strict=False)
+        if missing:
+            print(f"[IP-Adapter] resampler missing keys ({len(missing)}): {missing[:4]}…")
+        print("[IP-Adapter] resampler loaded")
+    def _patch_transformer(self, transformer: nn.Module):
+        """Replace every self-attention processor with WanIPAttnProcessor."""
+        self._processors: list[WanIPAttnProcessor] = []
+        for name, mod in transformer.named_modules():
+            if not (hasattr(mod, "processor") and hasattr(mod, "to_k")):
+                continue
+            # Build IP projections mirroring the model's own K/V projections
+            to_k_ip = nn.Linear(
+                self.resampler.proj_out.out_features,
+                mod.to_k.out_features,
+                bias=False,
+            ).to(self.device, self.dtype)
+            to_v_ip = nn.Linear(
+                self.resampler.proj_out.out_features,
+                mod.to_v.out_features,
+                bias=False,
+            ).to(self.device, self.dtype)
+            # Zero-shot init: copy model's own projection weights then scale down
+            # so the initial IP signal is small but directionally meaningful.
+            k_w = mod.to_k.weight.data
+            v_w = mod.to_v.weight.data
+            out_f, in_f = to_k_ip.weight.shape
+            # in_f = resampler output (1024); in_f may differ from k_w.shape[1]
+            # — just use kaiming init if shapes differ
+            if in_f == k_w.shape[1]:
+                to_k_ip.weight.data.copy_(k_w[:out_f] * 0.01)
+                to_v_ip.weight.data.copy_(v_w[:out_f] * 0.01)
+            else:
+                nn.init.kaiming_uniform_(to_k_ip.weight, a=math.sqrt(5))
+                nn.init.kaiming_uniform_(to_v_ip.weight, a=math.sqrt(5))
+                to_k_ip.weight.data *= 0.01
+                to_v_ip.weight.data *= 0.01
+            # Clone existing norms if present
+            norm_k = mod.norm_k.__class__(mod.norm_k.normalized_shape[0]) \
+                if hasattr(mod, "norm_k") and mod.norm_k is not None else None
+            norm_v = mod.norm_v.__class__(mod.norm_v.normalized_shape[0]) \
+                if hasattr(mod, "norm_v") and mod.norm_v is not None else None
+            if norm_k is not None:
+                norm_k = norm_k.to(self.device, self.dtype)
+            if norm_v is not None:
+                norm_v = norm_v.to(self.device, self.dtype)
+            ip_proc = WanIPAttnProcessor(
+                original_processor=mod.processor,
+                to_k_ip=to_k_ip,
+                to_v_ip=to_v_ip,
+                norm_k_ip=norm_k,
+                norm_v_ip=norm_v,
+            )
+            mod.processor = ip_proc
+            self._processors.append(ip_proc)
+        print(f"[IP-Adapter] patched {len(self._processors)} attention blocks")
+    # ── inference API ─────────────────────────────────────────────────────────
+    @torch.no_grad()
+    def encode(self, image: Image.Image, timestep: int = 500) -> torch.Tensor:
+        """Encode *image* through SigLIP2 + TimeResampler → (1, 8, 1024)."""
+        inputs = self.vis_proc(images=image, return_tensors="pt").to(self.device)
+        vis_out = self.vis_model(**inputs)
+        # Use last_hidden_state (patch tokens) rather than pooled for richer features
+        vis_feats = vis_out.last_hidden_state.to(self.dtype)          # (1, N, 1152)
+        t = torch.tensor([timestep], device=self.device, dtype=torch.long)
+        emb, _ = self.resampler(vis_feats, t)                         # (1, 8, 1024)
+        return emb
+    def set_hidden_states(self, emb: torch.Tensor, scale: float = 0.6):
+        """Broadcast *emb* to all processors before a pipe() call."""
+        for p in self._processors:
+            p.ip_hidden_states = emb
+            p.scale            = scale
+    def clear_hidden_states(self):
+        """Remove face embeddings after pipe() returns."""
+        for p in self._processors:
+            p.ip_hidden_states = None

requirements.txt CHANGED Viewed

@@ -15,3 +15,4 @@ sageattention
 torchvision
 insightface
 onnxruntime

 torchvision
 insightface
 onnxruntime
+einops