TestingwithNeg

Running on Zero

App Files Files Community

dagloop5 commited on 3 days ago

Commit

7b5d90c

verified ·

1 Parent(s): 0374898

Update app.py

Browse files

Files changed (1) hide show

app.py +221 -120

app.py CHANGED Viewed

@@ -75,6 +75,8 @@ from ltx_pipelines.utils.helpers import (
 from ltx_pipelines.utils.media_io import decode_audio_from_file, encode_video
 from ltx_core.loader.primitives import LoraPathStrengthAndSDOps
 from ltx_core.loader.sd_ops import LTXV_LORA_COMFY_RENAMING_MAP
 # Force-patch xformers attention into the LTX attention module.
 from ltx_core.model.transformer import attention as _attn_mod
@@ -339,10 +341,6 @@ LORA_CACHE_DIR = Path("lora_cache")
 LORA_CACHE_DIR.mkdir(exist_ok=True)
 current_lora_key: str | None = None
-PENDING_LORA_KEY: str | None = None
-PENDING_LORA_STATE: dict[str, torch.Tensor] | None = None
-PENDING_LORA_STATUS: str = "No LoRA state prepared yet."
 weights_dir = Path("weights")
 weights_dir.mkdir(exist_ok=True)
 checkpoint_path = hf_hub_download(
@@ -419,7 +417,110 @@ def _make_lora_key(pose_strength: float, general_strength: float, motion_strengt
     return key, key_str
-def prepare_lora_cache(
     pose_strength: float,
     general_strength: float,
     motion_strength: float,
@@ -435,128 +536,130 @@ def prepare_lora_cache(
     progress=gr.Progress(track_tqdm=True),
 ):
     """
-    CPU-only step:
-    - checks cache
-    - loads cached fused transformer state_dict, or
-    - builds fused transformer on CPU and saves it
-    The resulting state_dict is stored in memory and can be applied later.
     """
-    global PENDING_LORA_KEY, PENDING_LORA_STATE, PENDING_LORA_STATUS
-    ledger = pipeline.model_ledger
-    key, _ = _make_lora_key(pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength, voice_strength, realism_strength, transition_strength)
-    cache_path = LORA_CACHE_DIR / f"{key}.safetensors"
-    progress(0.05, desc="Preparing LoRA state")
-    if cache_path.exists():
         try:
-            progress(0.20, desc="Loading cached fused state")
-            state = load_file(str(cache_path))
-            PENDING_LORA_KEY = key
-            PENDING_LORA_STATE = state
-            PENDING_LORA_STATUS = f"Loaded cached LoRA state: {cache_path.name}"
-            return PENDING_LORA_STATUS
         except Exception as e:
-            print(f"[LoRA] Cache load failed: {type(e).__name__}: {e}")
-    entries = [
-        (pose_lora_path, round(float(pose_strength), 2)),
-        (general_lora_path, round(float(general_strength), 2)),
-        (motion_lora_path, round(float(motion_strength), 2)),
-        (dreamlay_lora_path, round(float(dreamlay_strength), 2)),
-        (mself_lora_path, round(float(mself_strength), 2)),
-        (dramatic_lora_path, round(float(dramatic_strength), 2)),
-        (fluid_lora_path, round(float(fluid_strength), 2)),
-        (liquid_lora_path, round(float(liquid_strength), 2)),
-        (demopose_lora_path, round(float(demopose_strength), 2)),
-        (voice_lora_path, round(float(voice_strength), 2)),
-        (realism_lora_path, round(float(realism_strength), 2)),
-        (transition_lora_path, round(float(transition_strength), 2)),
-    ]
-    loras_for_builder = [
-        LoraPathStrengthAndSDOps(path, strength, LTXV_LORA_COMFY_RENAMING_MAP)
-        for path, strength in entries
-        if path is not None and float(strength) != 0.0
-    ]
-    if not loras_for_builder:
-        PENDING_LORA_KEY = None
-        PENDING_LORA_STATE = None
-        PENDING_LORA_STATUS = "No non-zero LoRA strengths selected; nothing to prepare."
-        return PENDING_LORA_STATUS
-    tmp_ledger = None
-    new_transformer_cpu = None
     try:
-        progress(0.35, desc="Building fused CPU transformer")
-        tmp_ledger = pipeline.model_ledger.__class__(
-            dtype=ledger.dtype,
-            device=torch.device("cpu"),
-            checkpoint_path=str(checkpoint_path),
-            spatial_upsampler_path=str(spatial_upsampler_path),
-            gemma_root_path=str(gemma_root),
-            loras=tuple(loras_for_builder),
-            quantization=getattr(ledger, "quantization", None),
-        )
-        new_transformer_cpu = tmp_ledger.transformer()
-        progress(0.70, desc="Extracting fused state_dict")
-        state = {
-            k: v.detach().cpu().contiguous()
-            for k, v in new_transformer_cpu.state_dict().items()
-        }
-        save_file(state, str(cache_path))
-        PENDING_LORA_KEY = key
-        PENDING_LORA_STATE = state
-        PENDING_LORA_STATUS = f"Built and cached LoRA state: {cache_path.name}"
-        return PENDING_LORA_STATUS
     except Exception as e:
         import traceback
-        print(f"[LoRA] Prepare failed: {type(e).__name__}: {e}")
         print(traceback.format_exc())
-        PENDING_LORA_KEY = None
-        PENDING_LORA_STATE = None
-        PENDING_LORA_STATUS = f"LoRA prepare failed: {type(e).__name__}: {e}"
-        return PENDING_LORA_STATUS
-    finally:
-        try:
-            del new_transformer_cpu
-        except Exception:
-            pass
         try:
-            del tmp_ledger
         except Exception:
             pass
-        gc.collect()
-def apply_prepared_lora_state_to_pipeline():
-    """
-    Fast step: copy the already prepared CPU state into the live transformer.
-    This is the only part that should remain near generation time.
-    """
-    global current_lora_key, PENDING_LORA_KEY, PENDING_LORA_STATE
-    if PENDING_LORA_STATE is None or PENDING_LORA_KEY is None:
-        print("[LoRA] No prepared LoRA state available; skipping.")
-        return False
-    if current_lora_key == PENDING_LORA_KEY:
-        print("[LoRA] Prepared LoRA state already active; skipping.")
-        return True
-    existing_transformer = _transformer
-    with torch.no_grad():
-        missing, unexpected = existing_transformer.load_state_dict(PENDING_LORA_STATE, strict=False)
-        if missing or unexpected:
-            print(f"[LoRA] load_state_dict mismatch: missing={len(missing)}, unexpected={len(unexpected)}")
-    current_lora_key = PENDING_LORA_KEY
-    print("[LoRA] Prepared LoRA state applied to the pipeline.")
-    return True
 # Preload all models for ZeroGPU tensor packing.
 print("Preloading all models (including Gemma and audio components)...")
@@ -770,8 +873,6 @@ def generate_video(
         log_memory("before pipeline call")
-        apply_prepared_lora_state_to_pipeline()
         video, audio = pipeline(
             prompt=prompt,
             negative_prompt=negative_prompt,
@@ -956,7 +1057,7 @@ with gr.Blocks(title="LTX-2.3 Distilled with LoRAs, Negative Prompting, and Adva
     high_res.change(fn=on_highres_toggle, inputs=[first_image, last_image, high_res], outputs=[width, height])
     prepare_lora_btn.click(
-        fn=prepare_lora_cache,
         inputs=[pose_strength, general_strength, motion_strength, dreamlay_strength,
                 mself_strength, dramatic_strength, fluid_strength, liquid_strength,
                 demopose_strength, voice_strength, realism_strength, transition_strength],

 from ltx_pipelines.utils.media_io import decode_audio_from_file, encode_video
 from ltx_core.loader.primitives import LoraPathStrengthAndSDOps
 from ltx_core.loader.sd_ops import LTXV_LORA_COMFY_RENAMING_MAP
+from ltx_core.loader.module_ops.apply_loras import apply_loras
+from safetensors import safe_open
 # Force-patch xformers attention into the LTX attention module.
 from ltx_core.model.transformer import attention as _attn_mod
 LORA_CACHE_DIR.mkdir(exist_ok=True)
 current_lora_key: str | None = None
 weights_dir = Path("weights")
 weights_dir.mkdir(exist_ok=True)
 checkpoint_path = hf_hub_download(
     return key, key_str
+# =============================================================================
+# LoRA Cache (In-Memory) - Ultra-Fast In-Place Application
+# =============================================================================
+# In-memory caches to avoid redundant disk I/O
+LORA_SD_CACHE: dict[str, StateDict] = {}       # lora_path -> loaded StateDict
+FUSED_CACHE: dict[str, dict] = {}              # cache key -> fused state dict (CPU)
+current_lora_key: str | None = None
+def load_lora_into_cache(lora_path: str) -> StateDict:
+    """
+    Load a LoRA safetensor file into a cached StateDict.
+    Subsequent calls return the cached version instantly.
+    This replaces repeated disk reads with a one-time load + memory cache.
+    """
+    if lora_path in LORA_SD_CACHE:
+        return LORA_SD_CACHE[lora_path]
+    print(f"[LoRA] Loading {os.path.basename(lora_path)} into memory cache...")
+    # Use safe_open for memory-efficient streaming reads of large files
+    tensors = {}
+    with safe_open(lora_path, framework="safetensors") as f:
+        for key in f.keys():
+            tensors[key] = f.get_tensor(key)
+    state_dict = StateDict(
+        sd=tensors,
+        device=torch.device("cpu"),
+        size=sum(t.nbytes for t in tensors.values()),
+        dtype=set(t.dtype for t in tensors.values())
+    )
+    LORA_SD_CACHE[lora_path] = state_dict
+    print(f"[LoRA] Cached {len(tensors)} tensors from {os.path.basename(lora_path)}")
+    return state_dict
+def build_fused_state_dict(
+    base_transformer,
+    lora_configs: list[tuple[str, float]],
+    progress_callback=None
+) -> dict[str, torch.Tensor]:
+    """
+    Fuse multiple LoRAs into a single state dict ready for load_state_dict().
+    Uses LTX's apply_loras function which handles FP8 quantization correctly.
+    Args:
+        base_transformer: The preloaded transformer model
+        lora_configs: List of (lora_path, strength) tuples for non-zero LoRAs
+        progress_callback: Optional callback(step, desc) for progress updates
+    Returns:
+        Dictionary of fused weights ready for load_state_dict()
+    """
+    if not lora_configs:
+        # No LoRAs - return base transformer state dict
+        return {k: v.clone() for k, v in base_transformer.state_dict().items()}
+    if progress_callback:
+        progress_callback(0.1, "Loading LoRA state dicts into memory")
+    # Step 1: Load all LoRA state dicts (uses cache after first load)
+    lora_sd_with_strengths = []
+    for lora_path, strength in lora_configs:
+        sd = load_lora_into_cache(lora_path)
+        lora_sd_with_strengths.append(LoraStateDictWithStrength(sd, float(strength)))
+    if progress_callback:
+        progress_callback(0.3, "Extracting base transformer state dict")
+    # Step 2: Get base transformer state dict (already in memory from preloading!)
+    base_dict = base_transformer.state_dict()
+    base_sd = StateDict(
+        sd={k: v.detach().cpu().contiguous() for k, v in base_dict.items()},
+        device=torch.device("cpu"),
+        size=sum(v.nbytes for v in base_dict.values()),
+        dtype=set(v.dtype for v in base_dict.values())
+    )
+    if progress_callback:
+        progress_callback(0.5, "Fusing LoRAs with base weights (CPU)")
+    # Step 3: Fuse using LTX's apply_loras function
+    # This function handles:
+    # - FP8 quantized weights (_fuse_delta_with_scaled_fp8 / _fuse_delta_with_cast_fp8)
+    # - BFloat16 weights (_fuse_delta_with_bfloat16)
+    # - Proper delta accumulation for multiple LoRAs
+    fused_sd = apply_loras(
+        model_sd=base_sd,
+        lora_sd_and_strengths=lora_sd_with_strengths,
+        dtype=torch.bfloat16
+    )
+    if progress_callback:
+        progress_callback(0.9, "Extracting fused state dict")
+    # Step 4: Return the fused state dict as a plain dict (for load_state_dict)
+    return fused_sd.sd
+def on_prepare_loras_click(
     pose_strength: float,
     general_strength: float,
     motion_strength: float,
     progress=gr.Progress(track_tqdm=True),
 ):
     """
+    Called when user clicks the 'Prepare LoRA Cache' button.
+    This function:
+    1. Checks if LoRA combination is already applied (skip if so)
+    2. Checks in-memory FUSED_CACHE (skip building if cached)
+    3. Loads LoRA files into cache (reuses LORA_SD_CACHE on subsequent calls)
+    4. Builds fused state dict if needed (only new combinations)
+    5. Applies to the preloaded transformer
+    Only runs on button click, NOT on slider change.
     """
+    global current_lora_key, FUSED_CACHE
+    # Compute the cache key for this combination of strengths
+    key, _ = _make_lora_key(
+        pose_strength, general_strength, motion_strength, dreamlay_strength,
+        mself_strength, dramatic_strength, fluid_strength, liquid_strength,
+        demopose_strength, voice_strength, realism_strength, transition_strength
+    )
+    # Already applied with these exact strengths? Nothing to do.
+    if current_lora_key == key:
+        return f"✓ LoRAs already applied with current strengths"
+    progress(0.0, desc="Starting LoRA preparation")
+    # Build the list of active (non-zero) LoRAs
+    active_loras = []
+    lora_entries = [
+        (pose_lora_path, pose_strength, "Anthro Enhancer"),
+        (general_lora_path, general_strength, "Reasoning Enhancer"),
+        (motion_lora_path, motion_strength, "Anthro Posing"),
+        (dreamlay_lora_path, dreamlay_strength, "Dreamlay"),
+        (mself_lora_path, mself_strength, "Mself"),
+        (dramatic_lora_path, dramatic_strength, "Dramatic"),
+        (fluid_lora_path, fluid_strength, "Fluid Helper"),
+        (liquid_lora_path, liquid_strength, "Liquid Helper"),
+        (demopose_lora_path, demopose_strength, "Audio Helper"),
+        (voice_lora_path, voice_strength, "Voice Helper"),
+        (realism_lora_path, realism_strength, "Anthro Realism"),
+        (transition_lora_path, transition_strength, "POV"),
+    ]
+    for path, strength, name in lora_entries:
+        if float(strength) != 0.0:
+            active_loras.append((path, float(strength)))
+            print(f"[LoRA] Active: {name} = {strength}")
+    if not active_loras:
+        # No LoRAs selected - apply base model weights (reset from any previous LoRAs)
+        print("[LoRA] No LoRAs selected, resetting to base model weights")
         try:
+            transformer = ledger.transformer()
+            base_weights = {k: v.cpu() for k, v in transformer.state_dict().items()}
+            transformer.load_state_dict(base_weights, strict=False)
+            if torch.cuda.is_available():
+                transformer = transformer.to("cuda")
+            current_lora_key = key
+            progress(1.0, desc="Done")
+            return "✓ Reset to base model (no LoRAs active)"
         except Exception as e:
+            return f"✗ Reset failed: {e}"
+    # Check in-memory cache for this strength combination
+    if key in FUSED_CACHE:
+        print(f"[LoRA] Using cached fused state for: {key[:16]}...")
+        fused_state = FUSED_CACHE[key]
+        progress(0.85, desc="Using cached fused state")
+    else:
+        # Need to build the fused state dict (the expensive part)
+        print(f"[LoRA] Building new fused state dict for {len(active_loras)} LoRA(s)...")
+        # Progress callback that maps to Gradio's progress tracker
+        def progress_cb(step, desc):
+            progress(0.1 + step * 0.8, desc=desc)
+        transformer = ledger.transformer()
+        fused_state = build_fused_state_dict(transformer, active_loras, progress_cb)
+        # Cache the fused state for future reuse (keyed by strength combination)
+        FUSED_CACHE[key] = fused_state
+        print(f"[LoRA] Cached fused state for: {key[:16]}...")
+    # Apply fused state to transformer
+    progress(0.92, desc="Applying fused weights to transformer")
     try:
+        transformer = ledger.transformer()
+        target_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Move transformer to CPU for loading (avoids device mismatch)
+        transformer = transformer.to("cpu")
+        torch.cuda.empty_cache()  # Free VRAM from the CPU copy
+        # Load the fused state dict
+        missing, unexpected = transformer.load_state_dict(fused_state, strict=False)
+        if missing:
+            print(f"[LoRA] Warning: {len(missing)} keys not found in fused state")
+        if unexpected:
+            print(f"[LoRA] Warning: {len(unexpected)} unexpected keys in fused state")
+        # Move transformer to target device (GPU for generation)
+        if target_device.type != "cpu":
+            transformer = transformer.to(target_device)
+        current_lora_key = key
+        progress(1.0, desc="Done")
+        return f"✓ Applied {len(active_loras)} LoRA(s) successfully"
     except Exception as e:
         import traceback
+        print(f"[LoRA] Apply failed: {e}")
         print(traceback.format_exc())
+        # Try to restore transformer to GPU on error
         try:
+            transformer = ledger.transformer()
+            if next(transformer.parameters()).device.type == "cpu":
+                if torch.cuda.is_available():
+                    transformer = transformer.to("cuda")
         except Exception:
             pass
+        return f"✗ LoRA application failed: {e}"
 # Preload all models for ZeroGPU tensor packing.
 print("Preloading all models (including Gemma and audio components)...")
         log_memory("before pipeline call")
         video, audio = pipeline(
             prompt=prompt,
             negative_prompt=negative_prompt,
     high_res.change(fn=on_highres_toggle, inputs=[first_image, last_image, high_res], outputs=[width, height])
     prepare_lora_btn.click(
+        fn=on_prepare_loras_click,
         inputs=[pose_strength, general_strength, motion_strength, dreamlay_strength,
                 mself_strength, dramatic_strength, fluid_strength, liquid_strength,
                 demopose_strength, voice_strength, realism_strength, transition_strength],