Spaces:

dagloop5
/

Testing2

Running on Zero

App Files Files Community

dagloop5 commited on 10 days ago

Commit

61ae37f

verified ·

1 Parent(s): 844ace6

Update app.py

Browse files

Files changed (1) hide show

app.py +213 -225

app.py CHANGED Viewed

@@ -51,25 +51,29 @@ from safetensors import safe_open
 import json
 import requests
-from ltx_core.components.diffusion_steps import EulerDiffusionStep
 from ltx_core.components.noisers import GaussianNoiser
-from ltx_core.model.audio_vae import encode_audio as vae_encode_audio
-from ltx_core.model.upsampler import upsample_video
-from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number, decode_video as vae_decode_video
-from ltx_core.quantization import QuantizationPolicy
-from ltx_core.types import Audio, AudioLatentShape, VideoPixelShape
-from ltx_pipelines.distilled import DistilledPipeline
-from ltx_pipelines.utils import euler_denoising_loop
-from ltx_pipelines.utils.args import ImageConditioningInput
-from ltx_pipelines.utils.constants import DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES
 from ltx_pipelines.utils.helpers import (
-    cleanup_memory,
     combined_image_conditionings,
-    denoise_video_only,
-    encode_prompts,
-    simple_denoising_func,
 )
-from ltx_pipelines.utils.media_io import decode_audio_from_file, encode_video
 from ltx_core.loader.primitives import LoraPathStrengthAndSDOps
 from ltx_core.loader.sd_ops import LTXV_LORA_COMFY_RENAMING_MAP
@@ -101,167 +105,169 @@ RESOLUTIONS = {
 }
-class LTX23DistilledA2VPipeline(DistilledPipeline):
-    """DistilledPipeline with optional audio conditioning."""
     def __call__(
         self,
         prompt: str,
         seed: int,
         height: int,
         width: int,
         num_frames: int,
         frame_rate: float,
         images: list[ImageConditioningInput],
-        audio_path: str | None = None,
         tiling_config: TilingConfig | None = None,
         enhance_prompt: bool = False,
-    ):
-        # Standard path when no audio input is provided.
-        print(prompt)
-        if audio_path is None:
-            return super().__call__(
-                prompt=prompt,
-                seed=seed,
-                height=height,
-                width=width,
-                num_frames=num_frames,
-                frame_rate=frame_rate,
-                images=images,
-                tiling_config=tiling_config,
-                enhance_prompt=enhance_prompt,
-            )
         generator = torch.Generator(device=self.device).manual_seed(seed)
         noiser = GaussianNoiser(generator=generator)
-        stepper = EulerDiffusionStep()
         dtype = torch.bfloat16
-        (ctx_p,) = encode_prompts(
-            [prompt],
-            self.model_ledger,
             enhance_first_prompt=enhance_prompt,
-            enhance_prompt_image=images[0].path if len(images) > 0 else None,
         )
-        video_context, audio_context = ctx_p.video_encoding, ctx_p.audio_encoding
-        video_duration = num_frames / frame_rate
-        decoded_audio = decode_audio_from_file(audio_path, self.device, 0.0, video_duration)
-        if decoded_audio is None:
-            raise ValueError(f"Could not extract audio stream from {audio_path}")
-        encoded_audio_latent = vae_encode_audio(decoded_audio, self.model_ledger.audio_encoder())
-        audio_shape = AudioLatentShape.from_duration(batch=1, duration=video_duration, channels=8, mel_bins=16)
-        expected_frames = audio_shape.frames
-        actual_frames = encoded_audio_latent.shape[2]
-        if actual_frames > expected_frames:
-            encoded_audio_latent = encoded_audio_latent[:, :, :expected_frames, :]
-        elif actual_frames < expected_frames:
-            pad = torch.zeros(
-                encoded_audio_latent.shape[0],
-                encoded_audio_latent.shape[1],
-                expected_frames - actual_frames,
-                encoded_audio_latent.shape[3],
-                device=encoded_audio_latent.device,
-                dtype=encoded_audio_latent.dtype,
-            )
-            encoded_audio_latent = torch.cat([encoded_audio_latent, pad], dim=2)
-        video_encoder = self.model_ledger.video_encoder()
-        transformer = self.model_ledger.transformer()
-        stage_1_sigmas = torch.tensor(DISTILLED_SIGMA_VALUES, device=self.device)
-        def denoising_loop(sigmas, video_state, audio_state, stepper):
-            return euler_denoising_loop(
-                sigmas=sigmas,
-                video_state=video_state,
-                audio_state=audio_state,
-                stepper=stepper,
-                denoise_fn=simple_denoising_func(
-                    video_context=video_context,
-                    audio_context=audio_context,
-                    transformer=transformer,
-                ),
-            )
         stage_1_output_shape = VideoPixelShape(
-            batch=1,
-            frames=num_frames,
-            width=width // 2,
-            height=height // 2,
-            fps=frame_rate,
         )
-        stage_1_conditionings = combined_image_conditionings(
-            images=images,
-            height=stage_1_output_shape.height,
-            width=stage_1_output_shape.width,
-            video_encoder=video_encoder,
-            dtype=dtype,
-            device=self.device,
         )
-        video_state = denoise_video_only(
-            output_shape=stage_1_output_shape,
-            conditionings=stage_1_conditionings,
             noiser=noiser,
-            sigmas=stage_1_sigmas,
             stepper=stepper,
-            denoising_loop_fn=denoising_loop,
-            components=self.pipeline_components,
-            dtype=dtype,
-            device=self.device,
-            initial_audio_latent=encoded_audio_latent,
         )
-        torch.cuda.synchronize()
-        cleanup_memory()
-        upscaled_video_latent = upsample_video(
-            latent=video_state.latent[:1],
-            video_encoder=video_encoder,
-            upsampler=self.model_ledger.spatial_upsampler(),
-        )
-        stage_2_sigmas = torch.tensor(STAGE_2_DISTILLED_SIGMA_VALUES, device=self.device)
-        stage_2_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
-        stage_2_conditionings = combined_image_conditionings(
-            images=images,
-            height=stage_2_output_shape.height,
-            width=stage_2_output_shape.width,
-            video_encoder=video_encoder,
-            dtype=dtype,
-            device=self.device,
         )
-        video_state = denoise_video_only(
-            output_shape=stage_2_output_shape,
-            conditionings=stage_2_conditionings,
             noiser=noiser,
-            sigmas=stage_2_sigmas,
             stepper=stepper,
-            denoising_loop_fn=denoising_loop,
-            components=self.pipeline_components,
-            dtype=dtype,
-            device=self.device,
-            noise_scale=stage_2_sigmas[0],
-            initial_video_latent=upscaled_video_latent,
-            initial_audio_latent=encoded_audio_latent,
         )
-        torch.cuda.synchronize()
-        del transformer
-        del video_encoder
-        cleanup_memory()
-        decoded_video = vae_decode_video(
-            video_state.latent,
-            self.model_ledger.video_decoder(),
-            tiling_config,
-            generator,
-        )
-        original_audio = Audio(
-            waveform=decoded_audio.waveform.squeeze(0),
-            sampling_rate=decoded_audio.sampling_rate,
-        )
-        return decoded_video, original_audio
 # Model repos
@@ -276,11 +282,11 @@ print("=" * 80)
 # LoRA cache directory and currently-applied key
 LORA_CACHE_DIR = Path("lora_cache")
 LORA_CACHE_DIR.mkdir(exist_ok=True)
-current_lora_key: str | None = None
 PENDING_LORA_KEY: str | None = None
-PENDING_LORA_STATE: dict[str, torch.Tensor] | None = None
-PENDING_LORA_STATUS: str = "No LoRA state prepared yet."
 weights_dir = Path("weights")
 weights_dir.mkdir(exist_ok=True)
@@ -376,29 +382,19 @@ def prepare_lora_cache(
     progress=gr.Progress(track_tqdm=True),
 ):
     """
-    CPU-only step:
-    - checks cache
-    - loads cached fused transformer state_dict, or
-    - builds fused transformer on CPU and saves it
-    The resulting state_dict is stored in memory and can be applied later.
     """
-    global PENDING_LORA_KEY, PENDING_LORA_STATE, PENDING_LORA_STATUS
-    ledger = pipeline.model_ledger
-    key, _ = _make_lora_key(pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength, voice_strength, realism_strength, transition_strength)
-    cache_path = LORA_CACHE_DIR / f"{key}.safetensors"
-    progress(0.05, desc="Preparing LoRA state")
-    if cache_path.exists():
-        try:
-            progress(0.20, desc="Loading cached fused state")
-            state = load_file(str(cache_path))
-            PENDING_LORA_KEY = key
-            PENDING_LORA_STATE = state
-            PENDING_LORA_STATUS = f"Loaded cached LoRA state: {cache_path.name}"
-            return PENDING_LORA_STATUS
-        except Exception as e:
-            print(f"[LoRA] Cache load failed: {type(e).__name__}: {e}")
     entries = [
         (pose_lora_path, round(float(pose_strength), 2)),
@@ -414,6 +410,7 @@ def prepare_lora_cache(
         (realism_lora_path, round(float(realism_strength), 2)),
         (transition_lora_path, round(float(transition_strength), 2)),
     ]
     loras_for_builder = [
         LoraPathStrengthAndSDOps(path, strength, LTXV_LORA_COMFY_RENAMING_MAP)
         for path, strength in entries
@@ -422,35 +419,31 @@ def prepare_lora_cache(
     if not loras_for_builder:
         PENDING_LORA_KEY = None
-        PENDING_LORA_STATE = None
         PENDING_LORA_STATUS = "No non-zero LoRA strengths selected; nothing to prepare."
         return PENDING_LORA_STATUS
-    tmp_ledger = None
-    new_transformer_cpu = None
     try:
-        progress(0.35, desc="Building fused CPU transformer")
-        tmp_ledger = pipeline.model_ledger.__class__(
-            dtype=ledger.dtype,
-            device=torch.device("cpu"),
-            checkpoint_path=str(checkpoint_path),
-            spatial_upsampler_path=str(spatial_upsampler_path),
-            gemma_root_path=str(gemma_root),
-            loras=tuple(loras_for_builder),
-            quantization=getattr(ledger, "quantization", None),
-        )
-        new_transformer_cpu = tmp_ledger.transformer()
-        progress(0.70, desc="Extracting fused state_dict")
-        state = {
-            k: v.detach().cpu().contiguous()
-            for k, v in new_transformer_cpu.state_dict().items()
-        }
-        save_file(state, str(cache_path))
         PENDING_LORA_KEY = key
-        PENDING_LORA_STATE = state
-        PENDING_LORA_STATUS = f"Built and cached LoRA state: {cache_path.name}"
         return PENDING_LORA_STATUS
     except Exception as e:
@@ -458,45 +451,32 @@ def prepare_lora_cache(
         print(f"[LoRA] Prepare failed: {type(e).__name__}: {e}")
         print(traceback.format_exc())
         PENDING_LORA_KEY = None
-        PENDING_LORA_STATE = None
         PENDING_LORA_STATUS = f"LoRA prepare failed: {type(e).__name__}: {e}"
         return PENDING_LORA_STATUS
-    finally:
-        try:
-            del new_transformer_cpu
-        except Exception:
-            pass
-        try:
-            del tmp_ledger
-        except Exception:
-            pass
-        gc.collect()
-def apply_prepared_lora_state_to_pipeline():
-    """
-    Fast step: copy the already prepared CPU state into the live transformer.
-    This is the only part that should remain near generation time.
-    """
-    global current_lora_key, PENDING_LORA_KEY, PENDING_LORA_STATE
-    if PENDING_LORA_STATE is None or PENDING_LORA_KEY is None:
-        print("[LoRA] No prepared LoRA state available; skipping.")
         return False
     if current_lora_key == PENDING_LORA_KEY:
-        print("[LoRA] Prepared LoRA state already active; skipping.")
         return True
-    existing_transformer = _transformer
-    with torch.no_grad():
-        missing, unexpected = existing_transformer.load_state_dict(PENDING_LORA_STATE, strict=False)
-        if missing or unexpected:
-            print(f"[LoRA] load_state_dict mismatch: missing={len(missing)}, unexpected={len(unexpected)}")
     current_lora_key = PENDING_LORA_KEY
-    print("[LoRA] Prepared LoRA state applied to the pipeline.")
     return True
 # ---- REPLACE PRELOAD BLOCK START ----
@@ -588,8 +568,8 @@ def on_highres_toggle(first_image, last_image, high_res):
 def get_gpu_duration(
     first_image,
     last_image,
-    input_audio,
     prompt: str,
     duration: float,
     gpu_duration: float,
     enhance_prompt: bool = True,
@@ -618,8 +598,8 @@ def get_gpu_duration(
 def generate_video(
     first_image,
     last_image,
-    input_audio,
     prompt: str,
     duration: float,
     gpu_duration: float,
     enhance_prompt: bool = True,
@@ -682,15 +662,18 @@ def generate_video(
         video, audio = pipeline(
             prompt=prompt,
             seed=current_seed,
             height=int(height),
             width=int(width),
             num_frames=num_frames,
             frame_rate=frame_rate,
             images=images,
-            audio_path=input_audio,
             tiling_config=tiling_config,
             enhance_prompt=enhance_prompt,
         )
         log_memory("after pipeline call")
@@ -723,7 +706,6 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
             with gr.Row():
                 first_image = gr.Image(label="First Frame (Optional)", type="pil")
                 last_image = gr.Image(label="Last Frame (Optional)", type="pil")
-            input_audio = gr.Audio(label="Audio Input (Optional)", type="filepath")
             prompt = gr.Textbox(
                 label="Prompt",
                 info="for best results - make it as elaborate as possible",
@@ -731,6 +713,12 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
                 lines=3,
                 placeholder="Describe the motion and animation you want...",
             )
             duration = gr.Slider(label="Duration (seconds)", minimum=1.0, maximum=30.0, value=10.0, step=0.1)
@@ -817,13 +805,13 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
             [
                 None,
                 "pinkknit.jpg",
-                None,
                 "The camera falls downward through darkness as if dropped into a tunnel. "
                 "As it slows, five friends wearing pink knitted hats and sunglasses lean "
                 "over and look down toward the camera with curious expressions. The lens "
                 "has a strong fisheye effect, creating a circular frame around them. They "
                 "crowd together closely, forming a symmetrical cluster while staring "
                 "directly into the lens.",
                 3.0,
                 80.0,
                 False,
@@ -846,7 +834,7 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
             ],
         ],
         inputs=[
-            first_image, last_image, input_audio, prompt, duration, gpu_duration,
             enhance_prompt, seed, randomize_seed, height, width,
             pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength, voice_strength, realism_strength, transition_strength,
         ],
@@ -879,7 +867,7 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
     generate_btn.click(
         fn=generate_video,
         inputs=[
-            first_image, last_image, input_audio, prompt, duration, gpu_duration, enhance_prompt,
             seed, randomize_seed, height, width,
             pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength, voice_strength, realism_strength, transition_strength,
         ],

 import json
 import requests
+from ltx_core.components.diffusion_steps import Res2sDiffusionStep
+from ltx_core.components.guiders import MultiModalGuider, MultiModalGuiderParams
 from ltx_core.components.noisers import GaussianNoiser
+from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number
+from ltx_core.types import Audio, VideoLatentShape, VideoPixelShape
+from ltx_pipelines.utils.args import ImageConditioningInput, hq_2_stage_arg_parser
+from ltx_pipelines.utils.blocks import (
+    AudioDecoder,
+    DiffusionStage,
+    ImageConditioner,
+    PromptEncoder,
+    VideoDecoder,
+    VideoUpsampler,
+)
+from ltx_pipelines.utils.constants import LTX_2_3_HQ_PARAMS, STAGE_2_DISTILLED_SIGMAS
+from ltx_pipelines.utils.denoisers import GuidedDenoiser, SimpleDenoiser
 from ltx_pipelines.utils.helpers import (
+    assert_resolution,
     combined_image_conditionings,
+    get_device,
 )
+from ltx_pipelines.utils.media_io import encode_video
+from ltx_pipelines.utils.samplers import res2s_audio_video_denoising_loop
 from ltx_core.loader.primitives import LoraPathStrengthAndSDOps
 from ltx_core.loader.sd_ops import LTXV_LORA_COMFY_RENAMING_MAP
 }
+class LTX23NegativePromptTwoStagePipeline:
+    def __init__(
+        self,
+        checkpoint_path: str,
+        spatial_upsampler_path: str,
+        gemma_root: str,
+        loras: tuple[LoraPathStrengthAndSDOps, ...],
+        device: torch.device | None = None,
+        quantization: QuantizationPolicy | None = None,
+        registry: Registry | None = None,
+        torch_compile: bool = False,
+    ):
+        self.device = device or get_device()
+        self.dtype = torch.bfloat16
+        self._scheduler = LTX2Scheduler()
+        self.prompt_encoder = PromptEncoder(checkpoint_path, gemma_root, self.dtype, self.device, registry=registry)
+        self.image_conditioner = ImageConditioner(checkpoint_path, self.dtype, self.device, registry=registry)
+        self.upsampler = VideoUpsampler(checkpoint_path, spatial_upsampler_path, self.dtype, self.device, registry=registry)
+        self.video_decoder = VideoDecoder(checkpoint_path, self.dtype, self.device, registry=registry)
+        self.audio_decoder = AudioDecoder(checkpoint_path, self.dtype, self.device, registry=registry)
+        self.stage_1 = DiffusionStage(
+            checkpoint_path,
+            self.dtype,
+            self.device,
+            loras=tuple(loras),
+            quantization=quantization,
+            registry=registry,
+            torch_compile=torch_compile,
+        )
+        self.stage_2 = DiffusionStage(
+            checkpoint_path,
+            self.dtype,
+            self.device,
+            loras=tuple(loras),
+            quantization=quantization,
+            registry=registry,
+            torch_compile=torch_compile,
+        )
     def __call__(
         self,
         prompt: str,
+        negative_prompt: str,
         seed: int,
         height: int,
         width: int,
         num_frames: int,
         frame_rate: float,
         images: list[ImageConditioningInput],
         tiling_config: TilingConfig | None = None,
         enhance_prompt: bool = False,
+        streaming_prefetch_count: int | None = None,
+        max_batch_size: int = 1,
+        stage_1_sigmas: torch.Tensor | None = None,
+        stage_2_sigmas: torch.Tensor = STAGE_2_DISTILLED_SIGMAS,
+        video_guider_params: MultiModalGuiderParams | None = None,
+        audio_guider_params: MultiModalGuiderParams | None = None,
+    ) -> tuple[Iterator[torch.Tensor], Audio]:
+        assert_resolution(height=height, width=width, is_two_stage=True)
         generator = torch.Generator(device=self.device).manual_seed(seed)
         noiser = GaussianNoiser(generator=generator)
         dtype = torch.bfloat16
+        ctx_p, ctx_n = self.prompt_encoder(
+            [prompt, negative_prompt],
             enhance_first_prompt=enhance_prompt,
+            enhance_prompt_image=images[0][0] if len(images) > 0 else None,
+            enhance_prompt_seed=seed,
+            streaming_prefetch_count=streaming_prefetch_count,
         )
+        v_context_p, a_context_p = ctx_p.video_encoding, ctx_p.audio_encoding
+        v_context_n, a_context_n = ctx_n.video_encoding, ctx_n.audio_encoding
         stage_1_output_shape = VideoPixelShape(
+            batch=1, frames=num_frames, width=width // 2, height=height // 2, fps=frame_rate
         )
+        stage_1_conditionings = self.image_conditioner(
+            lambda enc: combined_image_conditionings(
+                images=images,
+                height=stage_1_output_shape.height,
+                width=stage_1_output_shape.width,
+                video_encoder=enc,
+                dtype=dtype,
+                device=self.device,
+            )
         )
+        stepper = Res2sDiffusionStep()
+        if stage_1_sigmas is None:
+            empty_latent = torch.empty(VideoLatentShape.from_pixel_shape(stage_1_output_shape).to_torch_shape())
+            stage_1_sigmas = self._scheduler.execute(latent=empty_latent, steps=num_inference_steps)
+        sigmas = stage_1_sigmas.to(dtype=torch.float32, device=self.device)
+        video_state, audio_state = self.stage_1(
+            denoiser=GuidedDenoiser(
+                v_context=v_context_p,
+                a_context=a_context_p,
+                video_guider=MultiModalGuider(
+                    params=video_guider_params,
+                    negative_context=v_context_n,
+                ),
+                audio_guider=MultiModalGuider(
+                    params=audio_guider_params,
+                    negative_context=a_context_n,
+                ),
+            ),
+            sigmas=sigmas,
             noiser=noiser,
             stepper=stepper,
+            width=stage_1_output_shape.width,
+            height=stage_1_output_shape.height,
+            frames=num_frames,
+            fps=frame_rate,
+            video=ModalitySpec(context=v_context_p, conditionings=stage_1_conditionings),
+            audio=ModalitySpec(context=a_context_p),
+            loop=res2s_audio_video_denoising_loop,
+            streaming_prefetch_count=streaming_prefetch_count,
+            max_batch_size=max_batch_size,
         )
+        upscaled_video_latent = self.upsampler(video_state.latent[:1])
+        stage_2_conditionings = self.image_conditioner(
+            lambda enc: combined_image_conditionings(
+                images=images,
+                height=height,
+                width=width,
+                video_encoder=enc,
+                dtype=dtype,
+                device=self.device,
+            )
         )
+        video_state, audio_state = self.stage_2(
+            denoiser=SimpleDenoiser(v_context=v_context_p, a_context=a_context_p),
+            sigmas=stage_2_sigmas.to(dtype=torch.float32, device=self.device),
             noiser=noiser,
             stepper=stepper,
+            width=width,
+            height=height,
+            frames=num_frames,
+            fps=frame_rate,
+            video=ModalitySpec(
+                context=v_context_p,
+                conditionings=stage_2_conditionings,
+                noise_scale=stage_2_sigmas[0].item(),
+                initial_latent=upscaled_video_latent,
+            ),
+            audio=ModalitySpec(
+                context=a_context_p,
+                noise_scale=stage_2_sigmas[0].item(),
+                initial_latent=audio_state.latent,
+            ),
+            loop=res2s_audio_video_denoising_loop,
+            streaming_prefetch_count=streaming_prefetch_count,
         )
+        decoded_video = self.video_decoder(video_state.latent, tiling_config, generator)
+        decoded_audio = self.audio_decoder(audio_state.latent)
+        return decoded_video, decoded_audio
 # Model repos
 # LoRA cache directory and currently-applied key
 LORA_CACHE_DIR = Path("lora_cache")
 LORA_CACHE_DIR.mkdir(exist_ok=True)
+current_lora_key: str | None = None
 PENDING_LORA_KEY: str | None = None
+PENDING_LORA_LORAS: tuple[LoraPathStrengthAndSDOps, ...] | None = None
+PENDING_LORA_STATUS: str = "No LoRA config prepared yet."
 weights_dir = Path("weights")
 weights_dir.mkdir(exist_ok=True)
     progress=gr.Progress(track_tqdm=True),
 ):
     """
+    Prepare the LoRA selection for the guided pipeline.
+    This caches the LoRA config, not fused weights.
     """
+    global PENDING_LORA_KEY, PENDING_LORA_LORAS, PENDING_LORA_STATUS
+    key = _make_lora_key(
+        pose_strength, general_strength, motion_strength, dreamlay_strength,
+        mself_strength, dramatic_strength, fluid_strength, liquid_strength,
+        demopose_strength, voice_strength, realism_strength, transition_strength
+    )
+    cache_path = LORA_CACHE_DIR / f"{key}.json"
+    progress(0.05, desc="Preparing LoRA config")
     entries = [
         (pose_lora_path, round(float(pose_strength), 2)),
         (realism_lora_path, round(float(realism_strength), 2)),
         (transition_lora_path, round(float(transition_strength), 2)),
     ]
     loras_for_builder = [
         LoraPathStrengthAndSDOps(path, strength, LTXV_LORA_COMFY_RENAMING_MAP)
         for path, strength in entries
     if not loras_for_builder:
         PENDING_LORA_KEY = None
+        PENDING_LORA_LORAS = None
         PENDING_LORA_STATUS = "No non-zero LoRA strengths selected; nothing to prepare."
         return PENDING_LORA_STATUS
     try:
+        if cache_path.exists():
+            progress(0.20, desc="Loading cached LoRA config")
+            data = json.loads(cache_path.read_text())
+            loras_for_builder = [
+                LoraPathStrengthAndSDOps(item["path"], item["strength"], LTXV_LORA_COMFY_RENAMING_MAP)
+                for item in data
+                if float(item["strength"]) != 0.0
+            ]
+        else:
+            progress(0.30, desc="Saving LoRA config cache")
+            cache_path.write_text(
+                json.dumps(
+                    [{"path": path, "strength": strength} for path, strength in entries if float(strength) != 0.0],
+                    indent=2,
+                )
+            )
         PENDING_LORA_KEY = key
+        PENDING_LORA_LORAS = tuple(loras_for_builder)
+        PENDING_LORA_STATUS = f"Prepared LoRA config: {cache_path.name}"
         return PENDING_LORA_STATUS
     except Exception as e:
         print(f"[LoRA] Prepare failed: {type(e).__name__}: {e}")
         print(traceback.format_exc())
         PENDING_LORA_KEY = None
+        PENDING_LORA_LORAS = None
         PENDING_LORA_STATUS = f"LoRA prepare failed: {type(e).__name__}: {e}"
         return PENDING_LORA_STATUS
+def apply_prepared_lora_config_to_pipeline():
+    global current_lora_key, PENDING_LORA_KEY, PENDING_LORA_LORAS, pipeline
+    if PENDING_LORA_LORAS is None or PENDING_LORA_KEY is None:
+        print("[LoRA] No prepared LoRA config available; skipping.")
         return False
     if current_lora_key == PENDING_LORA_KEY:
+        print("[LoRA] Prepared LoRA config already active; skipping.")
         return True
+    pipeline = LTX23NegativePromptTwoStagePipeline(
+        checkpoint_path=str(checkpoint_path),
+        spatial_upsampler_path=str(spatial_upsampler_path),
+        gemma_root=str(gemma_root),
+        loras=PENDING_LORA_LORAS,
+        quantization=QuantizationPolicy.fp8_cast(),
+    )
     current_lora_key = PENDING_LORA_KEY
+    print("[LoRA] Prepared LoRA config applied by rebuilding the pipeline.")
     return True
 # ---- REPLACE PRELOAD BLOCK START ----
 def get_gpu_duration(
     first_image,
     last_image,
     prompt: str,
+    negative_prompt: str,
     duration: float,
     gpu_duration: float,
     enhance_prompt: bool = True,
 def generate_video(
     first_image,
     last_image,
     prompt: str,
+    negative_prompt: str,
     duration: float,
     gpu_duration: float,
     enhance_prompt: bool = True,
         video, audio = pipeline(
             prompt=prompt,
+            negative_prompt=negative_prompt,
             seed=current_seed,
             height=int(height),
             width=int(width),
             num_frames=num_frames,
             frame_rate=frame_rate,
             images=images,
             tiling_config=tiling_config,
             enhance_prompt=enhance_prompt,
+            # if your wrapper exposes them:
+            video_guider_params=video_guider_params,
+            audio_guider_params=audio_guider_params,
         )
         log_memory("after pipeline call")
             with gr.Row():
                 first_image = gr.Image(label="First Frame (Optional)", type="pil")
                 last_image = gr.Image(label="Last Frame (Optional)", type="pil")
             prompt = gr.Textbox(
                 label="Prompt",
                 info="for best results - make it as elaborate as possible",
                 lines=3,
                 placeholder="Describe the motion and animation you want...",
             )
+            negative_prompt = gr.Textbox(
+                label="Negative Prompt",
+                value="",
+                lines=2,
+                placeholder="Describe what you want to avoid...",
+            )
             duration = gr.Slider(label="Duration (seconds)", minimum=1.0, maximum=30.0, value=10.0, step=0.1)
             [
                 None,
                 "pinkknit.jpg",
                 "The camera falls downward through darkness as if dropped into a tunnel. "
                 "As it slows, five friends wearing pink knitted hats and sunglasses lean "
                 "over and look down toward the camera with curious expressions. The lens "
                 "has a strong fisheye effect, creating a circular frame around them. They "
                 "crowd together closely, forming a symmetrical cluster while staring "
                 "directly into the lens.",
+                "",
                 3.0,
                 80.0,
                 False,
             ],
         ],
         inputs=[
+            first_image, last_image, prompt, negative_prompt, duration, gpu_duration,
             enhance_prompt, seed, randomize_seed, height, width,
             pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength, voice_strength, realism_strength, transition_strength,
         ],
     generate_btn.click(
         fn=generate_video,
         inputs=[
+            first_image, last_image, prompt, negative_prompt, duration, gpu_duration, enhance_prompt,
             seed, randomize_seed, height, width,
             pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength, voice_strength, realism_strength, transition_strength,
         ],