TestingwithNeg

Running on Zero

App Files Files Community

dagloop5 commited on 8 days ago

Commit

bdb37cb

verified ·

1 Parent(s): 5f0bc90

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -24

app.py CHANGED Viewed

@@ -125,6 +125,7 @@ class HQPipelineWithCachedLoRA:
     2. Handles ALL LoRAs via cached state (distilled + 12 custom)
     3. Supports CFG/negative prompts and guidance parameters
     4. Reuses single transformer for both stages
     """
     def __init__(
@@ -140,7 +141,6 @@ class HQPipelineWithCachedLoRA:
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.dtype = torch.bfloat16
-        # Create ONE ModelLedger for everything
         print("    Creating ModelLedger (no LoRAs)...")
         self.model_ledger = ModelLedger(
             dtype=self.dtype,
@@ -148,17 +148,15 @@ class HQPipelineWithCachedLoRA:
             checkpoint_path=checkpoint_path,
             gemma_root_path=gemma_root,
             spatial_upsampler_path=spatial_upsampler_path,
-            loras=(),  # NO LoRAs
             quantization=quantization,
         )
-        # Pipeline components
         self.pipeline_components = PipelineComponents(
             dtype=self.dtype,
             device=self.device,
         )
-        # Storage for cached LoRA state
         self._cached_state = None
     def apply_cached_lora_state(self, state_dict):
@@ -175,7 +173,6 @@ class HQPipelineWithCachedLoRA:
         width: int,
         num_frames: int,
         frame_rate: float,
-        num_inference_steps: int,
         video_guider_params: MultiModalGuiderParams,
         audio_guider_params: MultiModalGuiderParams,
         images: list,
@@ -186,7 +183,6 @@ class HQPipelineWithCachedLoRA:
         from ltx_core.tools import VideoLatentShape
         from ltx_core.components.noisers import GaussianNoiser
         from ltx_core.components.diffusion_steps import Res2sDiffusionStep
-        from ltx_core.components.schedulers import LTX2Scheduler
         from ltx_core.types import VideoPixelShape
         from ltx_core.model.upsampler import upsample_video
         from ltx_core.model.video_vae import decode_video as vae_decode_video
@@ -199,12 +195,7 @@ class HQPipelineWithCachedLoRA:
         generator = torch.Generator(device=device).manual_seed(seed)
         noiser = GaussianNoiser(generator=generator)
-        # Apply cached LoRA state if available
-        if self._cached_state is not None:
-            print("[LoRA] Applying cached state to transformer...")
-            transformer = self.model_ledger.transformer()
-            with torch.no_grad():
-                transformer.load_state_dict(self._cached_state, strict=False)
         ctx_p, ctx_n = encode_prompts(
             [prompt, negative_prompt],
@@ -217,7 +208,7 @@ class HQPipelineWithCachedLoRA:
         v_context_p, a_context_p = ctx_p.video_encoding, ctx_p.audio_encoding
         v_context_n, a_context_n = ctx_n.video_encoding, ctx_n.audio_encoding
-        # ===================== STAGE 1 =====================
         stage_1_output_shape = VideoPixelShape(
             batch=1, frames=num_frames,
             width=width // 2, height=height // 2, fps=frame_rate
@@ -238,13 +229,10 @@ class HQPipelineWithCachedLoRA:
         transformer = self.model_ledger.transformer()
-        empty_latent = torch.empty(VideoLatentShape.from_pixel_shape(stage_1_output_shape).to_torch_shape())
         stepper = Res2sDiffusionStep()
-        sigmas = (
-            LTX2Scheduler()
-            .execute(latent=empty_latent, steps=num_inference_steps)
-            .to(dtype=torch.float32, device=device)
-        )
         def first_stage_denoising_loop(sigmas, video_state, audio_state, stepper):
             return res2s_audio_video_denoising_loop(
@@ -265,7 +253,7 @@ class HQPipelineWithCachedLoRA:
             output_shape=stage_1_output_shape,
             conditionings=stage_1_conditionings,
             noiser=noiser,
-            sigmas=sigmas,
             stepper=stepper,
             denoising_loop_fn=first_stage_denoising_loop,
             components=self.pipeline_components,
@@ -298,11 +286,11 @@ class HQPipelineWithCachedLoRA:
         del video_encoder
         cleanup_memory()
-        # ===================== STAGE 2 =====================
         transformer = self.model_ledger.transformer()
         from ltx_pipelines.utils.constants import STAGE_2_DISTILLED_SIGMA_VALUES
-        distilled_sigmas = torch.tensor(STAGE_2_DISTILLED_SIGMA_VALUES, device=device)
         def second_stage_denoising_loop(sigmas, video_state, audio_state, stepper):
             return res2s_audio_video_denoising_loop(
@@ -321,13 +309,13 @@ class HQPipelineWithCachedLoRA:
             output_shape=stage_2_output_shape,
             conditionings=stage_2_conditionings,
             noiser=noiser,
-            sigmas=distilled_sigmas,
             stepper=stepper,
             denoising_loop_fn=second_stage_denoising_loop,
             components=self.pipeline_components,
             dtype=dtype,
             device=device,
-            noise_scale=distilled_sigmas[0],
             initial_video_latent=upscaled_video_latent,
             initial_audio_latent=audio_state.latent,
         )

     2. Handles ALL LoRAs via cached state (distilled + 12 custom)
     3. Supports CFG/negative prompts and guidance parameters
     4. Reuses single transformer for both stages
+    5. Uses 8 steps at half resolution + 3 steps at full resolution
     """
     def __init__(
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.dtype = torch.bfloat16
         print("    Creating ModelLedger (no LoRAs)...")
         self.model_ledger = ModelLedger(
             dtype=self.dtype,
             checkpoint_path=checkpoint_path,
             gemma_root_path=gemma_root,
             spatial_upsampler_path=spatial_upsampler_path,
+            loras=(),
             quantization=quantization,
         )
         self.pipeline_components = PipelineComponents(
             dtype=self.dtype,
             device=self.device,
         )
         self._cached_state = None
     def apply_cached_lora_state(self, state_dict):
         width: int,
         num_frames: int,
         frame_rate: float,
         video_guider_params: MultiModalGuiderParams,
         audio_guider_params: MultiModalGuiderParams,
         images: list,
         from ltx_core.tools import VideoLatentShape
         from ltx_core.components.noisers import GaussianNoiser
         from ltx_core.components.diffusion_steps import Res2sDiffusionStep
         from ltx_core.types import VideoPixelShape
         from ltx_core.model.upsampler import upsample_video
         from ltx_core.model.video_vae import decode_video as vae_decode_video
         generator = torch.Generator(device=device).manual_seed(seed)
         noiser = GaussianNoiser(generator=generator)
+        # NO LoRA application here - done in apply_prepared_lora_state_to_pipeline()
         ctx_p, ctx_n = encode_prompts(
             [prompt, negative_prompt],
         v_context_p, a_context_p = ctx_p.video_encoding, ctx_p.audio_encoding
         v_context_n, a_context_n = ctx_n.video_encoding, ctx_n.audio_encoding
+        # ===================== STAGE 1: 8 steps at half resolution =====================
         stage_1_output_shape = VideoPixelShape(
             batch=1, frames=num_frames,
             width=width // 2, height=height // 2, fps=frame_rate
         transformer = self.model_ledger.transformer()
+        # Use DISTILLED_SIGMA_VALUES for 8 steps at half resolution
+        from ltx_pipelines.utils.constants import DISTILLED_SIGMA_VALUES
+        stage_1_sigmas = torch.tensor(DISTILLED_SIGMA_VALUES, device=device)
         stepper = Res2sDiffusionStep()
         def first_stage_denoising_loop(sigmas, video_state, audio_state, stepper):
             return res2s_audio_video_denoising_loop(
             output_shape=stage_1_output_shape,
             conditionings=stage_1_conditionings,
             noiser=noiser,
+            sigmas=stage_1_sigmas,
             stepper=stepper,
             denoising_loop_fn=first_stage_denoising_loop,
             components=self.pipeline_components,
         del video_encoder
         cleanup_memory()
+        # ===================== STAGE 2: 3 steps at full resolution =====================
         transformer = self.model_ledger.transformer()
         from ltx_pipelines.utils.constants import STAGE_2_DISTILLED_SIGMA_VALUES
+        stage_2_sigmas = torch.tensor(STAGE_2_DISTILLED_SIGMA_VALUES, device=device)
         def second_stage_denoising_loop(sigmas, video_state, audio_state, stepper):
             return res2s_audio_video_denoising_loop(
             output_shape=stage_2_output_shape,
             conditionings=stage_2_conditionings,
             noiser=noiser,
+            sigmas=stage_2_sigmas,
             stepper=stepper,
             denoising_loop_fn=second_stage_denoising_loop,
             components=self.pipeline_components,
             dtype=dtype,
             device=device,
+            noise_scale=stage_2_sigmas[0],
             initial_video_latent=upscaled_video_latent,
             initial_audio_latent=audio_state.latent,
         )