neuralvfx commited on 3 days ago

Commit

7f0b483

0 Parent(s):

Initial commit with large files tracked by LFS

Browse files

Files changed (20) hide show

.gitattributes +10 -0
.gitignore +4 -0
README.md +99 -0
assets/comfyui.png +3 -0
assets/gallery.png +3 -0
assets/girl_icon.png +3 -0
assets/resized_dread_girl.png +3 -0
assets/resized_dread_girl_seg.png +3 -0
assets/resized_house.png +3 -0
assets/resized_house_seg.png +3 -0
assets/resized_kitten.png +3 -0
assets/resized_kitten_seg.png +3 -0
assets/side_by_side_d.png +3 -0
assets/stacked_vertical.png +3 -0
assets/z-image.png +3 -0
comfy-ui-patch/z-image-sam-controlnet.safetensors +3 -0
config.json +56 -0
diffusers_local/patch.py +509 -0
diffusers_local/pipeline_z_image_control_unified.py +1042 -0
diffusers_local/z_image_control_transformer_2d.py +1460 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,10 @@

+*.safetensors filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+__pycache__/
+*.pyc
+__pycache__/
+*.pyc

README.md ADDED Viewed

	@@ -0,0 +1,99 @@

+---
+license: apache-2.0
+datasets:
+- opendiffusionai/laion2b-squareish-1536px
+base_model:
+- Tongyi-MAI/Z-Image
+tags:
+- z-image
+- controlnet
+thumbnail: https://huggingface.co/neuralvfx/Z-Image-SAM-ControlNet/resolve/main/assets/stacked_vertical.png
+---
+# Z-Image-SAM-ControlNet
+![side by side](assets/side_by_side_d.png)
+## Fun Facts
+- This ControlNet is trained exclusively on images generated by [Segment Anything (SAM)](https://aidemos.meta.com/segment-anything/)
+- Base model used was [Tongyi-MAI/Z-Image](https://huggingface.co/Tongyi-MAI/Z-Image)
+- Trained at 1024x1024 resolution
+- Trained on 220K segmented images from [laion2b-squareish-1536px](https://huggingface.co/datasets/opendiffusionai/laion2b-squareish-1536px)
+- Trained using this repo: [https://github.com/aigc-apps/VideoX-Fun](https://github.com/aigc-apps/VideoX-Fun)
+# Showcases
+<table style="width:100%; table-layout:fixed;">
+  <tr>
+    <td><img src="./assets/resized_kitten_seg.png" ></td>
+    <td><img src="./assets/resized_kitten.png" ></td>
+  </tr>
+  <tr>
+    <td><img src="./assets/resized_dread_girl_seg.png" ></td>
+    <td><img src="./assets/resized_dread_girl.png" ></td>
+  </tr>
+  <tr>
+    <td><img src="./assets/resized_house_seg.png" ></td>
+    <td><img src="./assets/resized_house.png" ></td>
+  </tr>
+</table>
+# ComfyUI Usage
+1) Copy the weights from `comfy-ui-patch/z-image-sam-controlnet.safetensors` to `ComfyUI/models/model_patches`
+2) Use `ModelPatchLoader` to load the patch
+3) Plug `MODEL_PATCH` into `model_patch` on `ZImageFunControlnet`
+4) Plug the model, VAE and image into `ZImageFunControlnet`
+5) Plug the `ZImageFunControlnet` into KSampler
+![videoXFun Nodes](assets/comfyui.png)
+# Hugging Face Usage
+## Compatibility
+```py
+pip install -U diffusers==0.37.0
+```
+## Download
+```bash
+sudo apt-get install git-lfs
+git lfs install
+git clone https://huggingface.co/neuralvfx/Z-Image-SAM-ControlNet
+cd Z-Image-SAM-ControlNet
+```
+## Inference
+```python
+import torch
+from diffusers.utils import load_image
+from diffusers_local.pipeline_z_image_control_unified import ZImageControlUnifiedPipeline
+from diffusers_local.z_image_control_transformer_2d import ZImageControlTransformer2DModel
+transformer = ZImageControlTransformer2DModel.from_pretrained(
+    ".",
+    torch_dtype=torch.bfloat16,
+    use_safetensors=True,
+    add_control_noise_refiner=True,
+)
+pipe = ZImageControlUnifiedPipeline.from_pretrained(
+    "Tongyi-MAI/Z-Image",
+    torch_dtype=torch.bfloat16,
+    transformer=transformer,
+)
+pipe.enable_model_cpu_offload()
+image = pipe(
+    prompt="some beach wood washed up on the sunny sand, spelling the words z-image, with footprints and waves crashing",
+    negative_prompt="低分辨率，低画质，肢体畸形，手指畸形，画面过饱和，蜡像感，人脸无细节，过度光滑，画面具有AI感。构图混乱。文字模糊，扭曲。",
+    control_image=load_image("assets/z-image.png"),
+    height=1024,
+    width=1024,
+    num_inference_steps=50,
+    guidance_scale=4.0,
+    controlnet_conditioning_scale=1.0,
+    generator= torch.Generator("cuda").manual_seed(22),
+).images[0]
+image.save("output.png")
+image
+```

assets/comfyui.png ADDED Viewed

Git LFS Details

SHA256: 5cdf98017e5c0cd6e880adefce747f2832f91fe7d0d1af1f358c312cf63bda09
Pointer size: 130 Bytes
Size of remote file: 69.7 kB

assets/gallery.png ADDED Viewed

Git LFS Details

SHA256: 249d295ade52970079957ac6d7bf43d5acaf79475b2fdf0d60a2e2e1e562f61b
Pointer size: 131 Bytes
Size of remote file: 658 kB

assets/girl_icon.png ADDED Viewed

Git LFS Details

SHA256: 1068f895476ee2375316db163ae9d12d42fb1996d38fb19bf7e2ae8d2e6e98dc
Pointer size: 131 Bytes
Size of remote file: 460 kB

assets/resized_dread_girl.png ADDED Viewed

Git LFS Details

SHA256: 4c0fa84baad4f6c2e264053d41aaed3cf4696cd7250a24a7def82dd1ba9af13b
Pointer size: 132 Bytes
Size of remote file: 2.11 MB

assets/resized_dread_girl_seg.png ADDED Viewed

Git LFS Details

SHA256: af9c472d3bb2050f0ba2c4e5d849d67a1a4e148c9746e3dad0e72752ca11a02f
Pointer size: 131 Bytes
Size of remote file: 275 kB

assets/resized_house.png ADDED Viewed

Git LFS Details

SHA256: 95a999066e9c8264a1b4e2a8fcde497f8f1d1f594abec026d36765505212e0f7
Pointer size: 132 Bytes
Size of remote file: 1.28 MB

assets/resized_house_seg.png ADDED Viewed

Git LFS Details

SHA256: f841f654fe557240cc8159a7540c2d7caaf4aad8d73d21bdc9b3ad740e055c42
Pointer size: 131 Bytes
Size of remote file: 217 kB

assets/resized_kitten.png ADDED Viewed

Git LFS Details

SHA256: 0587c5229256f6be0282bce9a3839c2bfbfd55bbf4ee3b6faf62d0759a390bc6
Pointer size: 132 Bytes
Size of remote file: 1.55 MB

assets/resized_kitten_seg.png ADDED Viewed

Git LFS Details

SHA256: 2cbcc3608f07af9cd2f76d4ed783b42820a28f289f24c888ebd451d93cb20de3
Pointer size: 131 Bytes
Size of remote file: 186 kB

assets/side_by_side_d.png ADDED Viewed

Git LFS Details

SHA256: d3b1a1f0c3d95bfa107a92a32e6cbcd1c33b6ead024b6c2703c985c558bdbf06
Pointer size: 132 Bytes
Size of remote file: 1.34 MB

assets/stacked_vertical.png ADDED Viewed

Git LFS Details

SHA256: 834b5452dc8418327c3be3ee32ef3dd93592c9d2d3f0a39f6edf199694da3753
Pointer size: 131 Bytes
Size of remote file: 863 kB

assets/z-image.png ADDED Viewed

Git LFS Details

SHA256: 27557ac106417f123b6707a13820aa09c3f0fe664c6fac03d4f354c314d9368d
Pointer size: 130 Bytes
Size of remote file: 33.7 kB

comfy-ui-patch/z-image-sam-controlnet.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d64755decb4b48ee265e271d9a65b2e5fba0d06bca79a7d382dfc7d7829ee15a
+size 6712485600

config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "_class_name": "ZImageControlTransformer2DModel",
+  "_diffusers_version": "0.37.0",
+  "add_control_noise_refiner": true,
+  "add_control_noise_refiner_correctly": true,
+  "all_f_patch_size": [
+    1
+  ],
+  "all_patch_size": [
+    2
+  ],
+  "axes_dims": [
+    32,
+    48,
+    48
+  ],
+  "axes_lens": [
+    1536,
+    512,
+    512
+  ],
+  "cap_feat_dim": 2560,
+  "control_in_dim": 33,
+  "control_layers_places": [
+    0,
+    2,
+    4,
+    6,
+    8,
+    10,
+    12,
+    14,
+    16,
+    18,
+    20,
+    22,
+    24,
+    26,
+    28
+  ],
+  "control_refiner_layers_places": [
+    0,
+    1
+  ],
+  "dim": 3840,
+  "in_channels": 16,
+  "n_heads": 30,
+  "n_kv_heads": 30,
+  "n_layers": 30,
+  "n_refiner_layers": 2,
+  "norm_eps": 1e-05,
+  "qk_norm": true,
+  "rope_theta": 256.0,
+  "siglip_feat_dim": null,
+  "t_scale": 1000.0
+}

diffusers_local/patch.py ADDED Viewed

	@@ -0,0 +1,509 @@

+import importlib
+import os
+from typing import Optional, Set
+import diffusers.loaders.single_file_model as single_file_model
+import diffusers.pipelines.pipeline_loading_utils as pipe_loading_utils
+import torch
+from diffusers.loaders.single_file_utils import (
+    convert_animatediff_checkpoint_to_diffusers,
+    convert_auraflow_transformer_checkpoint_to_diffusers,
+    convert_autoencoder_dc_checkpoint_to_diffusers,
+    convert_chroma_transformer_checkpoint_to_diffusers,
+    convert_controlnet_checkpoint,
+    convert_cosmos_transformer_checkpoint_to_diffusers,
+    convert_flux2_transformer_checkpoint_to_diffusers,
+    convert_flux_transformer_checkpoint_to_diffusers,
+    convert_hidream_transformer_to_diffusers,
+    convert_hunyuan_video_transformer_to_diffusers,
+    convert_ldm_unet_checkpoint,
+    convert_ldm_vae_checkpoint,
+    convert_ltx_transformer_checkpoint_to_diffusers,
+    convert_ltx_vae_checkpoint_to_diffusers,
+    convert_lumina2_to_diffusers,
+    convert_mochi_transformer_checkpoint_to_diffusers,
+    convert_sana_transformer_to_diffusers,
+    convert_sd3_transformer_checkpoint_to_diffusers,
+    convert_stable_cascade_unet_single_file_to_diffusers,
+    convert_wan_transformer_to_diffusers,
+    convert_wan_vae_to_diffusers,
+    convert_z_image_transformer_checkpoint_to_diffusers,
+    create_controlnet_diffusers_config_from_ldm,
+    create_unet_diffusers_config_from_ldm,
+    create_vae_diffusers_config_from_ldm,
+)
+from diffusers.pipelines.pipeline_loading_utils import _unwrap_model
+from diffusers.utils import (
+    _maybe_remap_transformers_class,
+    get_class_from_dynamic_module,
+)
+try:
+    from diffusers.hooks.group_offloading import (
+        _GROUP_ID_LAZY_LEAF,
+        GroupOffloadingConfig,
+        ModelHook,
+        ModuleGroup,
+        _apply_group_offloading_hook,
+        _apply_lazy_group_offloading_hook,
+        _find_parent_module_in_module_dict,
+        _gather_buffers_with_no_group_offloading_parent,
+        _gather_parameters_with_no_group_offloading_parent,
+        send_to_device,
+    )
+except ImportError:
+    ModelHook = object
+    ModuleGroup = object
+    GroupOffloadingConfig = object
+    def _apply_group_offloading_hook(*args, **kwargs):
+        pass
+_MY_GO_LC_SUPPORTED_PYTORCH_LAYERS = (
+    torch.nn.Conv1d,
+    torch.nn.Conv2d,
+    torch.nn.Conv3d,
+    torch.nn.ConvTranspose1d,
+    torch.nn.ConvTranspose2d,
+    torch.nn.ConvTranspose3d,
+    torch.nn.Linear,
+    torch.nn.Sequential,  # A camada que queremos adicionar
+)
+class GroupOffloadingHook(ModelHook):
+    r"""
+    A hook that offloads groups of torch.nn.Module to the CPU for storage and onloads to accelerator device for
+    computation. Each group has one "onload leader" module that is responsible for onloading, and an "offload leader"
+    module that is responsible for offloading. If prefetching is enabled, the onload leader of the previous module
+    group is responsible for onloading the current module group.
+    """
+    _is_stateful = False
+    def __init__(self, group: ModuleGroup, *, config: GroupOffloadingConfig) -> None:
+        self.group = group
+        self.next_group: Optional[ModuleGroup] = None
+        self.config = config
+    def initialize_hook(self, module: torch.nn.Module) -> torch.nn.Module:
+        if self.group.offload_leader == module:
+            self.group.offload_()
+        return module
+    def pre_forward(self, module: torch.nn.Module, *args, **kwargs):
+        # If there wasn't an onload_leader assigned, we assume that the submodule that first called its forward
+        # method is the onload_leader of the group.
+        if self.group.onload_leader is None:
+            self.group.onload_leader = module
+        if self.group.onload_leader == module:
+            # STEP 1: GUARANTEE THE CURRENT GROUP'S STATE
+            # This section ensures that the parameters for the *current* module are on the correct device
+            # before its forward pass is executed.
+            # This block handles modules that are part of the prefetching chain (`onload_self` is False).
+            # The original design relied on the previous module to initiate the onload, which proved fragile.
+            # Our robust fix makes each module responsible for itself:
+            #   1. `self.group.onload_()`: Guarantees the data transfer is initiated, acting as a backup if the
+            #      previous module in the chain failed to do so.
+            #   2. `self.group.stream.synchronize()`: This is the critical synchronization barrier. It forces the
+            #      CPU to wait until the asynchronous transfer to the GPU is complete, preventing device mismatch errors.
+            if not self.group.onload_self and self.group.stream is not None:
+                self.group.onload_()
+                self.group.stream.synchronize()
+            # This block handles the first module in an execution chain (`onload_self` is True).
+            # It is responsible for loading itself onto the device.
+            if self.group.onload_self:
+                self.group.onload_()
+                # If streams are used, the onload() call above is asynchronous. We MUST synchronize here
+                # to ensure the module is ready before its computation starts.
+                if self.group.stream is not None:
+                    self.group.stream.synchronize()
+            # At this point, we are 100% certain that the current group's parameters are on the onload_device.
+            # STEP 2: INITIATE PREFETCHING FOR THE NEXT GROUP
+            # With the current group secured, we can now look ahead and start the asynchronous data transfer
+            # for the next module in the execution chain. This allows the data transfer to overlap with the
+            # computation of the current module's forward pass, which is the core benefit of prefetching.
+            should_onload_next_group = self.next_group is not None and not self.next_group.onload_self
+            if should_onload_next_group:
+                self.next_group.onload_()
+        # The rest of the function handles moving positional (*args) and keyword (**kwargs)
+        # arguments to the correct device.
+        args = send_to_device(args, self.group.onload_device, non_blocking=self.group.non_blocking)
+        exclude_kwargs = self.config.exclude_kwargs or []
+        if exclude_kwargs:
+            moved_kwargs = send_to_device(
+                {k: v for k, v in kwargs.items() if k not in exclude_kwargs},
+                self.group.onload_device,
+                non_blocking=self.group.non_blocking,
+            )
+            kwargs.update(moved_kwargs)
+        else:
+            kwargs = send_to_device(kwargs, self.group.onload_device, non_blocking=self.group.non_blocking)
+        return args, kwargs
+    def post_forward(self, module: torch.nn.Module, output):
+        if self.group.offload_leader == module:
+            self.group.offload_()
+        return output
+def _apply_group_offloading_leaf_level_patched(module: torch.nn.Module, config: GroupOffloadingConfig) -> None:
+    """
+    Versão corrigida de _apply_group_offloading_leaf_level que suporta nn.Sequential.
+    """
+    modules_with_group_offloading: Set[str] = set()
+    for name, submodule in module.named_modules():
+        if not isinstance(submodule, _MY_GO_LC_SUPPORTED_PYTORCH_LAYERS):
+            continue
+        group = ModuleGroup(
+            modules=[submodule],
+            offload_device=config.offload_device,
+            onload_device=config.onload_device,
+            offload_to_disk_path=config.offload_to_disk_path,
+            offload_leader=submodule,
+            onload_leader=submodule,
+            non_blocking=config.non_blocking,
+            stream=config.stream,
+            record_stream=config.record_stream,
+            low_cpu_mem_usage=config.low_cpu_mem_usage,
+            onload_self=True,
+            group_id=name,
+        )
+        _apply_group_offloading_hook(submodule, group, config=config)
+        modules_with_group_offloading.add(name)
+    # Parameters and Buffers at all non-leaf levels need to be offloaded/onloaded separately when the forward pass
+    # of the module is called
+    module_dict = dict(module.named_modules())
+    parameters = _gather_parameters_with_no_group_offloading_parent(module, modules_with_group_offloading)
+    buffers = _gather_buffers_with_no_group_offloading_parent(module, modules_with_group_offloading)
+    # Find closest module parent for each parameter and buffer, and attach group hooks
+    parent_to_parameters = {}
+    for name, param in parameters:
+        parent_name = _find_parent_module_in_module_dict(name, module_dict)
+        if parent_name in parent_to_parameters:
+            parent_to_parameters[parent_name].append(param)
+        else:
+            parent_to_parameters[parent_name] = [param]
+    parent_to_buffers = {}
+    for name, buffer in buffers:
+        parent_name = _find_parent_module_in_module_dict(name, module_dict)
+        if parent_name in parent_to_buffers:
+            parent_to_buffers[parent_name].append(buffer)
+        else:
+            parent_to_buffers[parent_name] = [buffer]
+    parent_names = set(parent_to_parameters.keys()) | set(parent_to_buffers.keys())
+    for name in parent_names:
+        parameters = parent_to_parameters.get(name, [])
+        buffers = parent_to_buffers.get(name, [])
+        parent_module = module_dict[name]
+        group = ModuleGroup(
+            modules=[],
+            offload_device=config.offload_device,
+            onload_device=config.onload_device,
+            offload_leader=parent_module,
+            onload_leader=parent_module,
+            offload_to_disk_path=config.offload_to_disk_path,
+            parameters=parameters,
+            buffers=buffers,
+            non_blocking=config.non_blocking,
+            stream=config.stream,
+            record_stream=config.record_stream,
+            low_cpu_mem_usage=config.low_cpu_mem_usage,
+            onload_self=True,
+            group_id=name,
+        )
+        _apply_group_offloading_hook(parent_module, group, config=config)
+    if config.stream is not None:
+        # When using streams, we need to know the layer execution order for applying prefetching (to overlap data transfer
+        # and computation). Since we don't know the order beforehand, we apply a lazy prefetching hook that will find the
+        # execution order and apply prefetching in the correct order.
+        unmatched_group = ModuleGroup(
+            modules=[],
+            offload_device=config.offload_device,
+            onload_device=config.onload_device,
+            offload_to_disk_path=config.offload_to_disk_path,
+            offload_leader=module,
+            onload_leader=module,
+            parameters=None,
+            buffers=None,
+            non_blocking=False,
+            stream=None,
+            record_stream=False,
+            low_cpu_mem_usage=config.low_cpu_mem_usage,
+            onload_self=True,
+            group_id=_GROUP_ID_LAZY_LEAF,
+        )
+        _apply_lazy_group_offloading_hook(module, unmatched_group, config=config)
+try:
+    import diffusers.hooks.group_offloading as group_offloading_module
+    setattr(group_offloading_module, "_apply_group_offloading_leaf_level", _apply_group_offloading_leaf_level_patched)
+    setattr(group_offloading_module, "GroupOffloadingHook", GroupOffloadingHook)
+except ImportError as e:
+    print(f"-> ERRO: Não foi possível importar o módulo `diffusers.hooks.group_offloading` para aplicar o patch: {e}")
+def convert_z_image_control_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
+    Z_IMAGE_KEYS_RENAME_DICT = {
+        "final_layer.": "all_final_layer.2-1.",
+        "x_embedder.": "all_x_embedder.2-1.",
+        ".attention.out.bias": ".attention.to_out.0.bias",
+        ".attention.k_norm.weight": ".attention.norm_k.weight",
+        ".attention.q_norm.weight": ".attention.norm_q.weight",
+        ".attention.out.weight": ".attention.to_out.0.weight",
+        "control_x_embedder.": "control_all_x_embedder.2-1.",
+    }
+    def convert_z_image_fused_attention(key: str, state_dict: dict[str, object]) -> None:
+        if ".attention.qkv.weight" not in key:
+            return
+        fused_qkv_weight = state_dict.pop(key)
+        to_q_weight, to_k_weight, to_v_weight = torch.chunk(fused_qkv_weight, 3, dim=0)
+        new_q_name = key.replace(".attention.qkv.weight", ".attention.to_q.weight")
+        new_k_name = key.replace(".attention.qkv.weight", ".attention.to_k.weight")
+        new_v_name = key.replace(".attention.qkv.weight", ".attention.to_v.weight")
+        state_dict[new_q_name] = to_q_weight
+        state_dict[new_k_name] = to_k_weight
+        state_dict[new_v_name] = to_v_weight
+        return
+    TRANSFORMER_SPECIAL_KEYS_REMAP = {
+        ".attention.qkv.weight": convert_z_image_fused_attention,
+    }
+    def update_state_dict(state_dict: dict[str, object], old_key: str, new_key: str) -> None:
+        state_dict[new_key] = state_dict.pop(old_key)
+    converted_state_dict = {key: checkpoint.pop(key) for key in list(checkpoint.keys())}
+    # Handle single file --> diffusers key remapping via the remap dict
+    for key in list(converted_state_dict.keys()):
+        new_key = key[:]
+        for replace_key, rename_key in Z_IMAGE_KEYS_RENAME_DICT.items():
+            new_key = new_key.replace(replace_key, rename_key)
+        update_state_dict(converted_state_dict, key, new_key)
+    # Handle any special logic which can't be expressed by a simple 1:1 remapping with the handlers in
+    # special_keys_remap
+    for key in list(converted_state_dict.keys()):
+        for special_key, handler_fn_inplace in TRANSFORMER_SPECIAL_KEYS_REMAP.items():
+            if special_key not in key:
+                continue
+            handler_fn_inplace(key, converted_state_dict)
+    return converted_state_dict
+SINGLE_FILE_LOADABLE_CLASSES = {
+    "StableCascadeUNet": {
+        "checkpoint_mapping_fn": convert_stable_cascade_unet_single_file_to_diffusers,
+    },
+    "UNet2DConditionModel": {
+        "checkpoint_mapping_fn": convert_ldm_unet_checkpoint,
+        "config_mapping_fn": create_unet_diffusers_config_from_ldm,
+        "default_subfolder": "unet",
+        "legacy_kwargs": {
+            "num_in_channels": "in_channels",  # Legacy kwargs supported by `from_single_file` mapped to new args
+        },
+    },
+    "AutoencoderKL": {
+        "checkpoint_mapping_fn": convert_ldm_vae_checkpoint,
+        "config_mapping_fn": create_vae_diffusers_config_from_ldm,
+        "default_subfolder": "vae",
+    },
+    "ControlNetModel": {
+        "checkpoint_mapping_fn": convert_controlnet_checkpoint,
+        "config_mapping_fn": create_controlnet_diffusers_config_from_ldm,
+    },
+    "SD3Transformer2DModel": {
+        "checkpoint_mapping_fn": convert_sd3_transformer_checkpoint_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "MotionAdapter": {
+        "checkpoint_mapping_fn": convert_animatediff_checkpoint_to_diffusers,
+    },
+    "SparseControlNetModel": {
+        "checkpoint_mapping_fn": convert_animatediff_checkpoint_to_diffusers,
+    },
+    "FluxTransformer2DModel": {
+        "checkpoint_mapping_fn": convert_flux_transformer_checkpoint_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "ChromaTransformer2DModel": {
+        "checkpoint_mapping_fn": convert_chroma_transformer_checkpoint_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "LTXVideoTransformer3DModel": {
+        "checkpoint_mapping_fn": convert_ltx_transformer_checkpoint_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "AutoencoderKLLTXVideo": {
+        "checkpoint_mapping_fn": convert_ltx_vae_checkpoint_to_diffusers,
+        "default_subfolder": "vae",
+    },
+    "AutoencoderDC": {"checkpoint_mapping_fn": convert_autoencoder_dc_checkpoint_to_diffusers},
+    "MochiTransformer3DModel": {
+        "checkpoint_mapping_fn": convert_mochi_transformer_checkpoint_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "HunyuanVideoTransformer3DModel": {
+        "checkpoint_mapping_fn": convert_hunyuan_video_transformer_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "AuraFlowTransformer2DModel": {
+        "checkpoint_mapping_fn": convert_auraflow_transformer_checkpoint_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "Lumina2Transformer2DModel": {
+        "checkpoint_mapping_fn": convert_lumina2_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "SanaTransformer2DModel": {
+        "checkpoint_mapping_fn": convert_sana_transformer_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "WanTransformer3DModel": {
+        "checkpoint_mapping_fn": convert_wan_transformer_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "WanVACETransformer3DModel": {
+        "checkpoint_mapping_fn": convert_wan_transformer_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "AutoencoderKLWan": {
+        "checkpoint_mapping_fn": convert_wan_vae_to_diffusers,
+        "default_subfolder": "vae",
+    },
+    "HiDreamImageTransformer2DModel": {
+        "checkpoint_mapping_fn": convert_hidream_transformer_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "CosmosTransformer3DModel": {
+        "checkpoint_mapping_fn": convert_cosmos_transformer_checkpoint_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "QwenImageTransformer2DModel": {
+        "checkpoint_mapping_fn": lambda x: x,
+        "default_subfolder": "transformer",
+    },
+    "Flux2Transformer2DModel": {
+        "checkpoint_mapping_fn": convert_flux2_transformer_checkpoint_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "ZImageTransformer2DModel": {
+        "checkpoint_mapping_fn": convert_z_image_transformer_checkpoint_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+    "ZImageControlTransformer2DModel": {
+        "checkpoint_mapping_fn": convert_z_image_control_transformer_checkpoint_to_diffusers,
+        "default_subfolder": "transformer",
+    },
+}
+def get_class_obj_and_candidates(library_name, class_name, importable_classes, pipelines, is_pipeline_module, component_name=None, cache_dir=None):
+    """Simple helper method to retrieve class object of module as well as potential parent class objects"""
+    component_folder = os.path.join(cache_dir, component_name) if component_name and cache_dir else None
+    if is_pipeline_module:
+        pipeline_module = getattr(pipelines, library_name)
+        class_obj = getattr(pipeline_module, class_name)
+        class_candidates = dict.fromkeys(importable_classes.keys(), class_obj)
+    elif component_folder and os.path.isfile(os.path.join(component_folder, library_name + ".py")):
+        # load custom component
+        class_obj = get_class_from_dynamic_module(component_folder, module_file=library_name + ".py", class_name=class_name)
+        class_candidates = dict.fromkeys(importable_classes.keys(), class_obj)
+    else:
+        # else we just import it from the library.
+        library = importlib.import_module(library_name)
+        # Handle deprecated Transformers classes
+        if library_name == "transformers":
+            class_name = _maybe_remap_transformers_class(class_name) or class_name
+        try:
+            class_obj = getattr(library, class_name)
+        except Exception:
+            module = importlib.import_module("diffusers_local")
+            class_obj = getattr(module, class_name)
+        class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
+    return class_obj, class_candidates
+def _get_single_file_loadable_mapping_class(cls):
+    diffusers_module = importlib.import_module("diffusers")
+    class_name_str = cls.__name__
+    for loadable_class_str in SINGLE_FILE_LOADABLE_CLASSES:
+        try:
+            loadable_class = getattr(diffusers_module, loadable_class_str)
+        except Exception:
+            module = importlib.import_module("diffusers_local")
+            loadable_class = getattr(module, loadable_class_str)
+        if issubclass(cls, loadable_class):
+            return loadable_class_str
+    return class_name_str
+def maybe_raise_or_warn(library_name, library, class_name, importable_classes, passed_class_obj, name, is_pipeline_module):
+    """Simple helper method to raise or warn in case incorrect module has been passed"""
+    if not is_pipeline_module:
+        library = importlib.import_module(library_name)
+        # Handle deprecated Transformers classes
+        if library_name == "transformers":
+            class_name = _maybe_remap_transformers_class(class_name) or class_name
+        try:
+            class_obj = getattr(library, class_name)
+        except Exception:
+            module = importlib.import_module("diffusers_local")
+            class_obj = getattr(module, class_name)
+        class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
+        expected_class_obj = None
+        for class_name, class_candidate in class_candidates.items():
+            if class_candidate is not None and issubclass(class_obj, class_candidate):
+                expected_class_obj = class_candidate
+        # Dynamo wraps the original model in a private class.
+        # I didn't find a public API to get the original class.
+        sub_model = passed_class_obj[name]
+        unwrapped_sub_model = _unwrap_model(sub_model)
+        model_cls = unwrapped_sub_model.__class__
+        if not issubclass(model_cls, expected_class_obj):
+            raise ValueError(f"{passed_class_obj[name]} is of type: {model_cls}, but should be {expected_class_obj}")
+    else:
+        print(f"You have passed a non-standard module {passed_class_obj[name]}. We cannot verify whether it has the correct type")
+pipe_loading_utils.get_class_obj_and_candidates = get_class_obj_and_candidates
+pipe_loading_utils.maybe_raise_or_warn = maybe_raise_or_warn
+single_file_model.SINGLE_FILE_LOADABLE_CLASSES = SINGLE_FILE_LOADABLE_CLASSES
+single_file_model._get_single_file_loadable_mapping_class = _get_single_file_loadable_mapping_class

diffusers_local/pipeline_z_image_control_unified.py ADDED Viewed

	@@ -0,0 +1,1042 @@

+# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
+# Refactored and optimized by DEVAIEXP Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torchvision.transforms as T
+from diffusers import AutoencoderKL, DiffusionPipeline, FlowMatchEulerDiscreteScheduler
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import FromSingleFileMixin, ZImageLoraLoaderMixin
+from diffusers.pipelines.z_image.pipeline_output import ZImagePipelineOutput
+from diffusers.utils import logging
+from diffusers.utils.torch_utils import randn_tensor
+from PIL import Image, ImageFilter
+from transformers import AutoTokenizer, PreTrainedModel
+from diffusers_local.z_image_control_transformer_2d import ZImageControlTransformer2DModel
+logger = logging.get_logger(__name__)
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    """
+    Calculates the shift value `mu` for the scheduler based on the image sequence length.
+    This function implements a linear interpolation to determine the shift value based on the input
+    image's sequence length, scaling between a base and a maximum shift value.
+    Args:
+        image_seq_len (`int`):
+            The sequence length of the image latents (height * width).
+        base_seq_len (`int`, *optional*, defaults to 256):
+            The base sequence length for the shift calculation.
+        max_seq_len (`int`, *optional*, defaults to 4096):
+            The maximum sequence length for the shift calculation.
+        base_shift (`float`, *optional*, defaults to 0.5):
+            The shift value corresponding to `base_seq_len`.
+        max_shift (`float`, *optional*, defaults to 1.15):
+            The shift value corresponding to `max_seq_len`.
+    Returns:
+        `float`: The calculated shift value `mu`.
+    """
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+def retrieve_latents(encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"):
+    """
+    Retrieves latents from a VAE encoder output.
+    Args:
+        encoder_output (`torch.Tensor`):
+            The output of a VAE encoder.
+        generator (`torch.Generator`, *optional*):
+            A random number generator for sampling from the latent distribution.
+        sample_mode (`str`, *optional*, defaults to "sample"):
+            The method to retrieve latents. Can be "sample" to sample from the distribution or
+            "argmax" to take the mode.
+    Returns:
+        `torch.Tensor`: The retrieved latents.
+    """
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+class ZImageControlUnifiedPipeline(DiffusionPipeline, ZImageLoraLoaderMixin, FromSingleFileMixin):
+    model_cpu_offload_seq = "text_encoder->vae->transformer"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: PreTrainedModel,
+        tokenizer: AutoTokenizer,
+        transformer: ZImageControlTransformer2DModel,
+    ):
+        """
+        Initializes the ZImageControlUnifiedPipeline.
+        Args:
+            scheduler (`FlowMatchEulerDiscreteScheduler`):
+                A scheduler to be used in combination with `transformer` to denoise the latents.
+            vae (`AutoencoderKL`):
+                Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+            text_encoder (`PreTrainedModel`):
+                A pretrained text encoder model.
+            tokenizer (`AutoTokenizer`):
+                A tokenizer to prepare text prompts for the `text_encoder`.
+            transformer (`ZImageControlTransformer2DModel`):
+                The main transformer model for the diffusion process.
+        """
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            transformer=transformer,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
+        self.mask_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True)
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[List[torch.FloatTensor]] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        max_sequence_length: int = 512,
+    ):
+        """
+        Encodes the prompt into text embeddings.
+        Args:
+            prompt (`Union[str, List[str]]`):
+                The prompt or prompts to guide the image generation.
+            device (`Optional[torch.device]`):
+                The device to move the embeddings to.
+            num_images_per_prompt (`int`):
+                The number of images to generate per prompt.
+            do_classifier_free_guidance (`bool`):
+                Whether to generate embeddings for classifier-free guidance.
+            negative_prompt (`Optional[Union[str, List[str]]]`):
+                The negative prompt or prompts.
+            prompt_embeds (`Optional[List[torch.FloatTensor]]`):
+                Pre-generated positive prompt embeddings.
+            negative_prompt_embeds (`Optional[torch.FloatTensor]`):
+                Pre-generated negative prompt embeddings.
+            max_sequence_length (`int`):
+                The maximum sequence length for tokenization.
+        Returns:
+            `Tuple[List[torch.Tensor], List[torch.Tensor]]`: A tuple containing the positive and negative prompt embeddings.
+        """
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt_embeds is not None:
+            pass
+        else:
+            prompt_embeds = self._encode_prompt(
+                prompt=prompt,
+                device=device,
+                max_sequence_length=max_sequence_length,
+            )
+        if num_images_per_prompt > 1:
+            prompt_embeds = [pe for pe in prompt_embeds for _ in range(num_images_per_prompt)]
+        if do_classifier_free_guidance:
+            if negative_prompt_embeds is not None:
+                pass
+            else:
+                if negative_prompt is None:
+                    negative_prompt = [""] * len(prompt)
+                else:
+                    negative_prompt = [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+                assert len(prompt) == len(negative_prompt)
+                negative_prompt_embeds = self._encode_prompt(
+                    prompt=negative_prompt,
+                    device=device,
+                    max_sequence_length=max_sequence_length,
+                )
+            if num_images_per_prompt > 1:
+                negative_prompt_embeds = [npe for npe in negative_prompt_embeds for _ in range(num_images_per_prompt)]
+        return prompt_embeds, negative_prompt_embeds
+    def _encode_prompt(self, prompt: Union[str, List[str]], device: torch.device, max_sequence_length: int) -> List[torch.Tensor]:
+        """
+        Internal helper to encode a list of prompts into embeddings, applying chat templates if available.
+        Args:
+            prompt (`Union[str, List[str]]`):
+                A list of strings to be encoded.
+            device (`torch.device`):
+                The target device for the embeddings.
+            max_sequence_length (`int`):
+                The maximum length for tokenization.
+        Returns:
+            `List[torch.Tensor]`: A list of embedding tensors, one for each prompt.
+        """
+        formatted_prompts = []
+        for p in prompt:
+            messages = [{"role": "user", "content": p}]
+            if hasattr(self.tokenizer, "apply_chat_template"):
+                formatted_prompts.append(self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=True))
+            else:
+                formatted_prompts.append(p)
+        text_inputs = self.tokenizer(
+            formatted_prompts,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_tensors="pt",
+        ).to(device)
+        prompt_masks = text_inputs.attention_mask.bool()
+        with torch.no_grad():
+            prompt_embeds_batch = self.text_encoder(input_ids=text_inputs.input_ids, attention_mask=prompt_masks, output_hidden_states=True).hidden_states[-2]
+        embeddings_list = []
+        for i in range(prompt_embeds_batch.shape[0]):
+            embeddings_list.append(prompt_embeds_batch[i][prompt_masks[i]])
+        return embeddings_list
+    def get_timesteps(self, num_inference_steps, strength, device):
+        """
+        Calculates the timesteps for the scheduler based on the number of inference steps and strength.
+        This is primarily used for image-to-image pipelines.
+        Args:
+            num_inference_steps (`int`): The total number of diffusion steps.
+            strength (`float`): The strength of the denoising process. A value of 1.0 means full denoising.
+            device (`torch.device`): The device to place the timesteps on.
+        Returns:
+            `Tuple[torch.Tensor, int]`: A tuple containing the timesteps and the number of steps to run.
+        """
+        init_timestep = min(num_inference_steps * strength, num_inference_steps)
+        t_start = int(max(num_inference_steps - init_timestep, 0))
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+        return timesteps, num_inference_steps - t_start
+    def prepare_latents(
+        self,
+        batch_size: int,
+        num_channels_latents: int,
+        height: int,
+        width: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        generator: torch.Generator,
+        image: Optional[PipelineImageInput] = None,
+        timestep: Optional[torch.Tensor] = None,
+        latents: Optional[torch.Tensor] = None,
+    ):
+        """
+        Prepares the initial latents for the diffusion process.
+        This function handles three cases:
+        1. `latents` are provided: They are returned directly.
+        2. `image` is None (Text-to-Image): Random noise is generated.
+        3. `image` is provided (Image-to-Image): The image is encoded, and noise is added according to the timestep.
+        Args:
+            batch_size (`int`): The number of latents to generate.
+            num_channels_latents (`int`): The number of channels in the latents.
+            height (`int`): The height of the output image in pixels.
+            width (`int`): The width of the output image in pixels.
+            dtype (`torch.dtype`): The data type for the latents.
+            device (`torch.device`): The device to create the latents on.
+            generator (`torch.Generator`): A random generator for creating the initial noise.
+            image (`Optional[PipelineImageInput]`): An initial image for img2img mode.
+            timestep (`Optional[torch.Tensor]`): The starting timestep for adding noise in img2img mode.
+            latents (`Optional[torch.Tensor]`): Pre-generated latents.
+        Returns:
+            `torch.Tensor`: The prepared latents.
+        """
+        latent_height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        latent_width = 2 * (int(width) // (self.vae_scale_factor * 2))
+        shape = (batch_size, num_channels_latents, latent_height, latent_width)
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+        if image is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            return latents
+        image_tensor = self.image_processor.preprocess(image, height=height, width=width).to(device=device, dtype=self.vae.dtype)
+        with torch.no_grad():
+            if image_tensor.shape[1] != num_channels_latents:
+                if isinstance(generator, list):
+                    image_latents = [retrieve_latents(self.vae.encode(image_tensor[i : i + 1]), generator=generator[i]) for i in range(image_tensor.shape[0])]
+                    image_latents = torch.cat(image_latents, dim=0)
+                else:
+                    image_latents = retrieve_latents(self.vae.encode(image_tensor), generator=generator)
+        image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        image_latents = image_latents.to(dtype)
+        if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
+            additional_image_per_prompt = batch_size // image_latents.shape[0]
+            image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
+            raise ValueError(f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts.")
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        latents = self.scheduler.scale_noise(image_latents, timestep, noise)
+        return latents
+    def _prepare_image_latents(
+        self,
+        image: PipelineImageInput,
+        mask_image: PipelineImageInput,
+        width: int,
+        height: int,
+        batch_size: int,
+        num_images_per_prompt: int,
+        device: torch.device,
+        dtype: torch.dtype,
+        do_preprocess: bool = True,
+    ) -> torch.Tensor:
+        """
+        Generic function to encode an image into 5D latents for inpainting context.
+        If `do_preprocess` is True, it processes the image (PIL/np).
+        If `do_preprocess` is False, it assumes 'image' is already a ready-to-use tensor.
+        Args:
+            image (`PipelineImageInput`): The input image. Can be None to return zeros.
+            width (`int`): The target width.
+            height (`int`): The target height.
+            batch_size (`int`): The prompt batch size.
+            num_images_per_prompt (`int`): The number of images per prompt.
+            device (`torch.device`): The target device.
+            dtype (`torch.dtype`): The target data type.
+            do_preprocess (`bool`): Whether to preprocess the image.
+        Returns:
+            `torch.Tensor`: A 5D tensor of the encoded image latents.
+        """
+        if image is None:
+            latent_h = height // self.vae_scale_factor
+            latent_w = width // self.vae_scale_factor
+            shape = (batch_size * num_images_per_prompt, self.transformer.in_channels, 1, latent_h, latent_w)
+            return torch.zeros(shape, device=device, dtype=dtype)
+        if do_preprocess:
+            image_tensor = self.image_processor.preprocess(image, height=height, width=width).to(device=device, dtype=self.vae.dtype)
+        else:
+            image_tensor = image.to(device=device, dtype=self.vae.dtype)
+        if mask_image is not None:
+            mask_condition = self.mask_processor.preprocess(mask_image, height=height, width=width).to(device=device, dtype=self.vae.dtype)
+            # Tile para 3 canais (RGB)
+            mask_condition = torch.tile(mask_condition, [1, 3, 1, 1])
+            # Aplica máscara: mantém apenas áreas escuras (< 0.5)
+            image_tensor = image_tensor * (mask_condition < 0.5)
+        with torch.no_grad():
+            latents = retrieve_latents(self.vae.encode(image_tensor), sample_mode="argmax")
+            latents = (latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        effective_batch_size = batch_size * num_images_per_prompt
+        if latents.shape[0] != effective_batch_size:
+            repeat_by = effective_batch_size // latents.shape[0]
+            latents = latents.repeat_interleave(repeat_by, dim=0)
+        return latents.to(dtype=dtype).unsqueeze(2)
+    def _prepare_mask_latents(
+        self,
+        mask_image: PipelineImageInput,
+        width: int,
+        height: int,
+        batch_size: int,
+        num_images_per_prompt: int,
+        reference_latents_shape: Tuple,
+        device: torch.device,
+        dtype: torch.dtype,
+        invert_mask: bool = False,
+        do_unsqueeze: bool = True,
+    ) -> torch.Tensor:
+        """
+        Processes a MASK using the mask_processor, inverts it, resizes it, and formats it for the control_context.
+        Args:
+            mask_image (`PipelineImageInput`): The mask image. Can be None to return zeros.
+            width (`int`): The target width.
+            height (`int`): The target height.
+            batch_size (`int`): The prompt batch size.
+            num_images_per_prompt (`int`): The number of images per prompt.
+            reference_latents_shape (`Tuple`): The shape of the inpainting latents for resizing.
+            device (`torch.device`): The target device.
+            dtype (`torch.dtype`): The target data type.
+        Returns:
+            `torch.Tensor`: A 5D tensor of the processed mask latents.
+        """
+        if mask_image is None:
+            placeholder_shape = (
+                batch_size * num_images_per_prompt,
+                1,
+                1,
+                reference_latents_shape[-2],
+                reference_latents_shape[-1],
+            )
+            return torch.zeros(placeholder_shape, device=device, dtype=dtype)
+        mask_tensor = self.mask_processor.preprocess(mask_image, height=height, width=width)
+        mask_tensor = mask_tensor.to(device=device, dtype=dtype)
+        if invert_mask:
+            mask_tensor = 1.0 - mask_tensor
+        mask_latents = F.interpolate(mask_tensor, size=reference_latents_shape[-2:], mode="nearest")
+        if do_unsqueeze:
+            mask_latents = mask_latents.unsqueeze(2)
+        return mask_latents
+    def prepare_control_latents(
+        self, image: PipelineImageInput, width: int, height: int, batch_size: int, num_images_per_prompt: int, device: torch.device, dtype: torch.dtype
+    ) -> torch.Tensor:
+        """
+        Preprocesses a control image, ENCODES it with the VAE to latent space,
+        and returns a 5D tensor ready for the transformer model.
+        Args:
+            image (`PipelineImageInput`): The control image. Can be None to return zeros.
+            width (`int`): The target width.
+            height (`int`): The target height.
+            batch_size (`int`): The prompt batch size.
+            num_images_per_prompt (`int`): The number of images per prompt.
+            device (`torch.device`): The target device.
+            dtype (`torch.dtype`): The target data type.
+        Returns:
+            `torch.Tensor`: A 5D tensor of the control image latents.
+        """
+        if image is None:
+            latent_h = 2 * (int(height) // (self.vae_scale_factor * 2))
+            latent_w = 2 * (int(width) // (self.vae_scale_factor * 2))
+            return torch.zeros(
+                (batch_size * num_images_per_prompt, self.transformer.in_channels, 1, latent_h, latent_w),
+                device=device,
+                dtype=dtype,
+            )
+        image_tensor = self.image_processor.preprocess(image, height=height, width=width).to(device=device, dtype=self.vae.dtype)
+        with torch.no_grad():
+            latents = retrieve_latents(self.vae.encode(image_tensor), sample_mode="argmax")
+            latents = (latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        effective_batch_size = batch_size * num_images_per_prompt
+        if latents.shape[0] < effective_batch_size:
+            latents = latents.repeat_interleave(effective_batch_size // latents.shape[0], dim=0)
+        return latents.to(dtype=dtype).unsqueeze(2)
+    def _expand_and_feather_mask(self, mask_image, expand_pixels=10, feather_radius=8, is_inpaint_mode=True):
+        """
+        Expands the white area of a mask using PyTorch for performance and then smooths its edges with Pillow.
+        Args:
+            mask_image (PIL.Image.Image | np.ndarray | torch.Tensor): The input mask.
+            expand_pixels (int): How many pixels to expand the white area.
+            feather_radius (int): The radius of the Gaussian blur for the gradient.
+            is_inpaint_mode (bool): Flag to enable/disable the operation.
+        Returns:
+            PIL.Image.Image | np.ndarray | torch.Tensor: The processed mask, in the same format as the input.
+        """
+        if not is_inpaint_mode or (expand_pixels <= 0 and feather_radius <= 0):
+            return mask_image
+        # --- 1. CONVERSÃO PARA TENSOR PYTORCH ---
+        input_type = type(mask_image)
+        if isinstance(mask_image, Image.Image):
+            # Converte PIL Image para Tensor
+            mask_tensor = T.ToTensor()(mask_image.convert("L"))
+        elif isinstance(mask_image, np.ndarray):
+            # Converte NumPy array para Tensor
+            mask_tensor = torch.from_numpy(mask_image).permute(2, 0, 1) if mask_image.ndim == 3 else torch.from_numpy(mask_image).unsqueeze(0)
+        elif isinstance(mask_image, torch.Tensor):
+            mask_tensor = mask_image
+        else:
+            raise TypeError(f"Unsupported mask type: {input_type}")
+        # Garante que o tensor está no device e formato corretos (Batch, Canais, H, W)
+        mask_tensor = mask_tensor.to(device=self.device, dtype=torch.float32)
+        if mask_tensor.ndim == 3:
+            mask_tensor = mask_tensor.unsqueeze(0) # Adiciona a dimensão do batch se necessário
+        # --- 2. EXPANSÃO (DILATION) NA GPU COM PYTORCH ---
+        if expand_pixels > 0:
+            kernel_size = expand_pixels * 2 + 1
+            padding = expand_pixels
+            # Max pooling com stride=1 é a implementação de dilatação para tensores
+            mask_tensor = F.max_pool2d(
+                mask_tensor,
+                kernel_size=kernel_size,
+                stride=1,
+                padding=padding
+            )
+        # --- 3. CONVERSÃO DE VOLTA PARA PIL IMAGE ---
+        # `ToPILImage` espera um tensor [C, H, W], então removemos a dimensão do batch
+        to_pil = T.ToPILImage()
+        mask_pil = to_pil(mask_tensor.squeeze(0).cpu())
+        # --- 4. DEGRADÊ (FEATHERING / BLUR) COM PILLOW ---
+        if feather_radius > 0:
+            mask_pil = mask_pil.filter(ImageFilter.GaussianBlur(radius=feather_radius))
+        # --- 5. CONVERSÃO FINAL PARA O TIPO ORIGINAL ---
+        if input_type is torch.Tensor:
+            # Reconverte para Tensor se o input era um Tensor
+            return T.ToTensor()(mask_pil).to(device=self.device, dtype=mask_image.dtype)
+        elif input_type is np.ndarray:
+            # Reconverte para NumPy array se o input era um array
+            return np.array(mask_pil)
+        else: # input_type is Image.Image
+            return mask_pil
+    def _apply_mask_blur(self, mask_image, mask_blur_radius, is_inpaint_mode):
+        """
+        Apply Gaussian blur to a mask image for inpainting operations.
+        Args:
+            mask_image (Image.Image | np.ndarray | torch.Tensor): The mask image to be blurred.
+                Can be provided as a PIL Image, NumPy array, or PyTorch tensor.
+            mask_blur_radius (float): The radius of the Gaussian blur filter in pixels.
+                Only applied if is_inpaint_mode is True and mask_blur_radius > 0.
+            is_inpaint_mode (bool): Flag indicating whether the pipeline is in inpainting mode.
+                Blur is only applied when this is True.
+        Returns:
+            Image.Image | np.ndarray | torch.Tensor: The mask image with Gaussian blur applied
+                if is_inpaint_mode is True and mask_blur_radius > 0. Otherwise, returns the
+                original mask_image unchanged. The return type matches the input type.
+        """
+        mask_to_use = mask_image
+        if is_inpaint_mode and mask_blur_radius > 0:
+            if isinstance(mask_image, Image.Image):
+                mask_pil = mask_image
+            elif isinstance(mask_image, np.ndarray):
+                mask_pil = Image.fromarray(mask_image)
+            elif isinstance(mask_image, torch.Tensor):
+                mask_pil = Image.fromarray(mask_image.cpu().numpy().astype(np.uint8))
+            else:
+                mask_pil = mask_image
+            mask_to_use = mask_pil.filter(ImageFilter.GaussianBlur(radius=mask_blur_radius))
+        return mask_to_use
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def interrupt(self):
+        return self._interrupt
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Optional[PipelineImageInput] = None,
+        mask_image: Optional[PipelineImageInput] = None,
+        inpaint_mode: Literal["default", "diff", "diff+inpaint"] = "default",
+        mask_blur_radius: float=8.0,
+        control_image: Optional[PipelineImageInput] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 20,
+        sigmas: Optional[List[float]] = None,
+        strength: float = 1.0,
+        guidance_scale: float = 4.0,
+        cfg_normalization: bool = False,
+        cfg_truncation: float = 1.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[List[torch.FloatTensor]] = None,
+        negative_prompt_embeds: Optional[List[torch.FloatTensor]] = None,
+        controlnet_conditioning_scale: float = 1.0,
+        controlnet_refiner_conditioning_scale: float = 1.0,
+        output_type: str = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        r"""
+        The main entry point for the Z-Image unified pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+            image (`PipelineImageInput`, *optional*):
+                The initial image for image-to-image or inpainting modes.
+            mask_image (`PipelineImageInput`, *optional*):
+                The mask image for inpainting. White areas are preserved, black areas are inpainted.
+            inpaint_mode (`str`, *optional*, defaults to `"default"`):
+                The inpainting mode. Can be "default", "diff", or "diff+inpaint". Determines how the inpainting
+            process is handled.
+            mask_blur_radius (`float`, *optional*, defaults to 8.0):
+                The radius for blurring the edges of the inpainting mask to create a smoother transition.
+            control_image (`PipelineImageInput`, *optional*):
+                The conditioning image for control modes (e.g., Canny, depth).
+            height (`int`, *optional*, defaults to 1024):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 1024):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 20):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+            expense of slower inference.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process. If not defined, the scheduler's default behavior
+            will be used.
+            strength (`float`, *optional*, defaults to 1.0):
+                Denoising strength for image-to-image. A value of 1.0 means the initial image is fully replaced,
+            while a lower value preserves more of the original image structure. Only used in img2img mode.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                The scale for classifier-free guidance. A value > 1 enables it. Higher values encourage images
+            closer to the prompt, potentially at the cost of quality.
+            cfg_normalization (`bool`, *optional*, defaults to False):
+                Whether to apply normalization to the guidance, which can prevent oversaturation.
+            cfg_truncation (`float`, *optional*, defaults to 1.0):
+                A value between 0.0 and 1.0 that disables CFG for the final portion of the denoising steps,
+            specified as a fraction of total steps. For example, 0.8 disables CFG for the last 20% of steps.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A torch generator to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents to be used as inputs for image generation.
+            prompt_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated positive text embeddings.
+            negative_prompt_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated negative text embeddings.
+            controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+                The scale of the control conditioning influence.
+            controlnet_refiner_conditioning_scale (`float`, *optional*, defaults to 1.0):
+                The scale of the control refiner conditioning influence.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between "pil" (`PIL.Image.Image`), "np.array", or "latent".
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a `ZImagePipelineOutput` instead of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary for the `AttentionProcessor`.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that is called at the end of each denoising step.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function.
+            max_sequence_length (`int`, *optional*, defaults to 512):
+                Maximum sequence length to use with the `prompt`.
+        Examples:
+        Returns:
+            [`~pipelines.z_image.ZImagePipelineOutput`] or `tuple`:
+            If `return_dict` is True, a `ZImagePipelineOutput` is returned, otherwise a `tuple` with the generated images.
+        """
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+        self._cfg_normalization = cfg_normalization
+        self._cfg_truncation = cfg_truncation
+        is_two_stage_control_model = self.transformer.control_in_dim > self.transformer.in_channels if hasattr(self.transformer, "control_in_dim") else False
+        device = self._execution_device
+        dtype = self.transformer.dtype
+        vae_scale = self.vae_scale_factor * 2
+        has_inpaint_inputs = image is not None and mask_image is not None
+        is_inpaint_control_mode = has_inpaint_inputs and inpaint_mode in ["default", "diff+inpaint"]
+        is_diff_mode = has_inpaint_inputs and inpaint_mode in ["diff", "diff+inpaint"]
+        is_img2img_mode = image is not None and not has_inpaint_inputs
+        ref_image = control_image or image
+        image_height = None
+        image_width = None
+        if ref_image is not None:
+            if isinstance(ref_image, Image.Image):
+                image_height, image_width = ref_image.height, ref_image.width
+            else:
+                image_height, image_width = ref_image.shape[-2], ref_image.shape[-1]
+        height = height or image_height or 1024
+        width = width or image_width or 1024
+        if height % vae_scale != 0 or width % vae_scale != 0:
+            raise ValueError(f"Height/width must be divisible by {vae_scale}.")
+        batch_size = len(prompt) if isinstance(prompt, list) else 1 if prompt else len(prompt_embeds)
+        effective_batch_size = batch_size * num_images_per_prompt
+        if prompt_embeds is not None and prompt is None:
+            if self.do_classifier_free_guidance and negative_prompt_embeds is None:
+                raise ValueError(
+                    "When `prompt_embeds` is provided without `prompt`, `negative_prompt_embeds` must also be provided for classifier-free guidance."
+                )
+        else:
+            (
+                prompt_embeds,
+                negative_prompt_embeds,
+            ) = self.encode_prompt(
+                prompt=prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                negative_prompt=negative_prompt,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                device=device,
+                max_sequence_length=max_sequence_length,
+            )
+        if self.do_classifier_free_guidance:
+            prompt_embeds_model_input = prompt_embeds + negative_prompt_embeds
+        else:
+            prompt_embeds_model_input = prompt_embeds
+        if control_image is not None or is_inpaint_control_mode:
+            control_latents = self.prepare_control_latents(control_image, width, height, batch_size, num_images_per_prompt, device, dtype)
+            if is_two_stage_control_model:
+                image_for_inpaint = None if is_diff_mode and not is_inpaint_control_mode else image
+                mask_for_inpaint = None if is_diff_mode and not is_inpaint_control_mode else mask_image
+                if is_inpaint_control_mode:
+                    mask_for_inpaint = self._apply_mask_blur(mask_for_inpaint, mask_blur_radius, True)
+                inpaint_latents = self._prepare_image_latents(
+                    image_for_inpaint, mask_for_inpaint, width, height, batch_size, num_images_per_prompt, device, dtype
+                )
+                mask_latents = self._prepare_mask_latents(
+                    mask_for_inpaint,
+                    width,
+                    height,
+                    batch_size,
+                    num_images_per_prompt,
+                    inpaint_latents.shape,
+                    device,
+                    dtype,
+                    invert_mask=is_inpaint_control_mode,
+                    do_unsqueeze=True,
+                )
+                control_context = torch.cat([control_latents, mask_latents, inpaint_latents], dim=1)
+            else:
+                control_context = control_latents
+        else:
+            control_context = None
+        if self.do_classifier_free_guidance:
+            control_context_model_input = control_context.repeat(2, 1, 1, 1, 1)
+        else:
+            control_context_model_input = control_context
+        image_seq_len = (height // (self.vae_scale_factor * 2)) * (width // (self.vae_scale_factor * 2))
+        mu = calculate_shift(image_seq_len)
+        self.scheduler.sigma_min = 0.0
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, sigmas, mu=mu)
+        self._num_timesteps = len(timesteps)
+        if is_img2img_mode:
+            strength = min(strength, 1.0)
+        else:
+            strength = 1.0
+        if strength < 1.0:
+            init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+            t_start = max(num_inference_steps - init_timestep, 0)
+            timesteps = timesteps[t_start * self.scheduler.order :]
+            num_steps_to_run = len(timesteps) // self.scheduler.order
+        else:
+            num_steps_to_run = num_inference_steps
+        latent_timestep = timesteps[:1].repeat(effective_batch_size) if strength < 1.0 else None
+        use_image_for_latents = is_img2img_mode
+        latents = self.prepare_latents(
+            effective_batch_size,
+            self.transformer.in_channels,
+            height,
+            width,
+            torch.float32,
+            device,
+            generator,
+            image=image if use_image_for_latents else None,
+            timestep=latent_timestep if use_image_for_latents else None,
+            latents=latents,
+        )
+        if is_diff_mode:
+            original_image_tensor = self.image_processor.preprocess(image, height=height, width=width).to(device=device, dtype=self.vae.dtype)
+            with torch.no_grad():
+                original_clean_latents = retrieve_latents(self.vae.encode(original_image_tensor), sample_mode="argmax")
+            original_clean_latents = (original_clean_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+            original_clean_latents = original_clean_latents.to(dtype)
+            noise = randn_tensor(original_clean_latents.shape, generator=generator, device=device, dtype=dtype)
+            latents_list = []
+            step_indices = [(self.scheduler.timesteps == t).nonzero().item() for t in timesteps]
+            for i in step_indices:
+                sigma = self.scheduler.sigmas[i]
+                noisy_latent = (1.0 - sigma) * original_clean_latents + sigma * noise
+                latents_list.append(noisy_latent)
+            original_latents_trajectory = torch.cat(latents_list, dim=0)
+            blurred_mask_image = self._apply_mask_blur(mask_image, mask_blur_radius, True)
+            map_processed = self._prepare_mask_latents(
+                blurred_mask_image,
+                width,
+                height,
+                batch_size,
+                num_images_per_prompt,
+                latents.shape,
+                device,
+                dtype,
+                invert_mask=True,
+                do_unsqueeze=False,
+            )
+            thresholds = torch.arange(len(timesteps), device=device, dtype=dtype) / len(timesteps)
+            thresholds = thresholds.view(-1, 1, 1, 1)
+            time_masks = map_processed > thresholds
+        num_warmup_steps = len(timesteps) - num_steps_to_run * self.scheduler.order
+        with torch.inference_mode():
+            with self.progress_bar(total=num_steps_to_run) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    if self.interrupt:
+                        continue
+                    if is_diff_mode:
+                        if i == 0:
+                            latents = original_latents_trajectory[:1]
+                        else:
+                            current_mask = time_masks[i].to(latents.dtype)
+                            current_original_latent = original_latents_trajectory[i:i+1]
+                            if current_mask.ndim == 3:
+                                current_mask = current_mask.unsqueeze(1)
+                            latents = current_original_latent * current_mask + latents * (1 - current_mask)
+                    timestep = t.expand(latents.shape[0])
+                    timestep = (1000 - timestep) / 1000
+                    t_norm = timestep[0].item()
+                    current_guidance_scale = self.guidance_scale
+                    if self.do_classifier_free_guidance and self._cfg_truncation is not None and float(self._cfg_truncation) <= 1:
+                        if t_norm > self._cfg_truncation:
+                            current_guidance_scale = 0.0
+                    apply_cfg = self.do_classifier_free_guidance and current_guidance_scale > 0
+                    if apply_cfg:
+                        latent_model_input = latents.repeat(2, 1, 1, 1)
+                        timestep_model_input = timestep.repeat(2)
+                    else:
+                        latent_model_input = latents
+                        timestep_model_input = timestep
+                    latent_model_input = latent_model_input.to(self.transformer.dtype)
+                    latent_model_input = latent_model_input.unsqueeze(2)
+                    latent_model_input_list = list(latent_model_input.unbind(dim=0))
+                    model_out_list = self.transformer(
+                        x=latent_model_input_list,
+                        t=timestep_model_input,
+                        cap_feats=prompt_embeds_model_input,
+                        control_context=control_context_model_input,
+                        conditioning_scale=controlnet_conditioning_scale,
+                        refiner_conditioning_scale=controlnet_refiner_conditioning_scale,
+                    )[0]
+                    if apply_cfg:
+                        pos_out = model_out_list[:effective_batch_size]
+                        neg_out = model_out_list[effective_batch_size:]
+                        noise_pred = []
+                        for j in range(effective_batch_size):
+                            pos = pos_out[j].float()
+                            neg = neg_out[j].float()
+                            pred = pos + current_guidance_scale * (pos - neg)
+                            if self._cfg_normalization and float(self._cfg_normalization) > 0.0:
+                                ori_pos_norm = torch.linalg.vector_norm(pos)
+                                new_pos_norm = torch.linalg.vector_norm(pred)
+                                max_new_norm = ori_pos_norm * float(self._cfg_normalization)
+                                if new_pos_norm > max_new_norm:
+                                    pred = pred * (max_new_norm / new_pos_norm)
+                            noise_pred.append(pred)
+                        noise_pred = torch.stack(noise_pred, dim=0)
+                    else:
+                        noise_pred = torch.stack([t.float() for t in model_out_list], dim=0)
+                    noise_pred = noise_pred.squeeze(2)
+                    noise_pred = -noise_pred
+                    latents = self.scheduler.step(noise_pred.to(torch.float32), t, latents).prev_sample
+                    if callback_on_step_end is not None:
+                        callback_kwargs = {}
+                        for k in callback_on_step_end_tensor_inputs:
+                            callback_kwargs[k] = locals()[k]
+                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                        if isinstance(callback_outputs, dict):
+                            latents = callback_outputs.pop("latents", latents)
+                            prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                            negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        progress_bar.update()
+        if output_type != "latent":
+            latents = latents.to(self.vae.dtype)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            with torch.no_grad():
+                image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        else:
+            image = latents
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return ZImagePipelineOutput(images=image)

diffusers_local/z_image_control_transformer_2d.py ADDED Viewed

	@@ -0,0 +1,1460 @@

+# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
+# Refactored and optimized by DEVAIEXP Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
+from diffusers.models.attention_dispatch import dispatch_attention_fn
+from diffusers.models.attention_processor import Attention, AttentionProcessor
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import RMSNorm
+from diffusers.utils import (
+    is_torch_version,
+)
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from torch.nn.utils.rnn import pad_sequence
+ADALN_EMBED_DIM = 256
+SEQ_MULTI_OF = 32
+def zero_module(module):
+    """
+    Initializes the parameters of a given module with zeros.
+    Args:
+        module (nn.Module): The module to be zero-initialized.
+    Returns:
+        nn.Module: The same module with its parameters initialized to zero.
+    """
+    for p in module.parameters():
+        nn.init.zeros_(p)
+    return module
+class TimestepEmbedder(nn.Module):
+    """
+    A module to embed timesteps into a higher-dimensional space using sinusoidal embeddings
+    followed by a multilayer perceptron (MLP).
+    """
+    def __init__(self, out_size, mid_size=None, frequency_embedding_size=256):
+        """
+        Initializes the TimestepEmbedder module.
+        Args:
+            out_size (int): The output dimension of the embedding.
+            mid_size (int, optional): The intermediate dimension of the MLP. Defaults to `out_size`.
+            frequency_embedding_size (int, optional): The dimension of the sinusoidal frequency embedding. Defaults to 256.
+        """
+        super().__init__()
+        if mid_size is None:
+            mid_size = out_size
+        self.mlp = nn.Sequential(
+            nn.Linear(
+                frequency_embedding_size,
+                mid_size,
+                bias=True,
+            ),
+            nn.SiLU(),
+            nn.Linear(
+                mid_size,
+                out_size,
+                bias=True,
+            ),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Creates sinusoidal timestep embeddings.
+        Args:
+            t (torch.Tensor): A 1-D Tensor of N timesteps.
+            dim (int): The dimension of the embedding.
+            max_period (int, optional): The maximum period for the sinusoidal frequencies. Defaults to 10000.
+        Returns:
+            torch.Tensor: The timestep embeddings with shape (N, dim).
+        """
+        with torch.amp.autocast("cuda", enabled=False):
+            half = dim // 2
+            freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half)
+            args = t[:, None] * freqs[None]
+            embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+            if dim % 2:
+                embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+            return embedding
+    def forward(self, t):
+        """
+        Processes the input timesteps to generate embeddings.
+        Args:
+            t (torch.Tensor): The input timesteps.
+        Returns:
+            torch.Tensor: The final timestep embeddings after passing through the MLP.
+        """
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        weight_dtype = self.mlp[0].weight.dtype
+        if weight_dtype.is_floating_point:
+            t_freq = t_freq.to(weight_dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class FeedForward(nn.Module):
+    """
+    A Feed-Forward Network module using SwiGLU activation.
+    """
+    def __init__(self, dim: int, hidden_dim: int):
+        """
+        Initializes the FeedForward module.
+        Args:
+            dim (int): Input and output dimension.
+            hidden_dim (int): The hidden dimension of the network.
+        """
+        super().__init__()
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+    def _forward_silu_gating(self, x1, x3):
+        """
+        Applies the SiLU gating mechanism.
+        Args:
+            x1 (torch.Tensor): The first intermediate tensor.
+            x3 (torch.Tensor): The second intermediate tensor (gate).
+        Returns:
+            torch.Tensor: The result of the gating operation.
+        """
+        return F.silu(x1) * x3
+    def forward(self, x):
+        """
+        Defines the forward pass of the FeedForward network.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The output tensor.
+        """
+        return self.w2(self._forward_silu_gating(self.w1(x), self.w3(x)))
+class FinalLayer(nn.Module):
+    """
+    The final layer of the transformer, which applies AdaLN modulation and a linear projection.
+    """
+    def __init__(self, hidden_size, out_channels):
+        """
+        Initializes the FinalLayer module.
+        Args:
+            hidden_size (int): The input hidden size.
+            out_channels (int): The output dimension (number of channels).
+        """
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(min(hidden_size, ADALN_EMBED_DIM), hidden_size, bias=True),
+        )
+    def forward(self, x, c):
+        """
+        Defines the forward pass for the final layer.
+        Args:
+            x (torch.Tensor): The main input tensor from the transformer blocks.
+            c (torch.Tensor): The conditioning tensor (usually from timestep embedding) for AdaLN modulation.
+        Returns:
+            torch.Tensor: The final output tensor projected to the patch dimension.
+        """
+        scale = 1.0 + self.adaLN_modulation(c)
+        x = self.norm_final(x) * scale.unsqueeze(1)
+        x = self.linear(x)
+        return x
+class RopeEmbedder:
+    """
+    Computes Rotary Positional Embeddings (RoPE) for 3D coordinates.
+    """
+    def __init__(self, theta: float = 256.0, axes_dims: List[int] = (32, 48, 48), axes_lens: List[int] = (1024, 512, 512)):
+        """
+        Initializes the RopeEmbedder.
+        Args:
+            theta (float, optional): The base for the rotary frequencies. Defaults to 256.0.
+            axes_dims (List[int], optional): The dimensions for each axis (F, H, W). Defaults to (32, 48, 48).
+            axes_lens (List[int], optional): The maximum length for each axis. Defaults to (1024, 512, 512).
+        """
+        self.theta = theta
+        self.axes_dims = axes_dims
+        self.axes_lens = axes_lens
+        self.freqs_cis_cache = {}
+    def _precompute_freqs_cis(self, device):
+        """
+        Precomputes and caches the rotary frequency tensors (cos and sin values).
+        Args:
+            device (torch.device): The device to store the cached tensors on.
+        Returns:
+            List[torch.Tensor]: A list of precomputed frequency tensors for each axis.
+        """
+        if device in self.freqs_cis_cache:
+            return self.freqs_cis_cache[device]
+        freqs_cis_list = []
+        for dim, max_len in zip(self.axes_dims, self.axes_lens):
+            half = dim // 2
+            freqs = 1.0 / (self.theta ** (torch.arange(0, half, device=device, dtype=torch.float32) / half))
+            t = torch.arange(max_len, device=device, dtype=torch.float32)
+            freqs = torch.outer(t, freqs)
+            emb = torch.stack([freqs.cos(), freqs.sin()], dim=-1)
+            freqs_cis_list.append(emb)
+        self.freqs_cis_cache[device] = freqs_cis_list
+        return freqs_cis_list
+    def __call__(self, ids: torch.Tensor):
+        """
+        Generates RoPE embeddings for a batch of 3D coordinates.
+        Args:
+            ids (torch.Tensor): A tensor of coordinates with shape (N, 3).
+        Returns:
+            torch.Tensor: The concatenated RoPE embeddings for the input coordinates.
+        """
+        assert ids.ndim == 2 and ids.shape[1] == len(self.axes_dims)
+        device = ids.device
+        freqs_cis_list = self._precompute_freqs_cis(device)
+        result = []
+        for i in range(len(self.axes_dims)):
+            result.append(freqs_cis_list[i][ids[:, i]])
+        return torch.cat(result, dim=-2)
+class ZSingleStreamAttnProcessor:
+    """
+    An attention processor that applies Rotary Positional Embeddings (RoPE) to query and key tensors
+    before computing scaled dot-product attention.
+    """
+    _attention_backend = None
+    _parallel_config = None
+    def __init__(self):
+        """
+        Initializes the ZSingleStreamAttnProcessor.
+        """
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("ZSingleStreamAttnProcessor requires PyTorch 2.0. To use it, please upgrade PyTorch to version 2.0 or higher.")
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        freqs_cis: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        The forward call for the attention processor.
+        Args:
+            attn (Attention): The attention layer that this processor is attached to.
+            hidden_states (torch.Tensor): The input hidden states.
+            encoder_hidden_states (Optional[torch.Tensor], optional): Not used in self-attention. Defaults to None.
+            attention_mask (Optional[torch.Tensor], optional): The attention mask. Defaults to None.
+            freqs_cis (Optional[torch.Tensor], optional): The precomputed RoPE frequencies. Defaults to None.
+        Returns:
+            torch.Tensor: The output of the attention mechanism.
+        """
+        def apply_rotary_emb(q_or_k: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
+            """
+            Applies RoPE to a query or key tensor.
+            """
+            x = q_or_k.transpose(1, 2)
+            x_reshaped = x.float().reshape(*x.shape[:-1], -1, 2)
+            x0 = x_reshaped[..., 0]
+            x1 = x_reshaped[..., 1]
+            freqs_cos = freqs_cis[..., 0].unsqueeze(1)
+            freqs_sin = freqs_cis[..., 1].unsqueeze(1)
+            x_rotated_0 = x0 * freqs_cos - x1 * freqs_sin
+            x_rotated_1 = x0 * freqs_sin + x1 * freqs_cos
+            x_rotated = torch.stack((x_rotated_0, x_rotated_1), dim=-1)
+            x_out = x_rotated.flatten(-2).transpose(1, 2)
+            return x_out.to(q_or_k.dtype)
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        query = query.unflatten(-1, (attn.heads, -1))
+        key = key.unflatten(-1, (attn.heads, -1))
+        value = value.unflatten(-1, (attn.heads, -1))
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        if freqs_cis is not None:
+            query = apply_rotary_emb(query, freqs_cis)
+            key = apply_rotary_emb(key, freqs_cis)
+        if attention_mask is not None and attention_mask.ndim == 2:
+            attention_mask = attention_mask[:, None, None, :]
+        hidden_states = dispatch_attention_fn(
+            query,
+            key,
+            value,
+            attn_mask=attention_mask,
+            dropout_p=0.0,
+            is_causal=False,
+            backend=self._attention_backend,
+            parallel_config=self._parallel_config,
+        )
+        hidden_states = hidden_states.flatten(2, 3)
+        output = attn.to_out[0](hidden_states.to(hidden_states.dtype))
+        if len(attn.to_out) > 1:
+            output = attn.to_out[1](output)
+        return output
+@maybe_allow_in_graph
+class ZImageTransformerBlock(nn.Module):
+    """
+    A standard transformer block consisting of a self-attention layer and a feed-forward network.
+    Includes support for AdaLN modulation.
+    """
+    def __init__(
+        self,
+        layer_id: int,
+        dim: int,
+        n_heads: int,
+        n_kv_heads: int,
+        norm_eps: float,
+        qk_norm: bool,
+        modulation=True,
+    ):
+        """
+        Initializes the ZImageTransformerBlock.
+        Args:
+            layer_id (int): The index of the layer.
+            dim (int): The dimension of the input and output features.
+            n_heads (int): The number of attention heads.
+            n_kv_heads (int): The number of key/value heads (not directly used in this simplified attention).
+            norm_eps (float): Epsilon for RMSNorm.
+            qk_norm (bool): Whether to apply normalization to query and key tensors.
+            modulation (bool, optional): Whether to enable AdaLN modulation. Defaults to True.
+        """
+        super().__init__()
+        self.dim = dim
+        self.head_dim = dim // n_heads
+        self.attention = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            dim_head=dim // n_heads,
+            heads=n_heads,
+            qk_norm="rms_norm" if qk_norm else None,
+            eps=1e-5,
+            bias=False,
+            out_bias=False,
+            processor=ZSingleStreamAttnProcessor(),
+        )
+        self.feed_forward = FeedForward(dim=dim, hidden_dim=int(dim / 3 * 8))
+        self.layer_id = layer_id
+        self.attention_norm1 = RMSNorm(dim, eps=norm_eps)
+        self.ffn_norm1 = RMSNorm(dim, eps=norm_eps)
+        self.attention_norm2 = RMSNorm(dim, eps=norm_eps)
+        self.ffn_norm2 = RMSNorm(dim, eps=norm_eps)
+        self.modulation = modulation
+        if modulation:
+            self.adaLN_modulation = nn.Sequential(
+                nn.Linear(min(dim, ADALN_EMBED_DIM), 4 * dim, bias=True),
+            )
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        """
+        Returns a dictionary of all attention processors used in the module.
+        """
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        """
+        Sets the attention processor for the attention layer in this block.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def forward(self, x, attn_mask, freqs_cis, adaln_input=None):
+        """
+        Defines the forward pass for the transformer block.
+        Args:
+            x (torch.Tensor): The input tensor.
+            attn_mask (torch.Tensor): The attention mask.
+            freqs_cis (torch.Tensor): The RoPE frequencies.
+            adaln_input (torch.Tensor, optional): The conditioning tensor for AdaLN. Defaults to None.
+        Returns:
+            torch.Tensor: The output tensor of the block.
+        """
+        if self.modulation:
+            scale_msa, gate_msa, scale_mlp, gate_mlp = self.adaLN_modulation(adaln_input).unsqueeze(1).chunk(4, dim=2)
+            scale_msa = scale_msa + 1.0
+            gate_msa = gate_msa.tanh()
+            scale_mlp = scale_mlp + 1.0
+            gate_mlp = gate_mlp.tanh()
+            normed = self.attention_norm1(x)
+            normed = normed * scale_msa
+            attn_out = self.attention(normed, attention_mask=attn_mask, freqs_cis=freqs_cis)
+            attn_out = self.attention_norm2(attn_out) * gate_msa
+            x = x + attn_out
+            normed = self.ffn_norm1(x)
+            normed = normed * scale_mlp
+            ffn_out = self.feed_forward(normed)
+            ffn_out = self.ffn_norm2(ffn_out) * gate_mlp
+            x = x + ffn_out
+        else:
+            normed = self.attention_norm1(x)
+            attn_out = self.attention(normed, attention_mask=attn_mask, freqs_cis=freqs_cis)
+            x = x + self.attention_norm2(attn_out)
+            normed = self.ffn_norm1(x)
+            ffn_out = self.feed_forward(normed)
+            x = x + self.ffn_norm2(ffn_out)
+        return x
+class ZImageControlTransformerBlock(ZImageTransformerBlock):
+    """
+    A specialized transformer block for the control pathway. It inherits from ZImageTransformerBlock
+    and adds projection layers to generate and combine control signals.
+    """
+    def __init__(self, layer_id: int, dim: int, n_heads: int, n_kv_heads: int, norm_eps: float, qk_norm: bool, modulation=True, block_id=0):
+        """
+        Initializes the ZImageControlTransformerBlock.
+        Args:
+            layer_id (int): The index of the layer.
+            dim (int): The dimension of the features.
+            n_heads (int): The number of attention heads.
+            n_kv_heads (int): The number of key/value heads.
+            norm_eps (float): Epsilon for RMSNorm.
+            qk_norm (bool): Whether to apply normalization to query and key.
+            modulation (bool, optional): Whether to enable AdaLN modulation. Defaults to True.
+            block_id (int, optional): The index of this control block. Defaults to 0.
+        """
+        super().__init__(layer_id, dim, n_heads, n_kv_heads, norm_eps, qk_norm, modulation)
+        self.block_id = block_id
+        if block_id == 0:
+            self.before_proj = zero_module(nn.Linear(self.dim, self.dim))
+        self.after_proj = zero_module(nn.Linear(self.dim, self.dim))
+    def forward(self, c, x, **kwargs):
+        """
+        Defines the forward pass for the control block.
+        Args:
+            c (torch.Tensor): The control signal tensor.
+            x (torch.Tensor): The reference tensor from the main pathway.
+            **kwargs: Additional arguments for the parent's forward method.
+        Returns:
+            torch.Tensor: A stacked tensor containing the skip connection and the final output.
+        """
+        if self.block_id == 0:
+            c = self.before_proj(c) + x
+            all_c = []
+        else:
+            all_c = list(torch.unbind(c))
+            c = all_c.pop(-1)
+        c = super().forward(c, **kwargs)
+        c_skip = self.after_proj(c)
+        all_c += [c_skip, c]
+        c = torch.stack(all_c)
+        return c
+class BaseZImageTransformerBlock(ZImageTransformerBlock):
+    """
+    The main transformer block used in the primary pathway. It inherits from ZImageTransformerBlock
+    and adds the logic to inject control "hints" from the control pathway.
+    """
+    def __init__(self, layer_id: int, dim: int, n_heads: int, n_kv_heads: int, norm_eps: float, qk_norm: bool, modulation=True, block_id=0):
+        """
+        Initializes the BaseZImageTransformerBlock.
+        Args:
+            layer_id (int): The index of the layer.
+            dim (int): The dimension of the features.
+            n_heads (int): The number of attention heads.
+            n_kv_heads (int): The number of key/value heads.
+            norm_eps (float): Epsilon for RMSNorm.
+            qk_norm (bool): Whether to apply normalization to query and key.
+            modulation (bool, optional): Whether to enable AdaLN modulation. Defaults to True.
+            block_id (int, optional): The index used to retrieve the corresponding control hint. Defaults to 0.
+        """
+        super().__init__(layer_id, dim, n_heads, n_kv_heads, norm_eps, qk_norm, modulation)
+        self.block_id = block_id
+    def forward(self, hidden_states, hints=None, context_scale=1.0, **kwargs):
+        """
+        Defines the forward pass, including the injection of control hints.
+        Args:
+            hidden_states (torch.Tensor): The input tensor.
+            hints (List[torch.Tensor], optional): A list of control hints from the control pathway. Defaults to None.
+            context_scale (float, optional): A scale factor for the control hints. Defaults to 1.0.
+            **kwargs: Additional arguments for the parent's forward method.
+        Returns:
+            torch.Tensor: The output tensor of the block.
+        """
+        hidden_states = super().forward(hidden_states, **kwargs)
+        if self.block_id is not None and hints is not None:
+            hidden_states = hidden_states + hints[self.block_id] * context_scale
+        return hidden_states
+class ZImageControlTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
+    _supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_unexpected = [
+        r"control_layers\..*",
+        r"control_noise_refiner\..*",
+        r"control_all_x_embedder\..*",
+    ]
+    _no_split_modules = ["ZImageTransformerBlock", "BaseZImageTransformerBlock", "ZImageControlTransformerBlock"]
+    _skip_layerwise_casting_patterns = ["t_embedder", "cap_embedder"]
+    _group_offload_block_modules = ["t_embedder", "cap_embedder"]
+    @register_to_config
+    def __init__(
+        self,
+        control_layers_places=None,
+        control_refiner_layers_places=None,
+        control_in_dim=None,
+        add_control_noise_refiner=False,
+        all_patch_size=(2,),
+        all_f_patch_size=(1,),
+        in_channels=16,
+        dim=3840,
+        n_layers=30,
+        n_refiner_layers=2,
+        n_heads=30,
+        n_kv_heads=30,
+        norm_eps=1e-5,
+        qk_norm=True,
+        cap_feat_dim=2560,
+        rope_theta=256.0,
+        t_scale=1000.0,
+        axes_dims=[32, 48, 48],
+        axes_lens=[1024, 512, 512],
+        use_controlnet=True,
+        checkpoint_ratio=0.5,
+    ):
+        """
+        Initializes the ZImageControlTransformer2DModel.
+        Args:
+            control_layers_places (List[int], optional): Indices of main layers where control hints are injected.
+            control_refiner_layers_places (List[int], optional): Indices of noise refiner layers for two-stage control.
+            control_in_dim (int, optional): Input channel dimension for the control context.
+            add_control_noise_refiner (bool, optional): Whether to add a dedicated refiner for the control signal.
+            all_patch_size (Tuple[int], optional): Tuple of patch sizes for spatial dimensions.
+            all_f_patch_size (Tuple[int], optional): Tuple of patch sizes for the frame dimension.
+            in_channels (int, optional): Number of input channels for the latent image.
+            dim (int, optional): The main dimension of the transformer model.
+            n_layers (int, optional): The number of main transformer layers.
+            n_refiner_layers (int, optional): The number of layers in the refiner blocks.
+            n_heads (int, optional): The number of attention heads.
+            n_kv_heads (int, optional): The number of key/value heads.
+            norm_eps (float, optional): Epsilon for RMSNorm.
+            qk_norm (bool, optional): Whether to apply normalization to query and key.
+            cap_feat_dim (int, optional): The dimension of the input caption features.
+            rope_theta (float, optional): The base for RoPE.
+            t_scale (float, optional): A scaling factor for the timestep.
+            axes_dims (List[int], optional): Dimensions for each axis in RoPE.
+            axes_lens (List[int], optional): Maximum lengths for each axis in RoPE.
+            use_controlnet (bool, optional): If False, control-related layers will not be created to save memory.
+            checkpoint_ratio (float, optional): The ratio of layers to apply gradient checkpointing to.
+        """
+        super().__init__()
+        self.use_controlnet = use_controlnet
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        self.all_patch_size = all_patch_size
+        self.all_f_patch_size = all_f_patch_size
+        self.dim = dim
+        self.control_in_dim = self.dim if control_in_dim is None else control_in_dim
+        self.is_two_stage_control = self.control_in_dim > 16
+        self.n_heads = n_heads
+        self.rope_theta = rope_theta
+        self.t_scale = t_scale
+        self.gradient_checkpointing = False
+        self.checkpoint_ratio = checkpoint_ratio
+        assert len(all_patch_size) == len(all_f_patch_size)
+        self.control_layers_places = list(range(0, n_layers, 2)) if control_layers_places is None else control_layers_places
+        self.control_refiner_layers_places = list(range(0, n_refiner_layers)) if control_refiner_layers_places is None else control_refiner_layers_places
+        self.add_control_noise_refiner = add_control_noise_refiner
+        assert 0 in self.control_layers_places
+        self.control_layers_mapping = {i: n for n, i in enumerate(self.control_layers_places)}
+        self.control_refiner_layers_mapping = {i: n for n, i in enumerate(self.control_refiner_layers_places)}
+        self.all_x_embedder = nn.ModuleDict(
+            {
+                f"{patch_size}-{f_patch_size}": nn.Linear(f_patch_size * patch_size * patch_size * in_channels, dim, bias=True)
+                for patch_size, f_patch_size in zip(all_patch_size, all_f_patch_size)
+            }
+        )
+        self.all_final_layer = nn.ModuleDict(
+            {
+                f"{patch_size}-{f_patch_size}": FinalLayer(dim, patch_size * patch_size * f_patch_size * self.out_channels)
+                for patch_size, f_patch_size in zip(all_patch_size, all_f_patch_size)
+            }
+        )
+        self.context_refiner = nn.ModuleList(
+            [ZImageTransformerBlock(i, dim, n_heads, n_kv_heads, norm_eps, qk_norm, modulation=False) for i in range(n_refiner_layers)]
+        )
+        self.t_embedder = TimestepEmbedder(min(dim, ADALN_EMBED_DIM), mid_size=1024)
+        self.cap_embedder = nn.Sequential(RMSNorm(cap_feat_dim, eps=norm_eps), nn.Linear(cap_feat_dim, dim, bias=True))
+        self.x_pad_token = nn.Parameter(torch.empty((1, dim)))
+        self.cap_pad_token = nn.Parameter(torch.empty((1, dim)))
+        head_dim = dim // n_heads
+        assert head_dim == sum(axes_dims)
+        self.axes_dims = axes_dims
+        self.axes_lens = axes_lens
+        self.rope_embedder = RopeEmbedder(theta=rope_theta, axes_dims=axes_dims, axes_lens=axes_lens)
+        self.layers = nn.ModuleList(
+            [BaseZImageTransformerBlock(i, dim, n_heads, n_kv_heads, norm_eps, qk_norm, block_id=self.control_layers_mapping.get(i)) for i in range(n_layers)]
+        )
+        self.noise_refiner = nn.ModuleList(
+            [
+                BaseZImageTransformerBlock(
+                    1000 + i, dim, n_heads, n_kv_heads, norm_eps, qk_norm, modulation=True, block_id=self.control_refiner_layers_mapping.get(i)
+                )
+                for i in range(n_refiner_layers)
+            ]
+        )
+        if self.use_controlnet:
+            self.control_layers = nn.ModuleList(
+                [ZImageControlTransformerBlock(i, dim, n_heads, n_kv_heads, norm_eps, qk_norm, block_id=i) for i in self.control_layers_places]
+            )
+            self.control_all_x_embedder = nn.ModuleDict(
+                {
+                    f"{patch_size}-{f_patch_size}": nn.Linear(f_patch_size * patch_size * patch_size * self.control_in_dim, dim, bias=True)
+                    for patch_size, f_patch_size in zip(all_patch_size, all_f_patch_size)
+                }
+            )
+            if self.is_two_stage_control:
+                if self.add_control_noise_refiner:
+                    self.control_noise_refiner = nn.ModuleList(
+                        [
+                            ZImageControlTransformerBlock(1000 + layer_id, dim, n_heads, n_kv_heads, norm_eps, qk_norm, modulation=True, block_id=layer_id)
+                            for layer_id in range(n_refiner_layers)
+                        ]
+                    )
+                else:
+                    self.control_noise_refiner = None
+            else:  # V1
+                self.control_noise_refiner = nn.ModuleList(
+                    [ZImageTransformerBlock(1000 + i, dim, n_heads, n_kv_heads, norm_eps, qk_norm, modulation=True) for i in range(n_refiner_layers)]
+                )
+        else:
+            self.control_layers = None
+            self.control_all_x_embedder = None
+            self.control_noise_refiner = None
+    def _unpatchify(self, x_image_tokens: torch.Tensor, all_sizes: List[Tuple], patch_size: int, f_patch_size: int) -> torch.Tensor:
+        """
+        Converts a sequence of image tokens back into a batched image tensor. This version is robust
+        to batches containing images of different original sizes.
+        Args:
+            x_image_tokens (torch.Tensor): A tensor of image tokens with shape [B, SeqLen, Dim].
+            all_sizes (List[Tuple]): A list of tuples with the original (F, H, W) size for each image in the batch.
+            patch_size (int): The spatial patch size (height and width).
+            f_patch_size (int): The frame/temporal patch size.
+        Returns:
+            torch.Tensor: The reconstructed latent tensor with shape [B, C, F, H, W].
+        """
+        pH = pW = patch_size
+        pF = f_patch_size
+        batch_size = x_image_tokens.shape[0]
+        unpatched_images = []
+        for i in range(batch_size):
+            F, H, W = all_sizes[i]
+            F_tokens, H_tokens, W_tokens = F // pF, H // pH, W // pW
+            original_seq_len = F_tokens * H_tokens * W_tokens
+            current_image_tokens = x_image_tokens[i, :original_seq_len, :]
+            unpatched_image = current_image_tokens.view(F_tokens, H_tokens, W_tokens, pF, pH, pW, self.out_channels)
+            unpatched_image = unpatched_image.permute(6, 0, 3, 1, 4, 2, 5).reshape(self.out_channels, F, H, W)
+            unpatched_images.append(unpatched_image)
+        try:
+            final_tensor = torch.stack(unpatched_images, dim=0)
+        except RuntimeError:
+            raise ValueError(
+                "Could not stack unpatched images into a single batch tensor. "
+                "This typically occurs if you are trying to generate images of different sizes in the same batch."
+            )
+        return final_tensor
+    def _patchify(
+        self,
+        all_image: List[torch.Tensor],
+        patch_size: int,
+        f_patch_size: int,
+        cap_padding_len: int,
+    ):
+        """
+        Converts a list of image tensors into patch sequences and computes their positional IDs.
+        Args:
+            all_image (List[torch.Tensor]): A list of image tensors to process.
+            patch_size (int): The spatial patch size.
+            f_patch_size (int): The frame/temporal patch size.
+            cap_padding_len (int): The length of the padded caption sequence, used as an offset for image position IDs.
+        Returns:
+            Tuple: A tuple containing lists of processed patches, sizes, position IDs, and padding masks.
+        """
+        pH = pW = patch_size
+        pF = f_patch_size
+        device = all_image[0].device
+        all_image_out = []
+        all_image_size = []
+        all_image_pos_ids = []
+        all_image_pad_mask = []
+        for i, image in enumerate(all_image):
+            C, F, H, W = image.size()
+            all_image_size.append((F, H, W))
+            F_tokens, H_tokens, W_tokens = F // pF, H // pH, W // pW
+            image = image.view(C, F_tokens, pF, H_tokens, pH, W_tokens, pW)
+            image = image.permute(1, 3, 5, 2, 4, 6, 0).reshape(F_tokens * H_tokens * W_tokens, pF * pH * pW * C)
+            image_ori_len = len(image)
+            image_padding_len = (-image_ori_len) % SEQ_MULTI_OF
+            image_ori_pos_ids = self._create_coordinate_grid(
+                size=(F_tokens, H_tokens, W_tokens),
+                start=(cap_padding_len + 1, 0, 0),
+                device=device,
+            ).flatten(0, 2)
+            image_padding_pos_ids = (
+                self._create_coordinate_grid(
+                    size=(1, 1, 1),
+                    start=(0, 0, 0),
+                    device=device,
+                )
+                .flatten(0, 2)
+                .repeat(image_padding_len, 1)
+            )
+            image_padded_pos_ids = torch.cat([image_ori_pos_ids, image_padding_pos_ids], dim=0)
+            all_image_pos_ids.append(image_padded_pos_ids)
+            all_image_pad_mask.append(
+                torch.cat(
+                    [
+                        torch.zeros((image_ori_len,), dtype=torch.bool, device=device),
+                        torch.ones((image_padding_len,), dtype=torch.bool, device=device),
+                    ],
+                    dim=0,
+                )
+            )
+            image_padded_feat = torch.cat([image, image[-1:].repeat(image_padding_len, 1)], dim=0)
+            all_image_out.append(image_padded_feat)
+        return (
+            all_image_out,
+            all_image_size,
+            all_image_pos_ids,
+            all_image_pad_mask,
+        )
+    def _patchify_and_embed(
+        self,
+        all_image: List[torch.Tensor],
+        all_cap_feats: List[torch.Tensor],
+        patch_size: int,
+        f_patch_size: int,
+    ):
+        """
+        Processes a batch of images and caption features by converting them into padded patch sequences
+        and generating their corresponding positional IDs and padding masks. This is the general-purpose,
+        robust version that iterates through the batch.
+        Args:
+            all_image (List[torch.Tensor]): A list of image tensors.
+            all_cap_feats (List[torch.Tensor]): A list of caption feature tensors.
+            patch_size (int): The spatial patch size.
+            f_patch_size (int): The frame/temporal patch size.
+        Returns:
+            Tuple: A tuple containing all processed data structures (image patches, caption features, sizes,
+                   position IDs, and padding masks) as lists.
+        """
+        pH = pW = patch_size
+        pF = f_patch_size
+        device = all_image[0].device
+        all_image_out, all_image_size, all_image_pos_ids, all_image_pad_mask = [], [], [], []
+        all_cap_pos_ids, all_cap_pad_mask, all_cap_feats_out = [], [], []
+        for i, (image, cap_feat) in enumerate(zip(all_image, all_cap_feats)):
+            cap_ori_len = len(cap_feat)
+            cap_padding_len = (-cap_ori_len) % SEQ_MULTI_OF
+            cap_total_len = cap_ori_len + cap_padding_len
+            cap_padded_pos_ids = self._create_coordinate_grid(size=(cap_total_len, 1, 1), start=(1, 0, 0), device=device).flatten(0, 2)
+            all_cap_pos_ids.append(cap_padded_pos_ids)
+            cap_mask = torch.ones(cap_total_len, dtype=torch.bool, device=device)
+            cap_mask[:cap_ori_len] = False
+            all_cap_pad_mask.append(cap_mask)
+            if cap_padding_len > 0:
+                padding_tensor = cap_feat[-1:].repeat(cap_padding_len, 1)
+                cap_padded_feat = torch.cat([cap_feat, padding_tensor], dim=0)
+            else:
+                cap_padded_feat = cap_feat
+            all_cap_feats_out.append(cap_padded_feat)
+            C, Fr, H, W = image.size()
+            all_image_size.append((Fr, H, W))
+            F_tokens, H_tokens, W_tokens = Fr // pF, H // pH, W // pW
+            image_reshaped = image.view(C, F_tokens, pF, H_tokens, pH, W_tokens, pW).permute(1, 3, 5, 2, 4, 6, 0).reshape(-1, pF * pH * pW * C)
+            image_ori_len = image_reshaped.shape[0]
+            image_padding_len = (-image_ori_len) % SEQ_MULTI_OF
+            image_total_len = image_ori_len + image_padding_len
+            image_ori_pos_ids = self._create_coordinate_grid(size=(F_tokens, H_tokens, W_tokens), start=(cap_total_len + 1, 0, 0), device=device).flatten(0, 2)
+            if image_padding_len > 0:
+                image_padding_pos_ids = torch.zeros((image_padding_len, 3), dtype=torch.int32, device=device)
+                image_padded_pos_ids = torch.cat([image_ori_pos_ids, image_padding_pos_ids], dim=0)
+            else:
+                image_padded_pos_ids = image_ori_pos_ids
+            all_image_pos_ids.append(image_padded_pos_ids)
+            image_mask = torch.ones(image_total_len, dtype=torch.bool, device=device)
+            image_mask[:image_ori_len] = False
+            all_image_pad_mask.append(image_mask)
+            if image_padding_len > 0:
+                padding_tensor = image_reshaped[-1:].repeat(image_padding_len, 1)
+                image_padded_feat = torch.cat([image_reshaped, padding_tensor], dim=0)
+            else:
+                image_padded_feat = image_reshaped
+            all_image_out.append(image_padded_feat)
+        return (
+            all_image_out,
+            all_cap_feats_out,
+            all_image_size,
+            all_image_pos_ids,
+            all_cap_pos_ids,
+            all_image_pad_mask,
+            all_cap_pad_mask,
+        )
+    def _process_cap_feats_with_cfg_cache(self, cap_feats_list, cap_pos_ids, cap_inner_pad_mask):
+        """
+        Processes caption features with intelligent duplicate detection to avoid redundant computation,
+        especially for Classifier-Free Guidance (CFG) where prompts are repeated.
+        Args:
+            cap_feats_list (List[torch.Tensor]): List of padded caption feature tensors.
+            cap_pos_ids (List[torch.Tensor]): List of corresponding position ID tensors.
+            cap_inner_pad_mask (List[torch.Tensor]): List of corresponding padding masks.
+        Returns:
+            Tuple: A tuple of batched tensors for padded features, RoPE frequencies, attention mask, and sequence lengths.
+        """
+        device = cap_feats_list[0].device
+        bsz = len(cap_feats_list)
+        shapes_equal = all(c.shape == cap_feats_list[0].shape for c in cap_feats_list)
+        if shapes_equal and bsz >= 2:
+            unique_indices = [0]
+            unique_tensors = [cap_feats_list[0]]
+            tensor_mapping = [0]
+            for i in range(1, bsz):
+                found_match = False
+                for j, unique_tensor in enumerate(unique_tensors):
+                    if torch.equal(cap_feats_list[i], unique_tensor):
+                        tensor_mapping.append(j)
+                        found_match = True
+                        break
+                if not found_match:
+                    unique_indices.append(i)
+                    unique_tensors.append(cap_feats_list[i])
+                    tensor_mapping.append(len(unique_tensors) - 1)
+            if len(unique_tensors) < bsz:
+                unique_cap_feats_list = [cap_feats_list[i] for i in unique_indices]
+                unique_cap_pos_ids = [cap_pos_ids[i] for i in unique_indices]
+                unique_cap_inner_pad_mask = [cap_inner_pad_mask[i] for i in unique_indices]
+                cap_item_seqlens_unique = [len(i) for i in unique_cap_feats_list]
+                cap_max_item_seqlen = max(cap_item_seqlens_unique)
+                cap_feats_cat = torch.cat(unique_cap_feats_list, dim=0)
+                cap_feats_embedded = self.cap_embedder(cap_feats_cat)
+                cap_feats_embedded[torch.cat(unique_cap_inner_pad_mask)] = self.cap_pad_token
+                cap_feats_padded_unique = pad_sequence(list(cap_feats_embedded.split(cap_item_seqlens_unique, dim=0)), batch_first=True, padding_value=0.0)
+                cap_freqs_cis_cat = self.rope_embedder(torch.cat(unique_cap_pos_ids, dim=0))
+                cap_freqs_cis_unique = pad_sequence(list(cap_freqs_cis_cat.split(cap_item_seqlens_unique, dim=0)), batch_first=True, padding_value=0.0)
+                cap_feats_padded = cap_feats_padded_unique[tensor_mapping]
+                cap_freqs_cis = cap_freqs_cis_unique[tensor_mapping]
+                seq_lens_tensor = torch.tensor([cap_max_item_seqlen] * bsz, device=device, dtype=torch.int32)
+                arange = torch.arange(cap_max_item_seqlen, device=device, dtype=torch.int32)
+                cap_attn_mask = arange[None, :] < seq_lens_tensor[:, None]
+                cap_item_seqlens = [cap_max_item_seqlen] * bsz
+                return cap_feats_padded, cap_freqs_cis, cap_attn_mask, cap_item_seqlens
+        cap_item_seqlens = [len(i) for i in cap_feats_list]
+        cap_max_item_seqlen = max(cap_item_seqlens)
+        cap_feats_cat = torch.cat(cap_feats_list, dim=0)
+        cap_feats_embedded = self.cap_embedder(cap_feats_cat)
+        cap_feats_embedded[torch.cat(cap_inner_pad_mask)] = self.cap_pad_token
+        cap_feats_padded = pad_sequence(list(cap_feats_embedded.split(cap_item_seqlens, dim=0)), batch_first=True, padding_value=0.0)
+        cap_freqs_cis_cat = self.rope_embedder(torch.cat(cap_pos_ids, dim=0))
+        cap_freqs_cis = pad_sequence(list(cap_freqs_cis_cat.split(cap_item_seqlens, dim=0)), batch_first=True, padding_value=0.0)
+        seq_lens_tensor = torch.tensor(cap_item_seqlens, device=device, dtype=torch.int32)
+        arange = torch.arange(cap_max_item_seqlen, device=device, dtype=torch.int32)
+        cap_attn_mask = arange[None, :] < seq_lens_tensor[:, None]
+        return cap_feats_padded, cap_freqs_cis, cap_attn_mask, cap_item_seqlens
+    @staticmethod
+    def _create_coordinate_grid(size, start=None, device=None):
+        """
+        Creates a 3D coordinate grid.
+        Args:
+            size (Tuple[int]): The dimensions of the grid (F, H, W).
+            start (Tuple[int], optional): The starting coordinates for each axis. Defaults to (0, 0, 0).
+            device (torch.device, optional): The device to create the tensor on. Defaults to None.
+        Returns:
+            torch.Tensor: The coordinate grid tensor.
+        """
+        if start is None:
+            start = (0 for _ in size)
+        axes = [torch.arange(x0, x0 + span, dtype=torch.int32, device=device) for x0, span in zip(start, size)]
+        grids = torch.meshgrid(axes, indexing="ij")
+        return torch.stack(grids, dim=-1)
+    def _apply_transformer_blocks(self, hidden_states, layers, checkpoint_ratio=0.5, **kwargs):
+        """
+        Applies a list of transformer layers to the hidden states, with optional selective gradient checkpointing.
+        Args:
+            hidden_states (torch.Tensor): The input tensor.
+            layers (nn.ModuleList): The list of transformer layers to apply.
+            checkpoint_ratio (float, optional): The ratio of layers to apply gradient checkpointing to. Defaults to 0.5.
+            **kwargs: Additional keyword arguments to pass to each layer's forward method.
+        Returns:
+            torch.Tensor: The output tensor after applying all layers.
+        """
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+            def create_custom_forward(module, **static_kwargs):
+                def custom_forward(*inputs):
+                    return module(*inputs, **static_kwargs)
+                return custom_forward
+            ckpt_kwargs = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+            checkpoint_every_n = max(1, int(1.0 / checkpoint_ratio)) if checkpoint_ratio > 0 else len(layers) + 1
+            for i, layer in enumerate(layers):
+                if i % checkpoint_every_n == 0:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(layer, **kwargs),
+                        hidden_states,
+                        **ckpt_kwargs,
+                    )
+                else:
+                    hidden_states = layer(hidden_states, **kwargs)
+        else:
+            for layer in layers:
+                hidden_states = layer(hidden_states, **kwargs)
+        return hidden_states
+    def _prepare_control_inputs(self, control_context, cap_feats_ref, t, patch_size, f_patch_size, device):
+        """
+        Prepares the control context for the transformer, including patchifying, embedding, and generating
+        positional information. Includes a fast path for batches with uniform shapes.
+        Args:
+            control_context (torch.Tensor or List[torch.Tensor]): The control context input.
+            cap_feats_ref (List[torch.Tensor]): A reference to caption features for padding calculation.
+            t (torch.Tensor): The timestep tensor.
+            patch_size (int): The spatial patch size.
+            f_patch_size (int): The frame/temporal patch size.
+            device (torch.device): The target device.
+        Returns:
+            Dict: A dictionary containing the processed control tensors ('c', 'c_item_seqlens', 'attn_mask', etc.).
+        """
+        bsz = control_context.shape[0]
+        if isinstance(control_context, torch.Tensor) and control_context.ndim == 5:
+            control_list = list(torch.unbind(control_context, dim=0))
+        else:
+            control_list = control_context
+        pH = pW = patch_size
+        pF = f_patch_size
+        cap_padding_len = cap_feats_ref[0].size(0) if isinstance(cap_feats_ref, list) else cap_feats_ref.shape[1]
+        shapes = [c.shape for c in control_list]
+        same_shape = all(s == shapes[0] for s in shapes)
+        if same_shape and bsz >= 2:
+            control_batch = torch.stack(control_list, dim=0)
+            B, C, F, H, W = control_batch.shape
+            F_tokens, H_tokens, W_tokens = F // pF, H // pH, W // pW
+            control_batch = control_batch.view(B, C, F_tokens, pF, H_tokens, pH, W_tokens, pW)
+            control_batch = control_batch.permute(0, 2, 4, 6, 3, 5, 7, 1).reshape(B, F_tokens * H_tokens * W_tokens, pF * pH * pW * C)
+            ori_len = control_batch.shape[1]
+            padding_len = (-ori_len) % SEQ_MULTI_OF
+            if padding_len > 0:
+                pad_tensor = control_batch[:, -1:, :].repeat(1, padding_len, 1)
+                control_batch = torch.cat([control_batch, pad_tensor], dim=1)
+            c = self.control_all_x_embedder[f"{patch_size}-{f_patch_size}"](control_batch)
+            final_seq_len = control_batch.shape[1]
+            pos_ids_ori = self._create_coordinate_grid(
+                size=(F_tokens, H_tokens, W_tokens),
+                start=(cap_padding_len + 1, 0, 0),
+                device=device,
+            ).flatten(0, 2)  # [ori_len, 3]
+            pos_ids_pad = torch.zeros((padding_len, 3), dtype=torch.int32, device=device)
+            pos_ids_padded = torch.cat([pos_ids_ori, pos_ids_pad], dim=0)
+            c_freqs_cis_single = self.rope_embedder(pos_ids_padded)
+            c_freqs_cis = c_freqs_cis_single.unsqueeze(0).repeat(B, 1, 1, 1)
+            c_attn_mask = torch.ones((B, final_seq_len), dtype=torch.bool, device=device)
+            return {"c": c, "c_item_seqlens": [final_seq_len] * B, "attn_mask": c_attn_mask, "freqs_cis": c_freqs_cis, "adaln_input": t.type_as(c)}
+        (c_patches, _, c_pos_ids, c_inner_pad_mask) = self._patchify(control_list, patch_size, f_patch_size, cap_padding_len)
+        c_item_seqlens = [len(p) for p in c_patches]
+        c_max_item_seqlen = max(c_item_seqlens)
+        c = torch.cat(c_patches, dim=0)
+        c = self.control_all_x_embedder[f"{patch_size}-{f_patch_size}"](c)
+        c[torch.cat(c_inner_pad_mask)] = self.x_pad_token
+        c = list(c.split(c_item_seqlens, dim=0))
+        c_freqs_cis_list = []
+        for pos_ids in c_pos_ids:
+            c_freqs_cis_list.append(self.rope_embedder(pos_ids))
+        c_padded = pad_sequence(c, batch_first=True, padding_value=0.0)
+        c_freqs_cis_padded = pad_sequence(c_freqs_cis_list, batch_first=True, padding_value=0.0)
+        seq_lens_tensor = torch.tensor(c_item_seqlens, device=device, dtype=torch.int32)
+        arange = torch.arange(c_max_item_seqlen, device=device, dtype=torch.int32)
+        c_attn_mask = arange[None, :] < seq_lens_tensor[:, None]
+        return {"c": c_padded, "c_item_seqlens": c_item_seqlens, "attn_mask": c_attn_mask, "freqs_cis": c_freqs_cis_padded, "adaln_input": t.type_as(c_padded)}
+    def _patchify_and_embed_batch_optimized(self, all_image, all_cap_feats, patch_size, f_patch_size):
+        """
+        An optimized version of _patchify_and_embed for batches where all images and captions have
+        uniform shapes. It processes the entire batch using vectorized operations instead of a loop.
+        Args:
+            all_image (List[torch.Tensor]): List of image tensors, all of the same shape.
+            all_cap_feats (List[torch.Tensor]): List of caption features, all of the same shape.
+            patch_size (int): The spatial patch size.
+            f_patch_size (int): The frame/temporal patch size.
+        Returns:
+            Tuple: A tuple containing all processed data structures, matching the output of the standard method.
+        """
+        pH = pW = patch_size
+        pF = f_patch_size
+        device = all_image[0].device
+        image_shapes = [img.shape for img in all_image]
+        cap_shapes = [cap.shape for cap in all_cap_feats]
+        same_image_shape = all(s == image_shapes[0] for s in image_shapes)
+        same_cap_shape = all(s == cap_shapes[0] for s in cap_shapes)
+        if not (same_image_shape and same_cap_shape):
+            return self._patchify_and_embed(all_image, all_cap_feats, patch_size, f_patch_size)
+        images_batch = torch.stack(all_image, dim=0)
+        caps_batch = torch.stack(all_cap_feats, dim=0)
+        B, C, Fr, H, W = images_batch.shape
+        cap_ori_len = caps_batch.shape[1]
+        cap_padding_len = (-cap_ori_len) % SEQ_MULTI_OF
+        cap_total_len = cap_ori_len + cap_padding_len
+        if cap_padding_len > 0:
+            cap_pad = caps_batch[:, -1:, :].repeat(1, cap_padding_len, 1)
+            caps_batch = torch.cat([caps_batch, cap_pad], dim=1)
+        cap_pos_ids = self._create_coordinate_grid(size=(cap_total_len, 1, 1), start=(1, 0, 0), device=device).flatten(0, 2).unsqueeze(0).repeat(B, 1, 1)
+        cap_mask = torch.zeros((B, cap_total_len), dtype=torch.bool, device=device)
+        if cap_padding_len > 0:
+            cap_mask[:, cap_ori_len:] = True
+        F_tokens, H_tokens, W_tokens = Fr // pF, H // pH, W // pW
+        images_reshaped = (
+            images_batch.view(B, C, F_tokens, pF, H_tokens, pH, W_tokens, pW)
+            .permute(0, 2, 4, 6, 3, 5, 7, 1)
+            .reshape(B, F_tokens * H_tokens * W_tokens, pF * pH * pW * C)
+        )
+        image_ori_len = images_reshaped.shape[1]
+        image_padding_len = (-image_ori_len) % SEQ_MULTI_OF
+        image_total_len = image_ori_len + image_padding_len
+        if image_padding_len > 0:
+            img_pad = images_reshaped[:, -1:, :].repeat(1, image_padding_len, 1)
+            images_reshaped = torch.cat([images_reshaped, img_pad], dim=1)
+        image_pos_ids = (
+            self._create_coordinate_grid(size=(F_tokens, H_tokens, W_tokens), start=(cap_total_len + 1, 0, 0), device=device)
+            .flatten(0, 2)
+            .unsqueeze(0)
+            .repeat(B, 1, 1)
+        )
+        if image_padding_len > 0:
+            img_pos_pad = torch.zeros((B, image_padding_len, 3), dtype=torch.int32, device=device)
+            image_pos_ids = torch.cat([image_pos_ids, img_pos_pad], dim=1)
+        image_mask = torch.zeros((B, image_total_len), dtype=torch.bool, device=device)
+        if image_padding_len > 0:
+            image_mask[:, image_ori_len:] = True
+        all_image_size = [(Fr, H, W)] * B
+        return (
+            list(torch.unbind(images_reshaped, dim=0)),
+            list(torch.unbind(caps_batch, dim=0)),
+            all_image_size,
+            list(torch.unbind(image_pos_ids, dim=0)),
+            list(torch.unbind(cap_pos_ids, dim=0)),
+            list(torch.unbind(image_mask, dim=0)),
+            list(torch.unbind(cap_mask, dim=0)),
+        )
+    def forward(
+        self,
+        x: List[torch.Tensor],
+        t,
+        cap_feats: List[torch.Tensor],
+        patch_size=2,
+        f_patch_size=1,
+        control_context=None,
+        conditioning_scale=1.0,
+        refiner_conditioning_scale=1.0,
+    ):
+        """
+        The main forward pass of the transformer model.
+        Args:
+            x (List[torch.Tensor]):
+                A list of latent image tensors.
+            t (torch.Tensor):
+                A batch of timesteps.
+            cap_feats (List[torch.Tensor]):
+                A list of caption feature tensors.
+            patch_size (int, optional):
+                The spatial patch size to use. Defaults to 2.
+            f_patch_size (int, optional):
+                The frame/temporal patch size to use. Defaults to 1.
+            control_context (torch.Tensor, optional):
+                The control context tensor. Defaults to None.
+            conditioning_scale (float, optional):
+                The scale for applying control hints. Defaults to 1.0.
+            refiner_conditioning_scale (float, optional):
+                The scale for applying refiner control hints. Defaults to 1.0.
+        Returns:
+            Transformer2DModelOutput: An object containing the final denoised sample.
+        """
+        is_control_mode = self.use_controlnet and control_context is not None and conditioning_scale > 0
+        if refiner_conditioning_scale is None:
+            refiner_conditioning_scale = conditioning_scale or 1.0
+        assert patch_size in self.all_patch_size
+        assert f_patch_size in self.all_f_patch_size
+        bsz = len(x)
+        device = x[0].device
+        t = t * self.t_scale
+        t = self.t_embedder(t)
+        can_optimize_patchify = (
+            bsz == len(cap_feats) and bsz >= 2 and all(img.shape == x[0].shape for img in x) and all(cap.shape == cap_feats[0].shape for cap in cap_feats)
+        )
+        if can_optimize_patchify:
+            (x_list, cap_feats_list, x_size, x_pos_ids, cap_pos_ids, x_inner_pad_mask, cap_inner_pad_mask) = self._patchify_and_embed_batch_optimized(
+                x, cap_feats, patch_size, f_patch_size
+            )
+        else:
+            (x_list, cap_feats_list, x_size, x_pos_ids, cap_pos_ids, x_inner_pad_mask, cap_inner_pad_mask) = self._patchify_and_embed(
+                x, cap_feats, patch_size, f_patch_size
+            )
+        x_item_seqlens = [len(i) for i in x_list]
+        x_max_item_seqlen = max(x_item_seqlens) if x_item_seqlens else 0
+        x_cat = torch.cat(x_list, dim=0) if x_list else torch.empty(0, x_list[0].shape[1] if x_list else 0, device=device)
+        x_embedded = self.all_x_embedder[f"{patch_size}-{f_patch_size}"](x_cat)
+        if x_inner_pad_mask and torch.cat(x_inner_pad_mask).any():
+            x_embedded[torch.cat(x_inner_pad_mask)] = self.x_pad_token
+        x = pad_sequence(list(x_embedded.split(x_item_seqlens, dim=0)), batch_first=True, padding_value=0.0)
+        adaln_input = t.to(device).type_as(x)
+        cap_feats_padded, cap_freqs_cis, cap_attn_mask, cap_item_seqlens = self._process_cap_feats_with_cfg_cache(
+            cap_feats_list, cap_pos_ids, cap_inner_pad_mask
+        )
+        x_freqs_cis_cat = self.rope_embedder(torch.cat(x_pos_ids, dim=0)) if x_pos_ids else torch.empty(0, device=device)
+        x_freqs_cis = pad_sequence(list(x_freqs_cis_cat.split(x_item_seqlens, dim=0)), batch_first=True, padding_value=0.0)
+        seq_lens_tensor = torch.tensor(x_item_seqlens, device=device, dtype=torch.int32)
+        arange = torch.arange(x_max_item_seqlen, device=device, dtype=torch.int32)
+        x_attn_mask = arange[None, :] < seq_lens_tensor[:, None]
+        refiner_hints = None
+        if is_control_mode and self.is_two_stage_control:
+            prepared_control = self._prepare_control_inputs(control_context, cap_feats_padded, t, patch_size, f_patch_size, device)
+            c = prepared_control["c"]
+            """
+            kwargs_for_control_refiner = {
+                "x": x,
+                "attn_mask": prepared_control["attn_mask"],
+                "freqs_cis": prepared_control["freqs_cis"],
+                "adaln_input": prepared_control["adaln_input"],
+            }
+            c_processed = self._apply_transformer_blocks(
+                c,
+                self.control_noise_refiner if self.add_control_noise_refiner else self.control_layers,
+                checkpoint_ratio=self.checkpoint_ratio,
+                **kwargs_for_control_refiner,
+            )
+            refiner_hints = torch.unbind(c_processed)[:-1]
+            control_context_processed = torch.unbind(c_processed)[-1]
+            control_context_item_seqlens = prepared_control["c_item_seqlens"]
+            """
+            kwargs_for_control_refiner = {
+                "x": x,
+                "attn_mask": x_attn_mask,        # was prepared_control["attn_mask"]
+                "freqs_cis": x_freqs_cis,         # was prepared_control["freqs_cis"]
+                "adaln_input": adaln_input,
+            }
+            c_processed = self._apply_transformer_blocks(
+                c,
+                self.control_noise_refiner if self.add_control_noise_refiner else self.control_layers,  # KEEP ORIGINAL
+                checkpoint_ratio=self.checkpoint_ratio,
+                **kwargs_for_control_refiner,
+            )
+            refiner_hints = torch.unbind(c_processed)[:-1]
+            control_context_processed = torch.unbind(c_processed)[-1]
+            control_context_item_seqlens = prepared_control["c_item_seqlens"]
+        kwargs_for_refiner = {
+            "attn_mask": x_attn_mask,
+            "freqs_cis": x_freqs_cis,
+            "adaln_input": adaln_input,
+            "context_scale": refiner_conditioning_scale,
+        }
+        if refiner_hints is not None:
+            kwargs_for_refiner["hints"] = refiner_hints
+        x = self._apply_transformer_blocks(x, self.noise_refiner, checkpoint_ratio=1.0, **kwargs_for_refiner)
+        kwargs_for_context = {"attn_mask": cap_attn_mask, "freqs_cis": cap_freqs_cis}
+        cap_feats = self._apply_transformer_blocks(cap_feats_padded, self.context_refiner, checkpoint_ratio=1.0, **kwargs_for_context)
+        unified_item_seqlens = [a + b for a, b in zip(x_item_seqlens, cap_item_seqlens)]
+        unified_max_item_seqlen = max(unified_item_seqlens) if unified_item_seqlens else 0
+        unified = torch.zeros((bsz, unified_max_item_seqlen, x.shape[-1]), dtype=x.dtype, device=device)
+        unified_freqs_cis = torch.zeros((bsz, unified_max_item_seqlen, x_freqs_cis.shape[-2], x_freqs_cis.shape[-1]), dtype=x_freqs_cis.dtype, device=device)
+        for i in range(bsz):
+            x_len = x_item_seqlens[i]
+            cap_len = cap_item_seqlens[i]
+            unified[i, :x_len] = x[i, :x_len]
+            unified[i, x_len : x_len + cap_len] = cap_feats[i, :cap_len]
+            unified_freqs_cis[i, :x_len] = x_freqs_cis[i, :x_len]
+            unified_freqs_cis[i, x_len : x_len + cap_len] = cap_freqs_cis[i, :cap_len]
+        seq_lens_tensor = torch.tensor(unified_item_seqlens, device=device, dtype=torch.int32)
+        arange = torch.arange(unified_max_item_seqlen, device=device, dtype=torch.int32)
+        unified_attn_mask = arange[None, :] < seq_lens_tensor[:, None]
+        hints = None
+        if is_control_mode:
+            kwargs_for_hints = {
+                "attn_mask": unified_attn_mask,
+                "freqs_cis": unified_freqs_cis,
+                "adaln_input": adaln_input,
+            }
+            if self.is_two_stage_control:
+                control_context_unified_list = [
+                    torch.cat([control_context_processed[i][: control_context_item_seqlens[i]], cap_feats[i, : cap_item_seqlens[i]]], dim=0) for i in range(bsz)
+                ]
+                c = pad_sequence(control_context_unified_list, batch_first=True, padding_value=0.0)
+                new_kwargs = dict(x=unified, **kwargs_for_hints)
+                c_processed = self._apply_transformer_blocks(c, self.control_layers, checkpoint_ratio=self.checkpoint_ratio, **new_kwargs)
+                hints = torch.unbind(c_processed)[:-1]
+            else:
+                prepared_control = self._prepare_control_inputs(control_context, cap_feats_padded, t, patch_size, f_patch_size, device)
+                c = prepared_control["c"]
+                kwargs_for_v1_refiner = {
+                    "attn_mask": prepared_control["attn_mask"],
+                    "freqs_cis": prepared_control["freqs_cis"],
+                    "adaln_input": prepared_control["adaln_input"],
+                }
+                c = self._apply_transformer_blocks(c, self.control_noise_refiner, checkpoint_ratio=self.checkpoint_ratio, **kwargs_for_v1_refiner)
+                c_item_seqlens = prepared_control["c_item_seqlens"]
+                control_context_unified_list = [torch.cat([c[i, : c_item_seqlens[i]], cap_feats[i, : cap_item_seqlens[i]]], dim=0) for i in range(bsz)]
+                c_unified = pad_sequence(control_context_unified_list, batch_first=True, padding_value=0.0)
+                new_kwargs = dict(x=unified, **kwargs_for_hints)
+                c_processed = self._apply_transformer_blocks(c_unified, self.control_layers, checkpoint_ratio=self.checkpoint_ratio, **new_kwargs)
+                hints = torch.unbind(c_processed)[:-1]
+        kwargs_for_layers = {"attn_mask": unified_attn_mask, "freqs_cis": unified_freqs_cis, "adaln_input": adaln_input}
+        if hints is not None:
+            kwargs_for_layers["hints"] = hints
+            kwargs_for_layers["context_scale"] = conditioning_scale
+        unified = self._apply_transformer_blocks(unified, self.layers, checkpoint_ratio=self.checkpoint_ratio, **kwargs_for_layers)
+        unified_out = self.all_final_layer[f"{patch_size}-{f_patch_size}"](unified, adaln_input)
+        x_image_tokens = unified_out[:, :x_max_item_seqlen]
+        x_final_tensor = self._unpatchify(x_image_tokens, x_size, patch_size, f_patch_size)
+        return Transformer2DModelOutput(sample=x_final_tensor)