kelseye commited on 10 days ago

Commit

9355758

verified ·

1 Parent(s): 3ad80be

Upload folder using huggingface_hub

Browse files

Files changed (21) hide show

.gitattributes +3 -0
README.md +194 -0
README_from_modelscope.md +196 -0
assets/cat_Inpaint_1.jpg +3 -0
assets/cat_Inpaint_2.jpg +3 -0
assets/cat_base.jpg +3 -0
assets/cat_mask_1.jpg +0 -0
assets/cat_mask_2.jpg +0 -0
assets/girl_Inpaint_1.jpg +0 -0
assets/girl_Inpaint_2.jpg +0 -0
assets/girl_base.jpg +0 -0
assets/girl_mask_1.jpg +0 -0
assets/girl_mask_2.jpg +0 -0
assets/room1.jpg +0 -0
assets/room2.jpg +0 -0
assets/room_Inpaint_1.jpg +0 -0
assets/room_Inpaint_2.jpg +0 -0
assets/room_base.jpg +0 -0
configuration.json +1 -0
model.py +378 -0
model.safetensors +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/cat_Inpaint_1.jpg filter=lfs diff=lfs merge=lfs -text
+assets/cat_Inpaint_2.jpg filter=lfs diff=lfs merge=lfs -text
+assets/cat_base.jpg filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,194 @@

+---
+license: apache-2.0
+---
+# Templates-Inpainting (FLUX.2-klein-base-4B)
+This model is one of the open-source Diffusion Templates series models from [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio). Specifically designed for inpainting, it accepts an original image and a mask image, then generates new content within the masked region based on natural language prompts, seamlessly blending with the surrounding unmasked background.
+## Results
+| Reference | Prompt | Mask | Generated |
+|:---:|:---|:---:|:---:|
+| ![](./assets/cat_base.jpg) | An orange cat is sitting on a stone. | ![](./assets/cat_mask_1.jpg) | ![](./assets/cat_Inpaint_1.jpg) |
+| ![](./assets/cat_base.jpg) | A cat wearing sunglasses is sitting on a stone. | ![](./assets/cat_mask_2.jpg) | ![](./assets/cat_Inpaint_2.jpg) |
+| Reference | Prompt | Mask | Generated |
+|:---:|:---|:---:|:---:|
+| ![](./assets/girl_base.jpg) | A beautiful young woman wearing a woven straw hat with a ribbon standing in a sunflower field. | ![](./assets/girl_mask_1.jpg) | ![](./assets/girl_Inpaint_1.jpg) |
+| ![](./assets/girl_base.jpg) | A beautiful young woman wearing an elegant white dress standing in a glowing sunflower field. | ![](./assets/girl_mask_2.jpg) | ![](./assets/girl_Inpaint_2.jpg) |
+| Reference | Prompt | Mask | Generated |
+|:---:|:---|:---:|:---:|
+| ![](./assets/room_base.jpg) | A sleek glass vase with a single blooming white lily and an open minimalist art book resting on the circular white marble coffee table. | ![](./assets/room1.jpg) | ![](./assets/room_Inpaint_1.jpg) |
+| ![](./assets/room_base.jpg) | A large, minimalist flower painting hanging on the clean off-white wall above the sofa, soft shadows. | ![](./assets/room2.jpg) | ![](./assets/room_Inpaint_2.jpg) |
+## Inference Code
+* Install [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio)
+```
+git clone https://github.com/modelscope/DiffSynth-Studio.git
+cd DiffSynth-Studio
+pip install -e .
+```
+* Direct inference (requires 40GB GPU memory)
+```python
+from diffsynth.diffusion.template import TemplatePipeline
+from diffsynth.pipelines.flux2_image import Flux2ImagePipeline, ModelConfig
+import torch
+from modelscope import dataset_snapshot_download
+from PIL import Image
+```
+```python
+pipe = Flux2ImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-base-4B", origin_file_pattern="transformer/*.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="text_encoder/*.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    tokenizer_config=ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="tokenizer/"),
+)
+template = TemplatePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[ModelConfig(model_id="DiffSynth-Studio/Template-KleinBase4B-Inpaint")],
+)
+dataset_snapshot_download(
+    "DiffSynth-Studio/examples_in_diffsynth",
+    allow_file_pattern=["templates/*"],
+    local_dir="data/examples",
+)
+image = template(
+    pipe,
+    prompt="An orange cat is sitting on a stone.",
+    seed=0, cfg_scale=4, num_inference_steps=50,
+    template_inputs = [{
+        "image": Image.open("data/examples/templates/image_reference.jpg"),
+        "mask": Image.open("data/examples/templates/image_mask_1.jpg"),
+        "force_inpaint": True,
+    }],
+    negative_template_inputs = [{
+        "image": Image.open("data/examples/templates/image_reference.jpg"),
+        "mask": Image.open("data/examples/templates/image_mask_1.jpg"),
+    }],
+)
+image.save("image_Inpaint_1.jpg")
+image = template(
+    pipe,
+    prompt="A cat wearing sunglasses is sitting on a stone.",
+    seed=0, cfg_scale=4, num_inference_steps=50,
+    template_inputs = [{
+        "image": Image.open("data/examples/templates/image_reference.jpg"),
+        "mask": Image.open("data/examples/templates/image_mask_2.jpg"),
+    }],
+    negative_template_inputs = [{
+        "image": Image.open("data/examples/templates/image_reference.jpg"),
+        "mask": Image.open("data/examples/templates/image_mask_2.jpg"),
+    }],
+)
+image.save("image_Inpaint_2.jpg")
+```
+* Enable lazy loading and memory management, requires 24G GPU memory
+```python
+from diffsynth.diffusion.template import TemplatePipeline
+from diffsynth.pipelines.flux2_image import Flux2ImagePipeline, ModelConfig
+import torch
+from modelscope import dataset_snapshot_download
+from PIL import Image
+```
+```python
+vram_config = {
+    "offload_dtype": "disk",
+    "offload_device": "disk",
+    "onload_dtype": torch.float8_e4m3fn,
+    "onload_device": "cpu",
+    "preparing_dtype": torch.float8_e4m3fn,
+    "preparing_device": "cuda",
+    "computation_dtype": torch.bfloat16,
+    "computation_device": "cuda",
+}
+pipe = Flux2ImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-base-4B", origin_file_pattern="transformer/*.safetensors", **vram_config),
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="text_encoder/*.safetensors", **vram_config),
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    tokenizer_config=ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="tokenizer/"),
+    vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
+)
+template = TemplatePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[ModelConfig(model_id="DiffSynth-Studio/Template-KleinBase4B-Inpaint")],
+    lazy_loading=True,
+)
+dataset_snapshot_download(
+    "DiffSynth-Studio/examples_in_diffsynth",
+    allow_file_pattern=["templates/*"],
+    local_dir="data/examples",
+)
+image = template(
+    pipe,
+    prompt="An orange cat is sitting on a stone.",
+    seed=0, cfg_scale=4, num_inference_steps=50,
+    template_inputs = [{
+        "image": Image.open("data/examples/templates/image_reference.jpg"),
+        "mask": Image.open("data/examples/templates/image_mask_1.jpg"),
+        "force_inpaint": True,
+    }],
+    negative_template_inputs = [{
+        "image": Image.open("data/examples/templates/image_reference.jpg"),
+        "mask": Image.open("data/examples/templates/image_mask_1.jpg"),
+    }],
+)
+image.save("image_Inpaint_1.jpg")
+image = template(
+    pipe,
+    prompt="A cat wearing sunglasses is sitting on a stone.",
+    seed=0, cfg_scale=4, num_inference_steps=50,
+    template_inputs = [{
+        "image": Image.open("data/examples/templates/image_reference.jpg"),
+        "mask": Image.open("data/examples/templates/image_mask_2.jpg"),
+    }],
+    negative_template_inputs = [{
+        "image": Image.open("data/examples/templates/image_reference.jpg"),
+        "mask": Image.open("data/examples/templates/image_mask_2.jpg"),
+    }],
+)
+image.save("image_Inpaint_2.jpg")
+```
+## Training Code
+After installing DiffSynth-Studio, use the following script to start training. For more information, please refer to the [DiffSynth-Studio Documentation](https://diffsynth-studio-doc.readthedocs.io/zh-cn/latest/).
+```shell
+modelscope download --dataset DiffSynth-Studio/diffsynth_example_dataset --include "flux2/Template-KleinBase4B-Inpaint/*" --local_dir ./data/diffsynth_example_dataset
+accelerate launch examples/flux2/model_training/train.py \
+  --dataset_base_path data/diffsynth_example_dataset/flux2/Template-KleinBase4B-Inpaint \
+  --dataset_metadata_path data/diffsynth_example_dataset/flux2/Template-KleinBase4B-Inpaint/metadata.jsonl \
+  --extra_inputs "template_inputs" \
+  --max_pixels 1048576 \
+  --dataset_repeat 50 \
+  --model_id_with_origin_paths "black-forest-labs/FLUX.2-klein-4B:text_encoder/*.safetensors,black-forest-labs/FLUX.2-klein-base-4B:transformer/*.safetensors,black-forest-labs/FLUX.2-klein-4B:vae/diffusion_pytorch_model.safetensors" \
+  --template_model_id_or_path "DiffSynth-Studio/Template-KleinBase4B-Inpaint:" \
+  --tokenizer_path "black-forest-labs/FLUX.2-klein-4B:tokenizer/" \
+  --learning_rate 1e-4 \
+  --num_epochs 2 \
+  --remove_prefix_in_ckpt "pipe.template_model." \
+  --output_path "./models/train/Template-KleinBase4B-Inpaint_full" \
+  --trainable_models "template_model" \
+  --use_gradient_checkpointing \
+  --find_unused_parameters
+```

README_from_modelscope.md ADDED Viewed

	@@ -0,0 +1,196 @@

+---
+frameworks:
+- Pytorch
+license: Apache License 2.0
+tags: []
+tasks:
+- text-to-image-synthesis
+---
+# Templates-局部重绘（FLUX.2-klein-base-4B）
+本模型是 [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio) 开源的 Diffusion Templates 系列模型之一。该模型专为局部重绘设计，能够接收原图和Mask图，并根据自然语言提示词在遮罩区域内生成全新的内容，同时无缝融合周围未被遮罩的图像背景。
+## 效果展示
+| Reference | Prompt | Mask | Generated |
+|:---:|:---|:---:|:---:|
+| ![](./assets/cat_base.jpg) | An orange cat is sitting on a stone. | ![](./assets/cat_mask_1.jpg) | ![](./assets/cat_Inpaint_1.jpg) |
+| ![](./assets/cat_base.jpg) | A cat wearing sunglasses is sitting on a stone. | ![](./assets/cat_mask_2.jpg) | ![](./assets/cat_Inpaint_2.jpg) |
+| Reference | Prompt | Mask | Generated |
+|:---:|:---|:---:|:---:|
+| ![](./assets/girl_base.jpg) | A beautiful young woman wearing a woven straw hat with a ribbon standing in a sunflower field. | ![](./assets/girl_mask_1.jpg) | ![](./assets/girl_Inpaint_1.jpg) |
+| ![](./assets/girl_base.jpg) | A beautiful young woman wearing an elegant white dress standing in a glowing sunflower field. | ![](./assets/girl_mask_2.jpg) | ![](./assets/girl_Inpaint_2.jpg) |
+| Reference | Prompt | Mask | Generated |
+|:---:|:---|:---:|:---:|
+| ![](./assets/room_base.jpg) | A sleek glass vase with a single blooming white lily and an open minimalist art book resting on the circular white marble coffee table. | ![](./assets/room1.jpg) | ![](./assets/room_Inpaint_1.jpg) |
+| ![](./assets/room_base.jpg) | A large, minimalist flower painting hanging on the clean off-white wall above the sofa, soft shadows. | ![](./assets/room2.jpg) | ![](./assets/room_Inpaint_2.jpg) |
+## 推理代码
+* 安装 [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio)
+```
+git clone https://github.com/modelscope/DiffSynth-Studio.git
+cd DiffSynth-Studio
+pip install -e .
+```
+* 直接推理，需 40G 显存
+```python
+from diffsynth.diffusion.template import TemplatePipeline
+from diffsynth.pipelines.flux2_image import Flux2ImagePipeline, ModelConfig
+import torch
+from modelscope import dataset_snapshot_download
+from PIL import Image
+pipe = Flux2ImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-base-4B", origin_file_pattern="transformer/*.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="text_encoder/*.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    tokenizer_config=ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="tokenizer/"),
+)
+template = TemplatePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[ModelConfig(model_id="DiffSynth-Studio/Template-KleinBase4B-Inpaint")],
+)
+dataset_snapshot_download(
+    "DiffSynth-Studio/examples_in_diffsynth",
+    allow_file_pattern=["templates/*"],
+    local_dir="data/examples",
+)
+image = template(
+    pipe,
+    prompt="An orange cat is sitting on a stone.",
+    seed=0, cfg_scale=4, num_inference_steps=50,
+    template_inputs = [{
+        "image": Image.open("data/examples/templates/image_reference.jpg"),
+        "mask": Image.open("data/examples/templates/image_mask_1.jpg"),
+        "force_inpaint": True,
+    }],
+    negative_template_inputs = [{
+        "image": Image.open("data/examples/templates/image_reference.jpg"),
+        "mask": Image.open("data/examples/templates/image_mask_1.jpg"),
+    }],
+)
+image.save("image_Inpaint_1.jpg")
+image = template(
+    pipe,
+    prompt="A cat wearing sunglasses is sitting on a stone.",
+    seed=0, cfg_scale=4, num_inference_steps=50,
+    template_inputs = [{
+        "image": Image.open("data/examples/templates/image_reference.jpg"),
+        "mask": Image.open("data/examples/templates/image_mask_2.jpg"),
+    }],
+    negative_template_inputs = [{
+        "image": Image.open("data/examples/templates/image_reference.jpg"),
+        "mask": Image.open("data/examples/templates/image_mask_2.jpg"),
+    }],
+)
+image.save("image_Inpaint_2.jpg")
+```
+* 开启惰性加载和显存管理，需 24G 显存
+```python
+from diffsynth.diffusion.template import TemplatePipeline
+from diffsynth.pipelines.flux2_image import Flux2ImagePipeline, ModelConfig
+import torch
+from modelscope import dataset_snapshot_download
+from PIL import Image
+vram_config = {
+    "offload_dtype": "disk",
+    "offload_device": "disk",
+    "onload_dtype": torch.float8_e4m3fn,
+    "onload_device": "cpu",
+    "preparing_dtype": torch.float8_e4m3fn,
+    "preparing_device": "cuda",
+    "computation_dtype": torch.bfloat16,
+    "computation_device": "cuda",
+}
+pipe = Flux2ImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-base-4B", origin_file_pattern="transformer/*.safetensors", **vram_config),
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="text_encoder/*.safetensors", **vram_config),
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    tokenizer_config=ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="tokenizer/"),
+    vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
+)
+template = TemplatePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[ModelConfig(model_id="DiffSynth-Studio/Template-KleinBase4B-Inpaint")],
+    lazy_loading=True,
+)
+dataset_snapshot_download(
+    "DiffSynth-Studio/examples_in_diffsynth",
+    allow_file_pattern=["templates/*"],
+    local_dir="data/examples",
+)
+image = template(
+    pipe,
+    prompt="An orange cat is sitting on a stone.",
+    seed=0, cfg_scale=4, num_inference_steps=50,
+    template_inputs = [{
+        "image": Image.open("data/examples/templates/image_reference.jpg"),
+        "mask": Image.open("data/examples/templates/image_mask_1.jpg"),
+        "force_inpaint": True,
+    }],
+    negative_template_inputs = [{
+        "image": Image.open("data/examples/templates/image_reference.jpg"),
+        "mask": Image.open("data/examples/templates/image_mask_1.jpg"),
+    }],
+)
+image.save("image_Inpaint_1.jpg")
+image = template(
+    pipe,
+    prompt="A cat wearing sunglasses is sitting on a stone.",
+    seed=0, cfg_scale=4, num_inference_steps=50,
+    template_inputs = [{
+        "image": Image.open("data/examples/templates/image_reference.jpg"),
+        "mask": Image.open("data/examples/templates/image_mask_2.jpg"),
+    }],
+    negative_template_inputs = [{
+        "image": Image.open("data/examples/templates/image_reference.jpg"),
+        "mask": Image.open("data/examples/templates/image_mask_2.jpg"),
+    }],
+)
+image.save("image_Inpaint_2.jpg")
+```
+## 训练代码
+安装 DiffSynth-Studio 后，使用以下脚本可开启训练，更多信息请参考 [DiffSynth-Studio 文档](https://diffsynth-studio-doc.readthedocs.io/zh-cn/latest/)。
+```shell
+modelscope download --dataset DiffSynth-Studio/diffsynth_example_dataset --include "flux2/Template-KleinBase4B-Inpaint/*" --local_dir ./data/diffsynth_example_dataset
+accelerate launch examples/flux2/model_training/train.py \
+  --dataset_base_path data/diffsynth_example_dataset/flux2/Template-KleinBase4B-Inpaint \
+  --dataset_metadata_path data/diffsynth_example_dataset/flux2/Template-KleinBase4B-Inpaint/metadata.jsonl \
+  --extra_inputs "template_inputs" \
+  --max_pixels 1048576 \
+  --dataset_repeat 50 \
+  --model_id_with_origin_paths "black-forest-labs/FLUX.2-klein-4B:text_encoder/*.safetensors,black-forest-labs/FLUX.2-klein-base-4B:transformer/*.safetensors,black-forest-labs/FLUX.2-klein-4B:vae/diffusion_pytorch_model.safetensors" \
+  --template_model_id_or_path "DiffSynth-Studio/Template-KleinBase4B-Inpaint:" \
+  --tokenizer_path "black-forest-labs/FLUX.2-klein-4B:tokenizer/" \
+  --learning_rate 1e-4 \
+  --num_epochs 2 \
+  --remove_prefix_in_ckpt "pipe.template_model." \
+  --output_path "./models/train/Template-KleinBase4B-Inpaint_full" \
+  --trainable_models "template_model" \
+  --use_gradient_checkpointing \
+  --find_unused_parameters
+```

assets/cat_Inpaint_1.jpg ADDED Viewed

Git LFS Details

SHA256: 61f144a198a21d0e552a6c259a2d8376e4edd9129b63c0b8d07772b2a3f9ffb8
Pointer size: 131 Bytes
Size of remote file: 110 kB

assets/cat_Inpaint_2.jpg ADDED Viewed

Git LFS Details

SHA256: 8a75e21783a96c9738d6a68f69390ca62fd19711e084e887a105c4604efcf2dc
Pointer size: 131 Bytes
Size of remote file: 132 kB

assets/cat_base.jpg ADDED Viewed

Git LFS Details

SHA256: f113000383ad9e079689cd5be415aea94e80ae5ef597ec062e5ad94f4c95a63a
Pointer size: 131 Bytes
Size of remote file: 129 kB

assets/cat_mask_1.jpg ADDED Viewed

assets/cat_mask_2.jpg ADDED Viewed

assets/girl_Inpaint_1.jpg ADDED Viewed

assets/girl_Inpaint_2.jpg ADDED Viewed

assets/girl_base.jpg ADDED Viewed

assets/girl_mask_1.jpg ADDED Viewed

assets/girl_mask_2.jpg ADDED Viewed

assets/room1.jpg ADDED Viewed

assets/room2.jpg ADDED Viewed

assets/room_Inpaint_1.jpg ADDED Viewed

assets/room_Inpaint_2.jpg ADDED Viewed

assets/room_base.jpg ADDED Viewed

configuration.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"framework":"Pytorch","task":"text-to-image-synthesis"}

model.py ADDED Viewed

	@@ -0,0 +1,378 @@

+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch, math
+import torch.nn as nn
+from einops import rearrange
+from diffsynth.core.attention import attention_forward
+from diffsynth.core.gradient import gradient_checkpoint_forward
+from diffsynth.models.flux2_dit import apply_rotary_emb, Flux2PosEmbed
+from diffsynth.models.general_modules import get_timestep_embedding
+from PIL import Image
+import numpy as np
+class AdaLayerNormContinuous(nn.Module):
+    def __init__(self, dim_in, dim_out, eps=1e-6):
+        super().__init__()
+        self.linear = nn.Linear(dim_in, dim_out * 2, bias=False)
+        self.norm = nn.LayerNorm(dim_in, eps=eps, elementwise_affine=False, bias=False)
+    def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor:
+        scale, shift = self.linear(torch.nn.functional.silu(conditioning_embedding)).chunk(2, dim=1)
+        x = self.norm(x) * (1 + scale) + shift
+        return x
+class Flux2FeedForward(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.linear_in = nn.Linear(dim, dim*3*2, bias=False)
+        self.linear_out = nn.Linear(dim*3, dim, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x1, x2 = self.linear_in(x).chunk(2, dim=-1)
+        x = torch.nn.functional.silu(x1) * x2
+        x = self.linear_out(x)
+        return x
+class Flux2TransformerBlock(nn.Module):
+    def __init__(self, dim, num_heads, eps=1e-6):
+        super().__init__()
+        self.img_norm1 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
+        self.txt_norm1 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
+        self.img_norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
+        self.img_ff = Flux2FeedForward(dim)
+        self.txt_norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
+        self.txt_ff = Flux2FeedForward(dim)
+        self.num_heads = num_heads
+        self.img_to_qkv = torch.nn.Linear(dim, 3 * dim, bias=False)
+        self.img_norm_q = torch.nn.RMSNorm(dim // num_heads, eps=eps)
+        self.img_norm_k = torch.nn.RMSNorm(dim // num_heads, eps=eps)
+        self.img_to_out = torch.nn.Linear(dim, dim, bias=False)
+        self.txt_to_qkv = torch.nn.Linear(dim, 3 * dim, bias=False)
+        self.txt_norm_q = torch.nn.RMSNorm(dim // num_heads, eps=eps)
+        self.txt_norm_k = torch.nn.RMSNorm(dim // num_heads, eps=eps)
+        self.txt_to_out = torch.nn.Linear(dim, dim, bias=False)
+    def attention(self, img: torch.Tensor, txt: torch.Tensor, image_rotary_emb: torch.Tensor, **kwargs) -> torch.Tensor:
+        img_q, img_k, img_v = self.img_to_qkv(img).chunk(3, dim=-1)
+        txt_q, txt_k, txt_v = self.txt_to_qkv(txt).chunk(3, dim=-1)
+        img_q, img_k, img_v, txt_q, txt_k, txt_v = tuple(map(lambda x: x.unflatten(-1, (self.num_heads, -1)), (img_q, img_k, img_v, txt_q, txt_k, txt_v)))
+        img_q = self.img_norm_q(img_q)
+        img_k = self.img_norm_k(img_k)
+        txt_q = self.txt_norm_q(txt_q)
+        txt_k = self.txt_norm_k(txt_k)
+        q = torch.cat([txt_q, img_q], dim=1)
+        k = torch.cat([txt_k, img_k], dim=1)
+        v = torch.cat([txt_v, img_v], dim=1)
+        q = apply_rotary_emb(q, image_rotary_emb, sequence_dim=1)
+        k = apply_rotary_emb(k, image_rotary_emb, sequence_dim=1)
+        img = attention_forward(q, k, v, q_pattern="b s n d", k_pattern="b s n d", v_pattern="b s n d", out_pattern="b s (n d)")
+        txt, img = img.split_with_sizes([txt.shape[1], img.shape[1] - txt.shape[1]], dim=1)
+        txt = self.txt_to_out(txt)
+        img = self.img_to_out(img)
+        return img, txt, (k, v)
+    def forward(self, img, txt, temb_mod_params_img, temb_mod_params_txt, image_rotary_emb):
+        (img_shift_msa, img_scale_msa, img_gate_msa), (img_shift_mlp, img_scale_mlp, img_gate_mlp) = temb_mod_params_img
+        (txt_shift_msa, txt_scale_msa, txt_gate_msa), (txt_shift_mlp, txt_scale_mlp, txt_gate_mlp) = temb_mod_params_txt
+        norm_img = (1 + img_scale_msa) * self.img_norm1(img) + img_shift_msa
+        norm_txt = (1 + txt_scale_msa) * self.txt_norm1(txt) + txt_shift_msa
+        img_attn_out, txt_attn_out, kv_cache = self.attention(norm_img, norm_txt, image_rotary_emb)
+        img = img + img_gate_msa * img_attn_out
+        norm_img = self.img_norm2(img) * (1 + img_scale_mlp) + img_shift_mlp
+        img = img + img_gate_mlp * self.img_ff(norm_img)
+        txt = txt + txt_gate_msa * txt_attn_out
+        norm_txt = self.txt_norm2(txt) * (1 + txt_scale_mlp) + txt_shift_mlp
+        txt = txt + txt_gate_mlp * self.txt_ff(norm_txt)
+        return txt, img, kv_cache
+class Flux2SingleTransformerBlock(nn.Module):
+    def __init__(self, dim, num_heads, eps: float = 1e-6):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
+        self.dim = dim
+        self.num_heads = num_heads
+        self.norm_q = torch.nn.RMSNorm(dim // num_heads, eps=eps, elementwise_affine=True)
+        self.norm_k = torch.nn.RMSNorm(dim // num_heads, eps=eps, elementwise_affine=True)
+        self.to_qkv_mlp_proj = torch.nn.Linear(dim, dim * 3 + dim * 3 * 2, bias=False)
+        self.to_out = torch.nn.Linear(dim + dim * 3, dim, bias=False)
+    def attention(self, x: torch.Tensor, image_rotary_emb: Optional[torch.Tensor] = None, **kwargs) -> torch.Tensor:
+        x = self.to_qkv_mlp_proj(x)
+        qkv, mlp_x = torch.split(x, [3 * self.dim, self.dim * 3 * 2], dim=-1)
+        q, k, v = tuple(map(lambda x: x.unflatten(-1, (self.num_heads, -1)), qkv.chunk(3, dim=-1)))
+        q = self.norm_q(q)
+        k = self.norm_k(k)
+        q = apply_rotary_emb(q, image_rotary_emb, sequence_dim=1)
+        k = apply_rotary_emb(k, image_rotary_emb, sequence_dim=1)
+        x = attention_forward(q, k, v, q_pattern="b s n d", k_pattern="b s n d", v_pattern="b s n d", out_pattern="b s (n d)")
+        x1, x2 = mlp_x.chunk(2, dim=-1)
+        x = torch.cat([x, torch.nn.functional.silu(x1) * x2], dim=-1)
+        x = self.to_out(x)
+        return x, (k, v)
+    def forward(self, x, temb_mod_params, image_rotary_emb):
+        mod_shift, mod_scale, mod_gate = temb_mod_params
+        norm_x = (1 + mod_scale) * self.norm(x) + mod_shift
+        attn_output, kv_cache = self.attention(x=norm_x, image_rotary_emb=image_rotary_emb,)
+        x = x + mod_gate * attn_output
+        return x, kv_cache
+class Flux2TimestepGuidanceEmbeddings(nn.Module):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.dim_in = dim_in
+        self.timestep_embedder = torch.nn.Sequential(nn.Linear(dim_in, dim_out, bias=False), nn.SiLU(), nn.Linear(dim_out, dim_out, bias=False))
+    def forward(self, timestep: torch.Tensor) -> torch.Tensor:
+        timesteps_proj = get_timestep_embedding(timestep, self.dim_in, flip_sin_to_cos=True, downscale_freq_shift=0)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(timestep.dtype))
+        return timesteps_emb
+class Flux2Modulation(nn.Module):
+    def __init__(self, dim: int, mod_param_sets: int = 2, bias: bool = False):
+        super().__init__()
+        self.mod_param_sets = mod_param_sets
+        self.linear = nn.Linear(dim, dim * 3 * self.mod_param_sets, bias=bias)
+    def forward(self, temb: torch.Tensor) -> Tuple[Tuple[torch.Tensor, torch.Tensor, torch.Tensor], ...]:
+        mod = torch.nn.functional.silu(temb)
+        mod = self.linear(mod)
+        mod = mod.unsqueeze(1)
+        mod_params = torch.chunk(mod, 3 * self.mod_param_sets, dim=-1)
+        return tuple(mod_params[3 * i : 3 * (i + 1)] for i in range(self.mod_param_sets))
+class Flux2DiTVariantModel(torch.nn.Module):
+    def __init__(
+        self,
+        patch_size: int = 1,
+        in_channels: int = 128,
+        out_channels: Optional[int] = None,
+        num_layers: int = 5,
+        num_single_layers: int = 20,
+        attention_head_dim: int = 128,
+        num_attention_heads: int = 24,
+        joint_attention_dim: int = 7680,
+        timestep_guidance_channels: int = 256,
+        axes_dims_rope: Tuple[int, ...] = (32, 32, 32, 32),
+        rope_theta: int = 2000,
+    ):
+        super().__init__()
+        self.out_channels = out_channels or in_channels
+        self.inner_dim = num_attention_heads * attention_head_dim
+        # 1. Sinusoidal positional embedding for RoPE on image and text tokens
+        self.pos_embed = Flux2PosEmbed(theta=rope_theta, axes_dim=axes_dims_rope)
+        # 2. Combined timestep + guidance embedding
+        self.time_guidance_embed = Flux2TimestepGuidanceEmbeddings(
+            dim_in=timestep_guidance_channels,
+            dim_out=self.inner_dim,
+        )
+        # 3. Modulation (double stream and single stream blocks share modulation parameters, resp.)
+        # Two sets of shift/scale/gate modulation parameters for the double stream attn and FF sub-blocks
+        self.double_stream_modulation_img = Flux2Modulation(self.inner_dim, mod_param_sets=2, bias=False)
+        self.double_stream_modulation_txt = Flux2Modulation(self.inner_dim, mod_param_sets=2, bias=False)
+        # Only one set of modulation parameters as the attn and FF sub-blocks are run in parallel for single stream
+        self.single_stream_modulation = Flux2Modulation(self.inner_dim, mod_param_sets=1, bias=False)
+        # 4. Input projections
+        self.img_embedder = nn.Linear(in_channels, self.inner_dim, bias=False)
+        self.txt_embedder = nn.Linear(joint_attention_dim, self.inner_dim, bias=False)
+        # 5. Double Stream Transformer Blocks
+        self.transformer_blocks = nn.ModuleList([Flux2TransformerBlock(dim=self.inner_dim, num_heads=num_attention_heads) for _ in range(num_layers)])
+        # 6. Single Stream Transformer Blocks
+        self.single_transformer_blocks = nn.ModuleList([Flux2SingleTransformerBlock(dim=self.inner_dim, num_heads=num_attention_heads) for _ in range(num_single_layers)])
+        # 7. Output layers
+        self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim)
+        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=False)
+    def prepare_static_parameters(self, img, txt):
+        timestep = torch.zeros((1,), dtype=txt.dtype, device=txt.device)
+        img_ids = []
+        for latent_id, latent in enumerate(img):
+            _, _, height, width = latent.shape
+            x_ids = torch.cartesian_prod(torch.tensor([(latent_id + 1) * 10]), torch.arange(height), torch.arange(width), torch.arange(1))
+            img_ids.append(x_ids)
+        img_ids = torch.cat(img_ids, dim=0).to(txt.device)
+        txt_ids = torch.cartesian_prod(torch.arange(1), torch.arange(1), torch.arange(1), torch.arange(txt.shape[1])).to(txt.device)
+        return timestep, img_ids, txt_ids
+    def patchify(self, img):
+        img_ = []
+        for latent in img:
+            latent = rearrange(latent, "B C H W -> B (H W) C")
+            img_.append(latent)
+        img_ = torch.concat(img_, dim=1)
+        return img_
+    def process_image(self, image, mask):
+        mask = mask.convert("RGB").resize(image.size)
+        mask = np.array(mask).mean(axis=-1)
+        image = np.array(image)
+        image[mask > 127] = 0
+        return Image.fromarray(image), Image.fromarray(mask).convert("RGB")
+    @torch.no_grad()
+    def process_inputs(
+        self,
+        pipe,
+        image,
+        mask,
+        prompt="Complete the content in the annotated region of the image.",
+        force_inpaint=False,
+        **kwargs
+    ):
+        masked_image, mask = self.process_image(image, mask)
+        images = [masked_image, mask]
+        pipe.load_models_to_device(["vae"])
+        kv_cache_input_latents = [pipe.vae.encode(pipe.preprocess_image(image)) for image in images]
+        prompt_emb_unit = [unit for unit in pipe.units if unit.__class__.__name__ == "Flux2Unit_Qwen3PromptEmbedder"][0]
+        kv_cache_prompt_emb = prompt_emb_unit.process(pipe, prompt)["prompt_embeds"]
+        pipe.load_models_to_device([])
+        return {
+            "kv_cache_input_latents": kv_cache_input_latents,
+            "kv_cache_prompt_emb": kv_cache_prompt_emb,
+            "image": image,
+            "mask": mask,
+            "force_inpaint": force_inpaint,
+        }
+    def forward(
+        self,
+        kv_cache_input_latents,
+        kv_cache_prompt_emb,
+        use_gradient_checkpointing=False,
+        use_gradient_checkpointing_offload=False,
+        image=None,
+        mask=None,
+        force_inpaint=False,
+        **kwargs,
+    ):
+        img = kv_cache_input_latents
+        txt = kv_cache_prompt_emb
+        num_txt_tokens = txt.shape[1]
+        # 1. Calculate timestep embedding and modulation parameters
+        timestep, img_ids, txt_ids = self.prepare_static_parameters(img, txt)
+        img = self.patchify(img)
+        temb = self.time_guidance_embed(timestep)
+        double_stream_mod_img = self.double_stream_modulation_img(temb)
+        double_stream_mod_txt = self.double_stream_modulation_txt(temb)
+        single_stream_mod = self.single_stream_modulation(temb)[0]
+        # 2. Input projection for image (img) and conditioning text (txt)
+        img = self.img_embedder(img)
+        txt = self.txt_embedder(txt)
+        # 3. Calculate RoPE embeddings from image and text tokens
+        image_rotary_emb = self.pos_embed(img_ids)
+        text_rotary_emb = self.pos_embed(txt_ids)
+        concat_rotary_emb = (
+            torch.cat([text_rotary_emb[0], image_rotary_emb[0]], dim=0),
+            torch.cat([text_rotary_emb[1], image_rotary_emb[1]], dim=0),
+        )
+        # 4. Double Stream Transformer Blocks
+        kv_cache = {}
+        for block_id, block in enumerate(self.transformer_blocks):
+            txt, img, kv_cache_ = gradient_checkpoint_forward(
+                block,
+                use_gradient_checkpointing=use_gradient_checkpointing,
+                use_gradient_checkpointing_offload=use_gradient_checkpointing_offload,
+                img=img,
+                txt=txt,
+                temb_mod_params_img=double_stream_mod_img,
+                temb_mod_params_txt=double_stream_mod_txt,
+                image_rotary_emb=concat_rotary_emb,
+            )
+            kv_cache[f"double_{block_id}"] = kv_cache_
+        # Concatenate text and image streams for single-block inference
+        img = torch.cat([txt, img], dim=1)
+        # 5. Single Stream Transformer Blocks
+        for block_id, block in enumerate(self.single_transformer_blocks):
+            img, kv_cache_ = gradient_checkpoint_forward(
+                block,
+                use_gradient_checkpointing=use_gradient_checkpointing,
+                use_gradient_checkpointing_offload=use_gradient_checkpointing_offload,
+                x=img,
+                temb_mod_params=single_stream_mod,
+                image_rotary_emb=concat_rotary_emb,
+            )
+            kv_cache[f"single_{block_id}"] = kv_cache_
+        # # Remove text tokens from concatenated stream
+        # img = img[:, num_txt_tokens:, ...]
+        # # 6. Output layers
+        # img = self.norm_out(img, temb)
+        # output = self.proj_out(img)
+        results = {"kv_cache": kv_cache}
+        if force_inpaint:
+            results.update({
+                "input_image": image,
+                "inpaint_mask": mask,
+                "inpaint_blur_size": 1,
+                "inpaint_blur_sigma": 1,
+            })
+        return results
+class TrainDataProcessor:
+    def __init__(self):
+        from diffsynth.core import UnifiedDataset
+        self.image_oparator = UnifiedDataset.default_image_operator(
+            base_path="", # If your dataset contains relative paths, please specify the root path here.
+            max_pixels=1024*1024,
+            height_division_factor=16,
+            width_division_factor=16,
+        )
+    def generate_bbox(self, height, width):
+        h = torch.randint(10, height - 10, (1,)).item()
+        w = torch.randint(10, width - 10, (1,)).item()
+        x = torch.randint(0, height - h, (1,)).item()
+        y = torch.randint(0, width - w, (1,)).item()
+        return x, x + h, y, y + w
+    def generate_mask(self, image):
+        image = np.array(image)
+        height, width, _ = image.shape
+        x, x_, y, y_ = self.generate_bbox(height, width)
+        image[x: x_, y: y_] = 0
+        mask = np.zeros_like(image)
+        mask[x: x_, y: y_] = 255
+        return Image.fromarray(image), Image.fromarray(mask)
+    def __call__(self, image, **kwargs):
+        image = self.image_oparator(image)
+        image, mask = self.generate_mask(image)
+        return {
+            "image": image,
+            "mask": mask,
+        }
+TEMPLATE_MODEL = Flux2DiTVariantModel
+TEMPLATE_MODEL_PATH = "model.safetensors"
+TEMPLATE_DATA_PROCESSOR = TrainDataProcessor

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:57ef6ed07ef3a159bb2c5424c12efef054b3250c2d25ba1ff1cf6960b6d4cb94
+size 7751106784