| import os |
| import re |
| import time |
| from io import BytesIO |
| import uuid |
| from dataclasses import dataclass |
| from glob import iglob |
| import argparse |
| from einops import rearrange |
| |
| from PIL import ExifTags, Image |
| from safetensors.torch import load_file, save_file |
| import spaces |
|
|
| import torch |
| import torch.nn.functional as F |
| import gradio as gr |
| import numpy as np |
| from transformers import pipeline |
|
|
| from src.flux.sampling import denoise_fireflow, get_schedule, prepare, prepare_image, unpack, denoise_rf, denoise_rf_solver, denoise_midpoint, denoise_rf_inversion, denoise_multi_turn_consistent, get_noise |
| from src.flux.util import (configs, embed_watermark, load_ae, load_clip, load_flow_model, load_t5) |
|
|
| @dataclass |
| class SamplingOptions: |
| source_prompt: str |
| target_prompt: str |
| |
| width: int |
| height: int |
| num_steps: int |
| guidance: float |
| seed: int | None |
|
|
|
|
| torch_device = "cuda" if torch.cuda.is_available() else "cpu" |
| offload = False |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| name = 'flux-dev' |
| ae = load_ae(name, device="cpu" if offload else torch_device) |
| t5 = load_t5(device, max_length=256 if name == "flux-schnell" else 512) |
| clip = load_clip(device) |
| model = load_flow_model(name, device="cpu" if offload else torch_device) |
| t5.eval() |
| clip.eval() |
| ae.eval() |
| model.eval() |
|
|
| is_schnell = False |
| add_sampling_metadata = True |
|
|
| |
| if os.path.exists("history_gradio/history.safetensors"): |
| os.remove("history_gradio/history.safetensors") |
|
|
| out_root = 'src/gradio_utils/gradio_outputs' |
| out_root_prompt = 'src/gradio_utils/gradio_prompts' |
| if not os.path.exists(out_root): |
| os.makedirs(out_root) |
| if not os.path.exists(out_root_prompt): |
| os.makedirs(out_root_prompt) |
|
|
| exp_folders = [d for d in os.listdir(out_root) if d.startswith("exp_") and d[4:].isdigit()] |
| if exp_folders: |
| max_idx = max(int(d[4:]) for d in exp_folders) |
| name_dir = f"exp_{max_idx + 1}" |
| else: |
| name_dir = "exp_0" |
| output_dir = os.path.join(out_root, name_dir) |
| output_prompt = os.path.join(out_root_prompt, name_dir) |
|
|
| if not os.path.exists(output_dir): |
| os.makedirs(output_dir) |
| if not os.path.exists(output_prompt): |
| os.makedirs(output_prompt) |
| if not os.path.exists("heatmap"): |
| os.makedirs("heatmap") |
| if not os.path.exists("heatmap/average_heatmaps"): |
| os.makedirs("heatmap/average_heatmaps") |
| source_image = None |
| history_tensors = { |
| "source img": torch.zeros((1, 1, 1)), |
| "prev img": torch.zeros((1, 1, 1))} |
| instructions = [''] |
|
|
|
|
| def read_sorted_prompts(folder_path): |
| |
| files = sorted([f for f in os.listdir(folder_path) if f.endswith('.txt')]) |
| prompts = [] |
| for filename in files: |
| file_path = os.path.join(folder_path, filename) |
| with open(file_path, 'r') as f: |
| prompt = f.read().strip() |
| prompts.append(prompt) |
| return prompts |
|
|
|
|
| @torch.inference_mode() |
| def reset(): |
|
|
| |
| if os.path.exists("history_gradio/history.safetensors"): |
| os.remove("history_gradio/history.safetensors") |
|
|
| global out_root, out_root_prompt, output_dir, output_prompt, history_tensors, source_image, instructions |
| if not os.path.exists(out_root): |
| os.makedirs(out_root) |
| if not os.path.exists(out_root_prompt): |
| os.makedirs(out_root_prompt) |
| exp_folders = [d for d in os.listdir(out_root) if d.startswith("exp_") and d[4:].isdigit()] |
| if exp_folders: |
| max_idx = max(int(d[4:]) for d in exp_folders) |
| name_dir = f"exp_{max_idx + 1}" |
| else: |
| name_dir = "exp_0" |
| output_dir = os.path.join(out_root, name_dir) |
| output_prompt = os.path.join(out_root_prompt, name_dir) |
|
|
| if not os.path.exists(output_dir): |
| os.makedirs(output_dir) |
| if not os.path.exists(output_prompt): |
| os.makedirs(output_prompt) |
| if not os.path.exists("heatmap"): |
| os.makedirs("heatmap") |
| if not os.path.exists("heatmap/average_heatmaps"): |
| os.makedirs("heatmap/average_heatmaps") |
| instructions = [''] |
| source_image = None |
| history_tensors = { |
| "source img": torch.zeros((1, 1, 1)), |
| "prev img": torch.zeros((1, 1, 1))} |
| |
| source_prompt = "(Optional) Describe the content of the uploaded image." |
| traget_prompt = "(Required) Describe the desired content of the edited image." |
| gallery = None |
| output_image = None |
| init_image = None |
| return source_prompt, traget_prompt, gallery, output_image, init_image |
|
|
|
|
| @torch.inference_mode() |
| def process_image( |
| init_image, |
| source_prompt, |
| target_prompt, |
| editing_strategy, |
| denoise_strategy, |
| num_steps, |
| guidance, |
| attn_guidance_start_block, |
| inject_step, |
| init_image_2=None): |
| if init_image is None: |
| img, gr_gallery = generate_image(prompt=target_prompt) |
| else: |
| img, gr_gallery = edit(init_image, source_prompt, target_prompt, editing_strategy, denoise_strategy, num_steps, guidance, attn_guidance_start_block, inject_step, init_image_2) |
| return img, gr_gallery |
|
|
| |
| @spaces.GPU(duration=120) |
| @torch.inference_mode() |
| def generate_image( |
| width=512, |
| height=512, |
| num_steps=28, |
| guidance=3.5, |
| seed=None, |
| prompt='', |
| init_image=None, |
| image2image_strength=0.0, |
| ): |
| global ae, t5, clip, model, name, is_schnell, output_dir, output_prompt, add_sampling_metadata, offload, history_tensors |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| torch.cuda.empty_cache() |
| seed = None |
|
|
| if seed is None: |
| g_seed = torch.Generator(device="cpu").seed() |
| print(f"Generating '{prompt}' with seed {g_seed}") |
| t0 = time.perf_counter() |
|
|
| if init_image is not None: |
| if isinstance(init_image, np.ndarray): |
| init_image = torch.from_numpy(init_image).permute(2, 0, 1).float() / 255.0 |
| init_image = init_image.unsqueeze(0) |
| init_image = init_image.to(device) |
| init_image = torch.nn.functional.interpolate(init_image, (height, width)) |
| if offload: |
| ae.encoder.to(device) |
| init_image = ae.encode(init_image) |
| if offload: |
| ae = ae.cpu() |
| torch.cuda.empty_cache() |
|
|
| |
| x = get_noise( |
| 1, |
| height, |
| width, |
| device=device, |
| dtype=torch.bfloat16, |
| seed=g_seed, |
| ) |
| timesteps = get_schedule( |
| num_steps, |
| x.shape[-1] * x.shape[-2] // 4, |
| shift=(not is_schnell), |
| ) |
| if init_image is not None: |
| t_idx = int((1 - image2image_strength) * num_steps) |
| t = timesteps[t_idx] |
| timesteps = timesteps[t_idx:] |
| x = t * x + (1.0 - t) * init_image.to(x.dtype) |
|
|
| if offload: |
| t5, clip = t5.to(device), clip.to(device) |
| inp = prepare(t5=t5, clip=clip, img=x, prompt=prompt) |
|
|
| |
| if offload: |
| t5, clip = t5.cpu(), clip.cpu() |
| torch.cuda.empty_cache() |
| model = model.to(device) |
|
|
| |
| info = {} |
| info['feature'] = {} |
| info['inject_step'] = 0 |
| info['editing_strategy']= "" |
| info['start_layer_index'] = 0 |
| info['end_layer_index'] = 37 |
| info['reuse_v']= False |
| qkv_ratio = '1.0,1.0,1.0' |
| info['qkv_ratio'] = list(map(float, qkv_ratio.split(','))) |
| x = denoise_rf(model, **inp, timesteps=timesteps, guidance=guidance, inverse=False, info=info) |
|
|
| |
| if offload: |
| model.cpu() |
| torch.cuda.empty_cache() |
| ae.decoder.to(x.device) |
|
|
| |
| x = unpack(x[0].float(), height, width) |
| device = torch.device("cuda") |
| with torch.autocast(device_type=device.type, dtype=torch.bfloat16): |
| x = ae.decode(x) |
|
|
| if offload: |
| ae.decoder.cpu() |
| torch.cuda.empty_cache() |
|
|
| t1 = time.perf_counter() |
|
|
| print(f"Done in {t1 - t0:.1f}s.") |
| |
| x = x.clamp(-1, 1) |
| x = embed_watermark(x.float()) |
| x = rearrange(x[0], "c h w -> h w c") |
|
|
| img = Image.fromarray((127.5 * (x + 1.0)).cpu().byte().numpy()) |
| |
| filename = os.path.join(output_dir,f"round_0000_[{prompt}].jpg") |
| os.makedirs(os.path.dirname(filename), exist_ok=True) |
| exif_data = Image.Exif() |
| if init_image is None: |
| exif_data[ExifTags.Base.Software] = "AI generated;txt2img;flux" |
| else: |
| exif_data[ExifTags.Base.Software] = "AI generated;img2img;flux" |
| exif_data[ExifTags.Base.Make] = "Black Forest Labs" |
| exif_data[ExifTags.Base.Model] = name |
| if add_sampling_metadata: |
| exif_data[ExifTags.Base.ImageDescription] = prompt |
| img.save(filename, format="jpeg", exif=exif_data, quality=95, subsampling=0) |
| instructions = [prompt] |
|
|
| prompt_path = os.path.join(output_prompt, f"round_0000.txt") |
| with open(prompt_path, "w") as f: |
| f.write(prompt) |
|
|
| |
| img_and_prompt = [] |
| history_imgs = sorted(os.listdir(output_dir)) |
| instructions = read_sorted_prompts(output_prompt) |
| for img_file, prompt_txt in zip(history_imgs, instructions): |
| img_and_prompt.append((os.path.join(output_dir, img_file), prompt_txt)) |
| history_gallery = gr.Gallery(value=img_and_prompt, label="History Image", interactive=True, columns=3) |
| return img, history_gallery |
|
|
|
|
| @spaces.GPU(duration=200) |
| @torch.inference_mode() |
| def edit(init_image, source_prompt, target_prompt, editing_strategy, denoise_strategy, num_steps, guidance, attn_guidance_start_block, inject_step, init_image_2=None): |
| global ae, t5, clip, model, name, is_schnell, output_dir, output_prompt, add_sampling_metadata, offload, source_image, history_tensors, instructions |
| |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| torch.cuda.empty_cache() |
| seed = None |
|
|
| |
| info = {} |
| shape = init_image.shape |
| new_h = shape[0] if shape[0] % 16 == 0 else shape[0] - shape[0] % 16 |
| new_w = shape[1] if shape[1] % 16 == 0 else shape[1] - shape[1] % 16 |
|
|
| if not any("round_0000" in fname for fname in os.listdir(output_dir)): |
| Image.fromarray(init_image).save(os.path.join(output_dir,"round_0000_[source].jpg")) |
| prompt_path = os.path.join(output_prompt, f"round_0000.txt") |
| with open(prompt_path, "w") as f: |
| f.write('') |
|
|
| init_image = init_image[:new_h, :new_w, :] |
| width, height = init_image.shape[0], init_image.shape[1] |
|
|
| init_image = torch.from_numpy(init_image).permute(2, 0, 1).float() / 127.5 - 1 |
| init_image = init_image.unsqueeze(0) |
| init_image = init_image.to(device) |
| if offload: |
| model.cpu() |
| torch.cuda.empty_cache() |
| ae.encoder.to(device) |
| |
| with torch.no_grad(): |
| init_image = ae.encode(init_image.to()).to(torch.bfloat16) |
|
|
| if init_image_2 is None: |
| print("init_image_2 is not provided, proceeding with single image processing.") |
| else: |
| init_image_2_pil = Image.fromarray(init_image_2) |
| init_image_2_pil = init_image_2_pil.resize((new_w, new_h), Image.Resampling.LANCZOS) |
| init_image_2 = np.array(init_image_2_pil) |
| init_image_2 = torch.from_numpy(init_image_2).permute(2, 0, 1).float() / 127.5 - 1 |
|
|
| rng = torch.Generator(device=torch.device("cpu")) |
| opts = SamplingOptions( |
| source_prompt=source_prompt, |
| target_prompt=target_prompt, |
| width=width, |
| height=height, |
| num_steps=num_steps, |
| guidance=guidance, |
| seed=None, |
| ) |
| if opts.seed is None: |
| opts.seed = torch.Generator(device=torch.device("cpu")).seed() |
| |
| print(f"Editing with prompt:\n{opts.source_prompt}") |
| t0 = time.perf_counter() |
|
|
| if offload: |
| ae = ae.cpu() |
| torch.cuda.empty_cache() |
| t5, clip = t5.to(torch_device), clip.to(torch_device) |
| opts.seed = None |
|
|
|
|
| |
| info = {} |
| info['feature'] = {} |
| info['inject_step'] = inject_step |
| info['editing_strategy']= " ".join(editing_strategy) |
| info['start_layer_index'] = 0 |
| info['end_layer_index'] = 37 |
| info['reuse_v']= False |
| qkv_ratio = '1.0,1.0,1.0' |
| info['qkv_ratio'] = list(map(float, qkv_ratio.split(','))) |
| info['attn_guidance'] = attn_guidance_start_block |
| info['lqr_stop'] = 0.25 |
|
|
| |
| with torch.no_grad(): |
| inp = prepare(t5, clip, init_image, prompt=opts.source_prompt) |
| inp_target = prepare(t5, clip, init_image, prompt=opts.target_prompt) |
| if source_image is None: |
| source_image = inp['img'] |
| inp_target_2 = None |
| if not init_image_2 is None: |
| inp_target_2 = prepare_image(init_image_2) |
|
|
| timesteps = get_schedule(opts.num_steps, inp["img"].shape[1], shift=(name != "flux-schnell")) |
| |
|
|
| |
| |
| if offload: |
| t5, clip = t5.cpu(), clip.cpu() |
| torch.cuda.empty_cache() |
| model = model.to(torch_device) |
|
|
| |
| denoise_strategies = ['fireflow', 'rf', 'rf_solver', 'midpoint', 'rf_inversion', 'multi_turn_consistent'] |
| denoise_funcs = [denoise_fireflow, denoise_rf, denoise_rf_solver, denoise_midpoint, denoise_rf_inversion, denoise_multi_turn_consistent] |
| denoise_func = denoise_funcs[denoise_strategies.index(denoise_strategy)] |
| with torch.no_grad(): |
| z, info = denoise_func(model, **inp, timesteps=timesteps, guidance=1, inverse=True, info=info) |
| |
| |
| |
| inp_target["img"] = z |
| timesteps = get_schedule(opts.num_steps, inp_target["img"].shape[1], shift=(name != "flux-schnell")) |
|
|
| if torch.all(history_tensors['source img'] == 0): |
| history_tensors = { |
| "source img": inp["img"], |
| "prev img": inp_target_2} |
| else: |
| if inp_target_2 is None: |
| history_tensors["prev img"] = inp["img"] |
| else: |
| history_tensors["source img"] = inp["img"] |
| history_tensors["prev img"] = inp_target_2 |
|
|
| |
| if denoise_strategy in ['rf_inversion', 'multi_turn_consistent']: |
| x, _ = denoise_func(model, **inp_target, timesteps=timesteps, guidance=guidance, inverse=False, info=info, img_LQR=history_tensors) |
| else: |
| x, _ = denoise_func(model, **inp_target, timesteps=timesteps, guidance=opts.guidance, inverse=False, info=info) |
| |
|
|
| |
| info = {} |
| history_tensors["source img"] = source_image |
| history_tensors["prev img"] = x |
|
|
| |
| x = unpack(x.float(), opts.width, opts.height) |
|
|
| if offload: |
| model.cpu() |
| torch.cuda.empty_cache() |
| ae.decoder.to(x.device) |
| |
| device = torch.device("cuda") |
| with torch.autocast(device_type=device.type, dtype=torch.bfloat16): |
| x = ae.decode(x) |
|
|
|
|
| if torch.cuda.is_available(): |
| torch.cuda.synchronize() |
| t1 = time.perf_counter() |
|
|
| |
| x = x.clamp(-1, 1) |
| x = embed_watermark(x.float()) |
| x = rearrange(x[0], "c h w -> h w c") |
|
|
| img = Image.fromarray((127.5 * (x + 1.0)).cpu().byte().numpy()) |
| exif_data = Image.Exif() |
| exif_data[ExifTags.Base.Software] = "AI generated;txt2img;flux" |
| exif_data[ExifTags.Base.Make] = "Black Forest Labs" |
| exif_data[ExifTags.Base.Model] = name |
| if add_sampling_metadata: |
| exif_data[ExifTags.Base.ImageDescription] = source_prompt |
|
|
|
|
|
|
| |
|
|
| |
| if not os.path.exists(output_dir): |
| os.makedirs(output_dir) |
| idx = 1 |
| |
| else: |
| fns = [fn for fn in os.listdir(output_dir)] |
| if len(fns) > 0: |
| idx = max(int(fn.split("_")[1]) for fn in fns) + 1 |
| else: |
| idx = 1 |
| formatted_idx = str(idx).zfill(4) |
| os.makedirs(output_prompt, exist_ok=True) |
| |
| if denoise_strategy == 'multi_turn_consistent': |
| denoise_strategy = 'MTC' |
| if target_prompt == '': |
| target_prompt = 'Reconstruction' |
| if target_prompt == source_prompt: |
| target_prompt = 'Reconstruction: ' + target_prompt |
|
|
| target_suffix = " ".join(target_prompt.split()[-5:]) |
| output_name = f"round_{formatted_idx}_{target_suffix}_{denoise_strategy}.jpg" |
| |
| fn = os.path.join(output_dir, output_name) |
| |
| print(f"Done in {t1 - t0:.1f}s. Saving {fn}") |
| img.save(fn) |
|
|
| if 'Reconstruction' in target_prompt: |
| target_prompt = source_prompt |
| instructions.append(target_prompt) |
| print("End Edit") |
|
|
| prompt_path = os.path.join(output_prompt, f"round_{formatted_idx}.txt") |
| with open(prompt_path, "w") as f: |
| f.write(target_prompt) |
|
|
| |
| img_and_prompt = [] |
| history_imgs = sorted(os.listdir(output_dir)) |
| instructions = read_sorted_prompts(output_prompt) |
| for img_file, prompt_txt in zip(history_imgs, instructions): |
| img_and_prompt.append((os.path.join(output_dir, img_file), prompt_txt)) |
| history_gallery = gr.Gallery(value=img_and_prompt, label="History Image", interactive=True, columns=3) |
|
|
| return img, history_gallery |
|
|
|
|
| def on_select(gallery, selected: gr.SelectData): |
| return gallery[selected.index][0], gallery[selected.index][1] |
| |
|
|
| def on_upload(path, uploaded: gr.EventData): |
| return path[0][0] |
|
|
| def on_change(init_image, changed: gr.EventData): |
| img_path = list(changed.target.temp_files) |
| return gr.Gallery(value=[(img_path[0], "")], label="History Image", interactive=True, columns=3), img_path[0] |
|
|
|
|
| def create_demo(model_name: str, device: str = "cuda" if torch.cuda.is_available() else "cpu"): |
|
|
| description = r""" |
| <h3>Tips π:</h3> |
| <ol> |
| <li>The app starts with default settings. To begin: <strong>(1) Click Reset Button.</strong> (2)Try the example image (at the bottom of the page) / Upload your own / Generate one with a target prompt.</li> |
| <li> Adaptive Attention (attn_guidance): The option<i> Top activated attn-maps</i> is effective only when this editing technique is selected. </li> |
| <li> If you like this project, please β us on <a href='https://github.com/ZhouZJ-DL/Multi-turn_Consistent_Image_Editing' target='_blank'>GitHub</a> or cite our <a href='https://arxiv.org/abs/2505.04320' target='_blank'>paper</a>. Thanks for your support! </li> |
| </ol> |
| """ |
| css = ''' |
| .gradio-container {width: 85% !important} |
| ''' |
|
|
| is_schnell = model_name == "flux-schnell" |
| |
| |
| examples = [ |
| ["src/gradio_utils/gradio_examples/000000000011.jpg", "", "an eagle standing on the branch", ['attn_guidance'], 15, 3.5, 11, 0], |
| ] |
|
|
| with gr.Blocks() as demo: |
| gr.Markdown(f"# Multi-turn Consistent Image Editing (FLUX.1-dev)") |
| gr.Markdown(description) |
| with gr.Row(): |
| with gr.Column(): |
| reset_btn = gr.Button("Reset", variant="primary") |
| source_prompt = gr.Textbox(label="Source Prompt", value="(Optional) Describe the content of the uploaded image.") |
| target_prompt = gr.Textbox(label="Target Prompt", value="(Required) Describe the desired content of the edited image.") |
| with gr.Row(): |
| init_image = gr.Image(label="Initial Image", visible=False, width=200) |
| init_image_2 = gr.Image(label="Input Image 2", visible=False, width=200) |
| gallery = gr.Gallery(label ="History Image", interactive=True, columns=3) |
| editing_strategy = gr.CheckboxGroup( |
| label="Editing Technique", |
| choices=['attn_guidance', 'replace_v', 'add_q', 'add_k', 'add_v', 'replace_q', 'replace_k'], |
| value=['attn_guidance'], |
| interactive=True |
| ) |
| denoise_strategy = gr.Dropdown( |
| ['multi_turn_consistent', 'fireflow', 'rf', 'rf_solver', 'midpoint', 'rf_inversion'], |
| label="Denoising Technique", value='multi_turn_consistent') |
| generate_btn = gr.Button("Generate", variant="primary") |
| |
| with gr.Column(): |
| with gr.Accordion("Advanced Options", open=True): |
| num_steps = gr.Slider(1, 30, 15, step=1, label="Number of steps") |
| guidance = gr.Slider(1.0, 10.0, 3.5, step=0.1, label="Text Guidance", interactive=not is_schnell) |
| attn_guidance_start_block = gr.Slider(0, 18, 11, step=1, label="Top activated attn-maps", interactive=not is_schnell) |
| inject_step = gr.Slider(0, 15, 1, step=1, label="Number of inject steps") |
| output_image = gr.Image(label="Generated/Edited Image") |
| example_image = gr.Image(label="example Image", visible=False, width=200) |
|
|
| gallery.select(on_select, gallery, [init_image, source_prompt]) |
| |
| gallery.upload(on_upload, gallery, init_image) |
| example_image.change(on_change, example_image, [gallery, init_image]) |
|
|
| generate_btn.click( |
| fn=process_image, |
| inputs=[init_image, source_prompt, target_prompt, editing_strategy, denoise_strategy, num_steps, guidance, attn_guidance_start_block, inject_step, init_image_2], |
| outputs=[output_image, gallery] |
| ) |
| reset_btn.click(fn = reset, outputs=[source_prompt, target_prompt, gallery, output_image, init_image]) |
| |
| |
| gr.Examples( |
| examples=examples, |
| inputs=[ |
| example_image, |
| source_prompt, |
| target_prompt, |
| editing_strategy, |
| num_steps, |
| guidance, |
| attn_guidance_start_block, |
| inject_step |
| ] |
| ) |
|
|
|
|
| return demo |
|
|
|
|
| demo = create_demo(name, "cuda") |
| |
| demo.launch(debug=True) |
|
|