Spaces:

WeReCooking
/

FE2E-CPU

Paused

App Files Files Community

Nekochu commited on May 21

Commit

563ef1b

verified ·

1 Parent(s): 10d0786

Revert to PyTorch INT8 (ONNX export produces NaN)

Browse files

Files changed (1) hide show

app.py +63 -165

app.py CHANGED Viewed

@@ -1,90 +1,80 @@
-"""FE2E: Depth + Normal estimation from a single image (CPU, ONNX INT8 DiT + PyTorch VAE)"""
 from __future__ import annotations
 import gc
-import math
 import os
-import shutil
 import sys
 import time
-import numpy as np
 import torch
-from einops import rearrange, repeat
-from PIL import Image
-from torchvision.transforms import functional as TF
-import torch.nn.functional as Func
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 MODELS_DIR = "/tmp/fe2e_models"
-ONNX_DIR = os.path.join(MODELS_DIR, "onnx")
-os.makedirs(ONNX_DIR, exist_ok=True)
-EMPTY_PROMPT_CACHE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "latent", "no_info.npz")
-def _download(repo, filename, dest_dir, token=None):
     from huggingface_hub import hf_hub_download
     basename = os.path.basename(filename)
-    dest = os.path.join(dest_dir, basename)
     if not os.path.exists(dest):
         print(f"[init] Downloading {repo}/{filename}...")
         src = hf_hub_download(repo, filename, token=token)
         shutil.copy2(src, dest)
-        size_mb = os.path.getsize(dest) / 1024 / 1024
-        print(f"[init] {basename}: {size_mb:.0f} MB")
     return dest
-def _load_all():
-    import onnxruntime as ort
     token = os.environ.get("HF_TOKEN")
-    repo = "WeReCooking2/FE2E-INT8"
-    print("[init] Downloading ONNX INT8 DiT...")
-    onnx_path = _download(repo, "onnx/dit_int8.onnx", ONNX_DIR, token)
-    _download(repo, "onnx/dit_int8.onnx.data", ONNX_DIR, token)
-    print("[init] Downloading VAE...")
-    vae_path = _download(repo, "vae_full.pt", MODELS_DIR, token)
-    print("[init] Creating ONNX Runtime session (mmap + low memory)...")
-    t0 = time.time()
-    opts = ort.SessionOptions()
-    opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_BASIC
-    opts.inter_op_num_threads = 1
-    opts.intra_op_num_threads = 2
-    opts.enable_mem_pattern = True
-    opts.enable_mem_reuse = True
-    opts.add_session_config_entry("session.disable_prepacking", "1")
-    dit_session = ort.InferenceSession(onnx_path, opts, providers=["CPUExecutionProvider"])
-    print(f"[init] DiT session ready in {time.time() - t0:.0f}s")
-    print("[init] Loading VAE...")
-    vae = torch.load(vae_path, map_location="cpu", weights_only=False, mmap=True)
-    vae.eval()
     gc.collect()
-    print("[init] Loading empty prompt cache...")
-    data = np.load(EMPTY_PROMPT_CACHE, allow_pickle=True)
-    embeds = torch.from_numpy(data["embeds"]).unsqueeze(0)
-    masks = torch.from_numpy(data["masks"]).unsqueeze(0)
-    print("[init] Ready.")
-    return dit_session, vae, embeds, masks
-print("[init] Loading models at startup...")
-DIT_SESSION, VAE, PROMPT_EMBEDS, PROMPT_MASKS = _load_all()
-print("[init] All models loaded, starting Gradio...")
 def generate(image):
-    """Run depth + normal estimation on a single image."""
     import gradio as gr
-    import matplotlib.cm as cm
     if image is None:
         raise gr.Error("Please upload an image.")
@@ -94,135 +84,43 @@ def generate(image):
     elif not isinstance(image, Image.Image):
         image = Image.fromarray(image).convert("RGB")
-    orig_size = image.size
-    print(f"[gen] Input: {orig_size}")
     t0 = time.time()
-    # --- Resize to 1024x768 (matches ONNX model's fixed shape) ---
-    image_resized = image.resize((1024, 768))
-    img_tensor = TF.to_tensor(image_resized).unsqueeze(0)  # [1, 3, 768, 1024]
-    height, width = 768, 1024
-    # --- VAE encode ---
-    with torch.inference_mode():
-        ref_latent = VAE.encode(img_tensor * 2 - 1)  # [1, 16, 96, 128]
-    # --- Prepare DiT inputs ---
-    h_lat, w_lat = ref_latent.shape[2], ref_latent.shape[3]  # 96, 128
-    h_half, w_half = h_lat // 2, w_lat // 2  # 48, 64
-    n_patches = h_half * w_half  # 3072
-    # Noise (zeros for single denoise)
-    noise = torch.zeros(1, 16, h_lat, w_lat, dtype=torch.float32)
-    # Rearrange to patches
-    noise_patches = rearrange(noise, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
-    ref_patches = rearrange(ref_latent, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
-    # Concatenate noise + reference along sequence dim
-    img = torch.cat([noise_patches, ref_patches], dim=1)  # [1, 6144, 64]
-    # Duplicate for CFG (conditional + unconditional)
-    img = torch.cat([img, img], dim=0)  # [2, 6144, 64]
-    # Position IDs for noise patches
-    img_ids_noise = torch.zeros(h_half, w_half, 3)
-    img_ids_noise[..., 1] = torch.arange(h_half)[:, None].float()
-    img_ids_noise[..., 2] = torch.arange(w_half)[None, :].float()
-    img_ids_noise = rearrange(img_ids_noise, "h w c -> 1 (h w) c")
-    # Position IDs for reference patches (same layout)
-    img_ids_ref = torch.zeros(h_half, w_half, 3)
-    img_ids_ref[..., 1] = torch.arange(h_half)[:, None].float()
-    img_ids_ref[..., 2] = torch.arange(w_half)[None, :].float()
-    img_ids_ref = rearrange(img_ids_ref, "h w c -> 1 (h w) c")
-    img_ids = torch.cat([img_ids_noise, img_ids_ref], dim=1)  # [1, 6144, 3]
-    img_ids = img_ids.repeat(2, 1, 1)  # [2, 6144, 3]
-    # Text embeddings from cache
-    txt = torch.cat([PROMPT_EMBEDS, PROMPT_EMBEDS], dim=0)  # [2, 640, 3584]
-    mask = torch.cat([PROMPT_MASKS, PROMPT_MASKS], dim=0)  # [2, 640]
-    txt_ids = torch.zeros(2, txt.shape[1], 3)  # [2, 640, 3]
-    # Timestep = 1.0 (single denoise step)
-    t_vec = torch.full((2,), 1.0, dtype=torch.float32)
-    # --- Run DiT via ONNX Runtime ---
-    print("[gen] Running ONNX INT8 DiT inference...")
-    t_dit = time.time()
-    feeds = {
-        "img": img.numpy().astype(np.float16),
-        "img_ids": img_ids.numpy().astype(np.float16),
-        "txt_ids": txt_ids.numpy().astype(np.float16),
-        "timesteps": t_vec.numpy().astype(np.float16),
-        "llm_embedding": txt.numpy().astype(np.float16),
-        "t_vec": t_vec.numpy().astype(np.float16),
-        "mask": mask.numpy().astype(np.float16),
-    }
-    pred_np = DIT_SESSION.run(None, feeds)[0]  # [2, 6144, 64] float16
-    pred = torch.from_numpy(pred_np).float()
-    dit_elapsed = time.time() - t_dit
-    print(f"[gen] DiT inference: {dit_elapsed:.1f}s")
-    # --- CFG ---
-    cfg_guidance = 6.0
-    cond = pred[:1]
-    uncond = pred[1:]
-    pred_cfg = uncond + cfg_guidance * (cond - uncond)  # [1, 6144, 64]
-    # --- Apply single denoise step: img + (0 - 1) * pred = img - pred ---
-    img_out = img[:1].float() + (0.0 - 1.0) * pred_cfg  # [1, 6144, 64]
-    # Split: first half is output (depth/normal latent), second half was reference
-    output_patches = img_out[:, :n_patches]  # [1, 3072, 64]
-    pred1_ref = cond[:, n_patches:]  # [1, 3072, 64] (reference prediction)
-    # --- Unpack patches back to spatial ---
-    depth_latent = rearrange(
-        output_patches, "b (h w) (c ph pw) -> b c (h ph) (w pw)",
-        h=h_half, w=w_half, ph=2, pw=2
-    )  # [1, 16, 96, 128]
-    normal_latent = rearrange(
-        pred1_ref, "b (h w) (c ph pw) -> b c (h ph) (w pw)",
-        h=h_half, w=w_half, ph=2, pw=2
-    )  # [1, 16, 96, 128]
-    # --- Unpack as the original code does (depth, normal from 2-panel layout) ---
-    # Actually, looking at the original code more carefully:
-    # Lpred, Rpred = unpack_latents(pred, h//16, w//16)
-    # which splits the patches into two panels
-    # But in single_denoise mode, the code uses denoise() not double_denoise()
-    # denoise() returns: img[:, :seq_len//2], pred1[:, seq_len//2:]
-    # So depth = first half of updated image, normal = second half of cond prediction
-    # --- VAE decode ---
     with torch.inference_mode():
-        depth_decoded = VAE.decode(depth_latent)
-        normal_decoded = VAE.decode(normal_latent)
-    depth_decoded = depth_decoded.clamp(-1, 1).mul(0.5).add(0.5)
     elapsed = time.time() - t0
-    # --- Normal map visualization ---
-    normal_np = normal_decoded[0].cpu().float().numpy().transpose(1, 2, 0)  # (H, W, 3)
     normal_norm = np.linalg.norm(normal_np, axis=-1, keepdims=True)
     normal_norm[normal_norm < 1e-12] = 1e-12
     normal_np = normal_np / normal_norm
     normal_rgb = (((normal_np + 1) * 0.5) * 255).clip(0, 255).astype(np.uint8)
-    normal_map = Image.fromarray(normal_rgb).resize(orig_size)
-    # --- Depth map visualization (turbo colormap) ---
-    depth_np = depth_decoded[0].cpu().float().mean(dim=0).numpy()
     depth_np = (depth_np - depth_np.min()) / (depth_np.max() - depth_np.min() + 1e-8)
     depth_colored = (cm.turbo(depth_np)[:, :, :3] * 255).astype(np.uint8)
-    depth_map = Image.fromarray(depth_colored).resize(orig_size)
-    status = f"Generated in {elapsed:.1f}s (DiT: {dit_elapsed:.1f}s, 1024x768, ONNX INT8)"
     print(f"[gen] {status}")
     return depth_map, normal_map, status
@@ -232,7 +130,7 @@ import gradio as gr
 with gr.Blocks(title="FE2E: Depth + Normal (CPU)") as demo:
     gr.Markdown(
         "**[FE2E](https://github.com/AMAP-ML/FE2E)** Depth + Normal from a single image (CVPR 2026). "
-        "Step1X-Edit DiT + LDRN LoRA (pre-merged), ONNX INT8 quantized on CPU."
     )
     with gr.Row(equal_height=True):
         input_img = gr.Image(label="Input", type="pil", height=256)

+"""FE2E: Depth + Normal estimation from a single image (CPU, pre-quantized INT8)"""
 from __future__ import annotations
 import gc
 import os
 import sys
 import time
 import torch
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 MODELS_DIR = "/tmp/fe2e_models"
+os.makedirs(MODELS_DIR, exist_ok=True)
+class Args:
+    prompt_type = "empty"
+    single_denoise = True
+    empty_prompt_cache = os.path.join(os.path.dirname(os.path.abspath(__file__)), "latent", "no_info.npz")
+    norm_type = "ln"
+def _download(repo, filename, token=None):
+    import shutil
     from huggingface_hub import hf_hub_download
     basename = os.path.basename(filename)
+    dest = os.path.join(MODELS_DIR, basename)
     if not os.path.exists(dest):
         print(f"[init] Downloading {repo}/{filename}...")
         src = hf_hub_download(repo, filename, token=token)
         shutil.copy2(src, dest)
+        print(f"[init] {basename}: {os.path.getsize(dest)/1024/1024:.0f} MB")
     return dest
+def _load_generator():
+    from infer.inference import ImageGenerator
     token = os.environ.get("HF_TOKEN")
+    dit_path = _download("WeReCooking2/FE2E-INT8", "dit_int8_full.pt", token)
+    vae_path = _download("WeReCooking2/FE2E-INT8", "vae_full.pt", token)
+    args = Args()
+    print("[init] Loading pre-quantized INT8 DiT (full model, mmap)...")
+    t0 = time.time()
+    dit = torch.load(dit_path, map_location="cpu", weights_only=False, mmap=True)
+    gc.collect()
+    print(f"[init] DiT loaded in {time.time()-t0:.0f}s")
+    print("[init] Loading VAE (full model)...")
+    ae = torch.load(vae_path, map_location="cpu", weights_only=False, mmap=True)
     gc.collect()
+    generator = ImageGenerator.__new__(ImageGenerator)
+    generator.device = torch.device("cpu")
+    generator.args = args
+    generator.ae = ae
+    generator.dit = dit
+    generator.llm_encoder = None
+    generator.quantized = False
+    generator.offload = False
+    generator.lora_module = None
+    print(f"[init] Ready. Total load: {time.time()-t0:.0f}s")
+    return generator
+print("[init] Loading model at startup (not lazy)...")
+GENERATOR = _load_generator()
+print("[init] Model ready, starting Gradio...")
 def generate(image):
     import gradio as gr
+    from PIL import Image
     if image is None:
         raise gr.Error("Please upload an image.")
     elif not isinstance(image, Image.Image):
         image = Image.fromarray(image).convert("RGB")
+    args = Args()
+    print(f"[gen] Input: {image.size}")
     t0 = time.time()
     with torch.inference_mode():
+        images, Lpred, Rpred = GENERATOR.generate_image(
+            prompt="",
+            negative_prompt="",
+            ref_images=image,
+            num_samples=1,
+            num_steps=1,
+            cfg_guidance=6.0,
+            seed=42,
+            show_progress=True,
+            args=args,
+        )
     elapsed = time.time() - t0
+    import numpy as np
+    import matplotlib.cm as cm
+    normal_np = Rpred[0].cpu().float().numpy().transpose(1, 2, 0)
     normal_norm = np.linalg.norm(normal_np, axis=-1, keepdims=True)
     normal_norm[normal_norm < 1e-12] = 1e-12
     normal_np = normal_np / normal_norm
     normal_rgb = (((normal_np + 1) * 0.5) * 255).clip(0, 255).astype(np.uint8)
+    normal_map = Image.fromarray(normal_rgb)
+    normal_map = normal_map.resize(image.size)
+    depth_np = Lpred[0].cpu().float().mean(dim=0).numpy()
     depth_np = (depth_np - depth_np.min()) / (depth_np.max() - depth_np.min() + 1e-8)
     depth_colored = (cm.turbo(depth_np)[:, :, :3] * 255).astype(np.uint8)
+    depth_map = Image.fromarray(depth_colored)
+    depth_map = depth_map.resize(image.size)
+    status = f"Generated in {elapsed:.1f}s ({image.size[0]}x{image.size[1]}, single denoise, INT8)"
     print(f"[gen] {status}")
     return depth_map, normal_map, status
 with gr.Blocks(title="FE2E: Depth + Normal (CPU)") as demo:
     gr.Markdown(
         "**[FE2E](https://github.com/AMAP-ML/FE2E)** Depth + Normal from a single image (CVPR 2026). "
+        "Takes ~29 min for 768x1024, 1 step, Step1X-Edit DiT + LDRN LoRA (pre-merged), INT8 quantized on CPU."
     )
     with gr.Row(equal_height=True):
         input_img = gr.Image(label="Input", type="pil", height=256)