Spaces:

WeReCooking
/

FE2E-CPU

Paused

App Files Files Community

Nekochu commited on 14 days ago

Commit

551acb3

1 Parent(s): 405d2b1

fix: diffusers dep, layer-by-layer FP8->FP32 cast, LoRA merge in FP32, INT8 quant

Browse files

Files changed (2) hide show

Dockerfile +1 -1
app.py +82 -58

Dockerfile CHANGED Viewed

@@ -3,7 +3,7 @@ FROM python:3.11-slim
 RUN apt-get update && apt-get install -y --no-install-recommends git && rm -rf /var/lib/apt/lists/*
 RUN pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu
-RUN pip install --no-cache-dir "gradio[mcp]" Pillow huggingface-hub safetensors einops numpy tqdm
 WORKDIR /app
 COPY . .

 RUN apt-get update && apt-get install -y --no-install-recommends git && rm -rf /var/lib/apt/lists/*
 RUN pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu
+RUN pip install --no-cache-dir "gradio[mcp]" Pillow huggingface-hub safetensors einops numpy tqdm "diffusers[torch]" transformers
 WORKDIR /app
 COPY . .

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""FE2E: Depth + Normal estimation from a single image (CPU, FP32 with dynamic INT8)"""
 from __future__ import annotations
 import gc
@@ -6,7 +6,6 @@ import os
 import sys
 import time
-import numpy as np
 import torch
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
@@ -22,58 +21,89 @@ class Args:
     norm_type = "ln"
-def _download_models():
-    from huggingface_hub import hf_hub_download
     import shutil
     token = os.environ.get("HF_TOKEN")
-    files = {
-        "dit": ("rkfg/Step1X-Edit-FP8", "step1x-edit-i1258-FP8.safetensors"),
-        "vae": ("exander/FE2E", "pretrain/vae.safetensors"),
-        "lora": ("exander/FE2E", "LDRN.safetensors"),
-    }
-    paths = {}
-    for key, (repo, filename) in files.items():
-        basename = os.path.basename(filename)
-        dest = os.path.join(MODELS_DIR, basename)
-        if not os.path.exists(dest):
-            print(f"[init] Downloading {repo}/{filename}...")
-            src = hf_hub_download(repo, filename, token=token)
-            shutil.copy2(src, dest)
-            print(f"[init] {basename}: {os.path.getsize(dest)/1024/1024:.0f} MB")
-        paths[key] = dest
-    return paths
-def _load_generator(paths):
-    """Load model: FP8 weights cast to FP32 for CPU, with LoRA merged."""
-    from infer.inference import ImageGenerator
     args = Args()
-    print("[init] Loading model (FP8 -> FP32 on CPU)...")
     t0 = time.time()
-    generator = ImageGenerator(
-        dit_path=paths["dit"],
-        ae_path=paths["vae"],
-        quantized=True,
-        offload=False,
-        lora=paths["lora"],
-        device="cpu",
-        args=args,
-    )
-    # FP8 tensors can't compute on CPU, cast to FP32
-    generator.dit = generator.dit.float()
-    generator.ae = generator.ae.float()
-    # Dynamic INT8 quantization for linear layers (biggest speedup on CPU)
-    generator.dit = torch.quantization.quantize_dynamic(
-        generator.dit, {torch.nn.Linear}, dtype=torch.qint8
-    )
-    elapsed = time.time() - t0
-    print(f"[init] Model loaded + quantized in {elapsed:.0f}s")
     gc.collect()
     return generator
@@ -84,7 +114,7 @@ def generate(image):
     """Estimate depth and surface normals from a single image.
     Args:
-        image: Input image (PIL Image or filepath).
     Returns:
         tuple: (depth_map, normal_map, status_message)
@@ -95,8 +125,7 @@ def generate(image):
     global GENERATOR
     if GENERATOR is None:
-        paths = _download_models()
-        GENERATOR = _load_generator(paths)
     if image is None:
         raise gr.Error("Please upload an image.")
@@ -107,10 +136,10 @@ def generate(image):
         image = Image.fromarray(image).convert("RGB")
     args = Args()
-    print(f"[gen] Input: {image.size}, starting inference...")
     t0 = time.time()
-    with torch.inference_mode():
         images, Lpred, Rpred = GENERATOR.generate_image(
             prompt="",
             negative_prompt="",
@@ -125,14 +154,11 @@ def generate(image):
     elapsed = time.time() - t0
-    # Normal map from model output
     normal_map = images[0] if images else None
-    # Depth map from Lpred
     Lpred_img = Lpred[0].clamp(0, 1).cpu()
     depth_map = F.to_pil_image(Lpred_img)
-    status = f"Generated in {elapsed:.1f}s ({image.size[0]}x{image.size[1]}, single denoise)"
     print(f"[gen] {status}")
     return depth_map, normal_map, status
@@ -144,7 +170,6 @@ def main():
     infer = sub.add_parser("infer")
     infer.add_argument("-i", "--input", required=True)
     infer.add_argument("-o", "--output-dir", default=".")
     args = parser.parse_args()
     if args.command == "infer":
@@ -160,9 +185,8 @@ def main():
     with gr.Blocks(title="FE2E: Depth + Normal (CPU)") as demo:
         gr.Markdown(
-            "**[FE2E](https://github.com/AMAP-ML/FE2E)** Depth + Normal estimation from a single image. "
-            "Single denoise step via Step1X-Edit DiT + LDRN LoRA. "
-            "CPU inference with dynamic INT8 quantization."
         )
         with gr.Row():
             with gr.Column():

+"""FE2E: Depth + Normal estimation from a single image (CPU)"""
 from __future__ import annotations
 import gc
 import sys
 import time
 import torch
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
     norm_type = "ln"
+def _download(repo, filename, token=None):
     import shutil
+    from huggingface_hub import hf_hub_download
+    basename = os.path.basename(filename)
+    dest = os.path.join(MODELS_DIR, basename)
+    if not os.path.exists(dest):
+        print(f"[init] Downloading {repo}/{filename}...")
+        src = hf_hub_download(repo, filename, token=token)
+        shutil.copy2(src, dest)
+        print(f"[init] {basename}: {os.path.getsize(dest)/1024/1024:.0f} MB")
+    return dest
+def _load_generator():
+    """Load FP8 model, cast to FP32, merge LoRA, quantize INT8."""
+    from safetensors.torch import load_file
+    from modules.model_edit import Step1XParams, Step1XEdit
+    from modules.autoencoder import AutoEncoder
+    from infer.inference import ImageGenerator, load_state_dict, equip_dit_with_lora_sd_scripts
+    import numpy as np
     token = os.environ.get("HF_TOKEN")
+    dit_path = _download("rkfg/Step1X-Edit-FP8", "step1x-edit-i1258-FP8.safetensors", token)
+    vae_path = _download("exander/FE2E", "pretrain/vae.safetensors", token)
+    lora_path = _download("exander/FE2E", "LDRN.safetensors", token)
     args = Args()
+    print("[init] Building model on meta device...")
+    with torch.device("meta"):
+        ae = AutoEncoder(
+            resolution=256, in_channels=3, ch=128, out_ch=3,
+            ch_mult=[1, 2, 4, 4], num_res_blocks=2, z_channels=16,
+            scale_factor=0.3611, shift_factor=0.1159,
+        )
+        step1x_params = Step1XParams(
+            in_channels=64, out_channels=64, vec_in_dim=768, context_in_dim=4096,
+            hidden_size=3072, mlp_ratio=4.0, num_heads=24, depth=19,
+            depth_single_blocks=38, axes_dim=[16, 56, 56], theta=10_000, qkv_bias=True,
+        )
+        dit = Step1XEdit(step1x_params)
+    # Load weights as FP8, then cast to FP32 layer by layer to avoid 46 GB peak
+    print("[init] Loading FP8 weights and casting to FP32 (layer by layer)...")
     t0 = time.time()
+    fp8_sd = load_file(dit_path, device="cpu")
+    dit_sd = {}
+    for k, v in fp8_sd.items():
+        dit_sd[k] = v.float()
+    del fp8_sd
+    gc.collect()
+    dit = dit.to(dtype=torch.float32)
+    missing, unexpected = dit.load_state_dict(dit_sd, strict=False, assign=True)
+    del dit_sd
+    gc.collect()
+    print(f"[init] DiT loaded in {time.time()-t0:.0f}s (missing={len(missing)}, unexpected={len(unexpected)})")
+    # Load VAE
+    ae = load_state_dict(ae, vae_path, "cpu")
+    ae = ae.float()
+    # Merge LoRA in FP32 (full precision merge)
+    print("[init] Merging LoRA...")
+    equip_dit_with_lora_sd_scripts(ae, [None], dit, lora_path, device="cpu")
+    # Dynamic INT8 quantization
+    print("[init] Applying dynamic INT8 quantization...")
+    dit = torch.quantization.quantize_dynamic(dit, {torch.nn.Linear}, dtype=torch.qint8)
     gc.collect()
+    # Build generator wrapper
+    generator = ImageGenerator.__new__(ImageGenerator)
+    generator.device = torch.device("cpu")
+    generator.args = args
+    generator.ae = ae
+    generator.dit = dit
+    generator.llm_encoder = None
+    generator.quantized = False
+    generator.offload = False
+    generator.lora_module = None
+    print("[init] Model ready for inference")
     return generator
     """Estimate depth and surface normals from a single image.
     Args:
+        image: Input image (PIL Image).
     Returns:
         tuple: (depth_map, normal_map, status_message)
     global GENERATOR
     if GENERATOR is None:
+        GENERATOR = _load_generator()
     if image is None:
         raise gr.Error("Please upload an image.")
         image = Image.fromarray(image).convert("RGB")
     args = Args()
+    print(f"[gen] Input: {image.size}")
     t0 = time.time()
+    with torch.inference_mode(), torch.no_grad():
         images, Lpred, Rpred = GENERATOR.generate_image(
             prompt="",
             negative_prompt="",
     elapsed = time.time() - t0
     normal_map = images[0] if images else None
     Lpred_img = Lpred[0].clamp(0, 1).cpu()
     depth_map = F.to_pil_image(Lpred_img)
+    status = f"Generated in {elapsed:.1f}s ({image.size[0]}x{image.size[1]}, single denoise, INT8)"
     print(f"[gen] {status}")
     return depth_map, normal_map, status
     infer = sub.add_parser("infer")
     infer.add_argument("-i", "--input", required=True)
     infer.add_argument("-o", "--output-dir", default=".")
     args = parser.parse_args()
     if args.command == "infer":
     with gr.Blocks(title="FE2E: Depth + Normal (CPU)") as demo:
         gr.Markdown(
+            "**[FE2E](https://github.com/AMAP-ML/FE2E)** Depth + Normal from a single image (CVPR 2026). "
+            "Single denoise step, Step1X-Edit DiT + LDRN LoRA, dynamic INT8 on CPU."
         )
         with gr.Row():
             with gr.Column():