PartPacker-CPU

Sleeping

App Files Files Community

cpuai commited on 5 days ago

Commit

cfef1cb

verified ·

1 Parent(s): 0846da3

Update app.py

Browse files

Files changed (1) hide show

app.py +342 -68

app.py CHANGED Viewed

@@ -1,12 +1,18 @@
 import os
-import numpy as np
 import cv2
 import kiui
-import trimesh
-import torch
 import rembg
-from datetime import datetime
-import gradio as gr
 try:
     import spaces
@@ -19,61 +25,70 @@ except ImportError:
             def __call__(self, func):
                 return func
 from flow.model import Model
 from flow.configs.schema import ModelConfig
 from flow.utils import get_random_color, recenter_foreground
 from vae.utils import postprocess_mesh
-from huggingface_hub import hf_hub_download
-# =========================
-# CPU 基础设置
-# =========================
 DEVICE = torch.device("cpu")
 DTYPE = torch.float32
 CPU_THREADS = int(os.environ.get("CPU_THREADS", "2"))
 torch.set_num_threads(CPU_THREADS)
 torch.set_num_interop_threads(max(1, min(2, CPU_THREADS)))
 TRIMESH_GLB_EXPORT = np.array(
     [[0, 1, 0], [0, 0, 1], [1, 0, 0]],
     dtype=np.float32
 )
 MAX_SEED = np.iinfo(np.int32).max
 bg_remover = rembg.new_session()
-# =========================
-# 下载模型
-# =========================
-flow_ckpt_path = hf_hub_download(repo_id="nvidia/PartPacker", filename="flow.pt")
-vae_ckpt_path = hf_hub_download(repo_id="nvidia/PartPacker", filename="vae.pt")
-# =========================
-# 模型配置
-# =========================
-model_config = ModelConfig(
-    vae_conf="vae.configs.part_woenc",
-    vae_ckpt_path=vae_ckpt_path,
-    qknorm=True,
-    qknorm_type="RMSNorm",
-    use_pos_embed=False,
-    dino_model="dinov2_vitg14",
-    hidden_dim=1536,
-    flow_shift=3.0,
-    logitnorm_mean=1.0,
-    logitnorm_std=1.0,
-    latent_size=4096,
-    use_parts=True,
-)
-# =========================
 # 工具函数：强制整个模块转 float32
-# =========================
 def force_module_fp32(module: torch.nn.Module):
     """
-    递归把模块参数和 buffer 全部转成 float32。
-    这一步是解决 CPU 下 bfloat16/float32 混用问题的关键。
     """
     module.to(device=DEVICE)
     module.float()
@@ -81,31 +96,233 @@ def force_module_fp32(module: torch.nn.Module):
     for child in module.children():
         force_module_fp32(child)
     for name, buf in module.named_buffers(recurse=False):
-        if torch.is_floating_point(buf):
             setattr(module, name, buf.to(device=DEVICE, dtype=torch.float32))
     return module
-# =========================
 # 初始化模型（CPU + float32）
-# =========================
 print("正在加载模型到 CPU ...")
 model = Model(model_config)
 model.eval()
 model.to(DEVICE)
 # 显式按 CPU 加载权重
-ckpt_dict = torch.load(flow_ckpt_path, map_location=DEVICE, weights_only=True)
 model.load_state_dict(ckpt_dict, strict=True)
-# 关键：再次强制整个模型为 float32
 force_module_fp32(model)
 model.eval()
 print("模型加载完成。")
-print("主模型 dtype:", next(model.parameters()).dtype)
 def get_random_seed(randomize_seed, seed):
@@ -165,7 +382,9 @@ def process_3d(
         os.makedirs("output", exist_ok=True)
         output_glb_path = f"output/partpacker_{datetime.now().strftime('%Y%m%d_%H%M%S')}.glb"
-        # RGBA -> float32
         image = input_image.astype(np.float32) / 255.0
         image = image[..., :3] * image[..., 3:4] + (1.0 - image[..., 3:4])
@@ -178,56 +397,83 @@ def process_3d(
         )
         data = {
-            "cond_images": image_tensor.float()
         }
-        # 再保险：推理前确保模型仍是 float32
         force_module_fp32(model)
         model.eval()
         with torch.inference_mode():
-            results = model(
-                data,
-                num_steps=int(num_steps),
-                cfg_scale=float(cfg_scale)
-            )
-        latent = results["latent"]
-        # 关键：latent 强制 float32
-        if isinstance(latent, torch.Tensor):
-            latent = latent.to(device=DEVICE, dtype=torch.float32).contiguous()
-        else:
             raise gr.Error("模型输出 latent 异常。")
-        # VAE 输入前再做 float32 保证
         data_part0 = {
-            "latent": latent[:, : model.config.latent_size, :].float().contiguous()
         }
         data_part1 = {
-            "latent": latent[:, model.config.latent_size:, :].float().contiguous()
         }
-        # 再保险：把 VAE 也强制成 float32
-        force_module_fp32(model.vae)
-        model.vae.eval()
         with torch.inference_mode():
-            results_part0 = model.vae(data_part0, resolution=int(grid_res))
-            results_part1 = model.vae(data_part1, resolution=int(grid_res))
         if not simplify_mesh:
             target_num_faces = -1
         parts = []
         vertices, faces = results_part0["meshes"][0]
         mesh_part0 = trimesh.Trimesh(vertices, faces, process=False)
         mesh_part0.vertices = mesh_part0.vertices @ TRIMESH_GLB_EXPORT.T
         mesh_part0 = postprocess_mesh(mesh_part0, int(target_num_faces))
         parts.extend(mesh_part0.split(only_watertight=False))
         vertices, faces = results_part1["meshes"][0]
         mesh_part1 = trimesh.Trimesh(vertices, faces, process=False)
         mesh_part1.vertices = mesh_part1.vertices @ TRIMESH_GLB_EXPORT.T
         mesh_part1 = postprocess_mesh(mesh_part1, int(target_num_faces))
@@ -244,6 +490,8 @@ def process_3d(
         return output_glb_path
     except Exception as e:
         raise gr.Error(
             "CPU 生成失败："
@@ -282,8 +530,16 @@ with block:
     with gr.Row():
         with gr.Column():
-            input_image = gr.Image(label="上传图片", type="filepath")
-            seg_image = gr.Image(label="处理后图片", type="numpy", interactive=False, image_mode="RGBA")
             with gr.Accordion("高级设置", open=False):
                 num_steps = gr.Slider(
@@ -307,9 +563,16 @@ with block:
                     step=1,
                     value=128
                 )
                 with gr.Row():
                     randomize_seed = gr.Checkbox(label="随机种子", value=True)
-                    seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
                 with gr.Row():
                     simplify_mesh = gr.Checkbox(label="简化网格", value=True)
@@ -349,8 +612,19 @@ with block:
         outputs=[seed]
     ).then(
         fn=process_3d,
-        inputs=[seg_image, num_steps, cfg_scale, input_grid_res, seed, simplify_mesh, target_num_faces],
         outputs=[output_model]
     )
-block.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))

 import os
+import contextlib
+import functools
+from datetime import datetime
 import cv2
+import gradio as gr
 import kiui
+import numpy as np
 import rembg
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import trimesh
+from huggingface_hub import hf_hub_download
 try:
     import spaces
             def __call__(self, func):
                 return func
 from flow.model import Model
 from flow.configs.schema import ModelConfig
 from flow.utils import get_random_color, recenter_foreground
 from vae.utils import postprocess_mesh
+# =========================================================
+# CPU / dtype 基础设置
+# =========================================================
 DEVICE = torch.device("cpu")
 DTYPE = torch.float32
+# 线程数可按 HF CPU Space 机器情况调整
 CPU_THREADS = int(os.environ.get("CPU_THREADS", "2"))
 torch.set_num_threads(CPU_THREADS)
 torch.set_num_interop_threads(max(1, min(2, CPU_THREADS)))
+# 显式设默认浮点 dtype 为 float32
+torch.set_default_dtype(torch.float32)
+# 对 CPU 推理更稳妥
+try:
+    torch.set_grad_enabled(False)
+except Exception:
+    pass
 TRIMESH_GLB_EXPORT = np.array(
     [[0, 1, 0], [0, 0, 1], [1, 0, 0]],
     dtype=np.float32
 )
 MAX_SEED = np.iinfo(np.int32).max
 bg_remover = rembg.new_session()
+# =========================================================
+# 工具函数：递归转换任意对象中的浮点 Tensor 为 float32
+# =========================================================
+def to_cpu_fp32(obj):
+    """
+    递归把对象中的浮点 Tensor 转成 CPU + float32。
+    支持 Tensor / dict / list / tuple。
+    """
+    if torch.is_tensor(obj):
+        if obj.is_floating_point():
+            return obj.to(device=DEVICE, dtype=torch.float32, non_blocking=False)
+        return obj.to(device=DEVICE, non_blocking=False)
+    if isinstance(obj, dict):
+        return {k: to_cpu_fp32(v) for k, v in obj.items()}
+    if isinstance(obj, list):
+        return [to_cpu_fp32(v) for v in obj]
+    if isinstance(obj, tuple):
+        return tuple(to_cpu_fp32(v) for v in obj)
+    return obj
+# =========================================================
 # 工具函数：强制整个模块转 float32
+# =========================================================
 def force_module_fp32(module: torch.nn.Module):
     """
+    递归把模块参数和 buffer 都转到 CPU + float32。
     """
     module.to(device=DEVICE)
     module.float()
     for child in module.children():
         force_module_fp32(child)
+    # 处理 buffer
     for name, buf in module.named_buffers(recurse=False):
+        if torch.is_tensor(buf) and buf.is_floating_point():
             setattr(module, name, buf.to(device=DEVICE, dtype=torch.float32))
     return module
+# =========================================================
+# 工具函数：禁用 CPU autocast
+# =========================================================
+@contextlib.contextmanager
+def disable_cpu_autocast():
+    """
+    显式关闭 CPU autocast，防止内部偷偷切到 bfloat16。
+    """
+    try:
+        with torch.autocast(device_type="cpu", enabled=False):
+            yield
+    except Exception:
+        # 某些环境/版本可能不支持该写法，直接退化为普通上下文
+        yield
+# =========================================================
+# 兜底补丁 1：全局修补 F.linear
+# =========================================================
+def patch_functional_linear():
+    """
+    给 torch.nn.functional.linear 打补丁：
+    如果 input 和 weight dtype 不一致，自动把 input 转成 weight.dtype。
+    这是最后一道保险。
+    """
+    if getattr(F.linear, "_fp32_safe_patched", False):
+        return
+    original_linear = F.linear
+    @functools.wraps(original_linear)
+    def linear_fp32_safe(input, weight, bias=None):
+        if (
+            torch.is_tensor(input)
+            and torch.is_tensor(weight)
+            and input.device.type == "cpu"
+            and input.is_floating_point()
+            and weight.is_floating_point()
+            and input.dtype != weight.dtype
+        ):
+            input = input.to(dtype=weight.dtype)
+        if (
+            bias is not None
+            and torch.is_tensor(bias)
+            and bias.is_floating_point()
+            and torch.is_tensor(weight)
+            and weight.is_floating_point()
+            and bias.dtype != weight.dtype
+        ):
+            bias = bias.to(dtype=weight.dtype)
+        return original_linear(input, weight, bias)
+    linear_fp32_safe._fp32_safe_patched = True
+    F.linear = linear_fp32_safe
+# =========================================================
+# 兜底补丁 2：给常见模块加 forward pre-hook
+# =========================================================
+def register_dtype_guard_hooks(root_module: nn.Module):
+    """
+    给常见算子模块注册前置 hook，在 forward 入口把输入对齐到参数 dtype。
+    """
+    hooks = []
+    guarded_types = (
+        nn.Linear,
+        nn.Conv1d,
+        nn.Conv2d,
+        nn.Conv3d,
+        nn.LayerNorm,
+        nn.GroupNorm,
+        nn.BatchNorm1d,
+        nn.BatchNorm2d,
+        nn.BatchNorm3d,
+        nn.MultiheadAttention,
+    )
+    def cast_obj_to_dtype(obj, dtype, device):
+        if torch.is_tensor(obj):
+            if obj.is_floating_point():
+                return obj.to(device=device, dtype=dtype)
+            return obj.to(device=device)
+        if isinstance(obj, dict):
+            return {k: cast_obj_to_dtype(v, dtype, device) for k, v in obj.items()}
+        if isinstance(obj, list):
+            return [cast_obj_to_dtype(v, dtype, device) for v in obj]
+        if isinstance(obj, tuple):
+            return tuple(cast_obj_to_dtype(v, dtype, device) for v in obj)
+        return obj
+    def pre_hook(module, inputs):
+        ref_tensor = None
+        # 先从参数里找参考 dtype
+        for p in module.parameters(recurse=False):
+            if torch.is_tensor(p) and p.is_floating_point():
+                ref_tensor = p
+                break
+        # 参数没有，再从 buffer 里找
+        if ref_tensor is None:
+            for b in module.buffers(recurse=False):
+                if torch.is_tensor(b) and b.is_floating_point():
+                    ref_tensor = b
+                    break
+        if ref_tensor is None:
+            return inputs
+        return cast_obj_to_dtype(inputs, ref_tensor.dtype, ref_tensor.device)
+    for submodule in root_module.modules():
+        if isinstance(submodule, guarded_types):
+            hooks.append(submodule.register_forward_pre_hook(pre_hook))
+    return hooks
+# =========================================================
+# 兜底补丁 3：包装 forward，统一禁用 autocast + 输入转 fp32
+# =========================================================
+def wrap_forward_fp32(module: nn.Module):
+    """
+    包装模块的 forward：
+    1. 进入 forward 前先把输入递归转为 float32
+    2. forward 期间禁用 CPU autocast
+    """
+    if getattr(module, "_forward_fp32_wrapped", False):
+        return
+    original_forward = module.forward
+    @functools.wraps(original_forward)
+    def forward_fp32_safe(*args, **kwargs):
+        args = to_cpu_fp32(args)
+        kwargs = to_cpu_fp32(kwargs)
+        with disable_cpu_autocast():
+            out = original_forward(*args, **kwargs)
+        return to_cpu_fp32(out)
+    module.forward = forward_fp32_safe
+    module._forward_fp32_wrapped = True
+# =========================================================
+# 下载模型
+# =========================================================
+flow_ckpt_path = hf_hub_download(
+    repo_id="nvidia/PartPacker",
+    filename="flow.pt"
+)
+vae_ckpt_path = hf_hub_download(
+    repo_id="nvidia/PartPacker",
+    filename="vae.pt"
+)
+# =========================================================
+# 模型配置
+# =========================================================
+model_config = ModelConfig(
+    vae_conf="vae.configs.part_woenc",
+    vae_ckpt_path=vae_ckpt_path,
+    qknorm=True,
+    qknorm_type="RMSNorm",
+    use_pos_embed=False,
+    dino_model="dinov2_vitg14",
+    hidden_dim=1536,
+    flow_shift=3.0,
+    logitnorm_mean=1.0,
+    logitnorm_std=1.0,
+    latent_size=4096,
+    use_parts=True,
+)
+# =========================================================
 # 初始化模型（CPU + float32）
+# =========================================================
 print("正在加载模型到 CPU ...")
+patch_functional_linear()
 model = Model(model_config)
 model.eval()
 model.to(DEVICE)
 # 显式按 CPU 加载权重
+# 某些环境下 weights_only=True 不兼容时，可退回普通 torch.load
+try:
+    ckpt_dict = torch.load(flow_ckpt_path, map_location=DEVICE, weights_only=True)
+except TypeError:
+    ckpt_dict = torch.load(flow_ckpt_path, map_location=DEVICE)
 model.load_state_dict(ckpt_dict, strict=True)
+# 强制全模型转 float32
 force_module_fp32(model)
 model.eval()
+# 包装 forward，彻底关闭 CPU autocast
+wrap_forward_fp32(model)
+if hasattr(model, "dit"):
+    wrap_forward_fp32(model.dit)
+if hasattr(model, "vae"):
+    wrap_forward_fp32(model.vae)
+# 给模型注册 dtype 保护 hook
+_DTYPE_GUARD_HOOKS = []
+_DTYPE_GUARD_HOOKS.extend(register_dtype_guard_hooks(model))
+if hasattr(model, "vae"):
+    _DTYPE_GUARD_HOOKS.extend(register_dtype_guard_hooks(model.vae))
 print("模型加载完成。")
+try:
+    print("主模型 dtype:", next(model.parameters()).dtype)
+except StopIteration:
+    print("主模型没有可见参数。")
 def get_random_seed(randomize_seed, seed):
         os.makedirs("output", exist_ok=True)
         output_glb_path = f"output/partpacker_{datetime.now().strftime('%Y%m%d_%H%M%S')}.glb"
+        # -------------------------------------------------
+        # 1) RGBA -> RGB 白底合成 -> float32
+        # -------------------------------------------------
         image = input_image.astype(np.float32) / 255.0
         image = image[..., :3] * image[..., 3:4] + (1.0 - image[..., 3:4])
         )
         data = {
+            "cond_images": image_tensor
         }
+        data = to_cpu_fp32(data)
+        # -------------------------------------------------
+        # 2) 推理前再次强制模型为 float32
+        # -------------------------------------------------
         force_module_fp32(model)
         model.eval()
+        if hasattr(model, "vae"):
+            force_module_fp32(model.vae)
+            model.vae.eval()
+        # -------------------------------------------------
+        # 3) 主模型推理：显式禁用 CPU autocast
+        # -------------------------------------------------
         with torch.inference_mode():
+            with disable_cpu_autocast():
+                results = model(
+                    data,
+                    num_steps=int(num_steps),
+                    cfg_scale=float(cfg_scale)
+                )
+        results = to_cpu_fp32(results)
+        latent = results.get("latent", None)
+        if not isinstance(latent, torch.Tensor):
             raise gr.Error("模型输出 latent 异常。")
+        latent = latent.to(device=DEVICE, dtype=torch.float32).contiguous()
+        # -------------------------------------------------
+        # 4) VAE 解码：再次显式禁用 CPU autocast
+        # -------------------------------------------------
         data_part0 = {
+            "latent": latent[:, : model.config.latent_size, :].contiguous()
         }
         data_part1 = {
+            "latent": latent[:, model.config.latent_size:, :].contiguous()
         }
+        data_part0 = to_cpu_fp32(data_part0)
+        data_part1 = to_cpu_fp32(data_part1)
         with torch.inference_mode():
+            with disable_cpu_autocast():
+                results_part0 = model.vae(data_part0, resolution=int(grid_res))
+                results_part1 = model.vae(data_part1, resolution=int(grid_res))
+        results_part0 = to_cpu_fp32(results_part0)
+        results_part1 = to_cpu_fp32(results_part1)
         if not simplify_mesh:
             target_num_faces = -1
         parts = []
+        # -------------------------------------------------
+        # 5) part 0 mesh
+        # -------------------------------------------------
         vertices, faces = results_part0["meshes"][0]
+        vertices = np.asarray(vertices, dtype=np.float32)
+        faces = np.asarray(faces, dtype=np.int64)
         mesh_part0 = trimesh.Trimesh(vertices, faces, process=False)
         mesh_part0.vertices = mesh_part0.vertices @ TRIMESH_GLB_EXPORT.T
         mesh_part0 = postprocess_mesh(mesh_part0, int(target_num_faces))
         parts.extend(mesh_part0.split(only_watertight=False))
+        # -------------------------------------------------
+        # 6) part 1 mesh
+        # -------------------------------------------------
         vertices, faces = results_part1["meshes"][0]
+        vertices = np.asarray(vertices, dtype=np.float32)
+        faces = np.asarray(faces, dtype=np.int64)
         mesh_part1 = trimesh.Trimesh(vertices, faces, process=False)
         mesh_part1.vertices = mesh_part1.vertices @ TRIMESH_GLB_EXPORT.T
         mesh_part1 = postprocess_mesh(mesh_part1, int(target_num_faces))
         return output_glb_path
+    except gr.Error:
+        raise
     except Exception as e:
         raise gr.Error(
             "CPU 生成失败："
     with gr.Row():
         with gr.Column():
+            input_image = gr.Image(
+                label="上传图片",
+                type="filepath"
+            )
+            seg_image = gr.Image(
+                label="处理后图片",
+                type="numpy",
+                interactive=False,
+                image_mode="RGBA"
+            )
             with gr.Accordion("高级设置", open=False):
                 num_steps = gr.Slider(
                     step=1,
                     value=128
                 )
                 with gr.Row():
                     randomize_seed = gr.Checkbox(label="随机种子", value=True)
+                    seed = gr.Slider(
+                        label="Seed",
+                        minimum=0,
+                        maximum=MAX_SEED,
+                        step=1,
+                        value=0
+                    )
                 with gr.Row():
                     simplify_mesh = gr.Checkbox(label="简化网格", value=True)
         outputs=[seed]
     ).then(
         fn=process_3d,
+        inputs=[
+            seg_image,
+            num_steps,
+            cfg_scale,
+            input_grid_res,
+            seed,
+            simplify_mesh,
+            target_num_faces
+        ],
         outputs=[output_model]
     )
+block.launch(
+    server_name="0.0.0.0",
+    server_port=int(os.environ.get("PORT", 7860))
+)