| import spaces |
| import os |
| import gradio as gr |
| import numpy as np |
| import torch |
| from PIL import Image |
| import trimesh |
| import random |
| from transformers import AutoModelForImageSegmentation |
| from torchvision import transforms |
| from huggingface_hub import hf_hub_download, snapshot_download |
| import subprocess |
| import shutil |
|
|
| |
| subprocess.run("pip install spandrel==0.4.1 --no-deps", shell=True, check=True) |
|
|
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
| DTYPE = torch.float16 |
|
|
| print("DEVICE: ", DEVICE) |
|
|
| DEFAULT_FACE_NUMBER = 100000 |
| MAX_SEED = np.iinfo(np.int32).max |
| TRIPOSG_REPO_URL = "https://github.com/VAST-AI-Research/TripoSG.git" |
| MV_ADAPTER_REPO_URL = "https://github.com/huanngzh/MV-Adapter.git" |
|
|
| RMBG_PRETRAINED_MODEL = "checkpoints/RMBG-1.4" |
| TRIPOSG_PRETRAINED_MODEL = "checkpoints/TripoSG" |
|
|
| TMP_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tmp") |
| os.makedirs(TMP_DIR, exist_ok=True) |
|
|
| TRIPOSG_CODE_DIR = "./triposg" |
| if not os.path.exists(TRIPOSG_CODE_DIR): |
| os.system(f"git clone {TRIPOSG_REPO_URL} {TRIPOSG_CODE_DIR}") |
|
|
| MV_ADAPTER_CODE_DIR = "./mv_adapter" |
| if not os.path.exists(MV_ADAPTER_CODE_DIR): |
| os.system(f"git clone {MV_ADAPTER_REPO_URL} {MV_ADAPTER_CODE_DIR} && cd {MV_ADAPTER_CODE_DIR} && git checkout 7d37a97e9bc223cdb8fd26a76bd8dd46504c7c3d") |
|
|
| import sys |
| sys.path.append(TRIPOSG_CODE_DIR) |
| sys.path.append(os.path.join(TRIPOSG_CODE_DIR, "scripts")) |
| sys.path.append(MV_ADAPTER_CODE_DIR) |
| sys.path.append(os.path.join(MV_ADAPTER_CODE_DIR, "scripts")) |
|
|
| HEADER = """ |
| |
| # 🔮 Image to 3D with [TripoSG](https://github.com/VAST-AI-Research/TripoSG) |
| |
| ## State-of-the-art Open Source 3D Generation Using Large-Scale Rectified Flow Transformers |
| |
| <p style="font-size: 1.1em;">By <a href="https://www.tripo3d.ai/" style="color: #1E90FF; text-decoration: none; font-weight: bold;">Tripo</a></p> |
| |
| ## 📋 Quick Start Guide: |
| 1. **Upload an image** (single object works best) |
| 2. Click **Generate Shape** to create the 3D mesh |
| 3. Click **Apply Texture** to add textures |
| 4. Use **Download GLB** to save your 3D model |
| 5. Adjust parameters under **Generation Settings** for fine-tuning |
| |
| Best results come from clean, well-lit images with clear subject isolation. Try it now! |
| |
| <p style="font-size: 0.9em; margin-top: 10px;">Texture generation powered by <a href="https://github.com/huanngzh/MV-Adapter" style="color: #1E90FF; text-decoration: none;">MV-Adapter</a> - a versatile multi-view adapter for consistent texture generation. Try the <a href="https://huggingface.co/spaces/VAST-AI/MV-Adapter-I2MV-SDXL" style="color: #1E90FF; text-decoration: none;">MV-Adapter demo</a> for multi-view image generation.</p> |
| |
| """ |
|
|
| |
| from image_process import prepare_image |
| from briarmbg import BriaRMBG |
| snapshot_download("briaai/RMBG-1.4", local_dir=RMBG_PRETRAINED_MODEL) |
| rmbg_net = BriaRMBG.from_pretrained(RMBG_PRETRAINED_MODEL).to(DEVICE) |
| rmbg_net.eval() |
| from triposg.pipelines.pipeline_triposg import TripoSGPipeline |
| snapshot_download("VAST-AI/TripoSG", local_dir=TRIPOSG_PRETRAINED_MODEL) |
| triposg_pipe = TripoSGPipeline.from_pretrained(TRIPOSG_PRETRAINED_MODEL).to(DEVICE, DTYPE) |
|
|
| |
| NUM_VIEWS = 6 |
| from inference_ig2mv_sdxl import prepare_pipeline, preprocess_image, remove_bg |
| from mvadapter.utils import get_orthogonal_camera, tensor_to_image, make_image_grid |
| from mvadapter.utils.render import NVDiffRastContextWrapper, load_mesh, render |
| mv_adapter_pipe = prepare_pipeline( |
| base_model="stabilityai/stable-diffusion-xl-base-1.0", |
| vae_model="madebyollin/sdxl-vae-fp16-fix", |
| unet_model=None, |
| lora_model=None, |
| adapter_path="huanngzh/mv-adapter", |
| scheduler=None, |
| num_views=NUM_VIEWS, |
| device=DEVICE, |
| dtype=torch.float16, |
| ) |
| birefnet = AutoModelForImageSegmentation.from_pretrained( |
| "ZhengPeng7/BiRefNet", trust_remote_code=True |
| ) |
| birefnet.to(DEVICE) |
| transform_image = transforms.Compose( |
| [ |
| transforms.Resize((1024, 1024)), |
| transforms.ToTensor(), |
| transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), |
| ] |
| ) |
| remove_bg_fn = lambda x: remove_bg(x, birefnet, transform_image, DEVICE) |
|
|
| if not os.path.exists("checkpoints/RealESRGAN_x2plus.pth"): |
| hf_hub_download("dtarnow/UPscaler", filename="RealESRGAN_x2plus.pth", local_dir="checkpoints") |
| if not os.path.exists("checkpoints/big-lama.pt"): |
| subprocess.run("wget -P checkpoints/ https://github.com/Sanster/models/releases/download/add_big_lama/big-lama.pt", shell=True, check=True) |
|
|
| def start_session(req: gr.Request): |
| save_dir = os.path.join(TMP_DIR, str(req.session_hash)) |
| os.makedirs(save_dir, exist_ok=True) |
| print("start session, mkdir", save_dir) |
|
|
| def end_session(req: gr.Request): |
| save_dir = os.path.join(TMP_DIR, str(req.session_hash)) |
| shutil.rmtree(save_dir) |
|
|
| def get_random_hex(): |
| random_bytes = os.urandom(8) |
| random_hex = random_bytes.hex() |
| return random_hex |
|
|
| def get_random_seed(randomize_seed, seed): |
| if randomize_seed: |
| seed = random.randint(0, MAX_SEED) |
| return seed |
|
|
| @spaces.GPU(duration=180) |
| def run_full(image: str, req: gr.Request): |
| seed = 0 |
| num_inference_steps = 50 |
| guidance_scale = 7.5 |
| simplify = True |
| target_face_num = DEFAULT_FACE_NUMBER |
| |
| image_seg = prepare_image(image, bg_color=np.array([1.0, 1.0, 1.0]), rmbg_net=rmbg_net) |
|
|
| outputs = triposg_pipe( |
| image=image_seg, |
| generator=torch.Generator(device=triposg_pipe.device).manual_seed(seed), |
| num_inference_steps=num_inference_steps, |
| guidance_scale=guidance_scale |
| ).samples[0] |
| print("mesh extraction done") |
| mesh = trimesh.Trimesh(outputs[0].astype(np.float32), np.ascontiguousarray(outputs[1])) |
|
|
| if simplify: |
| print("start simplify") |
| from utils import simplify_mesh |
| mesh = simplify_mesh(mesh, target_face_num) |
| |
| save_dir = os.path.join(TMP_DIR, "examples") |
| os.makedirs(save_dir, exist_ok=True) |
| mesh_path = os.path.join(save_dir, f"triposg_{get_random_hex()}.glb") |
| mesh.export(mesh_path) |
| print("save to ", mesh_path) |
|
|
| torch.cuda.empty_cache() |
|
|
| height, width = 768, 768 |
| |
| cameras = get_orthogonal_camera( |
| elevation_deg=[0, 0, 0, 0, 89.99, -89.99], |
| distance=[1.8] * NUM_VIEWS, |
| left=-0.55, |
| right=0.55, |
| bottom=-0.55, |
| top=0.55, |
| azimuth_deg=[x - 90 for x in [0, 90, 180, 270, 180, 180]], |
| device=DEVICE, |
| ) |
| ctx = NVDiffRastContextWrapper(device=DEVICE, context_type="cuda") |
|
|
| mesh = load_mesh(mesh_path, rescale=True, device=DEVICE) |
| render_out = render( |
| ctx, |
| mesh, |
| cameras, |
| height=height, |
| width=width, |
| render_attr=False, |
| normal_background=0.0, |
| ) |
| control_images = ( |
| torch.cat( |
| [ |
| (render_out.pos + 0.5).clamp(0, 1), |
| (render_out.normal / 2 + 0.5).clamp(0, 1), |
| ], |
| dim=-1, |
| ) |
| .permute(0, 3, 1, 2) |
| .to(DEVICE) |
| ) |
|
|
| image = Image.open(image) |
| image = remove_bg_fn(image) |
| image = preprocess_image(image, height, width) |
|
|
| pipe_kwargs = {} |
| if seed != -1 and isinstance(seed, int): |
| pipe_kwargs["generator"] = torch.Generator(device=DEVICE).manual_seed(seed) |
|
|
| images = mv_adapter_pipe( |
| "high quality", |
| height=height, |
| width=width, |
| num_inference_steps=15, |
| guidance_scale=3.0, |
| num_images_per_prompt=NUM_VIEWS, |
| control_image=control_images, |
| control_conditioning_scale=1.0, |
| reference_image=image, |
| reference_conditioning_scale=1.0, |
| negative_prompt="watermark, ugly, deformed, noisy, blurry, low contrast", |
| cross_attention_kwargs={"scale": 1.0}, |
| **pipe_kwargs, |
| ).images |
|
|
| torch.cuda.empty_cache() |
|
|
| mv_image_path = os.path.join(save_dir, f"mv_adapter_{get_random_hex()}.png") |
| make_image_grid(images, rows=1).save(mv_image_path) |
|
|
| from texture import TexturePipeline, ModProcessConfig |
| texture_pipe = TexturePipeline( |
| upscaler_ckpt_path="checkpoints/RealESRGAN_x2plus.pth", |
| inpaint_ckpt_path="checkpoints/big-lama.pt", |
| device=DEVICE, |
| ) |
|
|
| textured_glb_path = texture_pipe( |
| mesh_path=mesh_path, |
| save_dir=save_dir, |
| save_name=f"texture_mesh_{get_random_hex()}.glb", |
| uv_unwarp=True, |
| uv_size=4096, |
| rgb_path=mv_image_path, |
| rgb_process_config=ModProcessConfig(view_upscale=True, inpaint_mode="view"), |
| camera_azimuth_deg=[x - 90 for x in [0, 90, 180, 270, 180, 180]], |
| ) |
|
|
| return image_seg, mesh_path, textured_glb_path |
| |
|
|
| @spaces.GPU() |
| @torch.no_grad() |
| def run_segmentation(image: str): |
| image = prepare_image(image, bg_color=np.array([1.0, 1.0, 1.0]), rmbg_net=rmbg_net) |
| return image |
|
|
| @spaces.GPU(duration=90) |
| @torch.no_grad() |
| def image_to_3d( |
| image: Image.Image, |
| seed: int, |
| num_inference_steps: int, |
| guidance_scale: float, |
| simplify: bool, |
| target_face_num: int, |
| req: gr.Request |
| ): |
| outputs = triposg_pipe( |
| image=image, |
| generator=torch.Generator(device=triposg_pipe.device).manual_seed(seed), |
| num_inference_steps=num_inference_steps, |
| guidance_scale=guidance_scale |
| ).samples[0] |
| print("mesh extraction done") |
| mesh = trimesh.Trimesh(outputs[0].astype(np.float32), np.ascontiguousarray(outputs[1])) |
|
|
| if simplify: |
| print("start simplify") |
| from utils import simplify_mesh |
| mesh = simplify_mesh(mesh, target_face_num) |
| |
| save_dir = os.path.join(TMP_DIR, str(req.session_hash)) |
| mesh_path = os.path.join(save_dir, f"triposg_{get_random_hex()}.glb") |
| mesh.export(mesh_path) |
| print("save to ", mesh_path) |
|
|
| torch.cuda.empty_cache() |
|
|
| return mesh_path |
|
|
| @spaces.GPU(duration=120) |
| @torch.no_grad() |
| def run_texture(image: Image, mesh_path: str, seed: int, req: gr.Request): |
| height, width = 768, 768 |
| |
| cameras = get_orthogonal_camera( |
| elevation_deg=[0, 0, 0, 0, 89.99, -89.99], |
| distance=[1.8] * NUM_VIEWS, |
| left=-0.55, |
| right=0.55, |
| bottom=-0.55, |
| top=0.55, |
| azimuth_deg=[x - 90 for x in [0, 90, 180, 270, 180, 180]], |
| device=DEVICE, |
| ) |
| ctx = NVDiffRastContextWrapper(device=DEVICE, context_type="cuda") |
|
|
| mesh = load_mesh(mesh_path, rescale=True, device=DEVICE) |
| render_out = render( |
| ctx, |
| mesh, |
| cameras, |
| height=height, |
| width=width, |
| render_attr=False, |
| normal_background=0.0, |
| ) |
| control_images = ( |
| torch.cat( |
| [ |
| (render_out.pos + 0.5).clamp(0, 1), |
| (render_out.normal / 2 + 0.5).clamp(0, 1), |
| ], |
| dim=-1, |
| ) |
| .permute(0, 3, 1, 2) |
| .to(DEVICE) |
| ) |
|
|
| image = Image.open(image) |
| image = remove_bg_fn(image) |
| image = preprocess_image(image, height, width) |
|
|
| pipe_kwargs = {} |
| if seed != -1 and isinstance(seed, int): |
| pipe_kwargs["generator"] = torch.Generator(device=DEVICE).manual_seed(seed) |
|
|
| images = mv_adapter_pipe( |
| "high quality", |
| height=height, |
| width=width, |
| num_inference_steps=15, |
| guidance_scale=3.0, |
| num_images_per_prompt=NUM_VIEWS, |
| control_image=control_images, |
| control_conditioning_scale=1.0, |
| reference_image=image, |
| reference_conditioning_scale=1.0, |
| negative_prompt="watermark, ugly, deformed, noisy, blurry, low contrast", |
| cross_attention_kwargs={"scale": 1.0}, |
| **pipe_kwargs, |
| ).images |
|
|
| torch.cuda.empty_cache() |
|
|
| save_dir = os.path.join(TMP_DIR, str(req.session_hash)) |
| mv_image_path = os.path.join(save_dir, f"mv_adapter_{get_random_hex()}.png") |
| make_image_grid(images, rows=1).save(mv_image_path) |
|
|
| from texture import TexturePipeline, ModProcessConfig |
| texture_pipe = TexturePipeline( |
| upscaler_ckpt_path="checkpoints/RealESRGAN_x2plus.pth", |
| inpaint_ckpt_path="checkpoints/big-lama.pt", |
| device=DEVICE, |
| ) |
|
|
| textured_glb_path = texture_pipe( |
| mesh_path=mesh_path, |
| save_dir=save_dir, |
| save_name=f"texture_mesh_{get_random_hex()}.glb", |
| uv_unwarp=True, |
| uv_size=4096, |
| rgb_path=mv_image_path, |
| rgb_process_config=ModProcessConfig(view_upscale=True, inpaint_mode="view"), |
| camera_azimuth_deg=[x - 90 for x in [0, 90, 180, 270, 180, 180]], |
| ) |
|
|
| return textured_glb_path |
|
|
|
|
| with gr.Blocks(title="TripoSG") as demo: |
| gr.Markdown(HEADER) |
|
|
| with gr.Row(): |
| with gr.Column(): |
| with gr.Row(): |
| image_prompts = gr.Image(label="Input Image", type="filepath") |
| seg_image = gr.Image( |
| label="Segmentation Result", type="pil", format="png", interactive=False |
| ) |
|
|
| with gr.Accordion("Generation Settings", open=True): |
| seed = gr.Slider( |
| label="Seed", |
| minimum=0, |
| maximum=MAX_SEED, |
| step=0, |
| value=0 |
| ) |
| randomize_seed = gr.Checkbox(label="Randomize seed", value=True) |
| num_inference_steps = gr.Slider( |
| label="Number of inference steps", |
| minimum=8, |
| maximum=50, |
| step=1, |
| value=50, |
| ) |
| guidance_scale = gr.Slider( |
| label="CFG scale", |
| minimum=0.0, |
| maximum=20.0, |
| step=0.1, |
| value=7.0, |
| ) |
|
|
| with gr.Row(): |
| reduce_face = gr.Checkbox(label="Simplify Mesh", value=True) |
| target_face_num = gr.Slider(maximum=1000000, minimum=10000, value=DEFAULT_FACE_NUMBER, label="Target Face Number") |
|
|
| gen_button = gr.Button("Generate Shape", variant="primary") |
| gen_texture_button = gr.Button("Apply Texture", interactive=False) |
|
|
| with gr.Column(): |
| model_output = gr.Model3D(label="Generated GLB", interactive=False) |
| textured_model_output = gr.Model3D(label="Textured GLB", interactive=False) |
|
|
| with gr.Row(): |
| examples = gr.Examples( |
| examples=[ |
| f"{TRIPOSG_CODE_DIR}/assets/example_data/{image}" |
| for image in os.listdir(f"{TRIPOSG_CODE_DIR}/assets/example_data") |
| ], |
| fn=run_full, |
| inputs=[image_prompts], |
| outputs=[seg_image, model_output, textured_model_output], |
| cache_examples=True, |
| ) |
|
|
| gen_button.click( |
| run_segmentation, |
| inputs=[image_prompts], |
| outputs=[seg_image] |
| ).then( |
| get_random_seed, |
| inputs=[randomize_seed, seed], |
| outputs=[seed], |
| ).then( |
| image_to_3d, |
| inputs=[ |
| seg_image, |
| seed, |
| num_inference_steps, |
| guidance_scale, |
| reduce_face, |
| target_face_num |
| ], |
| outputs=[model_output] |
| ).then(lambda: gr.Button(interactive=True), outputs=[gen_texture_button]) |
|
|
| gen_texture_button.click( |
| run_texture, |
| inputs=[image_prompts, model_output, seed], |
| outputs=[textured_model_output] |
| ) |
|
|
| demo.load(start_session) |
| demo.unload(end_session) |
|
|
| demo.launch() |