Spaces:

Daankular
/

Image2Model

Running on Zero

Daankular commited on 21 days ago

Commit

90db216

1 Parent(s): 6f79261

Update apply_texture to MV-Adapter new API

- init_adapter() removed; replace with init_custom_adapter(num_views=6)
+ load_custom_adapter(path, weight_name=...)
- Add ShiftSNRScheduler wrapping (required by new API)
- Move dtype to pipe.to() after adapter init; add cond_encoder.to()
- Replace image=/cameras= call args with reference_image= + control_image=
(plucker embeddings from get_plucker_embeds_from_cameras_ortho)
- Remove stale torch.autocast wrapper
- Import get_plucker_embeds_from_cameras_ortho

Files changed (1) hide show

app.py +31 -22

app.py CHANGED Viewed

@@ -950,41 +950,50 @@ def apply_texture(glb_path, input_image, remove_background, variant, tex_seed,
         from mvadapter.pipelines.pipeline_mvadapter_i2mv_sdxl import MVAdapterI2MVSDXLPipeline
         from mvadapter.schedulers.scheduling_shift_snr import ShiftSNRScheduler
-        from mvadapter.utils import get_orthogonal_camera
         import torchvision.transforms.functional as TF
         progress(0.4, desc=f"Running MV-Adapter ({variant})...")
-        pipe = MVAdapterI2MVSDXLPipeline.from_pretrained(
-            sd_id,
-            torch_dtype=torch.float16,
-        ).to(DEVICE)
-        pipe.init_adapter(
-            image_encoder_path="openai/clip-vit-large-patch14",
-            ipa_weight_path=os.path.join(mvadapter_weights, "mvadapter_i2mv_sdxl.safetensors"),
-            adapter_tokens=256,
         )
         ref_pil = Image.open(img_path).convert("RGB")
         cameras = get_orthogonal_camera(
             elevation_deg=[0, 0, 0, 0, 0, 0],
             distance=[1.8] * 6,
             left=-0.55, right=0.55, bottom=-0.55, top=0.55,
-            azimuth_deg=[x - 90 for x in [0, 45, 90, 135, 180, 270]],
             device=DEVICE,
         )
-        with torch.autocast(DEVICE):
-            out = pipe(
-                image=ref_pil,
-                height=768, width=768,
-                num_images_per_prompt=6,
-                guidance_scale=3.0,
-                num_inference_steps=30,
-                generator=torch.Generator(device=DEVICE).manual_seed(int(tex_seed)),
-                cameras=cameras,
-            )
         mv_grid = out.images  # list of 6 PIL images
         grid_w  = mv_grid[0].width * len(mv_grid)

         from mvadapter.pipelines.pipeline_mvadapter_i2mv_sdxl import MVAdapterI2MVSDXLPipeline
         from mvadapter.schedulers.scheduling_shift_snr import ShiftSNRScheduler
+        from mvadapter.utils import get_orthogonal_camera, get_plucker_embeds_from_cameras_ortho
         import torchvision.transforms.functional as TF
         progress(0.4, desc=f"Running MV-Adapter ({variant})...")
+        pipe = MVAdapterI2MVSDXLPipeline.from_pretrained(sd_id)
+        pipe.scheduler = ShiftSNRScheduler.from_scheduler(
+            pipe.scheduler,
+            shift_mode="interpolated",
+            shift_scale=8.0,
+        )
+        pipe.init_custom_adapter(num_views=6)
+        pipe.load_custom_adapter(
+            mvadapter_weights, weight_name="mvadapter_i2mv_sdxl.safetensors"
         )
+        pipe.to(device=DEVICE, dtype=torch.float16)
+        pipe.cond_encoder.to(device=DEVICE, dtype=torch.float16)
         ref_pil = Image.open(img_path).convert("RGB")
         cameras = get_orthogonal_camera(
             elevation_deg=[0, 0, 0, 0, 0, 0],
             distance=[1.8] * 6,
             left=-0.55, right=0.55, bottom=-0.55, top=0.55,
+            azimuth_deg=[x - 90 for x in [0, 45, 90, 180, 270, 315]],
             device=DEVICE,
         )
+        plucker_embeds = get_plucker_embeds_from_cameras_ortho(
+            cameras.c2w, [1.1] * 6, width=768
+        )
+        control_images = ((plucker_embeds + 1.0) / 2.0).clamp(0, 1)
+        out = pipe(
+            "high quality",
+            height=768, width=768,
+            num_images_per_prompt=6,
+            guidance_scale=3.0,
+            num_inference_steps=30,
+            generator=torch.Generator(device=DEVICE).manual_seed(int(tex_seed)),
+            control_image=control_images,
+            control_conditioning_scale=1.0,
+            reference_image=ref_pil,
+            reference_conditioning_scale=1.0,
+            negative_prompt="watermark, ugly, deformed, noisy, blurry, low contrast",
+        )
         mv_grid = out.images  # list of 6 PIL images
         grid_w  = mv_grid[0].width * len(mv_grid)