Spaces:

XciD
/

tile-visualizer-api

Sleeping

App Files Files Community

XciD HF Staff commited on 22 days ago

Commit

9a8a023

unverified ·

1 Parent(s): 6bc9ac3

feat: add AI render mode with SDXL + ControlNet Tile inpainting

Browse files

Files changed (2) hide show

app.py +96 -19
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -2,20 +2,25 @@ import spaces
 import numpy as np
 import torch
 import gradio as gr
-from PIL import Image
 from transformers import (
     AutoImageProcessor,
     Mask2FormerForUniversalSegmentation,
     AutoModelForDepthEstimation,
 )
-# Load models on CPU at startup. @spaces.GPU moves them to CUDA automatically.
 seg_processor = AutoImageProcessor.from_pretrained("facebook/mask2former-swin-large-ade-semantic")
 seg_model = Mask2FormerForUniversalSegmentation.from_pretrained(
     "facebook/mask2former-swin-large-ade-semantic"
 )
-# Find floor/rug class IDs from model config
 FLOOR_KEYWORDS = {'floor', 'flooring', 'rug', 'carpet', 'mat'}
 FLOOR_IDS = set()
 id2label = seg_model.config.id2label
@@ -31,8 +36,29 @@ depth_model = AutoModelForDepthEstimation.from_pretrained(
     "depth-anything/Depth-Anything-V2-Large-hf", torch_dtype=torch.float16
 )
-@spaces.GPU
 @torch.inference_mode()
 def predict(image):
     if image is None:
@@ -46,10 +72,8 @@ def predict(image):
     device = seg_model.device
-    # Segmentation (Mask2Former) - keep float32 for numerical stability
     seg_inputs = seg_processor(images=image_resized, return_tensors="pt")
     seg_inputs = {k: v.to(device) for k, v in seg_inputs.items()}
     seg_outputs = seg_model(**seg_inputs)
     seg_result = seg_processor.post_process_semantic_segmentation(
         seg_outputs, target_sizes=[(proc_h, proc_w)]
@@ -57,19 +81,15 @@ def predict(image):
     seg_map = seg_result.cpu().numpy()
     floor_mask = np.zeros((proc_h, proc_w), dtype=np.uint8)
-    # Debug: log unique classes found
     unique_classes = np.unique(seg_map)
     print(f"Detected classes: {[(int(c), id2label.get(c, '?')) for c in unique_classes]}")
-    print(f"Floor IDs: {FLOOR_IDS}")
     for class_id in FLOOR_IDS:
         floor_mask[seg_map == class_id] = 255
     mask_img = Image.fromarray(floor_mask).resize((orig_w, orig_h), Image.NEAREST)
-    # Depth estimation
     depth_inputs = depth_processor(images=image_resized, return_tensors="pt")
     depth_inputs = {k: v.to(device, dtype=torch.float16) if v.is_floating_point() else v.to(device) for k, v in depth_inputs.items()}
     depth_outputs = depth_model(**depth_inputs)
     depth_map = depth_outputs.predicted_depth.squeeze().cpu().numpy()
@@ -84,17 +104,74 @@ def predict(image):
     return mask_img, depth_img
-with gr.Blocks() as demo:
-    gr.Markdown("# Tile Visualizer - Segmentation API")
-    with gr.Row():
-        input_image = gr.Image(type="pil", label="Room photo")
-    with gr.Row():
-        mask_output = gr.Image(type="pil", label="Floor mask")
-        depth_output = gr.Image(type="pil", label="Depth map")
-    btn = gr.Button("Process")
-    btn.click(fn=predict, inputs=input_image, outputs=[mask_output, depth_output])
 app = demo.app

 import numpy as np
 import torch
 import gradio as gr
+from PIL import Image, ImageDraw
 from transformers import (
     AutoImageProcessor,
     Mask2FormerForUniversalSegmentation,
     AutoModelForDepthEstimation,
 )
+from diffusers import (
+    StableDiffusionXLControlNetInpaintPipeline,
+    ControlNetModel,
+    AutoencoderKL,
+)
+# ─── Segmentation + Depth models ─────────────────
 seg_processor = AutoImageProcessor.from_pretrained("facebook/mask2former-swin-large-ade-semantic")
 seg_model = Mask2FormerForUniversalSegmentation.from_pretrained(
     "facebook/mask2former-swin-large-ade-semantic"
 )
 FLOOR_KEYWORDS = {'floor', 'flooring', 'rug', 'carpet', 'mat'}
 FLOOR_IDS = set()
 id2label = seg_model.config.id2label
     "depth-anything/Depth-Anything-V2-Large-hf", torch_dtype=torch.float16
 )
+# ─── SDXL + ControlNet Tile for AI rendering ─────
+print("Loading ControlNet Tile + SDXL inpainting pipeline...")
+controlnet = ControlNetModel.from_pretrained(
+    "xinsir/controlnet-tile-sdxl-1.0",
+    torch_dtype=torch.float16,
+)
+vae = AutoencoderKL.from_pretrained(
+    "madebyollin/sdxl-vae-fp16-fix",
+    torch_dtype=torch.float16,
+)
+inpaint_pipe = StableDiffusionXLControlNetInpaintPipeline.from_pretrained(
+    "diffusers/stable-diffusion-xl-1.0-inpainting-0.1",
+    controlnet=controlnet,
+    vae=vae,
+    torch_dtype=torch.float16,
+    variant="fp16",
+)
+inpaint_pipe.enable_model_cpu_offload()
+print("Pipeline loaded.")
+@spaces.GPU(duration=60)
 @torch.inference_mode()
 def predict(image):
     if image is None:
     device = seg_model.device
     seg_inputs = seg_processor(images=image_resized, return_tensors="pt")
     seg_inputs = {k: v.to(device) for k, v in seg_inputs.items()}
     seg_outputs = seg_model(**seg_inputs)
     seg_result = seg_processor.post_process_semantic_segmentation(
         seg_outputs, target_sizes=[(proc_h, proc_w)]
     seg_map = seg_result.cpu().numpy()
     floor_mask = np.zeros((proc_h, proc_w), dtype=np.uint8)
     unique_classes = np.unique(seg_map)
     print(f"Detected classes: {[(int(c), id2label.get(c, '?')) for c in unique_classes]}")
     for class_id in FLOOR_IDS:
         floor_mask[seg_map == class_id] = 255
     mask_img = Image.fromarray(floor_mask).resize((orig_w, orig_h), Image.NEAREST)
     depth_inputs = depth_processor(images=image_resized, return_tensors="pt")
     depth_inputs = {k: v.to(device, dtype=torch.float16) if v.is_floating_point() else v.to(device) for k, v in depth_inputs.items()}
     depth_outputs = depth_model(**depth_inputs)
     depth_map = depth_outputs.predicted_depth.squeeze().cpu().numpy()
     return mask_img, depth_img
+def create_tiled_control_image(tile_texture, width, height):
+    """Tile the texture image to fill width x height."""
+    tw, th = tile_texture.size
+    control = Image.new("RGB", (width, height))
+    for y in range(0, height, th):
+        for x in range(0, width, tw):
+            control.paste(tile_texture, (x, y))
+    return control
+@spaces.GPU(duration=120)
+@torch.inference_mode()
+def render_ai(room_image, tile_texture):
+    if room_image is None or tile_texture is None:
+        raise gr.Error("Room image and tile texture are required")
+    # Step 1: Get floor mask
+    mask_img, _ = predict.__wrapped__(room_image)
+    # Resize everything to 1024x1024 for SDXL
+    size = 1024
+    room_resized = room_image.resize((size, size), Image.LANCZOS)
+    mask_resized = mask_img.resize((size, size), Image.NEAREST)
+    # Step 2: Create tiled control image from tile texture
+    tile_size = max(64, size // 8)
+    tile_resized = tile_texture.resize((tile_size, tile_size), Image.LANCZOS)
+    control_image = create_tiled_control_image(tile_resized, size, size)
+    # Step 3: Run SDXL inpainting with ControlNet Tile
+    result = inpaint_pipe(
+        prompt="ceramic tile floor, tiled floor with repeating pattern, interior design photo, photorealistic",
+        negative_prompt="blurry, distorted, low quality, watermark, text",
+        image=room_resized,
+        mask_image=mask_resized,
+        control_image=control_image,
+        num_inference_steps=25,
+        guidance_scale=7.0,
+        controlnet_conditioning_scale=0.9,
+        strength=0.95,
+        generator=torch.Generator(device="cuda").manual_seed(42),
+    ).images[0]
+    # Resize back to original dimensions
+    result = result.resize((room_image.size[0], room_image.size[1]), Image.LANCZOS)
+    return result
+with gr.Blocks() as demo:
+    gr.Markdown("# Tile Visualizer API")
+    with gr.Tab("Segmentation"):
+        with gr.Row():
+            seg_input = gr.Image(type="pil", label="Room photo")
+        with gr.Row():
+            mask_output = gr.Image(type="pil", label="Floor mask")
+            depth_output = gr.Image(type="pil", label="Depth map")
+        seg_btn = gr.Button("Segment")
+        seg_btn.click(fn=predict, inputs=seg_input, outputs=[mask_output, depth_output])
+    with gr.Tab("AI Render"):
+        with gr.Row():
+            render_room = gr.Image(type="pil", label="Room photo")
+            render_tile = gr.Image(type="pil", label="Tile texture")
+        render_output = gr.Image(type="pil", label="Result")
+        render_btn = gr.Button("Render")
+        render_btn.click(fn=render_ai, inputs=[render_room, render_tile], outputs=render_output)
 app = demo.app

requirements.txt CHANGED Viewed

@@ -1,8 +1,10 @@
 torch
 torchvision
 transformers
 Pillow
 numpy
 gradio
 accelerate
 scipy

 torch
 torchvision
 transformers
+diffusers
 Pillow
 numpy
 gradio
 accelerate
 scipy
+safetensors