Spaces:

XciD
/

tile-visualizer-api

Sleeping

App Files Files Community

XciD HF Staff commited on 22 days ago

Commit

3ccb4f2

unverified ·

1 Parent(s): c457a51

feat: initial tile visualizer API with OneFormer + Depth Anything V2

Browse files

Files changed (3) hide show

README.md +5 -6
app.py +117 -0
requirements.txt +7 -0

README.md CHANGED Viewed

@@ -1,13 +1,12 @@
 ---
-title: Tile Visualizer Api
-emoji: 💻
-colorFrom: red
-colorTo: pink
 sdk: gradio
 sdk_version: 6.14.0
 python_version: '3.12'
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Tile Visualizer API
+emoji: 🏠
+colorFrom: blue
+colorTo: purple
 sdk: gradio
 sdk_version: 6.14.0
 python_version: '3.12'
 app_file: app.py
 pinned: false
+hardware: zero-a10g
 ---

app.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import io
+import numpy as np
+import torch
+import gradio as gr
+from PIL import Image
+from transformers import (
+    OneFormerProcessor,
+    OneFormerForUniversalSegmentation,
+    AutoImageProcessor,
+    AutoModelForDepthEstimation,
+)
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
+ADE20K_FLOOR_IDS = {3, 28}  # 3=floor, 28=rug/carpet
+seg_processor = None
+seg_model = None
+depth_processor = None
+depth_model = None
+def load_models():
+    global seg_processor, seg_model, depth_processor, depth_model
+    if seg_model is None:
+        seg_processor = OneFormerProcessor.from_pretrained(
+            "shi-labs/oneformer_ade20k_swin_large"
+        )
+        seg_model = OneFormerForUniversalSegmentation.from_pretrained(
+            "shi-labs/oneformer_ade20k_swin_large",
+            torch_dtype=DTYPE,
+        ).to(DEVICE)
+    if depth_model is None:
+        depth_processor = AutoImageProcessor.from_pretrained(
+            "depth-anything/Depth-Anything-V2-Large-hf"
+        )
+        depth_model = AutoModelForDepthEstimation.from_pretrained(
+            "depth-anything/Depth-Anything-V2-Large-hf",
+            torch_dtype=DTYPE,
+        ).to(DEVICE)
+@torch.inference_mode()
+def process_image(image: Image.Image):
+    """Takes a room photo, returns floor mask + depth map as images."""
+    if image is None:
+        raise gr.Error("No image provided")
+    load_models()
+    orig_w, orig_h = image.size
+    max_size = 1024
+    scale = min(1.0, max_size / max(orig_w, orig_h))
+    proc_w, proc_h = int(orig_w * scale), int(orig_h * scale)
+    image_resized = image.resize((proc_w, proc_h), Image.LANCZOS)
+    # --- Segmentation ---
+    seg_inputs = seg_processor(
+        images=image_resized, task_inputs=["semantic"], return_tensors="pt"
+    )
+    seg_inputs = {k: v.to(DEVICE, dtype=DTYPE) if v.dtype == torch.float32 else v.to(DEVICE) for k, v in seg_inputs.items()}
+    seg_outputs = seg_model(**seg_inputs)
+    seg_result = seg_processor.post_process_semantic_segmentation(
+        seg_outputs, target_sizes=[(proc_h, proc_w)]
+    )[0]
+    seg_map = seg_result.cpu().numpy()
+    floor_mask = np.zeros((proc_h, proc_w), dtype=np.uint8)
+    for class_id in ADE20K_FLOOR_IDS:
+        floor_mask[seg_map == class_id] = 255
+    # Resize mask to original dimensions
+    mask_img = Image.fromarray(floor_mask).resize((orig_w, orig_h), Image.NEAREST)
+    # --- Depth estimation ---
+    depth_inputs = depth_processor(images=image_resized, return_tensors="pt")
+    depth_inputs = {k: v.to(DEVICE, dtype=DTYPE) if v.dtype == torch.float32 else v.to(DEVICE) for k, v in depth_inputs.items()}
+    depth_outputs = depth_model(**depth_inputs)
+    depth_map = depth_outputs.predicted_depth.squeeze().cpu().numpy()
+    # Normalize to 0-255
+    depth_min, depth_max = depth_map.min(), depth_map.max()
+    if depth_max - depth_min > 0:
+        depth_norm = ((depth_map - depth_min) / (depth_max - depth_min) * 255).astype(np.uint8)
+    else:
+        depth_norm = np.zeros_like(depth_map, dtype=np.uint8)
+    depth_img = Image.fromarray(depth_norm).resize((orig_w, orig_h), Image.BILINEAR)
+    return mask_img, depth_img
+def predict(image):
+    mask, depth = process_image(image)
+    return mask, depth
+with gr.Blocks() as demo:
+    gr.Markdown("# Tile Visualizer - Segmentation API")
+    gr.Markdown("Upload a room photo to get floor mask + depth map.")
+    with gr.Row():
+        input_image = gr.Image(type="pil", label="Room photo")
+    with gr.Row():
+        mask_output = gr.Image(type="pil", label="Floor mask")
+        depth_output = gr.Image(type="pil", label="Depth map")
+    btn = gr.Button("Process")
+    btn.click(fn=predict, inputs=input_image, outputs=[mask_output, depth_output])
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch
+torchvision
+transformers
+Pillow
+numpy
+gradio
+accelerate