XciD HF Staff commited on
Commit
3ccb4f2
·
unverified ·
1 Parent(s): c457a51

feat: initial tile visualizer API with OneFormer + Depth Anything V2

Browse files
Files changed (3) hide show
  1. README.md +5 -6
  2. app.py +117 -0
  3. requirements.txt +7 -0
README.md CHANGED
@@ -1,13 +1,12 @@
1
  ---
2
- title: Tile Visualizer Api
3
- emoji: 💻
4
- colorFrom: red
5
- colorTo: pink
6
  sdk: gradio
7
  sdk_version: 6.14.0
8
  python_version: '3.12'
9
  app_file: app.py
10
  pinned: false
 
11
  ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Tile Visualizer API
3
+ emoji: 🏠
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: gradio
7
  sdk_version: 6.14.0
8
  python_version: '3.12'
9
  app_file: app.py
10
  pinned: false
11
+ hardware: zero-a10g
12
  ---
 
 
app.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import numpy as np
3
+ import torch
4
+ import gradio as gr
5
+ from PIL import Image
6
+ from transformers import (
7
+ OneFormerProcessor,
8
+ OneFormerForUniversalSegmentation,
9
+ AutoImageProcessor,
10
+ AutoModelForDepthEstimation,
11
+ )
12
+
13
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
14
+ DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
15
+
16
+ ADE20K_FLOOR_IDS = {3, 28} # 3=floor, 28=rug/carpet
17
+
18
+ seg_processor = None
19
+ seg_model = None
20
+ depth_processor = None
21
+ depth_model = None
22
+
23
+
24
+ def load_models():
25
+ global seg_processor, seg_model, depth_processor, depth_model
26
+
27
+ if seg_model is None:
28
+ seg_processor = OneFormerProcessor.from_pretrained(
29
+ "shi-labs/oneformer_ade20k_swin_large"
30
+ )
31
+ seg_model = OneFormerForUniversalSegmentation.from_pretrained(
32
+ "shi-labs/oneformer_ade20k_swin_large",
33
+ torch_dtype=DTYPE,
34
+ ).to(DEVICE)
35
+
36
+ if depth_model is None:
37
+ depth_processor = AutoImageProcessor.from_pretrained(
38
+ "depth-anything/Depth-Anything-V2-Large-hf"
39
+ )
40
+ depth_model = AutoModelForDepthEstimation.from_pretrained(
41
+ "depth-anything/Depth-Anything-V2-Large-hf",
42
+ torch_dtype=DTYPE,
43
+ ).to(DEVICE)
44
+
45
+
46
+ @torch.inference_mode()
47
+ def process_image(image: Image.Image):
48
+ """Takes a room photo, returns floor mask + depth map as images."""
49
+ if image is None:
50
+ raise gr.Error("No image provided")
51
+
52
+ load_models()
53
+
54
+ orig_w, orig_h = image.size
55
+ max_size = 1024
56
+ scale = min(1.0, max_size / max(orig_w, orig_h))
57
+ proc_w, proc_h = int(orig_w * scale), int(orig_h * scale)
58
+ image_resized = image.resize((proc_w, proc_h), Image.LANCZOS)
59
+
60
+ # --- Segmentation ---
61
+ seg_inputs = seg_processor(
62
+ images=image_resized, task_inputs=["semantic"], return_tensors="pt"
63
+ )
64
+ seg_inputs = {k: v.to(DEVICE, dtype=DTYPE) if v.dtype == torch.float32 else v.to(DEVICE) for k, v in seg_inputs.items()}
65
+
66
+ seg_outputs = seg_model(**seg_inputs)
67
+ seg_result = seg_processor.post_process_semantic_segmentation(
68
+ seg_outputs, target_sizes=[(proc_h, proc_w)]
69
+ )[0]
70
+
71
+ seg_map = seg_result.cpu().numpy()
72
+ floor_mask = np.zeros((proc_h, proc_w), dtype=np.uint8)
73
+ for class_id in ADE20K_FLOOR_IDS:
74
+ floor_mask[seg_map == class_id] = 255
75
+
76
+ # Resize mask to original dimensions
77
+ mask_img = Image.fromarray(floor_mask).resize((orig_w, orig_h), Image.NEAREST)
78
+
79
+ # --- Depth estimation ---
80
+ depth_inputs = depth_processor(images=image_resized, return_tensors="pt")
81
+ depth_inputs = {k: v.to(DEVICE, dtype=DTYPE) if v.dtype == torch.float32 else v.to(DEVICE) for k, v in depth_inputs.items()}
82
+
83
+ depth_outputs = depth_model(**depth_inputs)
84
+ depth_map = depth_outputs.predicted_depth.squeeze().cpu().numpy()
85
+
86
+ # Normalize to 0-255
87
+ depth_min, depth_max = depth_map.min(), depth_map.max()
88
+ if depth_max - depth_min > 0:
89
+ depth_norm = ((depth_map - depth_min) / (depth_max - depth_min) * 255).astype(np.uint8)
90
+ else:
91
+ depth_norm = np.zeros_like(depth_map, dtype=np.uint8)
92
+
93
+ depth_img = Image.fromarray(depth_norm).resize((orig_w, orig_h), Image.BILINEAR)
94
+
95
+ return mask_img, depth_img
96
+
97
+
98
+ def predict(image):
99
+ mask, depth = process_image(image)
100
+ return mask, depth
101
+
102
+
103
+ with gr.Blocks() as demo:
104
+ gr.Markdown("# Tile Visualizer - Segmentation API")
105
+ gr.Markdown("Upload a room photo to get floor mask + depth map.")
106
+
107
+ with gr.Row():
108
+ input_image = gr.Image(type="pil", label="Room photo")
109
+ with gr.Row():
110
+ mask_output = gr.Image(type="pil", label="Floor mask")
111
+ depth_output = gr.Image(type="pil", label="Depth map")
112
+
113
+ btn = gr.Button("Process")
114
+ btn.click(fn=predict, inputs=input_image, outputs=[mask_output, depth_output])
115
+
116
+ if __name__ == "__main__":
117
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ torch
2
+ torchvision
3
+ transformers
4
+ Pillow
5
+ numpy
6
+ gradio
7
+ accelerate