Spaces:

FeilongTang
/

OneVision-Encoder-Codec-View

Running

FeilongTang commited on May 10

Commit

901e5ca

1 Parent(s): b4e26aa

Canvas: one per GOP group with IPPP (I-frame + P-frame patches)

Replaces the single square 'all selected patches' canvas with one
canvas per GOP group, mirroring the codec convention.

Per group (frame range s..e):
- frame s = I-frame: full image kept whole, occupies the top of the
canvas (the keyframe anchor).
- frames s+1..e = P-frames: only their saliency-selected patches go
into the canvas, packed below the I-frame in a wb-wide grid in
time-major raster order.
Canvas width is locked to wb*patch so the I-frame and the P-grid
align block-for-block, matching how LLaVA-OneVision tokenizes input.

UI
- gr.Image (single canvas) → gr.Gallery (N canvases, click any
thumbnail to enlarge), columns=2 rows=2, height=380.
- Caption per canvas: 'Group K/N · I-frame @ sampled #s + p P-frames'.
- Card subtitle explains the IPPP convention.

Run info JSON
- Drop 'canvas_resolution' (single scalar).
- Add 'canvases': [{index, size, group, structure}], 'n_canvases'.
- 'total_selected_patches' renamed to 'total_selected_patches_incl_i_frames'
since I-frame contributes its full grid to the canvas count.

Files changed (1) hide show

app.py +105 -36

app.py CHANGED Viewed

@@ -447,29 +447,68 @@ def write_mp4(frames: List[np.ndarray], path: str, fps: float) -> None:
             proc.kill()
-def pack_canvas(
-    frames: List[np.ndarray], masks: List[np.ndarray], patch: int,
-) -> Tuple[np.ndarray, int]:
-    """Collect every selected patch in time-order, raster-scan, into a
-    near-square canvas image. Empty slots are white."""
-    selected: List[np.ndarray] = []
-    for f, m in zip(frames, masks):
-        hb, wb = m.shape
-        for i in range(hb):
-            for j in range(wb):
-                if m[i, j]:
-                    selected.append(
-                        f[i * patch:(i + 1) * patch, j * patch:(j + 1) * patch].copy()
-                    )
-    n = len(selected)
-    if n == 0:
-        return np.full((patch, patch, 3), 255, dtype=np.uint8), 0
-    cn = int(math.ceil(math.sqrt(n)))
-    canvas = np.full((cn * patch, cn * patch, 3), 255, dtype=np.uint8)
-    for k, p in enumerate(selected):
-        ci, cj = k // cn, k % cn
-        canvas[ci * patch:(ci + 1) * patch, cj * patch:(cj + 1) * patch] = p
-    return canvas, n
 def make_charts(
@@ -594,14 +633,14 @@ def process(
     progress=gr.Progress(track_tqdm=False),
 ):
     if not video_path:
-        return None, None, "Please upload a video.", None
     t0 = time.time()
     progress(0.05, desc="Reading metadata")
     meta = video_metadata(video_path)
     total = meta.get("total_frames") or 0
     if total <= 0:
-        return None, None, json.dumps(
             {"error": "Could not read frame count.", "metadata": meta},
             indent=2, ensure_ascii=False,
         ), None
@@ -631,7 +670,7 @@ def process(
         fids = sample_frame_ids(total, int(sample_frames))
     raw = decode_frames(video_path, fids)
     if not raw:
-        return None, None, json.dumps(
             {"error": "Failed to decode frames.", "metadata": meta},
             indent=2, ensure_ascii=False,
         ), None
@@ -682,10 +721,21 @@ def process(
     vis_fps = max(2.0, min(8.0, (meta.get("fps") or 25.0) / 4.0))
     write_mp4(vis, vis_path, vis_fps)
-    progress(0.85, desc="Packing canvas")
-    canvas, n_selected = pack_canvas(resized, masks, int(patch_size))
-    canvas_path = os.path.join(out_dir, "canvas.png")
-    cv2.imwrite(canvas_path, canvas)
     hb, wb = grids[0].shape
     grid_size = int(grids[0].shape[0] * grids[0].shape[1]) if grids else 0
@@ -732,8 +782,18 @@ def process(
         "resized_frame_size": f"{tw}x{th}",
         "patch_grid_per_frame": f"{hb}x{wb} = {hb * wb} patches",
         "actual_selected_total": int(actual_selected),
-        "total_selected_patches": int(n_selected),
-        "canvas_resolution": f"{canvas.shape[1]}x{canvas.shape[0]}",
         "vis_video_fps": round(vis_fps, 2),
         "viz_mode": mode,
         "heatmap_alpha": float(heatmap_alpha) if mode != "selection" else None,
@@ -751,7 +811,7 @@ def process(
     progress(1.0, desc="Done")
     return (
-        vis_path, canvas_path,
         json.dumps(info, indent=2, ensure_ascii=False),
         chart_fig,
     )
@@ -1254,9 +1314,18 @@ with gr.Blocks(**_BLOCK_KW) as demo:
             with gr.Row():
                 with gr.Column(scale=1):
                     with gr.Group(elem_classes="ovc-card"):
-                        gr.Markdown("### Packed canvas")
-                        canvas_out = gr.Image(
-                            label="", show_label=False, height=320,
                         )
                 with gr.Column(scale=1):
                     with gr.Group(elem_classes="ovc-card"):

             proc.kill()
+def pack_canvases_per_group(
+    frames: List[np.ndarray],
+    masks: List[np.ndarray],
+    groups: List[Tuple[int, int]],
+    patch: int,
+) -> Tuple[List[np.ndarray], int]:
+    """One canvas per GOP group, structured as I-frame + P-frame patches
+    (IPPP order, matching the codec convention).
+    Within each group [s..e]:
+      - frame s is the I-frame: its WHOLE image is laid down as the top
+        of the canvas (the anchor / keyframe).
+      - frames s+1..e are P-frames: only their selected patches go below
+        the I-frame, packed in time-order, raster scan, in a wb-wide grid.
+    The canvas width is locked to the frame's patch-grid width so the
+    I-frame slots in cleanly and the P-grid below aligns block-for-block.
+    """
+    canvases: List[np.ndarray] = []
+    total_selected = 0
+    for (s, e) in groups:
+        if s >= len(frames):
+            continue
+        i_frame = frames[s]
+        h, w = i_frame.shape[:2]
+        hb, wb = h // patch, w // patch
+        canvas_w = wb * patch
+        # I-frame block (already a multiple of patch from smart_resize).
+        i_block = i_frame[: hb * patch, : canvas_w].copy()
+        total_selected += hb * wb  # I-frame counts as fully kept.
+        # Collect selected patches from P-frames (s+1..e), time-major.
+        p_patches: List[np.ndarray] = []
+        for k in range(s + 1, e + 1):
+            if k >= len(frames):
+                break
+            f, m = frames[k], masks[k]
+            for i in range(m.shape[0]):
+                for j in range(m.shape[1]):
+                    if m[i, j]:
+                        p_patches.append(
+                            f[i * patch:(i + 1) * patch, j * patch:(j + 1) * patch].copy()
+                        )
+        total_selected += len(p_patches)
+        if not p_patches:
+            canvases.append(i_block)
+            continue
+        # Lay P-patches in a wb-wide grid below the I-frame.
+        rows = (len(p_patches) + wb - 1) // wb
+        p_grid = np.full((rows * patch, canvas_w, 3), 255, dtype=np.uint8)
+        for idx, p in enumerate(p_patches):
+            r, c = divmod(idx, wb)
+            p_grid[r * patch:(r + 1) * patch, c * patch:(c + 1) * patch] = p
+        canvas = np.vstack([i_block, p_grid])
+        canvases.append(canvas)
+    if not canvases:
+        canvases = [np.full((patch, patch, 3), 255, dtype=np.uint8)]
+    return canvases, total_selected
 def make_charts(
     progress=gr.Progress(track_tqdm=False),
 ):
     if not video_path:
+        return None, [], "Please upload a video.", None
     t0 = time.time()
     progress(0.05, desc="Reading metadata")
     meta = video_metadata(video_path)
     total = meta.get("total_frames") or 0
     if total <= 0:
+        return None, [], json.dumps(
             {"error": "Could not read frame count.", "metadata": meta},
             indent=2, ensure_ascii=False,
         ), None
         fids = sample_frame_ids(total, int(sample_frames))
     raw = decode_frames(video_path, fids)
     if not raw:
+        return None, [], json.dumps(
             {"error": "Failed to decode frames.", "metadata": meta},
             indent=2, ensure_ascii=False,
         ), None
     vis_fps = max(2.0, min(8.0, (meta.get("fps") or 25.0) / 4.0))
     write_mp4(vis, vis_path, vis_fps)
+    progress(0.85, desc="Packing canvases (one per GOP group)")
+    canvases, n_selected = pack_canvases_per_group(
+        resized, masks, groups, int(patch_size),
+    )
+    canvas_items: List[Tuple[str, str]] = []
+    for idx, canv in enumerate(canvases):
+        cp = os.path.join(out_dir, f"canvas_{idx:03d}.png")
+        cv2.imwrite(cp, canv)
+        s_idx, e_idx = groups[idx] if idx < len(groups) else (idx, idx)
+        n_p = max(0, e_idx - s_idx)  # number of P-frames in this group
+        caption = (
+            f"Group {idx + 1}/{len(canvases)} · I-frame @ sampled #{s_idx} "
+            f"+ {n_p} P-frame{'s' if n_p != 1 else ''}"
+        )
+        canvas_items.append((cp, caption))
     hb, wb = grids[0].shape
     grid_size = int(grids[0].shape[0] * grids[0].shape[1]) if grids else 0
         "resized_frame_size": f"{tw}x{th}",
         "patch_grid_per_frame": f"{hb}x{wb} = {hb * wb} patches",
         "actual_selected_total": int(actual_selected),
+        "total_selected_patches_incl_i_frames": int(n_selected),
+        "canvases": [
+            {
+                "index": i,
+                "size": f"{canvases[i].shape[1]}x{canvases[i].shape[0]}",
+                "group": list(groups[i]) if i < len(groups) else None,
+                "structure": "IPPP — first frame full (I), rest contribute "
+                             "only their selected patches (P).",
+            }
+            for i in range(len(canvases))
+        ],
+        "n_canvases": int(len(canvases)),
         "vis_video_fps": round(vis_fps, 2),
         "viz_mode": mode,
         "heatmap_alpha": float(heatmap_alpha) if mode != "selection" else None,
     progress(1.0, desc="Done")
     return (
+        vis_path, canvas_items,
         json.dumps(info, indent=2, ensure_ascii=False),
         chart_fig,
     )
             with gr.Row():
                 with gr.Column(scale=1):
                     with gr.Group(elem_classes="ovc-card"):
+                        gr.Markdown("### Packed canvases (one per GOP group)")
+                        gr.Markdown(
+                            "<small>Each canvas is one GOP group rendered in "
+                            "<b>IPPP order</b>: the group's first frame is the "
+                            "<b>I-frame</b> kept whole (top), followed by the "
+                            "<b>P-frame</b> selected patches packed below.</small>"
+                        )
+                        canvas_out = gr.Gallery(
+                            label="", show_label=False,
+                            columns=2, rows=2, height=380,
+                            object_fit="contain",
+                            preview=True,
                         )
                 with gr.Column(scale=1):
                     with gr.Group(elem_classes="ovc-card"):