Canvas: one per GOP group with IPPP (I-frame + P-frame patches)
Browse filesReplaces the single square 'all selected patches' canvas with one
canvas per GOP group, mirroring the codec convention.
Per group (frame range s..e):
- frame s = I-frame: full image kept whole, occupies the top of the
canvas (the keyframe anchor).
- frames s+1..e = P-frames: only their saliency-selected patches go
into the canvas, packed below the I-frame in a wb-wide grid in
time-major raster order.
Canvas width is locked to wb*patch so the I-frame and the P-grid
align block-for-block, matching how LLaVA-OneVision tokenizes input.
UI
- gr.Image (single canvas) → gr.Gallery (N canvases, click any
thumbnail to enlarge), columns=2 rows=2, height=380.
- Caption per canvas: 'Group K/N · I-frame @ sampled #s + p P-frames'.
- Card subtitle explains the IPPP convention.
Run info JSON
- Drop 'canvas_resolution' (single scalar).
- Add 'canvases': [{index, size, group, structure}], 'n_canvases'.
- 'total_selected_patches' renamed to 'total_selected_patches_incl_i_frames'
since I-frame contributes its full grid to the canvas count.
|
@@ -447,29 +447,68 @@ def write_mp4(frames: List[np.ndarray], path: str, fps: float) -> None:
|
|
| 447 |
proc.kill()
|
| 448 |
|
| 449 |
|
| 450 |
-
def
|
| 451 |
-
frames: List[np.ndarray],
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 473 |
|
| 474 |
|
| 475 |
def make_charts(
|
|
@@ -594,14 +633,14 @@ def process(
|
|
| 594 |
progress=gr.Progress(track_tqdm=False),
|
| 595 |
):
|
| 596 |
if not video_path:
|
| 597 |
-
return None,
|
| 598 |
|
| 599 |
t0 = time.time()
|
| 600 |
progress(0.05, desc="Reading metadata")
|
| 601 |
meta = video_metadata(video_path)
|
| 602 |
total = meta.get("total_frames") or 0
|
| 603 |
if total <= 0:
|
| 604 |
-
return None,
|
| 605 |
{"error": "Could not read frame count.", "metadata": meta},
|
| 606 |
indent=2, ensure_ascii=False,
|
| 607 |
), None
|
|
@@ -631,7 +670,7 @@ def process(
|
|
| 631 |
fids = sample_frame_ids(total, int(sample_frames))
|
| 632 |
raw = decode_frames(video_path, fids)
|
| 633 |
if not raw:
|
| 634 |
-
return None,
|
| 635 |
{"error": "Failed to decode frames.", "metadata": meta},
|
| 636 |
indent=2, ensure_ascii=False,
|
| 637 |
), None
|
|
@@ -682,10 +721,21 @@ def process(
|
|
| 682 |
vis_fps = max(2.0, min(8.0, (meta.get("fps") or 25.0) / 4.0))
|
| 683 |
write_mp4(vis, vis_path, vis_fps)
|
| 684 |
|
| 685 |
-
progress(0.85, desc="Packing
|
| 686 |
-
|
| 687 |
-
|
| 688 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 689 |
|
| 690 |
hb, wb = grids[0].shape
|
| 691 |
grid_size = int(grids[0].shape[0] * grids[0].shape[1]) if grids else 0
|
|
@@ -732,8 +782,18 @@ def process(
|
|
| 732 |
"resized_frame_size": f"{tw}x{th}",
|
| 733 |
"patch_grid_per_frame": f"{hb}x{wb} = {hb * wb} patches",
|
| 734 |
"actual_selected_total": int(actual_selected),
|
| 735 |
-
"
|
| 736 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 737 |
"vis_video_fps": round(vis_fps, 2),
|
| 738 |
"viz_mode": mode,
|
| 739 |
"heatmap_alpha": float(heatmap_alpha) if mode != "selection" else None,
|
|
@@ -751,7 +811,7 @@ def process(
|
|
| 751 |
|
| 752 |
progress(1.0, desc="Done")
|
| 753 |
return (
|
| 754 |
-
vis_path,
|
| 755 |
json.dumps(info, indent=2, ensure_ascii=False),
|
| 756 |
chart_fig,
|
| 757 |
)
|
|
@@ -1254,9 +1314,18 @@ with gr.Blocks(**_BLOCK_KW) as demo:
|
|
| 1254 |
with gr.Row():
|
| 1255 |
with gr.Column(scale=1):
|
| 1256 |
with gr.Group(elem_classes="ovc-card"):
|
| 1257 |
-
gr.Markdown("### Packed
|
| 1258 |
-
|
| 1259 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1260 |
)
|
| 1261 |
with gr.Column(scale=1):
|
| 1262 |
with gr.Group(elem_classes="ovc-card"):
|
|
|
|
| 447 |
proc.kill()
|
| 448 |
|
| 449 |
|
| 450 |
+
def pack_canvases_per_group(
|
| 451 |
+
frames: List[np.ndarray],
|
| 452 |
+
masks: List[np.ndarray],
|
| 453 |
+
groups: List[Tuple[int, int]],
|
| 454 |
+
patch: int,
|
| 455 |
+
) -> Tuple[List[np.ndarray], int]:
|
| 456 |
+
"""One canvas per GOP group, structured as I-frame + P-frame patches
|
| 457 |
+
(IPPP order, matching the codec convention).
|
| 458 |
+
|
| 459 |
+
Within each group [s..e]:
|
| 460 |
+
- frame s is the I-frame: its WHOLE image is laid down as the top
|
| 461 |
+
of the canvas (the anchor / keyframe).
|
| 462 |
+
- frames s+1..e are P-frames: only their selected patches go below
|
| 463 |
+
the I-frame, packed in time-order, raster scan, in a wb-wide grid.
|
| 464 |
+
|
| 465 |
+
The canvas width is locked to the frame's patch-grid width so the
|
| 466 |
+
I-frame slots in cleanly and the P-grid below aligns block-for-block.
|
| 467 |
+
"""
|
| 468 |
+
canvases: List[np.ndarray] = []
|
| 469 |
+
total_selected = 0
|
| 470 |
+
for (s, e) in groups:
|
| 471 |
+
if s >= len(frames):
|
| 472 |
+
continue
|
| 473 |
+
i_frame = frames[s]
|
| 474 |
+
h, w = i_frame.shape[:2]
|
| 475 |
+
hb, wb = h // patch, w // patch
|
| 476 |
+
canvas_w = wb * patch
|
| 477 |
+
# I-frame block (already a multiple of patch from smart_resize).
|
| 478 |
+
i_block = i_frame[: hb * patch, : canvas_w].copy()
|
| 479 |
+
total_selected += hb * wb # I-frame counts as fully kept.
|
| 480 |
+
|
| 481 |
+
# Collect selected patches from P-frames (s+1..e), time-major.
|
| 482 |
+
p_patches: List[np.ndarray] = []
|
| 483 |
+
for k in range(s + 1, e + 1):
|
| 484 |
+
if k >= len(frames):
|
| 485 |
+
break
|
| 486 |
+
f, m = frames[k], masks[k]
|
| 487 |
+
for i in range(m.shape[0]):
|
| 488 |
+
for j in range(m.shape[1]):
|
| 489 |
+
if m[i, j]:
|
| 490 |
+
p_patches.append(
|
| 491 |
+
f[i * patch:(i + 1) * patch, j * patch:(j + 1) * patch].copy()
|
| 492 |
+
)
|
| 493 |
+
total_selected += len(p_patches)
|
| 494 |
+
|
| 495 |
+
if not p_patches:
|
| 496 |
+
canvases.append(i_block)
|
| 497 |
+
continue
|
| 498 |
+
|
| 499 |
+
# Lay P-patches in a wb-wide grid below the I-frame.
|
| 500 |
+
rows = (len(p_patches) + wb - 1) // wb
|
| 501 |
+
p_grid = np.full((rows * patch, canvas_w, 3), 255, dtype=np.uint8)
|
| 502 |
+
for idx, p in enumerate(p_patches):
|
| 503 |
+
r, c = divmod(idx, wb)
|
| 504 |
+
p_grid[r * patch:(r + 1) * patch, c * patch:(c + 1) * patch] = p
|
| 505 |
+
|
| 506 |
+
canvas = np.vstack([i_block, p_grid])
|
| 507 |
+
canvases.append(canvas)
|
| 508 |
+
|
| 509 |
+
if not canvases:
|
| 510 |
+
canvases = [np.full((patch, patch, 3), 255, dtype=np.uint8)]
|
| 511 |
+
return canvases, total_selected
|
| 512 |
|
| 513 |
|
| 514 |
def make_charts(
|
|
|
|
| 633 |
progress=gr.Progress(track_tqdm=False),
|
| 634 |
):
|
| 635 |
if not video_path:
|
| 636 |
+
return None, [], "Please upload a video.", None
|
| 637 |
|
| 638 |
t0 = time.time()
|
| 639 |
progress(0.05, desc="Reading metadata")
|
| 640 |
meta = video_metadata(video_path)
|
| 641 |
total = meta.get("total_frames") or 0
|
| 642 |
if total <= 0:
|
| 643 |
+
return None, [], json.dumps(
|
| 644 |
{"error": "Could not read frame count.", "metadata": meta},
|
| 645 |
indent=2, ensure_ascii=False,
|
| 646 |
), None
|
|
|
|
| 670 |
fids = sample_frame_ids(total, int(sample_frames))
|
| 671 |
raw = decode_frames(video_path, fids)
|
| 672 |
if not raw:
|
| 673 |
+
return None, [], json.dumps(
|
| 674 |
{"error": "Failed to decode frames.", "metadata": meta},
|
| 675 |
indent=2, ensure_ascii=False,
|
| 676 |
), None
|
|
|
|
| 721 |
vis_fps = max(2.0, min(8.0, (meta.get("fps") or 25.0) / 4.0))
|
| 722 |
write_mp4(vis, vis_path, vis_fps)
|
| 723 |
|
| 724 |
+
progress(0.85, desc="Packing canvases (one per GOP group)")
|
| 725 |
+
canvases, n_selected = pack_canvases_per_group(
|
| 726 |
+
resized, masks, groups, int(patch_size),
|
| 727 |
+
)
|
| 728 |
+
canvas_items: List[Tuple[str, str]] = []
|
| 729 |
+
for idx, canv in enumerate(canvases):
|
| 730 |
+
cp = os.path.join(out_dir, f"canvas_{idx:03d}.png")
|
| 731 |
+
cv2.imwrite(cp, canv)
|
| 732 |
+
s_idx, e_idx = groups[idx] if idx < len(groups) else (idx, idx)
|
| 733 |
+
n_p = max(0, e_idx - s_idx) # number of P-frames in this group
|
| 734 |
+
caption = (
|
| 735 |
+
f"Group {idx + 1}/{len(canvases)} · I-frame @ sampled #{s_idx} "
|
| 736 |
+
f"+ {n_p} P-frame{'s' if n_p != 1 else ''}"
|
| 737 |
+
)
|
| 738 |
+
canvas_items.append((cp, caption))
|
| 739 |
|
| 740 |
hb, wb = grids[0].shape
|
| 741 |
grid_size = int(grids[0].shape[0] * grids[0].shape[1]) if grids else 0
|
|
|
|
| 782 |
"resized_frame_size": f"{tw}x{th}",
|
| 783 |
"patch_grid_per_frame": f"{hb}x{wb} = {hb * wb} patches",
|
| 784 |
"actual_selected_total": int(actual_selected),
|
| 785 |
+
"total_selected_patches_incl_i_frames": int(n_selected),
|
| 786 |
+
"canvases": [
|
| 787 |
+
{
|
| 788 |
+
"index": i,
|
| 789 |
+
"size": f"{canvases[i].shape[1]}x{canvases[i].shape[0]}",
|
| 790 |
+
"group": list(groups[i]) if i < len(groups) else None,
|
| 791 |
+
"structure": "IPPP — first frame full (I), rest contribute "
|
| 792 |
+
"only their selected patches (P).",
|
| 793 |
+
}
|
| 794 |
+
for i in range(len(canvases))
|
| 795 |
+
],
|
| 796 |
+
"n_canvases": int(len(canvases)),
|
| 797 |
"vis_video_fps": round(vis_fps, 2),
|
| 798 |
"viz_mode": mode,
|
| 799 |
"heatmap_alpha": float(heatmap_alpha) if mode != "selection" else None,
|
|
|
|
| 811 |
|
| 812 |
progress(1.0, desc="Done")
|
| 813 |
return (
|
| 814 |
+
vis_path, canvas_items,
|
| 815 |
json.dumps(info, indent=2, ensure_ascii=False),
|
| 816 |
chart_fig,
|
| 817 |
)
|
|
|
|
| 1314 |
with gr.Row():
|
| 1315 |
with gr.Column(scale=1):
|
| 1316 |
with gr.Group(elem_classes="ovc-card"):
|
| 1317 |
+
gr.Markdown("### Packed canvases (one per GOP group)")
|
| 1318 |
+
gr.Markdown(
|
| 1319 |
+
"<small>Each canvas is one GOP group rendered in "
|
| 1320 |
+
"<b>IPPP order</b>: the group's first frame is the "
|
| 1321 |
+
"<b>I-frame</b> kept whole (top), followed by the "
|
| 1322 |
+
"<b>P-frame</b> selected patches packed below.</small>"
|
| 1323 |
+
)
|
| 1324 |
+
canvas_out = gr.Gallery(
|
| 1325 |
+
label="", show_label=False,
|
| 1326 |
+
columns=2, rows=2, height=380,
|
| 1327 |
+
object_fit="contain",
|
| 1328 |
+
preview=True,
|
| 1329 |
)
|
| 1330 |
with gr.Column(scale=1):
|
| 1331 |
with gr.Group(elem_classes="ovc-card"):
|