FeilongTang commited on
Commit
901e5ca
·
1 Parent(s): b4e26aa

Canvas: one per GOP group with IPPP (I-frame + P-frame patches)

Browse files

Replaces the single square 'all selected patches' canvas with one
canvas per GOP group, mirroring the codec convention.

Per group (frame range s..e):
- frame s = I-frame: full image kept whole, occupies the top of the
canvas (the keyframe anchor).
- frames s+1..e = P-frames: only their saliency-selected patches go
into the canvas, packed below the I-frame in a wb-wide grid in
time-major raster order.
Canvas width is locked to wb*patch so the I-frame and the P-grid
align block-for-block, matching how LLaVA-OneVision tokenizes input.

UI
- gr.Image (single canvas) → gr.Gallery (N canvases, click any
thumbnail to enlarge), columns=2 rows=2, height=380.
- Caption per canvas: 'Group K/N · I-frame @ sampled #s + p P-frames'.
- Card subtitle explains the IPPP convention.

Run info JSON
- Drop 'canvas_resolution' (single scalar).
- Add 'canvases': [{index, size, group, structure}], 'n_canvases'.
- 'total_selected_patches' renamed to 'total_selected_patches_incl_i_frames'
since I-frame contributes its full grid to the canvas count.

Files changed (1) hide show
  1. app.py +105 -36
app.py CHANGED
@@ -447,29 +447,68 @@ def write_mp4(frames: List[np.ndarray], path: str, fps: float) -> None:
447
  proc.kill()
448
 
449
 
450
- def pack_canvas(
451
- frames: List[np.ndarray], masks: List[np.ndarray], patch: int,
452
- ) -> Tuple[np.ndarray, int]:
453
- """Collect every selected patch in time-order, raster-scan, into a
454
- near-square canvas image. Empty slots are white."""
455
- selected: List[np.ndarray] = []
456
- for f, m in zip(frames, masks):
457
- hb, wb = m.shape
458
- for i in range(hb):
459
- for j in range(wb):
460
- if m[i, j]:
461
- selected.append(
462
- f[i * patch:(i + 1) * patch, j * patch:(j + 1) * patch].copy()
463
- )
464
- n = len(selected)
465
- if n == 0:
466
- return np.full((patch, patch, 3), 255, dtype=np.uint8), 0
467
- cn = int(math.ceil(math.sqrt(n)))
468
- canvas = np.full((cn * patch, cn * patch, 3), 255, dtype=np.uint8)
469
- for k, p in enumerate(selected):
470
- ci, cj = k // cn, k % cn
471
- canvas[ci * patch:(ci + 1) * patch, cj * patch:(cj + 1) * patch] = p
472
- return canvas, n
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
473
 
474
 
475
  def make_charts(
@@ -594,14 +633,14 @@ def process(
594
  progress=gr.Progress(track_tqdm=False),
595
  ):
596
  if not video_path:
597
- return None, None, "Please upload a video.", None
598
 
599
  t0 = time.time()
600
  progress(0.05, desc="Reading metadata")
601
  meta = video_metadata(video_path)
602
  total = meta.get("total_frames") or 0
603
  if total <= 0:
604
- return None, None, json.dumps(
605
  {"error": "Could not read frame count.", "metadata": meta},
606
  indent=2, ensure_ascii=False,
607
  ), None
@@ -631,7 +670,7 @@ def process(
631
  fids = sample_frame_ids(total, int(sample_frames))
632
  raw = decode_frames(video_path, fids)
633
  if not raw:
634
- return None, None, json.dumps(
635
  {"error": "Failed to decode frames.", "metadata": meta},
636
  indent=2, ensure_ascii=False,
637
  ), None
@@ -682,10 +721,21 @@ def process(
682
  vis_fps = max(2.0, min(8.0, (meta.get("fps") or 25.0) / 4.0))
683
  write_mp4(vis, vis_path, vis_fps)
684
 
685
- progress(0.85, desc="Packing canvas")
686
- canvas, n_selected = pack_canvas(resized, masks, int(patch_size))
687
- canvas_path = os.path.join(out_dir, "canvas.png")
688
- cv2.imwrite(canvas_path, canvas)
 
 
 
 
 
 
 
 
 
 
 
689
 
690
  hb, wb = grids[0].shape
691
  grid_size = int(grids[0].shape[0] * grids[0].shape[1]) if grids else 0
@@ -732,8 +782,18 @@ def process(
732
  "resized_frame_size": f"{tw}x{th}",
733
  "patch_grid_per_frame": f"{hb}x{wb} = {hb * wb} patches",
734
  "actual_selected_total": int(actual_selected),
735
- "total_selected_patches": int(n_selected),
736
- "canvas_resolution": f"{canvas.shape[1]}x{canvas.shape[0]}",
 
 
 
 
 
 
 
 
 
 
737
  "vis_video_fps": round(vis_fps, 2),
738
  "viz_mode": mode,
739
  "heatmap_alpha": float(heatmap_alpha) if mode != "selection" else None,
@@ -751,7 +811,7 @@ def process(
751
 
752
  progress(1.0, desc="Done")
753
  return (
754
- vis_path, canvas_path,
755
  json.dumps(info, indent=2, ensure_ascii=False),
756
  chart_fig,
757
  )
@@ -1254,9 +1314,18 @@ with gr.Blocks(**_BLOCK_KW) as demo:
1254
  with gr.Row():
1255
  with gr.Column(scale=1):
1256
  with gr.Group(elem_classes="ovc-card"):
1257
- gr.Markdown("### Packed canvas")
1258
- canvas_out = gr.Image(
1259
- label="", show_label=False, height=320,
 
 
 
 
 
 
 
 
 
1260
  )
1261
  with gr.Column(scale=1):
1262
  with gr.Group(elem_classes="ovc-card"):
 
447
  proc.kill()
448
 
449
 
450
+ def pack_canvases_per_group(
451
+ frames: List[np.ndarray],
452
+ masks: List[np.ndarray],
453
+ groups: List[Tuple[int, int]],
454
+ patch: int,
455
+ ) -> Tuple[List[np.ndarray], int]:
456
+ """One canvas per GOP group, structured as I-frame + P-frame patches
457
+ (IPPP order, matching the codec convention).
458
+
459
+ Within each group [s..e]:
460
+ - frame s is the I-frame: its WHOLE image is laid down as the top
461
+ of the canvas (the anchor / keyframe).
462
+ - frames s+1..e are P-frames: only their selected patches go below
463
+ the I-frame, packed in time-order, raster scan, in a wb-wide grid.
464
+
465
+ The canvas width is locked to the frame's patch-grid width so the
466
+ I-frame slots in cleanly and the P-grid below aligns block-for-block.
467
+ """
468
+ canvases: List[np.ndarray] = []
469
+ total_selected = 0
470
+ for (s, e) in groups:
471
+ if s >= len(frames):
472
+ continue
473
+ i_frame = frames[s]
474
+ h, w = i_frame.shape[:2]
475
+ hb, wb = h // patch, w // patch
476
+ canvas_w = wb * patch
477
+ # I-frame block (already a multiple of patch from smart_resize).
478
+ i_block = i_frame[: hb * patch, : canvas_w].copy()
479
+ total_selected += hb * wb # I-frame counts as fully kept.
480
+
481
+ # Collect selected patches from P-frames (s+1..e), time-major.
482
+ p_patches: List[np.ndarray] = []
483
+ for k in range(s + 1, e + 1):
484
+ if k >= len(frames):
485
+ break
486
+ f, m = frames[k], masks[k]
487
+ for i in range(m.shape[0]):
488
+ for j in range(m.shape[1]):
489
+ if m[i, j]:
490
+ p_patches.append(
491
+ f[i * patch:(i + 1) * patch, j * patch:(j + 1) * patch].copy()
492
+ )
493
+ total_selected += len(p_patches)
494
+
495
+ if not p_patches:
496
+ canvases.append(i_block)
497
+ continue
498
+
499
+ # Lay P-patches in a wb-wide grid below the I-frame.
500
+ rows = (len(p_patches) + wb - 1) // wb
501
+ p_grid = np.full((rows * patch, canvas_w, 3), 255, dtype=np.uint8)
502
+ for idx, p in enumerate(p_patches):
503
+ r, c = divmod(idx, wb)
504
+ p_grid[r * patch:(r + 1) * patch, c * patch:(c + 1) * patch] = p
505
+
506
+ canvas = np.vstack([i_block, p_grid])
507
+ canvases.append(canvas)
508
+
509
+ if not canvases:
510
+ canvases = [np.full((patch, patch, 3), 255, dtype=np.uint8)]
511
+ return canvases, total_selected
512
 
513
 
514
  def make_charts(
 
633
  progress=gr.Progress(track_tqdm=False),
634
  ):
635
  if not video_path:
636
+ return None, [], "Please upload a video.", None
637
 
638
  t0 = time.time()
639
  progress(0.05, desc="Reading metadata")
640
  meta = video_metadata(video_path)
641
  total = meta.get("total_frames") or 0
642
  if total <= 0:
643
+ return None, [], json.dumps(
644
  {"error": "Could not read frame count.", "metadata": meta},
645
  indent=2, ensure_ascii=False,
646
  ), None
 
670
  fids = sample_frame_ids(total, int(sample_frames))
671
  raw = decode_frames(video_path, fids)
672
  if not raw:
673
+ return None, [], json.dumps(
674
  {"error": "Failed to decode frames.", "metadata": meta},
675
  indent=2, ensure_ascii=False,
676
  ), None
 
721
  vis_fps = max(2.0, min(8.0, (meta.get("fps") or 25.0) / 4.0))
722
  write_mp4(vis, vis_path, vis_fps)
723
 
724
+ progress(0.85, desc="Packing canvases (one per GOP group)")
725
+ canvases, n_selected = pack_canvases_per_group(
726
+ resized, masks, groups, int(patch_size),
727
+ )
728
+ canvas_items: List[Tuple[str, str]] = []
729
+ for idx, canv in enumerate(canvases):
730
+ cp = os.path.join(out_dir, f"canvas_{idx:03d}.png")
731
+ cv2.imwrite(cp, canv)
732
+ s_idx, e_idx = groups[idx] if idx < len(groups) else (idx, idx)
733
+ n_p = max(0, e_idx - s_idx) # number of P-frames in this group
734
+ caption = (
735
+ f"Group {idx + 1}/{len(canvases)} · I-frame @ sampled #{s_idx} "
736
+ f"+ {n_p} P-frame{'s' if n_p != 1 else ''}"
737
+ )
738
+ canvas_items.append((cp, caption))
739
 
740
  hb, wb = grids[0].shape
741
  grid_size = int(grids[0].shape[0] * grids[0].shape[1]) if grids else 0
 
782
  "resized_frame_size": f"{tw}x{th}",
783
  "patch_grid_per_frame": f"{hb}x{wb} = {hb * wb} patches",
784
  "actual_selected_total": int(actual_selected),
785
+ "total_selected_patches_incl_i_frames": int(n_selected),
786
+ "canvases": [
787
+ {
788
+ "index": i,
789
+ "size": f"{canvases[i].shape[1]}x{canvases[i].shape[0]}",
790
+ "group": list(groups[i]) if i < len(groups) else None,
791
+ "structure": "IPPP — first frame full (I), rest contribute "
792
+ "only their selected patches (P).",
793
+ }
794
+ for i in range(len(canvases))
795
+ ],
796
+ "n_canvases": int(len(canvases)),
797
  "vis_video_fps": round(vis_fps, 2),
798
  "viz_mode": mode,
799
  "heatmap_alpha": float(heatmap_alpha) if mode != "selection" else None,
 
811
 
812
  progress(1.0, desc="Done")
813
  return (
814
+ vis_path, canvas_items,
815
  json.dumps(info, indent=2, ensure_ascii=False),
816
  chart_fig,
817
  )
 
1314
  with gr.Row():
1315
  with gr.Column(scale=1):
1316
  with gr.Group(elem_classes="ovc-card"):
1317
+ gr.Markdown("### Packed canvases (one per GOP group)")
1318
+ gr.Markdown(
1319
+ "<small>Each canvas is one GOP group rendered in "
1320
+ "<b>IPPP order</b>: the group's first frame is the "
1321
+ "<b>I-frame</b> kept whole (top), followed by the "
1322
+ "<b>P-frame</b> selected patches packed below.</small>"
1323
+ )
1324
+ canvas_out = gr.Gallery(
1325
+ label="", show_label=False,
1326
+ columns=2, rows=2, height=380,
1327
+ object_fit="contain",
1328
+ preview=True,
1329
  )
1330
  with gr.Column(scale=1):
1331
  with gr.Group(elem_classes="ovc-card"):