| import os |
| import cv2 |
| import numpy as np |
| import torch |
| import matplotlib as mpl |
|
|
| from .video_utils import ( |
| read_video_frames, |
| resize_frames_to_long_side, |
| save_to_video, |
| add_overlay_text |
| ) |
| from typing import Optional, List, Tuple |
| from libs.models.mano_wrapper import MANO |
| from .render_utils import Renderer |
|
|
| class Config: |
| """ |
| Configuration class for file paths, parameters, and visual settings. |
| Paths are initialized with default values but can be overridden by arguments. |
| """ |
| def __init__(self, args=None): |
| |
| self.VIDEO_ROOT = getattr(args, 'video_root', 'data/examples/videos') |
| self.LABEL_ROOT = getattr(args, 'label_root', 'data/examples/annotations') |
| self.SAVE_PATH = getattr(args, 'save_path', 'data/examples/visualize') |
| self.MANO_MODEL_PATH = getattr(args, 'mano_model_path', './weights/mano') |
|
|
| |
| self.RENDER_SIZE_LONG_SIDE = 480 |
| self.FPS = 15 |
|
|
| |
| self.LEFT_CMAP = "inferno" |
| self.RIGHT_CMAP = "inferno" |
|
|
| |
| self.LEFT_COLOR = np.array([0.6594, 0.6259, 0.7451]) |
| self.RIGHT_COLOR = np.array([0.4078, 0.4980, 0.7451]) |
|
|
|
|
| class HandVisualizer: |
| """ |
| Main class for loading data, configuring the renderer, and visualizing |
| the hand episode, including mesh and trajectory. |
| """ |
| def __init__(self, config: Config, render_gradual_traj: bool = False): |
| self.config = config |
| self.render_gradual_traj = render_gradual_traj |
| self.all_modes = ['cam', 'first'] |
| if self.render_gradual_traj: |
| self.all_modes = ['cam', 'full', 'first'] |
|
|
| |
| self.mano = MANO(model_path=self.config.MANO_MODEL_PATH).cuda() |
| faces_right = torch.from_numpy(self.mano.faces).float().cuda() |
| |
| self.faces_left = faces_right[:, [0, 2, 1]] |
| self.faces_right = faces_right |
|
|
| def _render_hand_trajectory(self, video_frames, hand_traj_wordspace, hand_mask, extrinsics, renderer: Renderer, mode: str): |
| """ |
| Renders hand mesh for one frame or hand trajectory across multiple frames, |
| depending on the mode ('cam', 'first', 'full'). |
| """ |
| verts_left_worldspace, verts_right_worldspace = hand_traj_wordspace |
| left_hand_mask, right_hand_mask = hand_mask |
| R_w2c, t_w2c = extrinsics |
|
|
| num_total_frames = len(video_frames) |
| all_save_frames = [] |
|
|
| |
| if mode == 'cam': |
| |
| num_loop_frames = num_total_frames |
| |
| left_colors = self.config.LEFT_COLOR[np.newaxis, :].repeat(num_total_frames, axis=0) |
| right_colors = self.config.RIGHT_COLOR[np.newaxis, :].repeat(num_total_frames, axis=0) |
| elif mode == 'first': |
| |
| num_loop_frames = 1 |
| left_colors = self.config.LEFT_COLOR[np.newaxis, :].repeat(num_total_frames, axis=0) |
| right_colors = self.config.RIGHT_COLOR[np.newaxis, :].repeat(num_total_frames, axis=0) |
| elif mode == 'full': |
| |
| num_loop_frames = num_total_frames |
| |
| left_colors, right_colors = generate_hand_colors(num_total_frames, self.config.LEFT_CMAP, self.config.RIGHT_CMAP) |
| else: |
| raise ValueError(f'Unknown rendering mode: {mode}') |
|
|
| for current_frame_idx in range(num_loop_frames): |
|
|
| if not mode == 'first': |
| print(f'Processing frame {current_frame_idx + 1}/{num_loop_frames}', end='\r') |
| |
| curr_img_overlay = video_frames[current_frame_idx].copy().astype(np.float32) / 255.0 |
|
|
| |
| R_w2c_cur = R_w2c[current_frame_idx] |
| t_w2c_cur = t_w2c[current_frame_idx] |
|
|
| |
| verts_left_camspace = ( |
| R_w2c_cur @ verts_left_worldspace.transpose(0, 2, 1) + t_w2c_cur |
| ).transpose(0, 2, 1) |
| verts_right_camspace = ( |
| R_w2c_cur @ verts_right_worldspace.transpose(0, 2, 1) + t_w2c_cur |
| ).transpose(0, 2, 1) |
|
|
| |
| if mode == 'cam': |
| |
| start_traj_idx = current_frame_idx |
| end_traj_idx = current_frame_idx + 1 |
| transparency = [1.0] |
| elif mode == 'first': |
| |
| start_traj_idx = 0 |
| end_traj_idx = num_total_frames |
| transparency = [1.0] * (end_traj_idx - start_traj_idx) |
| |
| if current_frame_idx > 0: continue |
| elif mode == 'full': |
| |
| start_traj_idx = current_frame_idx |
| end_traj_idx = num_total_frames |
| |
| transparency = np.linspace(0.4, 0.7, end_traj_idx - start_traj_idx) |
| else: |
| raise ValueError(f'Unknown rendering mode: {mode}') |
|
|
| |
| for traj_idx, kk in enumerate(range(start_traj_idx, end_traj_idx)): |
|
|
| if mode == 'first': |
| print(f'Processing frame {traj_idx + 1}/{num_total_frames}', end='\r') |
| curr_img_overlay = video_frames[current_frame_idx].copy().astype(np.float32)/255 |
|
|
| |
| left_mask_k = left_hand_mask[kk] |
| right_mask_k = right_hand_mask[kk] |
| transp_k = transparency[traj_idx] if len(transparency) > traj_idx else 1.0 |
|
|
| left_verts_list, left_color_list, left_face_list = ([], [], []) |
| right_verts_list, right_color_list, right_face_list = ([], [], []) |
|
|
| if left_mask_k != 0: |
| left_verts_list = [torch.from_numpy(verts_left_camspace[kk]).float().cuda()] |
| |
| left_color_list = [torch.from_numpy(left_colors[kk]).float().unsqueeze(0).repeat(778, 1).cuda()] |
| left_face_list = [self.faces_left] |
|
|
| if right_mask_k != 0: |
| right_verts_list = [torch.from_numpy(verts_right_camspace[kk]).float().cuda()] |
| right_color_list = [torch.from_numpy(right_colors[kk]).float().unsqueeze(0).repeat(778, 1).cuda()] |
| right_face_list = [self.faces_right] |
|
|
| verts_list = left_verts_list + right_verts_list |
| faces_list = left_face_list + right_face_list |
| colors_list = left_color_list + right_color_list |
|
|
| if verts_list: |
| |
| rend, mask = renderer.render(verts_list, faces_list, colors_list) |
| rend = rend[..., ::-1] |
|
|
| color_mesh = rend.astype(np.float32) / 255.0 |
| valid_mask = mask[..., None].astype(np.float32) |
|
|
| |
| |
| curr_img_overlay = ( |
| curr_img_overlay[:, :, :3] * (1 - valid_mask) + |
| color_mesh[:, :, :3] * valid_mask * transp_k + |
| curr_img_overlay[:, :, :3] * valid_mask * (1 - transp_k) |
| ) |
| if mode == 'first': |
| |
| final_frame = (curr_img_overlay * 255).astype(np.uint8) |
| final_frame = cv2.cvtColor(final_frame, cv2.COLOR_BGR2RGB) |
| all_save_frames.append(final_frame) |
| |
| if mode == 'cam' or mode == 'full': |
| |
| final_frame = (curr_img_overlay * 255).astype(np.uint8) |
| final_frame = cv2.cvtColor(final_frame, cv2.COLOR_BGR2RGB) |
| all_save_frames.append(final_frame) |
|
|
| print(f'Finished rendering with mode: {mode}') |
| return all_save_frames |
|
|
| def process_episode(self, episode_name: str): |
| """Loads data and orchestrates the visualization process for a single episode.""" |
| print(f'\nProcessing episode: {episode_name}') |
|
|
| |
| dataset_name = episode_name.split('_')[0] |
| ep_name = episode_name.split('_')[-2] + '_' + episode_name.split('_')[-1] |
| video_name = episode_name.replace(f'{dataset_name}_', '').replace(f'_{ep_name}', '') |
| video_path = os.path.join(self.config.VIDEO_ROOT, f'{video_name}.mp4') |
| label_path = os.path.join(self.config.LABEL_ROOT, episode_name + '.npy') |
|
|
| if not os.path.exists(label_path): |
| print(f'Episode file {label_path} does not exist, skipping...') |
| return |
|
|
| |
| cap = cv2.VideoCapture(video_path) |
| episode_info = np.load(label_path, allow_pickle=True).item() |
|
|
| start_frame, end_frame = get_frame_interval(episode_info) |
| R_w2c, t_w2c, normalized_intrinsics = get_camera_info(episode_info) |
| caption_left, caption_right, hand_type = get_caption_info(episode_info) |
| (verts_left_worldspace, left_hand_mask), (verts_right_worldspace, right_hand_mask) = \ |
| get_hand_labels(episode_info, self.mano) |
|
|
| |
| video_frames = read_video_frames(cap, start_frame=start_frame, end_frame=end_frame, interval=1) |
| resize_video_frames = resize_frames_to_long_side(video_frames, self.config.RENDER_SIZE_LONG_SIDE) |
| H, W, _ = resize_video_frames[0].shape |
|
|
| |
| |
| intrinsics_denorm = normalized_intrinsics.copy() |
| intrinsics_denorm[0] *= W |
| intrinsics_denorm[1] *= H |
| fx_exo = intrinsics_denorm[0, 0] |
| fy_exo = intrinsics_denorm[1, 1] |
|
|
| renderer = Renderer(W, H, (fx_exo, fy_exo), 'cuda') |
|
|
| |
| all_rendered_frames = [] |
| hand_traj_wordspace = (verts_left_worldspace, verts_right_worldspace) |
| hand_mask = (left_hand_mask, right_hand_mask) |
| extrinsics = (R_w2c, t_w2c) |
|
|
| for mode in self.all_modes: |
| save_frames = self._render_hand_trajectory( |
| resize_video_frames, |
| hand_traj_wordspace, |
| hand_mask, |
| extrinsics, |
| renderer, |
| mode=mode |
| ) |
| all_rendered_frames.append(save_frames) |
|
|
| |
| final_save_frames = [] |
| num_frames = len(all_rendered_frames[0]) |
|
|
| |
| caption_primary = caption_right if hand_type == 'right' else caption_left |
| caption_opposite = caption_left if hand_type == 'right' else caption_right |
| opposite_intervals = [interval for _, interval in caption_opposite] |
|
|
| for frame_idx in range(num_frames): |
| |
| curr_img_overlay = np.concatenate( |
| [all_rendered_frames[mode_idx][frame_idx] for mode_idx in range(len(self.all_modes))], |
| axis=1 |
| ) |
|
|
| |
| overlay_text_primary = caption_primary[0][0] |
|
|
| |
| opposite_idx = find_caption_index(frame_idx, opposite_intervals) |
| overlay_text_opposite = caption_opposite[opposite_idx][0] if opposite_idx is not None else 'None.' |
|
|
| |
| overlay_text_full = generate_overlay_text( |
| overlay_text_primary, |
| overlay_text_opposite, |
| hand_type |
| ) |
| add_overlay_text(curr_img_overlay, overlay_text_full) |
|
|
| final_save_frames.append(curr_img_overlay) |
|
|
| |
| os.makedirs(self.config.SAVE_PATH, exist_ok=True) |
| save_to_video(final_save_frames, f'{self.config.SAVE_PATH}/{episode_name}.mp4', fps=self.config.FPS) |
| print(f'\nSuccessfully saved episode to {self.config.SAVE_PATH}/{episode_name}.mp4') |
|
|
| def find_caption_index(frame_index: int, intervals: list[tuple[int, int]]) -> Optional[int]: |
| """Finds the interval index for a given frame index.""" |
| for idx, (start, end) in enumerate(intervals): |
| if start <= frame_index <= end: |
| return idx |
| return None |
|
|
| def generate_hand_colors(T: int, left_cmap: str, right_cmap: str) -> tuple[np.ndarray, np.ndarray]: |
| """ |
| Generates RGB color sequences for left and right hands over T frames. |
| Returns colors in shape (T, 3), normalized 0-1, based on the specified colormaps. |
| """ |
| t_norm = np.linspace(0, 0.95, T) |
| left_colors = mpl.colormaps.get_cmap(left_cmap)(t_norm)[:, :3] |
| right_colors = mpl.colormaps.get_cmap(right_cmap)(t_norm)[:, :3] |
| return left_colors, right_colors |
|
|
| def get_frame_interval(episode_info: dict) -> tuple[int, int]: |
| """Extracts start (inclusive) and end (exclusive) frame indices from episode info.""" |
| video_decode_frames = episode_info['video_decode_frame'] |
| start_frame = video_decode_frames[0] |
| end_frame = video_decode_frames[-1] + 1 |
| return start_frame, end_frame |
|
|
| def normalize_camera_intrinsics(intrinsics: np.ndarray) -> np.ndarray: |
| """ |
| Normalizes intrinsics based on the assumption that the principal point |
| is at the image center (image size is 2*cx, 2*cy). |
| """ |
| |
| normalized_intrinsics = intrinsics.copy() |
| normalized_intrinsics[0] /= normalized_intrinsics[0, 2] * 2 |
| normalized_intrinsics[1] /= normalized_intrinsics[1, 2] * 2 |
| return normalized_intrinsics |
|
|
| def get_camera_info(episode_info: dict) -> tuple[np.ndarray, np.ndarray, np.ndarray]: |
| """ |
| Extracts and normalizes camera intrinsics and extrinsics (world-to-cam). |
| """ |
| extrinsics = episode_info['extrinsics'] |
| R_w2c = extrinsics[:, :3, :3].copy() |
| t_w2c = extrinsics[:, :3, 3:].copy() |
|
|
| intrinsics = episode_info['intrinsics'].copy() |
| normalized_intrinsics = normalize_camera_intrinsics(intrinsics) |
|
|
| return R_w2c, t_w2c, normalized_intrinsics |
|
|
| def get_caption_info(episode_info: dict) -> tuple[list, list, str]: |
| """ |
| Extracts and formats caption information for left and right hands. |
| Adds a large interval if captions are empty to cover all frames. |
| """ |
| hand_type = episode_info['anno_type'] |
|
|
| caption_right = episode_info['text'].get('right', []) |
| caption_left = episode_info['text'].get('left', []) |
|
|
| |
| if not caption_right: |
| caption_right = [['None.', (0, 10000)]] |
| if not caption_left: |
| caption_left = [['None.', (0, 10000)]] |
|
|
| return caption_left, caption_right, hand_type |
|
|
| def get_hand_labels(episode_info: dict, mano: MANO): |
| """ |
| Processes hand labels (pose, shape, translation, orientation) through the MANO model |
| to obtain hand vertices in world space. |
| """ |
| left_labels = episode_info['left'] |
| right_labels = episode_info['right'] |
|
|
| |
| left_hand_mask = left_labels['kept_frames'] |
| verts_left, _ = process_single_hand_labels(left_labels, left_hand_mask, mano, is_left=True) |
|
|
| |
| right_hand_mask = right_labels['kept_frames'] |
| verts_right, _ = process_single_hand_labels(right_labels, right_hand_mask, mano) |
| |
| return (verts_left, left_hand_mask), (verts_right, right_hand_mask) |
|
|
| def process_single_hand_labels(hand_labels: dict, hand_mask: np.ndarray, mano: MANO, is_left: bool = False): |
| """ |
| Helper function to compute MANO vertices for a single hand (left or right). |
| """ |
| T = len(hand_mask) |
| |
| wrist_worldspace = hand_labels['transl_worldspace'].reshape(-1, 1, 3) |
| wrist_orientation = hand_labels['global_orient_worldspace'] |
| beta = hand_labels['beta'] |
| pose = hand_labels['hand_pose'] |
|
|
| |
| identity = np.eye(3, dtype=pose.dtype) |
| identity_block = np.broadcast_to(identity, (pose.shape[1], 3, 3)) |
| mask_indices = (hand_mask == 0) |
| if np.any(mask_indices): |
| pose[mask_indices] = identity_block |
| |
|
|
| beta_torch = torch.from_numpy(beta).float().cuda().unsqueeze(0).repeat(T, 1) |
| pose_torch = torch.from_numpy(pose).float().cuda() |
| |
| |
| global_rot_placeholder = torch.eye(3).float().unsqueeze(0).unsqueeze(0).cuda().repeat(T, 1, 1, 1) |
| |
| mano_out = mano(betas=beta_torch, hand_pose=pose_torch, global_orient=global_rot_placeholder) |
| |
| verts = mano_out.vertices.cpu().numpy() |
| joints = mano_out.joints.cpu().numpy() |
|
|
| |
| |
| if is_left: |
| verts[:, :, 0] *= -1 |
| joints[:, :, 0] *= -1 |
|
|
| |
| |
| verts_worldspace = ( |
| wrist_orientation @ |
| (verts - joints[:, 0][:, None]).transpose(0, 2, 1) |
| ).transpose(0, 2, 1) + wrist_worldspace |
|
|
| return verts_worldspace, joints[:, 0] |
|
|
| def generate_overlay_text(overlay_text: str, overlay_text_opposite: str, hand_type: str) -> str: |
| """Formats the caption string based on the primary hand type.""" |
| if hand_type == 'right': |
| return f'Left: {overlay_text_opposite} | Right: {overlay_text}' |
| else: |
| return f'Left: {overlay_text} | Right: {overlay_text_opposite}' |