Video-Text-to-Text
Transformers
Safetensors
English
Chinese
internvideo3
text-generation
video-understanding
multimodal
long-video
agent
custom_code
Instructions to use yanziang/InternVideo3-8B-Instruct with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use yanziang/InternVideo3-8B-Instruct with Transformers:
# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("yanziang/InternVideo3-8B-Instruct", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
| # coding=utf-8 | |
| # Copyright 2025 The InternVideo Team. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """Video processor class for InternVideo3.""" | |
| import math | |
| from typing import Optional, Union | |
| import numpy as np | |
| import torch | |
| from transformers.feature_extraction_utils import BatchFeature | |
| from transformers.image_utils import ChannelDimension, PILImageResampling, SizeDict, get_image_size | |
| from transformers.processing_utils import Unpack, VideosKwargs | |
| from transformers.utils import TensorType, logging | |
| from transformers.video_processing_utils import BaseVideoProcessor | |
| from transformers.video_utils import VideoMetadata, group_videos_by_shape, reorder_videos | |
| logger = logging.get_logger(__name__) | |
| def smart_resize( | |
| num_frames: int, | |
| height: int, | |
| width: int, | |
| temporal_factor: int = 2, | |
| factor: int = 32, | |
| min_pixels: int = 128 * 128, | |
| max_pixels: int = 16 * 16 * 2 * 2 * 2 * 6144, | |
| ): | |
| if num_frames < temporal_factor: | |
| raise ValueError(f"t:{num_frames} must be larger than temporal_factor:{temporal_factor}") | |
| if height < factor or width < factor: | |
| raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}") | |
| elif max(height, width) / min(height, width) > 200: | |
| raise ValueError( | |
| f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}" | |
| ) | |
| h_bar = round(height / factor) * factor | |
| w_bar = round(width / factor) * factor | |
| t_bar = round(num_frames / temporal_factor) * temporal_factor | |
| if t_bar * h_bar * w_bar > max_pixels: | |
| beta = math.sqrt((num_frames * height * width) / max_pixels) | |
| h_bar = max(factor, math.floor(height / beta / factor) * factor) | |
| w_bar = max(factor, math.floor(width / beta / factor) * factor) | |
| elif t_bar * h_bar * w_bar < min_pixels: | |
| beta = math.sqrt(min_pixels / (num_frames * height * width)) | |
| h_bar = math.ceil(height * beta / factor) * factor | |
| w_bar = math.ceil(width * beta / factor) * factor | |
| return h_bar, w_bar | |
| class InternVideo3VideoProcessorInitKwargs(VideosKwargs): | |
| patch_size: Optional[int] | |
| temporal_patch_size: Optional[int] | |
| merge_size: Optional[int] | |
| min_frames: Optional[int] | |
| max_frames: Optional[int] | |
| class InternVideo3VideoProcessor(BaseVideoProcessor): | |
| resample = PILImageResampling.BICUBIC | |
| size = {"shortest_edge": 128 * 32 * 32, "longest_edge": 32 * 32 * 768} | |
| image_mean = [0.5, 0.5, 0.5] | |
| image_std = [0.5, 0.5, 0.5] | |
| do_resize = True | |
| do_rescale = True | |
| do_normalize = True | |
| do_convert_rgb = True | |
| patch_size = 16 | |
| temporal_patch_size = 2 | |
| merge_size = 2 | |
| fps = 2 | |
| min_frames = 4 | |
| max_frames = 768 | |
| do_sample_frames = True | |
| valid_kwargs = InternVideo3VideoProcessorInitKwargs | |
| model_input_names = ["pixel_values_videos", "video_grid_thw"] | |
| def __init__(self, **kwargs: Unpack[InternVideo3VideoProcessorInitKwargs]): | |
| super().__init__(**kwargs) | |
| if self.size is not None and ( | |
| self.size.get("shortest_edge", None) is None or self.size.get("longest_edge", None) is None | |
| ): | |
| raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.") | |
| def _further_process_kwargs( | |
| self, | |
| size: Optional[SizeDict] = None, | |
| **kwargs, | |
| ) -> dict: | |
| if size is not None and ("shortest_edge" not in size or "longest_edge" not in size): | |
| raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.") | |
| return super()._further_process_kwargs(size=size, **kwargs) | |
| def sample_frames( | |
| self, | |
| metadata: VideoMetadata, | |
| num_frames: Optional[int] = None, | |
| fps: Optional[Union[int, float]] = None, | |
| **kwargs, | |
| ): | |
| if fps is not None and num_frames is not None: | |
| raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!") | |
| total_num_frames = metadata.total_num_frames | |
| fps = fps if fps is not None else self.fps | |
| if num_frames is None and fps is not None: | |
| if metadata.fps is None: | |
| metadata.fps = 24 | |
| logger.warning_once( | |
| "Asked to sample `fps` frames per second but no video metadata was provided. " | |
| "Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results." | |
| ) | |
| num_frames = int(total_num_frames / metadata.fps * fps) | |
| num_frames = min(min(max(num_frames, self.min_frames), self.max_frames), total_num_frames) | |
| if num_frames is None: | |
| num_frames = min(max(total_num_frames, self.min_frames), self.max_frames) | |
| indices = np.linspace(0, total_num_frames - 1, num_frames).round().astype(int) | |
| return indices | |
| def _preprocess( | |
| self, | |
| videos: list[torch.Tensor], | |
| do_convert_rgb: bool = True, | |
| do_resize: bool = True, | |
| size: Optional[SizeDict] = None, | |
| interpolation: PILImageResampling = PILImageResampling.BICUBIC, | |
| do_rescale: bool = True, | |
| rescale_factor: float = 1 / 255.0, | |
| do_normalize: bool = True, | |
| image_mean: Optional[Union[float, list[float]]] = None, | |
| image_std: Optional[Union[float, list[float]]] = None, | |
| patch_size: Optional[int] = None, | |
| temporal_patch_size: Optional[int] = None, | |
| merge_size: Optional[int] = None, | |
| return_tensors: Optional[Union[str, TensorType]] = None, | |
| **kwargs, | |
| ): | |
| grouped_videos, grouped_videos_index = group_videos_by_shape(videos) | |
| resized_videos_grouped = {} | |
| for shape, stacked_videos in grouped_videos.items(): | |
| B, T, C, H, W = stacked_videos.shape | |
| num_frames, height, width = T, H, W | |
| if do_resize: | |
| resized_height, resized_width = smart_resize( | |
| num_frames=num_frames, | |
| height=height, | |
| width=width, | |
| temporal_factor=temporal_patch_size, | |
| factor=patch_size * merge_size, | |
| min_pixels=size.shortest_edge, | |
| max_pixels=size.longest_edge, | |
| ) | |
| stacked_videos = stacked_videos.view(B * T, C, H, W) | |
| stacked_videos = self.resize( | |
| stacked_videos, | |
| size=SizeDict(height=resized_height, width=resized_width), | |
| interpolation=interpolation, | |
| ) | |
| stacked_videos = stacked_videos.view(B, T, C, resized_height, resized_width) | |
| resized_videos_grouped[shape] = stacked_videos | |
| resized_videos = reorder_videos(resized_videos_grouped, grouped_videos_index) | |
| grouped_videos, grouped_videos_index = group_videos_by_shape(resized_videos) | |
| processed_videos_grouped = {} | |
| processed_grids = {} | |
| for shape, stacked_videos in grouped_videos.items(): | |
| resized_height, resized_width = get_image_size(stacked_videos[0], channel_dim=ChannelDimension.FIRST) | |
| stacked_videos = self.rescale_and_normalize( | |
| stacked_videos, do_rescale, rescale_factor, do_normalize, image_mean, image_std | |
| ) | |
| patches = stacked_videos | |
| if patches.shape[1] % temporal_patch_size != 0: | |
| repeats = patches[:, -1:].repeat(1, temporal_patch_size - 1, 1, 1, 1) | |
| patches = torch.cat([patches, repeats], dim=1) | |
| batch_size, grid_t, channel = patches.shape[:3] | |
| grid_t = grid_t // temporal_patch_size | |
| grid_h, grid_w = resized_height // patch_size, resized_width // patch_size | |
| patches = patches.view( | |
| batch_size, | |
| grid_t, | |
| temporal_patch_size, | |
| channel, | |
| grid_h // merge_size, | |
| merge_size, | |
| patch_size, | |
| grid_w // merge_size, | |
| merge_size, | |
| patch_size, | |
| ) | |
| patches = patches.permute(0, 1, 4, 7, 5, 8, 3, 2, 6, 9) | |
| flatten_patches = patches.reshape( | |
| batch_size, | |
| grid_t * grid_h * grid_w, | |
| channel * temporal_patch_size * patch_size * patch_size, | |
| ) | |
| processed_videos_grouped[shape] = flatten_patches | |
| processed_grids[shape] = [[grid_t, grid_h, grid_w]] * batch_size | |
| processed_videos = reorder_videos(processed_videos_grouped, grouped_videos_index) | |
| processed_grids = reorder_videos(processed_grids, grouped_videos_index) | |
| pixel_values_videos = torch.cat(processed_videos, dim=0) | |
| video_grid_thw = torch.tensor(processed_grids) | |
| data = { | |
| "pixel_values_videos": pixel_values_videos, | |
| "video_grid_thw": video_grid_thw, | |
| } | |
| return BatchFeature(data=data, tensor_type=return_tensors) | |
| __all__ = ["InternVideo3VideoProcessor"] | |