Robotics
Transformers
Safetensors
English
prts_qwen3_vl
feature-extraction
vision-language-action
vla
contrastive-reinforcement-learning
goal-conditioned-rl
qwen3-vl
prts
custom_code
Instructions to use TeleEmbodied/PRTS-4B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use TeleEmbodied/PRTS-4B with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("TeleEmbodied/PRTS-4B", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
| # Copyright 2025 TeleAI Rhodes Team. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """Processor for PRTS built on Qwen3-VL (hub / trust_remote_code; no prts package required).""" | |
| from __future__ import annotations | |
| import logging | |
| from typing import Optional, Union | |
| import numpy as np | |
| import torch | |
| from transformers.feature_extraction_utils import BatchFeature | |
| from transformers.image_utils import ImageInput | |
| from transformers.processing_utils import ( | |
| ImagesKwargs, | |
| MultiModalData, | |
| ProcessingKwargs, | |
| ProcessorMixin, | |
| Unpack, | |
| VideosKwargs, | |
| ) | |
| from transformers.tokenization_utils_base import PreTokenizedInput, TextInput | |
| from transformers.utils.logging import get_logger | |
| from transformers.video_utils import VideoInput | |
| ACTION_START_TOKEN = "<|action_start|>" | |
| ACTION_PLACEHOLDER_TOKEN = "<|action_pad|>" | |
| ACTION_END_TOKEN = "<|action_end|>" | |
| CRL_GOAL_REPR_TOKEN = "<|goal_repr|>" | |
| CRL_OBS_REPR_TOKEN = "<|obs_repr|>" | |
| VISION_START_TOKEN = "<|vision_start|>" # beginning of vision input | |
| IMAGE_PLACEHOLDER_TOKEN = "<|image_pad|>" # image placeholder | |
| VIDEO_PLACEHOLDER_TOKEN = "<|video_pad|>" # video placeholder | |
| logger = get_logger(__name__) | |
| if not logger.handlers: | |
| handler = logging.StreamHandler() | |
| handler.setLevel(logging.INFO) | |
| handler.setFormatter(logging.Formatter("%(levelname)s:%(name)s:%(message)s")) | |
| logger.addHandler(handler) | |
| class Qwen3VLVideosProcessorKwargs(VideosKwargs, total=False): | |
| pass | |
| class Qwen3VLImagesKwargs(ImagesKwargs): | |
| min_pixels: Optional[int] | |
| max_pixels: Optional[int] | |
| patch_size: Optional[int] | |
| temporal_patch_size: Optional[int] | |
| merge_size: Optional[int] | |
| class Qwen3VLProcessorKwargs(ProcessingKwargs, total=False): | |
| images_kwargs: Qwen3VLImagesKwargs | |
| videos_kwargs: Qwen3VLVideosProcessorKwargs | |
| _defaults = { | |
| "text_kwargs": { | |
| "padding": False, | |
| "return_token_type_ids": False, | |
| "return_mm_token_type_ids": False, | |
| }, | |
| "videos_kwargs": {"return_metadata": True}, | |
| } | |
| class PRTS_Qwen3VLProcessor(ProcessorMixin): | |
| r""" | |
| Constructs a PRTS processor which wraps a Qwen3-VL image processor and a Qwen2 tokenizer into a single processor. | |
| This processor is built independently (not inheriting from Qwen3VLProcessor) to avoid tight coupling, | |
| while maintaining compatibility with Qwen3-VL's timestamp-based video processing approach. | |
| [`PRTS_Qwen3VLProcessor`] offers all the functionalities needed for PRTS model with: | |
| - Action token handling (discrete and continuous) | |
| - State token handling for proprioceptive inputs | |
| - Expert trigger tokens for flow matching action prediction | |
| - Qwen3-VL compatible image/video processing with timestamp-based video handling | |
| Args: | |
| image_processor ([`Qwen2VLImageProcessor`], *optional*): | |
| The image processor is a required input. | |
| tokenizer ([`Qwen2TokenizerFast`], *optional*): | |
| The tokenizer is a required input. | |
| video_processor ([`Qwen3VLVideoProcessor`], *optional*): | |
| The video processor is a required input. | |
| chat_template (`str`, *optional*): | |
| A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string. | |
| """ | |
| attributes = ["image_processor", "tokenizer", "video_processor"] | |
| image_processor_class = "AutoImageProcessor" | |
| video_processor_class = "AutoVideoProcessor" | |
| tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast") | |
| def __init__(self, image_processor=None, tokenizer=None, video_processor=None, | |
| chat_template=None, **kwargs): | |
| # Initialize base ProcessorMixin | |
| super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template) | |
| # Get image/video tokens from tokenizer | |
| self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token | |
| self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token | |
| self.image_token_id = ( | |
| tokenizer.image_token_id | |
| if getattr(tokenizer, "image_token_id", None) | |
| else tokenizer.convert_tokens_to_ids(self.image_token) | |
| ) | |
| self.video_token_id = ( | |
| tokenizer.video_token_id | |
| if getattr(tokenizer, "video_token_id", None) | |
| else tokenizer.convert_tokens_to_ids(self.video_token) | |
| ) | |
| # Qwen3-VL vision tokens | |
| self.vision_start_token = ( | |
| "<|vision_start|>" if not hasattr(tokenizer, "vision_start_token") else tokenizer.vision_start_token | |
| ) | |
| self.vision_end_token = ( | |
| "<|vision_end|>" if not hasattr(tokenizer, "vision_end_token") else tokenizer.vision_end_token | |
| ) | |
| self.vision_start_token_id = ( | |
| tokenizer.vision_start_token_id | |
| if getattr(tokenizer, "vision_start_token_id", None) | |
| else tokenizer.convert_tokens_to_ids(self.vision_start_token) | |
| ) | |
| self.vision_end_token_id = ( | |
| tokenizer.vision_end_token_id | |
| if getattr(tokenizer, "vision_end_token_id", None) | |
| else tokenizer.convert_tokens_to_ids(self.vision_end_token) | |
| ) | |
| prts_special_tokens = [ | |
| ACTION_START_TOKEN, | |
| ACTION_PLACEHOLDER_TOKEN, | |
| ACTION_END_TOKEN, | |
| CRL_GOAL_REPR_TOKEN, | |
| CRL_OBS_REPR_TOKEN, | |
| ] | |
| num_new_tokens = tokenizer.add_tokens(prts_special_tokens, special_tokens=True) | |
| logger.info(f"Added {num_new_tokens} new special tokens to the tokenizer.") | |
| self.action_token = getattr(tokenizer, "action_token", ACTION_PLACEHOLDER_TOKEN) | |
| self.action_token_id = tokenizer.convert_tokens_to_ids(self.action_token) | |
| token_dict = { | |
| "action_start_token_id": ACTION_START_TOKEN, | |
| "action_token_id": ACTION_PLACEHOLDER_TOKEN, | |
| "vision_start_token_id": VISION_START_TOKEN, | |
| "image_token_id": IMAGE_PLACEHOLDER_TOKEN, | |
| "video_token_id": VIDEO_PLACEHOLDER_TOKEN, | |
| "crl_goal_repr_token_id": CRL_GOAL_REPR_TOKEN, | |
| "crl_obs_repr_token_id": CRL_OBS_REPR_TOKEN, | |
| } | |
| self.token_ids = {key: tokenizer.convert_tokens_to_ids(value) for key, value in token_dict.items()} | |
| def __call__( | |
| self, | |
| images: Optional[ImageInput] = None, | |
| text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, | |
| videos: Optional[VideoInput] = None, | |
| actions: Union[torch.Tensor] = None, | |
| **kwargs: Unpack[Qwen3VLProcessorKwargs], | |
| ) -> BatchFeature: | |
| output_kwargs = self._merge_kwargs( | |
| Qwen3VLProcessorKwargs, | |
| tokenizer_init_kwargs=self.tokenizer.init_kwargs, | |
| **kwargs, | |
| ) | |
| image_inputs = {} | |
| if images is not None: | |
| image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"]) | |
| image_grid_thw = image_inputs["image_grid_thw"] | |
| else: | |
| image_grid_thw = None | |
| videos_inputs = {} | |
| if videos is not None: | |
| videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"]) | |
| video_grid_thw = videos_inputs["video_grid_thw"] | |
| if "return_metadata" not in kwargs: | |
| video_metadata = videos_inputs.pop("video_metadata", None) | |
| else: | |
| video_metadata = videos_inputs.get("video_metadata", None) | |
| else: | |
| video_grid_thw = None | |
| video_metadata = None | |
| if not isinstance(text, list): | |
| text = [text] | |
| text = text.copy() | |
| if image_grid_thw is not None: | |
| merge_length = self.image_processor.merge_size**2 | |
| index = 0 | |
| for i in range(len(text)): | |
| while self.image_token in text[i]: | |
| num_image_tokens = image_grid_thw[index].prod() // merge_length | |
| text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1) | |
| index += 1 | |
| text[i] = text[i].replace("<|placeholder|>", self.image_token) | |
| if video_grid_thw is not None: | |
| merge_length = self.video_processor.merge_size**2 | |
| index = 0 | |
| for i in range(len(text)): | |
| while self.video_token in text[i]: | |
| if video_metadata is not None and index < len(video_metadata): | |
| metadata = video_metadata[index] | |
| if metadata.fps is None: | |
| logger.warning_once( | |
| "Qwen3VL requires frame timestamps to construct prompts, but the `fps` of the input video could not be inferred. " | |
| "Probably `video_metadata` was missing from inputs and you passed pre-sampled frames. " | |
| "Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results." | |
| ) | |
| metadata.fps = 24 if metadata.fps is None else metadata.fps | |
| curr_timestamp = self._calculate_timestamps( | |
| metadata.frames_indices, | |
| metadata.fps, | |
| self.video_processor.merge_size, | |
| ) | |
| video_placeholder = "" | |
| frame_seqlen = video_grid_thw[index][1:].prod() // merge_length | |
| for frame_idx in range(video_grid_thw[index][0]): | |
| curr_time = curr_timestamp[frame_idx] | |
| video_placeholder += f"<{curr_time:.1f} seconds>" | |
| video_placeholder += ( | |
| self.vision_start_token + "<|placeholder|>" * frame_seqlen + self.vision_end_token | |
| ) | |
| if f"{self.vision_start_token}{self.video_token}{self.vision_end_token}" in text[i]: | |
| text[i] = text[i].replace( | |
| f"{self.vision_start_token}{self.video_token}{self.vision_end_token}", | |
| video_placeholder, | |
| 1, | |
| ) | |
| else: | |
| text[i] = text[i].replace(self.video_token, video_placeholder, 1) | |
| else: | |
| num_video_tokens = video_grid_thw[index].prod() // merge_length | |
| text[i] = text[i].replace(self.video_token, "<|placeholder|>" * num_video_tokens, 1) | |
| index += 1 | |
| text[i] = text[i].replace("<|placeholder|>", self.video_token) | |
| return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) | |
| return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None) | |
| text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) | |
| self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"]) | |
| if return_mm_token_type_ids: | |
| array_ids = np.array(text_inputs["input_ids"]) | |
| mm_token_type_ids = np.zeros_like(text_inputs["input_ids"]) | |
| mm_token_type_ids[array_ids == self.image_token_id] = 1 | |
| text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist() | |
| output_data = {**text_inputs, **image_inputs, **videos_inputs} | |
| if actions is not None: | |
| output_data["actions"] = actions | |
| return BatchFeature(data=output_data, tensor_type=return_tensors) | |
| def _calculate_timestamps(self, indices: Union[list[int], np.ndarray], video_fps: float, merge_size: int = 2): | |
| if not isinstance(indices, list): | |
| indices = indices.tolist() | |
| if len(indices) % merge_size != 0: | |
| indices.extend(indices[-1] for _ in range(merge_size - len(indices) % merge_size)) | |
| timestamps = [idx / video_fps for idx in indices] | |
| timestamps = [ | |
| (timestamps[i] + timestamps[i + merge_size - 1]) / 2 for i in range(0, len(timestamps), merge_size) | |
| ] | |
| return timestamps | |
| def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs): | |
| vision_data = {} | |
| if image_sizes is not None: | |
| images_kwargs = Qwen3VLProcessorKwargs._defaults.get("images_kwargs", {}) | |
| images_kwargs.update(kwargs) | |
| merge_size = images_kwargs.get("merge_size", None) or self.image_processor.merge_size | |
| num_image_patches = [ | |
| self.image_processor.get_number_of_image_patches(*image_size, images_kwargs) | |
| for image_size in image_sizes | |
| ] | |
| num_image_tokens = [(num_patches // merge_size**2) for num_patches in num_image_patches] | |
| vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches}) | |
| if video_sizes is not None: | |
| videos_kwargs = Qwen3VLProcessorKwargs._defaults.get("videos_kwargs", {}) | |
| videos_kwargs.update(kwargs) | |
| merge_size = videos_kwargs.get("merge_size", None) or self.video_processor.merge_size | |
| num_video_patches = [ | |
| self.video_processor.get_number_of_video_patches(*video_size, videos_kwargs) | |
| for video_size in video_sizes | |
| ] | |
| num_video_tokens = [(num_patches // merge_size**2) for num_patches in num_video_patches] | |
| vision_data["num_video_tokens"] = num_video_tokens | |
| return MultiModalData(**vision_data) | |
| def set_action_tokenizer(self, action_tokenizer): | |
| self.action_tokenizer = action_tokenizer | |
| prts_fast_action_tokens = [f"<|action_token_{i}|>" for i in range(action_tokenizer.vocab_size)] | |
| num_new_tokens = self.tokenizer.add_tokens(prts_fast_action_tokens, special_tokens=True) | |
| logger.info(f"Added {num_new_tokens} FAST action tokens to the tokenizer.") | |
| self.action_token_start_index = self.tokenizer.convert_tokens_to_ids("<|action_token_0|>") | |
| self.action_vocab_size = action_tokenizer.vocab_size | |
| token_ids = self.tokenizer.convert_tokens_to_ids(prts_fast_action_tokens) | |
| self.action_mapper = {k: v for k, v in zip(prts_fast_action_tokens, token_ids, strict=True)} | |
| def preprocess_action(self, actions, **kwargs): | |
| raise NotImplementedError | |
| def post_process_image_text_to_text( | |
| self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs | |
| ): | |
| return self.tokenizer.batch_decode( | |
| generated_outputs, | |
| skip_special_tokens=skip_special_tokens, | |
| clean_up_tokenization_spaces=clean_up_tokenization_spaces, | |
| **kwargs, | |
| ) | |
| def model_input_names(self): | |
| tokenizer_input_names = self.tokenizer.model_input_names | |
| image_processor_input_names = self.image_processor.model_input_names | |
| return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) | |
| PRTS_Qwen3VLProcessor.register_for_auto_class() | |
| __all__ = ["PRTS_Qwen3VLProcessor"] | |