OpenMOSS-Team/MOSS-VL-Instruct-0408 · Update modeling_moss

Update modeling_moss_vl.py

by CCCCyx - opened 4 days ago

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+1006

-1

Files changed (1) hide show

modeling_moss_vl.py +1006 -1

modeling_moss_vl.py CHANGED Viewed

@@ -14,8 +14,11 @@
 # limitations under the License.
 """PyTorch MossVL model - Qwen3VL Vision + Text with Cross Attention"""
 from dataclasses import dataclass
-from typing import Any, Callable, Optional, Union, Tuple, List
 import torch
 import torch.nn as nn
@@ -26,6 +29,8 @@ from transformers import initialization as init
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
 from transformers.generation import GenerationMixin
 from transformers.integrations import use_kernel_forward_from_hub
 from transformers.masking_utils import create_causal_mask
 from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
@@ -46,6 +51,59 @@ from .configuration_moss_vl import MossVLConfig, MossVLTextConfig, MossVLVisionC
 logger = logging.get_logger(__name__)
 @dataclass
 class MossVLModelOutputWithPast(ModelOutput):
     """
@@ -2098,10 +2156,18 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
     config: MossVLConfig
     _checkpoint_conversion_mapping = {}
     accepts_loss_kwargs = False
     def __init__(self, config):
         super().__init__(config)
         self.model = MossVLModel(config)
         self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
         self.post_init()
@@ -2333,6 +2399,945 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
         return model_kwargs
 __all__ = [
     "MossVLVisionModel",

 # limitations under the License.
 """PyTorch MossVL model - Qwen3VL Vision + Text with Cross Attention"""
+import copy
+import queue
+import threading
 from dataclasses import dataclass
+from typing import Any, Callable, Dict, Optional, Union, Tuple, List
 import torch
 import torch.nn as nn
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
 from transformers.generation import GenerationMixin
+from transformers.generation.stopping_criteria import StoppingCriteria, StoppingCriteriaList
+from transformers.generation.streamers import TextIteratorStreamer
 from transformers.integrations import use_kernel_forward_from_hub
 from transformers.masking_utils import create_causal_mask
 from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
 logger = logging.get_logger(__name__)
+_OFFLINE_SYSTEM_PROMPTS = {
+    "no_thinking": {
+        "text_image": "You are a helpful AI assistant. Respond to the user's request based on the provided text and/or images.",
+        "video": "You are a helpful AI assistant specializing in video analysis. Respond to the user's request based on the provided video content.",
+    },
+    "deep_thinking": {
+        "text_image": "A conversation between User and Assistant. The user makes a request, and the assistant responds to it based on the provided text and/or images. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <thinking></thinking> and <answer></answer> tags, respectively, i.e., <thinking>reasoning process here</thinking><answer>answer here</answer>.",
+        "video": "A conversation between User and Assistant specializing in video analysis. The user makes a request, and the assistant responds to it based on the provided video content. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <thinking></thinking> and <answer></answer> tags, respectively, i.e., <thinking>reasoning process here</thinking><answer>answer here</answer>.",
+    },
+}
+class _OfflineCancelStoppingCriteria(StoppingCriteria):
+    def __init__(self, cancel_event: threading.Event):
+        self.cancel_event = cancel_event
+    def __call__(self, input_ids, scores, **kwargs) -> bool:
+        return self.cancel_event.is_set()
+class _OfflineQueueStreamer(TextIteratorStreamer):
+    def __init__(self, tokenizer, output_text_queue: "queue.Queue[str]"):
+        super().__init__(tokenizer, skip_prompt=True, skip_special_tokens=True)
+        self.output_text_queue = output_text_queue
+        self.collected_chunks: List[str] = []
+    def on_finalized_text(self, text: str, stream_end: bool = False):
+        if text:
+            self.collected_chunks.append(text)
+            self.output_text_queue.put(text)
+        super().on_finalized_text(text, stream_end=stream_end)
+_OFFLINE_THINKING_MODE_ALIASES = {
+    "no_thinking": "no_thinking",
+    "default": "no_thinking",
+    "standard": "no_thinking",
+    "deep_thinking": "deep_thinking",
+    "thinking": "deep_thinking",
+    "reasoning": "deep_thinking",
+}
+_OFFLINE_SYSTEM_PROMPT_TYPE_ALIASES = {
+    "text_image": "text_image",
+    "text-image": "text_image",
+    "image_text": "text_image",
+    "image-text": "text_image",
+    "text": "text_image",
+    "image": "text_image",
+    "video": "video",
+}
 @dataclass
 class MossVLModelOutputWithPast(ModelOutput):
     """
     config: MossVLConfig
     _checkpoint_conversion_mapping = {}
     accepts_loss_kwargs = False
+    @classmethod
+    def build_offline_prepare_helper(cls):
+        helper = cls.__new__(cls)
+        helper._offline_processor_lock = threading.RLock()
+        return helper
     def __init__(self, config):
         super().__init__(config)
         self.model = MossVLModel(config)
         self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self._offline_processor_lock = threading.RLock()
         self.post_init()
         return model_kwargs
+    # ==================== Offline generate orchestration ====================
+    # The following helpers replicate the ``offline_*`` API exposed by the
+    # previous release of ``modeling_moss_vl.py`` (transformers==4.57.1) so
+    # external runners (e.g. ``inference/run_inference.py``) keep working
+    # against this newer implementation without code changes.  They ultimately
+    # dispatch through ``self.generate(...)`` and the checkpoint's processor,
+    # which keeps the decoding logic identical across versions.
+    @staticmethod
+    def _offline_flatten_content_with_vision_tokens(content) -> str:
+        if isinstance(content, str):
+            return content
+        if not isinstance(content, list):
+            return str(content) if content else ""
+        parts = []
+        for item in content:
+            if isinstance(item, dict):
+                if item.get("type") == "image" or "image" in item:
+                    parts.append("<|image|>")
+                elif item.get("type") == "video" or "video" in item:
+                    parts.append("<|video|>")
+                if "text" in item:
+                    parts.append(str(item["text"]))
+            elif isinstance(item, str):
+                parts.append(item)
+        return "".join(parts)
+    @staticmethod
+    def _offline_sanitize_prompt_text(processor, text: Any) -> str:
+        if text is None:
+            return ""
+        sanitized = str(text)
+        replacements = [
+            (getattr(processor, "image_placeholder", None), ""),
+            (getattr(processor, "video_placeholder", None), ""),
+            (getattr(processor, "image_token", None), ""),
+            (getattr(processor, "video_token", None), ""),
+        ]
+        for needle, replacement in replacements:
+            if needle:
+                sanitized = sanitized.replace(needle, replacement)
+        return sanitized.lstrip("\n")
+    def _offline_sanitize_message_content(self, processor, content: Any) -> Any:
+        if isinstance(content, str):
+            return self._offline_sanitize_prompt_text(processor, content)
+        if not isinstance(content, list):
+            return content
+        sanitized_items = []
+        for item in content:
+            if isinstance(item, dict):
+                item_copy = dict(item)
+                if "text" in item_copy:
+                    item_copy["text"] = self._offline_sanitize_prompt_text(processor, item_copy.get("text"))
+                sanitized_items.append(item_copy)
+            elif isinstance(item, str):
+                sanitized_items.append(self._offline_sanitize_prompt_text(processor, item))
+            else:
+                sanitized_items.append(item)
+        return sanitized_items
+    def _offline_prepare_messages(self, processor, query: Dict[str, Any]) -> List[Dict[str, Any]]:
+        messages = query.get("messages")
+        if messages:
+            prepared_messages = []
+            for message in messages:
+                if not isinstance(message, dict):
+                    continue
+                message_copy = dict(message)
+                message_copy["content"] = self._offline_sanitize_message_content(
+                    processor,
+                    message_copy.get("content", ""),
+                )
+                prepared_messages.append(message_copy)
+            if prepared_messages:
+                return prepared_messages
+        prompt = self._offline_sanitize_prompt_text(processor, query.get("prompt", ""))
+        images = list(query.get("images") or [])
+        videos = list(query.get("videos") or [])
+        content = []
+        for image in images:
+            content.append({"type": "image", "image": image})
+        for video in videos:
+            content.append({"type": "video", "video": video})
+        if prompt:
+            content.append({"type": "text", "text": prompt.lstrip("\n")})
+        if not content:
+            content = [{"type": "text", "text": ""}]
+        return [{"role": "user", "content": content}]
+    def _offline_prepare_input_text(self, processor, messages: List[Dict[str, Any]]) -> str:
+        processed_messages = []
+        for message in messages:
+            message_copy = dict(message)
+            message_copy["content"] = self._offline_flatten_content_with_vision_tokens(
+                message_copy.get("content", "")
+            )
+            processed_messages.append(message_copy)
+        return processor.apply_chat_template(
+            processed_messages,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+    @staticmethod
+    def _offline_collect_media(messages: List[Dict[str, Any]]) -> tuple[List[Any], List[Any]]:
+        all_images: List[Any] = []
+        all_videos: List[Any] = []
+        for message in messages:
+            content = message.get("content")
+            if isinstance(content, list):
+                for item in content:
+                    if not isinstance(item, dict):
+                        continue
+                    if item.get("type") == "image" or "image" in item:
+                        image = item.get("image") or item.get("image_url")
+                        if image is not None:
+                            all_images.append(image)
+                    elif item.get("type") == "video" or "video" in item:
+                        video = item.get("video")
+                        if video is not None:
+                            all_videos.append(video)
+        return all_images, all_videos
+    def _offline_build_processor_kwargs(
+        self,
+        input_text: Union[str, List[str]],
+        all_images: List[Any],
+        all_videos: List[Any],
+        media_kwargs: Dict[str, Any],
+    ) -> Dict[str, Any]:
+        processor_kwargs: Dict[str, Any] = {
+            "text": input_text,
+            "images": all_images or None,
+            "videos": all_videos or None,
+            "return_tensors": "pt",
+            "padding": False,
+        }
+        if media_kwargs.get("min_pixels") is not None:
+            processor_kwargs["min_pixels"] = media_kwargs["min_pixels"]
+        if media_kwargs.get("max_pixels") is not None:
+            processor_kwargs["max_pixels"] = media_kwargs["max_pixels"]
+        if media_kwargs.get("video_fps") is not None:
+            processor_kwargs["video_fps"] = media_kwargs["video_fps"]
+        min_frames = media_kwargs.get("min_frames", media_kwargs.get("video_minlen"))
+        max_frames = media_kwargs.get("max_frames", media_kwargs.get("video_maxlen"))
+        if min_frames is not None:
+            processor_kwargs["min_frames"] = min_frames
+        if max_frames is not None:
+            processor_kwargs["max_frames"] = max_frames
+        return processor_kwargs
+    def _offline_run_processor(self, processor, processor_kwargs: Dict[str, Any], media_kwargs: Dict[str, Any]):
+        image_proc = getattr(processor, "image_processor", None)
+        video_proc = getattr(processor, "video_processor", None)
+        modified_multi_image = False
+        modified_video = False
+        with self._offline_processor_lock:
+            try:
+                multi_image_max_pixels = media_kwargs.get("multi_image_max_pixels")
+                if multi_image_max_pixels is not None and image_proc is not None:
+                    orig_multi_image_max_pixels = getattr(image_proc, "multi_image_max_pixels", None)
+                    image_proc.multi_image_max_pixels = multi_image_max_pixels
+                    modified_multi_image = True
+                video_max_pixels = media_kwargs.get("video_max_pixels")
+                if video_max_pixels is not None and video_proc is not None:
+                    orig_video_max_pixels = getattr(video_proc, "video_max_pixels", None)
+                    video_proc.video_max_pixels = video_max_pixels
+                    modified_video = True
+                inputs = processor(**processor_kwargs)
+            finally:
+                if modified_multi_image and image_proc is not None:
+                    image_proc.multi_image_max_pixels = orig_multi_image_max_pixels
+                if modified_video and video_proc is not None:
+                    video_proc.video_max_pixels = orig_video_max_pixels
+        return inputs
+    def _offline_move_inputs_to_devices(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        moved_inputs = dict(inputs)
+        text_device = self.get_input_embeddings().weight.device
+        vision_device = self.visual.patch_embed.proj.weight.device
+        vision_input_keys = {"pixel_values", "grid_thw"}
+        for key, value in list(moved_inputs.items()):
+            if not isinstance(value, torch.Tensor):
+                continue
+            target_device = vision_device if key in vision_input_keys else text_device
+            moved_value = value.to(target_device)
+            if moved_value.dtype == torch.float32:
+                moved_value = moved_value.to(torch.bfloat16)
+            moved_inputs[key] = moved_value
+        return moved_inputs
+    @staticmethod
+    def _offline_build_call_kwargs(generate_kwargs: Optional[Dict[str, Any]]) -> Dict[str, Any]:
+        normalized_generate_kwargs = dict(generate_kwargs or {})
+        max_new_tokens = normalized_generate_kwargs.pop("max_new_tokens", 1024)
+        temperature = normalized_generate_kwargs.pop("temperature", 1.0)
+        top_k = normalized_generate_kwargs.pop("top_k", 50)
+        top_p = normalized_generate_kwargs.pop("top_p", 1.0)
+        repetition_penalty = normalized_generate_kwargs.pop("repetition_penalty", 1.0)
+        do_sample = normalized_generate_kwargs.pop("do_sample", False)
+        # ``vision_chunked_length`` was used by the previous modeling file to
+        # shard the visual tower at prefill time.  The current forward path
+        # processes the entire vision input in one go, so this flag is
+        # intentionally accepted-and-ignored here for backward compatibility.
+        normalized_generate_kwargs.pop("vision_chunked_length", None)
+        if temperature is None:
+            temperature = 1.0
+        if temperature <= 0:
+            temperature = 1.0
+            do_sample = False
+        return dict(
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+            do_sample=do_sample,
+            **normalized_generate_kwargs,
+        )
+    def offline_prepare_query_cpu(
+        self,
+        processor,
+        query: Dict[str, Any],
+        session_messages: Optional[List[Dict[str, Any]]] = None,
+        *,
+        padding: bool = False,
+    ) -> Dict[str, Any]:
+        current_session = session_messages or []
+        if query.get("reset_session") or query.get("clear_history"):
+            current_session = []
+        working_messages = self._offline_build_session_messages(
+            processor,
+            query,
+            current_session,
+        )
+        input_text = self._offline_prepare_input_text(processor, working_messages)
+        all_images, all_videos = self._offline_collect_media(working_messages)
+        media_kwargs = dict(query.get("media_kwargs") or {})
+        processor_kwargs = self._offline_build_processor_kwargs(
+            input_text,
+            all_images,
+            all_videos,
+            media_kwargs,
+        )
+        processor_kwargs["padding"] = padding
+        inputs_cpu = self._offline_run_processor(processor, processor_kwargs, media_kwargs)
+        return {
+            "inputs_cpu": inputs_cpu,
+            "input_text": input_text,
+            "working_messages": working_messages,
+            "call_kwargs": self._offline_build_call_kwargs(query.get("generate_kwargs")),
+        }
+    def _offline_prepare_inputs(self, processor, query: Dict[str, Any]):
+        prepared = self.offline_prepare_query_cpu(processor, query)
+        inputs = self._offline_move_inputs_to_devices(prepared["inputs_cpu"])
+        return inputs, prepared["input_text"]
+    def offline_generate_from_prepared(self, processor, prepared: Dict[str, Any]) -> Dict[str, Any]:
+        inputs = self._offline_move_inputs_to_devices(prepared["inputs_cpu"])
+        input_seq_len = inputs["input_ids"].shape[1]
+        with torch.no_grad():
+            outputs = self.generate(
+                **inputs,
+                **prepared["call_kwargs"],
+            )
+        generated_tokens = outputs[:, input_seq_len:]
+        decoded_texts = processor.batch_decode(generated_tokens, skip_special_tokens=True)
+        text = decoded_texts[0] if decoded_texts else ""
+        return {
+            "text": text,
+            "input_text": prepared["input_text"],
+            "messages": prepared["working_messages"],
+        }
+    def _offline_build_session_messages(
+        self,
+        processor,
+        query: Dict[str, Any],
+        session_messages: List[Dict[str, Any]],
+    ) -> List[Dict[str, Any]]:
+        has_explicit_messages = bool(query.get("messages"))
+        if has_explicit_messages and not query.get("append_messages_to_session", False):
+            base_messages: List[Dict[str, Any]] = []
+        else:
+            base_messages = [dict(message) for message in session_messages]
+        turn_messages = self._offline_prepare_messages(processor, query)
+        has_system_message = any(
+            isinstance(message, dict) and message.get("role") == "system"
+            for message in (base_messages + turn_messages)
+        )
+        should_add_system_prompt = (
+            query.get("use_default_system_prompt", False)
+            or query.get("system_prompt") is not None
+            or query.get("system_prompt_type") is not None
+            or query.get("thinking_mode") is not None
+        )
+        if not base_messages and not has_system_message and should_add_system_prompt:
+            system_prompt = self._offline_resolve_system_prompt(query, turn_messages)
+            if system_prompt is not None:
+                base_messages.append({"role": "system", "content": system_prompt})
+        return base_messages + turn_messages
+    @staticmethod
+    def _offline_query_contains_video(query: Dict[str, Any], messages: List[Dict[str, Any]]) -> bool:
+        if query.get("videos"):
+            return True
+        for message in messages:
+            content = message.get("content") if isinstance(message, dict) else None
+            if isinstance(content, list) and any(
+                isinstance(item, dict) and (item.get("type") == "video" or "video" in item)
+                for item in content
+            ):
+                return True
+        return False
+    @staticmethod
+    def _offline_normalize_thinking_mode(value: Optional[str]) -> str:
+        if value is None:
+            return "no_thinking"
+        normalized = _OFFLINE_THINKING_MODE_ALIASES.get(str(value).strip().lower())
+        if normalized is None:
+            allowed = ", ".join(sorted(set(_OFFLINE_THINKING_MODE_ALIASES.values())))
+            raise ValueError(f"Unsupported thinking_mode: {value!r}. Supported values: {allowed}")
+        return normalized
+    @staticmethod
+    def _offline_normalize_system_prompt_type(value: Optional[str], has_video: bool) -> str:
+        if value is None:
+            return "video" if has_video else "text_image"
+        normalized_key = str(value).strip().lower().replace("/", "_").replace(" ", "_")
+        while "__" in normalized_key:
+            normalized_key = normalized_key.replace("__", "_")
+        normalized = _OFFLINE_SYSTEM_PROMPT_TYPE_ALIASES.get(normalized_key)
+        if normalized is None:
+            allowed = ", ".join(sorted(set(_OFFLINE_SYSTEM_PROMPT_TYPE_ALIASES.values())))
+            raise ValueError(f"Unsupported system_prompt_type: {value!r}. Supported values: {allowed}")
+        return normalized
+    def _offline_resolve_system_prompt(
+        self,
+        query: Dict[str, Any],
+        turn_messages: List[Dict[str, Any]],
+    ) -> Optional[str]:
+        explicit_system_prompt = query.get("system_prompt")
+        if explicit_system_prompt is not None:
+            return str(explicit_system_prompt)
+        has_video = self._offline_query_contains_video(query, turn_messages)
+        thinking_mode = self._offline_normalize_thinking_mode(query.get("thinking_mode"))
+        system_prompt_type = self._offline_normalize_system_prompt_type(
+            query.get("system_prompt_type"),
+            has_video=has_video,
+        )
+        return _OFFLINE_SYSTEM_PROMPTS[thinking_mode][system_prompt_type]
+    @staticmethod
+    def _offline_finalize_session_messages(
+        working_messages: List[Dict[str, Any]],
+        assistant_text: str,
+    ) -> List[Dict[str, Any]]:
+        next_messages = [dict(message) for message in working_messages]
+        next_messages.append({"role": "assistant", "content": assistant_text})
+        return next_messages
+    def _offline_prepare_generation(self, processor, query: Dict[str, Any]):
+        prepared = self.offline_prepare_query_cpu(processor, query)
+        inputs = self._offline_move_inputs_to_devices(prepared["inputs_cpu"])
+        return inputs, prepared["input_text"], prepared["call_kwargs"]
+    @staticmethod
+    def _offline_normalize_shared_mapping(
+        values: List[Dict[str, Any]],
+        mapping_name: str,
+    ) -> Dict[str, Any]:
+        normalized_values = [dict(value or {}) for value in values]
+        if not normalized_values:
+            return {}
+        all_keys = set()
+        for value in normalized_values:
+            all_keys.update(value.keys())
+        merged: Dict[str, Any] = {}
+        mismatched_keys: List[str] = []
+        for key in sorted(all_keys):
+            unique_values = {repr(value.get(key)) for value in normalized_values}
+            if len(unique_values) > 1:
+                mismatched_keys.append(key)
+            else:
+                merged[key] = normalized_values[0].get(key)
+        if mismatched_keys:
+            mismatch_text = ", ".join(mismatched_keys)
+            raise ValueError(
+                f"All batch queries must share the same {mapping_name}. "
+                f"Mismatched keys: {mismatch_text}"
+            )
+        return merged
+    def _offline_prepare_batch_generation(
+        self,
+        processor,
+        queries: List[Dict[str, Any]],
+        session_states: Optional[List[List[Dict[str, Any]]]] = None,
+    ):
+        if not queries:
+            raise ValueError("`queries` must contain at least one query.")
+        if session_states is None:
+            session_states = [[] for _ in queries]
+        elif len(session_states) != len(queries):
+            raise ValueError("`session_states` must have the same length as `queries`.")
+        working_messages_list: List[List[Dict[str, Any]]] = []
+        input_texts: List[str] = []
+        all_images_per_query: List[List[Any]] = []
+        all_videos_per_query: List[List[Any]] = []
+        for query, session_state in zip(queries, session_states):
+            if not isinstance(query, dict):
+                raise TypeError("Each batch query must be a dict.")
+            if query.get("stop_offline_generate"):
+                raise ValueError("`stop_offline_generate` is not supported in offline_batch_generate.")
+            if query.get("stream_output", query.get("stream", False)):
+                raise ValueError("Streaming is not supported in offline_batch_generate.")
+            if query.get("cancel_current_generate") or query.get("stop_generation"):
+                raise ValueError("Cancel / stop controls are not supported in offline_batch_generate.")
+            current_session = [] if query.get("reset_session") or query.get("clear_history") else session_state
+            working_messages = self._offline_build_session_messages(
+                processor,
+                query,
+                current_session,
+            )
+            working_messages_list.append(working_messages)
+            input_texts.append(self._offline_prepare_input_text(processor, working_messages))
+            all_images, all_videos = self._offline_collect_media(working_messages)
+            all_images_per_query.append(all_images)
+            all_videos_per_query.append(all_videos)
+        media_kwargs = self._offline_normalize_shared_mapping(
+            [query.get("media_kwargs") or {} for query in queries],
+            mapping_name="media_kwargs",
+        )
+        processor_kwargs = self._offline_build_processor_kwargs(
+            input_text=input_texts,
+            all_images=[image for images in all_images_per_query for image in images],
+            all_videos=[video for videos in all_videos_per_query for video in videos],
+            media_kwargs=media_kwargs,
+        )
+        processor_kwargs["padding"] = True
+        tokenizer = getattr(processor, "tokenizer", None)
+        orig_padding_side = None
+        if tokenizer is not None and hasattr(tokenizer, "padding_side"):
+            orig_padding_side = tokenizer.padding_side
+            tokenizer.padding_side = "left"
+        try:
+            inputs = self._offline_run_processor(processor, processor_kwargs, media_kwargs)
+        finally:
+            if tokenizer is not None and orig_padding_side is not None:
+                tokenizer.padding_side = orig_padding_side
+        inputs = self._offline_move_inputs_to_devices(inputs)
+        generate_kwargs = self._offline_normalize_shared_mapping(
+            [query.get("generate_kwargs") or {} for query in queries],
+            mapping_name="generate_kwargs",
+        )
+        call_kwargs = self._offline_build_call_kwargs(generate_kwargs)
+        return inputs, input_texts, working_messages_list, call_kwargs
+    def offline_batch_generate(
+        self,
+        processor,
+        queries: List[Dict[str, Any]],
+        session_states: Optional[List[List[Dict[str, Any]]]] = None,
+        vision_chunked_length: int = 64,
+    ) -> Dict[str, Any]:
+        """
+        Batch offline generation for multiple independent samples.
+        This method supports:
+        - batched single-turn generation
+        - batched multi-turn continuation through `session_states`
+        It intentionally does not support queue-style controls such as:
+        - `stream_output`
+        - `cancel_current_generate`
+        - `stop_generation`
+        - `stop_offline_generate`
+        """
+        if not queries:
+            return {"results": [], "session_states": []}
+        prepared_queries = [dict(query) for query in queries]
+        for query in prepared_queries:
+            generate_kwargs = query.setdefault("generate_kwargs", {})
+            generate_kwargs.setdefault("vision_chunked_length", vision_chunked_length)
+        if session_states is None:
+            session_states = [[] for _ in prepared_queries]
+        elif len(session_states) != len(prepared_queries):
+            raise ValueError("`session_states` must have the same length as `queries`.")
+        tokenizer = getattr(processor, "tokenizer", None)
+        bucketed_indices: Dict[Any, List[int]] = {}
+        for index, (query, session_state) in enumerate(zip(prepared_queries, session_states)):
+            current_session = [] if query.get("reset_session") or query.get("clear_history") else session_state
+            working_messages = self._offline_build_session_messages(processor, query, current_session)
+            input_text = self._offline_prepare_input_text(processor, working_messages)
+            if tokenizer is not None:
+                token_ids = tokenizer(input_text, add_special_tokens=False)["input_ids"]
+                bucket_key = len(token_ids)
+            else:
+                bucket_key = len(input_text)
+            bucketed_indices.setdefault(bucket_key, []).append(index)
+        results: List[Optional[Dict[str, Any]]] = [None] * len(prepared_queries)
+        next_session_states: List[Optional[List[Dict[str, Any]]]] = [None] * len(prepared_queries)
+        for bucket_indices in bucketed_indices.values():
+            bucket_queries = [prepared_queries[index] for index in bucket_indices]
+            bucket_session_states = [session_states[index] for index in bucket_indices]
+            inputs, input_texts, working_messages_list, call_kwargs = self._offline_prepare_batch_generation(
+                processor,
+                bucket_queries,
+                session_states=bucket_session_states,
+            )
+            with torch.no_grad():
+                outputs = self.generate(
+                    **inputs,
+                    **call_kwargs,
+                )
+            input_seq_len = inputs["input_ids"].shape[1]
+            generated_tokens = outputs[:, input_seq_len:]
+            decoded_texts = processor.batch_decode(generated_tokens, skip_special_tokens=True)
+            for local_index, (query, input_text, working_messages, text) in enumerate(
+                zip(bucket_queries, input_texts, working_messages_list, decoded_texts)
+            ):
+                original_index = bucket_indices[local_index]
+                if query.get("persist_session", True):
+                    next_session_state = self._offline_finalize_session_messages(working_messages, text)
+                else:
+                    next_session_state = working_messages
+                next_session_states[original_index] = next_session_state
+                results[original_index] = {
+                    "index": original_index,
+                    "text": text,
+                    "input_text": input_text,
+                    "messages": working_messages,
+                }
+        return {
+            "results": [item for item in results if item is not None],
+            "session_states": [item for item in next_session_states if item is not None],
+        }
+    def _offline_generate_one(self, processor, query: Dict[str, Any]) -> str:
+        working_messages = self._offline_build_session_messages(processor, query, [])
+        generation_query = dict(query)
+        generation_query["messages"] = working_messages
+        inputs, _, call_kwargs = self._offline_prepare_generation(processor, generation_query)
+        with torch.no_grad():
+            outputs = self.generate(
+                **inputs,
+                **call_kwargs,
+            )
+        new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
+        return processor.decode(new_tokens, skip_special_tokens=True)
+    @staticmethod
+    def _offline_capture_processor_attrs(target, overrides: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
+        if target is None or not overrides:
+            return None
+        return {name: copy.deepcopy(getattr(target, name)) for name in overrides}
+    @staticmethod
+    def _offline_apply_processor_attrs(target, overrides: Optional[Dict[str, Any]]) -> None:
+        if target is None or not overrides:
+            return
+        for name, value in overrides.items():
+            setattr(target, name, copy.deepcopy(value))
+    @staticmethod
+    def _offline_restore_processor_attrs(target, snapshot: Optional[Dict[str, Any]]) -> None:
+        if target is None or snapshot is None:
+            return
+        for name, value in snapshot.items():
+            setattr(target, name, copy.deepcopy(value))
+    def _offline_generate_one_with_processor_overrides(
+        self,
+        processor,
+        query: Dict[str, Any],
+        image_processor_overrides: Optional[Dict[str, Any]] = None,
+        video_processor_overrides: Optional[Dict[str, Any]] = None,
+    ) -> str:
+        image_proc = getattr(processor, "image_processor", None)
+        video_proc = getattr(processor, "video_processor", None)
+        image_snapshot = self._offline_capture_processor_attrs(image_proc, image_processor_overrides)
+        video_snapshot = self._offline_capture_processor_attrs(video_proc, video_processor_overrides)
+        with self._offline_processor_lock:
+            try:
+                self._offline_apply_processor_attrs(image_proc, image_processor_overrides)
+                self._offline_apply_processor_attrs(video_proc, video_processor_overrides)
+                return self._offline_generate_one(processor, query)
+            finally:
+                self._offline_restore_processor_attrs(image_proc, image_snapshot)
+                self._offline_restore_processor_attrs(video_proc, video_snapshot)
+    def offline_image_generate(
+        self,
+        processor,
+        prompt: str,
+        image: Any,
+        *,
+        shortest_edge: int = 4096,
+        longest_edge: int = 16777216,
+        multi_image_max_pixels: int = 201326592,
+        patch_size: int = 16,
+        temporal_patch_size: int = 1,
+        merge_size: int = 2,
+        image_mean: Optional[Union[List[float], Tuple[float, ...]]] = (0.5, 0.5, 0.5),
+        image_std: Optional[Union[List[float], Tuple[float, ...]]] = (0.5, 0.5, 0.5),
+        max_new_tokens: int = 1024,
+        temperature: float = 1.0,
+        top_k: int = 50,
+        top_p: float = 1.0,
+        repetition_penalty: float = 1.0,
+        do_sample: bool = False,
+        vision_chunked_length: int = 64,
+        thinking_mode: Optional[str] = None,
+        system_prompt_type: Optional[str] = None,
+        system_prompt: Optional[str] = None,
+    ) -> str:
+        """
+        Single-image offline generation with explicit image preprocessor defaults.
+        The default values mirror `preprocessor_config.json` so README examples can
+        surface the full image preprocessing setup without requiring a batch wrapper.
+        """
+        query: Dict[str, Any] = {
+            "prompt": prompt,
+            "images": [image],
+            "videos": [],
+            "media_kwargs": {
+                "min_pixels": shortest_edge,
+                "max_pixels": longest_edge,
+                "multi_image_max_pixels": multi_image_max_pixels,
+            },
+            "generate_kwargs": {
+                "max_new_tokens": max_new_tokens,
+                "temperature": temperature,
+                "top_k": top_k,
+                "top_p": top_p,
+                "repetition_penalty": repetition_penalty,
+                "do_sample": do_sample,
+                "vision_chunked_length": vision_chunked_length,
+            },
+        }
+        if thinking_mode is not None:
+            query["thinking_mode"] = thinking_mode
+        if system_prompt_type is not None:
+            query["system_prompt_type"] = system_prompt_type
+        if system_prompt is not None:
+            query["system_prompt"] = system_prompt
+        image_processor_overrides = {
+            "size": {"shortest_edge": shortest_edge, "longest_edge": longest_edge},
+            "multi_image_max_pixels": multi_image_max_pixels,
+            "patch_size": patch_size,
+            "temporal_patch_size": temporal_patch_size,
+            "merge_size": merge_size,
+            "image_mean": list(image_mean) if image_mean is not None else None,
+            "image_std": list(image_std) if image_std is not None else None,
+        }
+        return self._offline_generate_one_with_processor_overrides(
+            processor,
+            query,
+            image_processor_overrides=image_processor_overrides,
+        )
+    def offline_video_generate(
+        self,
+        processor,
+        prompt: str,
+        video: Any,
+        *,
+        shortest_edge: int = 4096,
+        longest_edge: int = 16777216,
+        video_max_pixels: int = 201326592,
+        patch_size: int = 16,
+        temporal_patch_size: int = 1,
+        merge_size: int = 2,
+        video_fps: float = 1.0,
+        min_frames: int = 1,
+        max_frames: int = 256,
+        num_extract_threads: int = 4,
+        image_mean: Optional[Union[List[float], Tuple[float, ...]]] = (0.5, 0.5, 0.5),
+        image_std: Optional[Union[List[float], Tuple[float, ...]]] = (0.5, 0.5, 0.5),
+        max_new_tokens: int = 1024,
+        temperature: float = 1.0,
+        top_k: int = 50,
+        top_p: float = 1.0,
+        repetition_penalty: float = 1.0,
+        do_sample: bool = False,
+        vision_chunked_length: int = 64,
+        thinking_mode: Optional[str] = None,
+        system_prompt_type: Optional[str] = None,
+        system_prompt: Optional[str] = None,
+    ) -> str:
+        """
+        Single-video offline generation with explicit video preprocessor defaults.
+        The default values mirror `video_preprocessor_config.json` so README examples
+        can show a standalone video entry point with the effective preprocessing knobs.
+        """
+        query: Dict[str, Any] = {
+            "prompt": prompt,
+            "images": [],
+            "videos": [video],
+            "media_kwargs": {
+                "min_pixels": shortest_edge,
+                "max_pixels": longest_edge,
+                "video_max_pixels": video_max_pixels,
+                "video_fps": video_fps,
+                "min_frames": min_frames,
+                "max_frames": max_frames,
+            },
+            "generate_kwargs": {
+                "max_new_tokens": max_new_tokens,
+                "temperature": temperature,
+                "top_k": top_k,
+                "top_p": top_p,
+                "repetition_penalty": repetition_penalty,
+                "do_sample": do_sample,
+                "vision_chunked_length": vision_chunked_length,
+            },
+        }
+        if thinking_mode is not None:
+            query["thinking_mode"] = thinking_mode
+        if system_prompt_type is not None:
+            query["system_prompt_type"] = system_prompt_type
+        if system_prompt is not None:
+            query["system_prompt"] = system_prompt
+        video_processor_overrides = {
+            "size": {"shortest_edge": shortest_edge, "longest_edge": longest_edge},
+            "video_max_pixels": video_max_pixels,
+            "patch_size": patch_size,
+            "temporal_patch_size": temporal_patch_size,
+            "merge_size": merge_size,
+            "video_fps": video_fps,
+            "min_frames": min_frames,
+            "max_frames": max_frames,
+            "num_extract_threads": num_extract_threads,
+            "image_mean": list(image_mean) if image_mean is not None else None,
+            "image_std": list(image_std) if image_std is not None else None,
+        }
+        return self._offline_generate_one_with_processor_overrides(
+            processor,
+            query,
+            video_processor_overrides=video_processor_overrides,
+        )
+    def offline_generate(
+        self,
+        processor,
+        new_queries: "queue.Queue[dict]",
+        output_text_queue: "queue.Queue[str]",
+        vision_chunked_length: int = 64,
+    ) -> None:
+        """
+        HF-style offline inference wrapper aligned with the previous backend output path.
+        This method intentionally reuses the checkpoint's existing processor and
+        `generate()` flow so that outputs stay consistent with the old external
+        backend inference implementation.
+        Supported query keys include:
+        - `prompt` / `messages`
+        - `images` / `videos`
+        - `media_kwargs` / `generate_kwargs`
+        - `thinking_mode` (`no_thinking` or `deep_thinking`, plus compatible aliases)
+        - `system_prompt_type` (`text_image` or `video`, plus compatible aliases)
+        - `system_prompt` for an explicit override
+        - `stream_output` / `stream`
+        - `reset_session` / `clear_history`
+        - `cancel_current_generate` / `stop_generation` / `stop_offline_generate`
+        """
+        buffered_queries: List[Dict[str, Any]] = []
+        session_messages: List[Dict[str, Any]] = []
+        while True:
+            if buffered_queries:
+                query = buffered_queries.pop(0)
+            else:
+                query = new_queries.get()
+            if not isinstance(query, dict):
+                continue
+            if query.get("stop_offline_generate"):
+                break
+            if query.get("reset_session") or query.get("clear_history"):
+                session_messages = []
+            try:
+                generate_kwargs = query.setdefault("generate_kwargs", {})
+                generate_kwargs.setdefault("vision_chunked_length", vision_chunked_length)
+                working_messages = self._offline_build_session_messages(
+                    processor,
+                    query,
+                    session_messages,
+                )
+                generation_query = dict(query)
+                generation_query["messages"] = working_messages
+                inputs, input_text, call_kwargs = self._offline_prepare_generation(processor, generation_query)
+                stream_output = bool(query.get("stream_output", query.get("stream", False)))
+                cancel_event = threading.Event()
+                stopping_criteria = StoppingCriteriaList([_OfflineCancelStoppingCriteria(cancel_event)])
+                generation_state: Dict[str, Any] = {}
+                if stream_output:
+                    output_text_queue.put("<|round_start|>")
+                    streamer = _OfflineQueueStreamer(getattr(processor, "tokenizer", processor), output_text_queue)
+                else:
+                    streamer = None
+                def _run_generation():
+                    try:
+                        with torch.no_grad():
+                            generation_state["outputs"] = self.generate(
+                                **inputs,
+                                stopping_criteria=stopping_criteria,
+                                streamer=streamer,
+                                **call_kwargs,
+                            )
+                    except Exception as exc:
+                        generation_state["exception"] = exc
+                worker = threading.Thread(target=_run_generation, daemon=True)
+                worker.start()
+                stop_conversation_after_turn = False
+                while worker.is_alive():
+                    try:
+                        control_query = new_queries.get(timeout=0.1)
+                    except queue.Empty:
+                        continue
+                    if not isinstance(control_query, dict):
+                        continue
+                    if control_query.get("cancel_current_generate") or control_query.get("stop_generation"):
+                        cancel_event.set()
+                        stop_conversation_after_turn = stop_conversation_after_turn or control_query.get("stop_offline_generate", False)
+                        continue
+                    if control_query.get("stop_offline_generate"):
+                        cancel_event.set()
+                        stop_conversation_after_turn = True
+                        continue
+                    buffered_queries.append(control_query)
+                worker.join()
+                was_cancelled = cancel_event.is_set()
+                if "exception" in generation_state:
+                    raise generation_state["exception"]
+                if stream_output and streamer is not None:
+                    text = "".join(streamer.collected_chunks)
+                else:
+                    outputs = generation_state["outputs"]
+                    new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
+                    text = processor.decode(new_tokens, skip_special_tokens=True)
+                    output_text_queue.put(text)
+                if query.get("persist_session", True) and (not was_cancelled or query.get("persist_cancelled_turn", False)):
+                    session_messages = self._offline_finalize_session_messages(working_messages, text)
+                output_text_queue.put("<|round_end|>")
+                if stop_conversation_after_turn:
+                    break
+            except Exception as exc:
+                output_text_queue.put(f"[ERROR] {exc}")
+                output_text_queue.put("<|round_end|>")
 __all__ = [
     "MossVLVisionModel",