code_SAS_VLM2Vec / src /model /processor.py
MgGladys's picture
Add files using upload-large-folder tool
0a937d7 verified
import logging
import PIL
from transformers.image_utils import ChannelDimension
from src.model.baseline_backbone.colpali import ColPaliProcessor
logger = logging.getLogger(__name__)
import torch
import numpy as np
from src.utils import print_master
from src.model.baseline_backbone.llava_next import LlavaNextForConditionalGeneration
from src.model.baseline_backbone.phi3_v.modeling_phi3_v import Phi3VForCausalLM
from src.model.vlm_backbone.qwen2_vl_gp import Qwen2VLForConditionalGeneration_GP, Qwen2VLProcessor
from src.model.vlm_backbone.qwen2_5_vl_gp import Qwen2_5_VLForConditionalGeneration_GP, Qwen2_5_VL_GP_Processor
from src.model.vlm_backbone.qwen2_vl_layer_prune import Qwen2VLForConditionalGeneration_LayerPrune, Qwen2VLProcessor
from src.model.vlm_backbone.qwen2_vl import Qwen2VLForConditionalGeneration, Qwen2VLProcessor
from src.model.vlm_backbone.qwen2_vl_tokenselection import \
Qwen2VLForConditionalGeneration as Qwen2VLTokenSelectionForConditionalGeneration, \
Qwen2VLProcessor as Qwen2VLTokenSelectionProcessor
from src.model.baseline_backbone.internvideo2.modeling_internvideo2 import InternVideo2_Stage2
from src.model.vlm_backbone.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
from src.model.vlm_backbone.qwen2_5_vl_tokenselection import \
Qwen2_5_VLForConditionalGeneration as Qwen2_5_VL_TokenSelectionForConditionalGeneration
from qwen_vl_utils import process_vision_info
PHI_IMAGE_TOKEN_MAX_INPUT_ID = int(1e9)
LLAVA_IMAGE_TOKEN_ID = 32000
PHI3V = 'phi3_v'
LLAVA_NEXT = 'llava_next'
QWEN2_VL_GP = 'qwen2_vl_gp'
QWEN2_5_VL_GP = 'qwen2_5_vl_gp'
QWEN2_VL_LayerPrune = 'qwen2_vl_layerprune'
QWEN2_VL = 'qwen2_vl'
QWEN2_VL_TOKENSELECTION = 'qwen2_vl'
QWEN2_5_VL = 'qwen2_5_vl'
QWEN2_VL_TOKENSELECTION = 'qwen2_vl_tokenselection'
QWEN2_5_VL_TOKENSELECTION = 'qwen2_5_vl_tokenselection'
INTERNVIDEO2 = 'internvideo2'
GME = 'gme' # QWEN2-VL
LamRA = 'lamra' # QWEN2-VL
LamRA_QWEN2_5 = 'lamra_qwen25' # QWEN2.5-VL
COLPALI = 'colpali' # PaliGemma-3B
E5_V = 'e5_v' # Llava_next
MODEL2BACKBONE = { # keys are from hf_config.model_type or manually added if not provided
'phi3_v': PHI3V,
'llava_next': LLAVA_NEXT,
'qwen2_vl_gp': QWEN2_VL_GP,
'qwen2_5_vl_gp': QWEN2_5_VL_GP,
'qwen2_vl_layerprune': QWEN2_VL_LayerPrune,
'qwen2_vl': QWEN2_VL,
'qwen2_vl_tokenselection': QWEN2_VL,
'qwen2_5_vl': QWEN2_5_VL,
'qwen2_vl_tokenselection': QWEN2_VL_TOKENSELECTION,
'qwen2_5_vl_tokenselection': QWEN2_5_VL_TOKENSELECTION,
'internvideo2': INTERNVIDEO2,
'gme': GME,
'lamra': LamRA,
'lamra_qwen25': LamRA,
'colpali': COLPALI,
'e5_v': E5_V,
}
SUPPORTED_MODELS = set(MODEL2BACKBONE.keys())
VLM_IMAGE_TOKENS = {
PHI3V: "<|image_1|>",
LLAVA_NEXT: "<image>",
QWEN2_VL_GP: "<|image_pad|>",
QWEN2_5_VL_GP: "<|image_pad|>",
QWEN2_VL_LayerPrune: "<|image_pad|>",
QWEN2_VL: "<|image_pad|>",
QWEN2_5_VL: "<|image_pad|>",
QWEN2_VL_TOKENSELECTION: "<|image_pad|>",
QWEN2_5_VL_TOKENSELECTION: "<|image_pad|>",
GME: "<|image_pad|>",
LamRA: "<|image_pad|>",
LamRA_QWEN2_5: "<|image_pad|>",
INTERNVIDEO2: "",
COLPALI: "",
E5_V: "<image>",
}
VLM_VIDEO_TOKENS = {
LLAVA_NEXT: "<image>",
QWEN2_VL_GP: "<|video_pad|>",
QWEN2_5_VL_GP: "<|video_pad|>",
QWEN2_VL_LayerPrune: "<|video_pad|>",
QWEN2_VL: "<|video_pad|>",
QWEN2_5_VL: "<|video_pad|>",
QWEN2_VL_TOKENSELECTION: "<|video_pad|>",
QWEN2_5_VL_TOKENSELECTION: "<|video_pad|>",
GME: "<|video_pad|>",
LamRA: "<|video_pad|>",
LamRA_QWEN2_5: "<|video_pad|>",
INTERNVIDEO2: "",
COLPALI: "",
E5_V: "<image>",
}
backbone2model = {
PHI3V: Phi3VForCausalLM,
LLAVA_NEXT: LlavaNextForConditionalGeneration,
QWEN2_VL_GP: Qwen2VLForConditionalGeneration_GP,
QWEN2_5_VL_GP: Qwen2_5_VLForConditionalGeneration_GP,
"QWEN2_VL_LayerPrune": Qwen2VLForConditionalGeneration_LayerPrune,
QWEN2_VL: Qwen2VLForConditionalGeneration,
QWEN2_5_VL: Qwen2_5_VLForConditionalGeneration,
QWEN2_VL_TOKENSELECTION: Qwen2VLTokenSelectionForConditionalGeneration,
QWEN2_5_VL_TOKENSELECTION: Qwen2_5_VL_TokenSelectionForConditionalGeneration,
INTERNVIDEO2: InternVideo2_Stage2,
E5_V: LlavaNextForConditionalGeneration,
}
def load_processor(model_args, data_args=None):
"""
Load processor based on VLM backbone.
Note: due to this change, https://github.com/huggingface/transformers/commit/9215cc62d4366072aacafa4e44028c1ca187167b#diff-6505546ec5a9ab74b2ce6511681dd31194eb91e9fa3ce26282e487a5e61f9356L1102
"""
model_name_or_path = model_args.checkpoint_path if model_args.checkpoint_path else model_args.model_name
print_master(f'Loading processor from: {model_name_or_path}')
if model_args.model_backbone == PHI3V:
from src.model.baseline_backbone.phi3_v.processing_phi3_v import Phi3VProcessor
processor = Phi3VProcessor.from_pretrained(
model_name_or_path,
trust_remote_code=True,
num_crops=model_args.num_crops
)
processor.tokenizer.padding_side = "right"
elif model_args.model_backbone == LLAVA_NEXT:
from src.model.baseline_backbone.llava_next import LlavaNextProcessor
processor = LlavaNextProcessor.from_pretrained(
model_name_or_path,
trust_remote_code=True
)
elif model_args.model_backbone in [QWEN2_VL, GME, LamRA]:
from src.model.vlm_backbone.qwen2_vl.processing_qwen2_vl import Qwen2VLProcessor
from src.model.vlm_backbone.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor
from src.model.vlm_backbone.qwen2_vl.tokenization_qwen2_fast import Qwen2TokenizerFast
min_pixels, max_pixels = None, None
if data_args is not None:
min_pixels, max_pixels = data_args.resize_min_pixels, data_args.resize_max_pixels
size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
image_processor = Qwen2VLImageProcessor.from_pretrained(model_name_or_path, size=size)
tokenizer = Qwen2TokenizerFast.from_pretrained(model_name_or_path)
processor = Qwen2VLProcessor.from_pretrained(
model_name_or_path,
image_processor=image_processor, tokenizer=tokenizer, size=size
)
elif model_args.model_backbone == QWEN2_VL_TOKENSELECTION:
from src.model.vlm_backbone.qwen2_vl_tokenselection.processing_qwen2_vl import Qwen2VLProcessor
from src.model.vlm_backbone.qwen2_vl_tokenselection.image_processing_qwen2_vl import Qwen2VLImageProcessor
from src.model.vlm_backbone.qwen2_vl_tokenselection.tokenization_qwen2_fast import Qwen2TokenizerFast
min_pixels, max_pixels = None, None
if data_args is not None:
min_pixels, max_pixels = data_args.resize_min_pixels, data_args.resize_max_pixels
size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
image_processor = Qwen2VLImageProcessor.from_pretrained(model_name_or_path, size=size)
if data_args is not None:
image_processor.do_resize = data_args.resize_use_processor
image_processor.min_pixels = data_args.resize_min_pixels
image_processor.max_pixels = data_args.resize_max_pixels
tokenizer = Qwen2TokenizerFast.from_pretrained(model_name_or_path)
processor = Qwen2VLProcessor.from_pretrained(
model_name_or_path,
image_processor=image_processor, tokenizer=tokenizer, size=size,
uigraph_use=model_args.uigraph_use,
uigraph_diff=model_args.uigraph_diff, uigraph_rand=model_args.uigraph_rand,
uimask_ratio=model_args.uimask_ratio, uimask_rand=model_args.uimask_rand
)
elif model_args.model_backbone in [QWEN2_5_VL, LamRA_QWEN2_5]:
from src.model.vlm_backbone.qwen2_5_vl.processing_qwen2_5_vl import Qwen2_5_VLProcessor
from src.model.vlm_backbone.qwen2_5_vl.image_processing_qwen2_5_vl import Qwen2_5_VLImageProcessor
from src.model.vlm_backbone.qwen2_vl.tokenization_qwen2_fast import Qwen2TokenizerFast
min_pixels, max_pixels = None, None
if data_args is not None:
min_pixels, max_pixels = data_args.resize_min_pixels, data_args.resize_max_pixels
size = {"shortest_edge": min_pixels, "longest_edge": max_pixels, "min_pixels": min_pixels, "max_pixels": max_pixels}
image_processor = Qwen2_5_VLImageProcessor.from_pretrained(model_name_or_path, size=size)
tokenizer = Qwen2TokenizerFast.from_pretrained(model_name_or_path)
processor = Qwen2_5_VLProcessor.from_pretrained(model_name_or_path, image_processor=image_processor, tokenizer=tokenizer)
# elif model_args.model_backbone in [QWEN2_5_VL_GP]:
# from src.model.vlm_backbone.qwen2_5_vl_gp.processing_qwen2_5_vl import Qwen2_5_VLProcessor
# from src.model.vlm_backbone.qwen2_5_vl_gp.image_processing_qwen2_5_vl import Qwen2_5_VLImageProcessor
# from src.model.vlm_backbone.qwen2_vl.tokenization_qwen2_fast import Qwen2TokenizerFast
# min_pixels, max_pixels = None, None
# if data_args is not None:
# min_pixels, max_pixels = data_args.resize_min_pixels, data_args.resize_max_pixels
# size = {"shortest_edge": min_pixels, "longest_edge": max_pixels, "min_pixels": min_pixels, "max_pixels": max_pixels}
# image_processor = Qwen2_5_VLImageProcessor.from_pretrained(model_name_or_path, size=size)
# tokenizer = Qwen2TokenizerFast.from_pretrained(model_name_or_path)
# processor = Qwen2_5_VLProcessor.from_pretrained(model_name_or_path, image_processor=image_processor, tokenizer=tokenizer)
# processor.tokenizer.padding_side = "left"
elif model_args.model_backbone in [QWEN2_5_VL_GP]:
# 使用 GP 专用 Processor(支持 normed_bboxes -> ref_token_masks)
max_pixels = getattr(data_args, "resize_max_pixels", 12845056) if data_args is not None else 12845056
processor = Qwen2_5_VL_GP_Processor.from_pretrained(
model_name_or_path,
max_pixels=max_pixels,
)
# tokenizer 补丁:eos/pad/左对齐(与 GP 训练假设一致)
tok = processor.tokenizer
if not hasattr(tok, "eos_token_id") or tok.eos_token_id is None:
tok.eos_token_id = tok.convert_tokens_to_ids(tok.eos_token)
if getattr(tok, "pad_token_id", None) is None:
tok.pad_token_id = tok.eos_token_id
tok.padding_side = "left"
# 对齐你现有的缩放参数(若存在)
if data_args is not None:
try:
processor.image_processor.do_resize = data_args.resize_use_processor
processor.image_processor.min_pixels = data_args.resize_min_pixels
processor.image_processor.max_pixels = data_args.resize_max_pixels
size = {
"shortest_edge": data_args.resize_min_pixels,
"longest_edge": data_args.resize_max_pixels,
"min_pixels": data_args.resize_min_pixels,
"max_pixels": data_args.resize_max_pixels,
}
if hasattr(processor.image_processor, "size"):
processor.image_processor.size = size
except Exception as e:
print_master(f"[warn] set resize fields on GP image_processor failed: {e}")
elif model_args.model_backbone in [QWEN2_VL_GP]:
from src.model.vlm_backbone.qwen2_vl_gp.processing_qwen2_vl import Qwen2VLProcessor
from src.model.vlm_backbone.qwen2_vl_gp.image_processing_qwen2_vl import Qwen2VLImageProcessor
from src.model.vlm_backbone.qwen2_vl_gp.tokenization_qwen2_fast import Qwen2TokenizerFast
min_pixels, max_pixels = None, None
if data_args is not None:
min_pixels, max_pixels = data_args.resize_min_pixels, data_args.resize_max_pixels
size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
image_processor = Qwen2VLImageProcessor.from_pretrained(model_name_or_path, size=size)
tokenizer = Qwen2TokenizerFast.from_pretrained(model_name_or_path)
processor = Qwen2VLProcessor.from_pretrained(
model_name_or_path,
image_processor=image_processor, tokenizer=tokenizer, size=size
)
processor.tokenizer.padding_side = "left"
elif model_args.model_backbone == QWEN2_5_VL_TOKENSELECTION:
# TODO: qwen2.5 token selection not working yet
from src.model.vlm_backbone.qwen2_5_vl_tokenselection.processing_qwen2_5_vl import Qwen2_5_VLProcessor
from src.model.vlm_backbone.qwen2_5_vl_tokenselection.image_processing_qwen2_5_vl import Qwen2_5_VLImageProcessor
from src.model.vlm_backbone.qwen2_vl_tokenselection.tokenization_qwen2_fast import Qwen2TokenizerFast
min_pixels, max_pixels = None, None
if data_args is not None:
min_pixels, max_pixels = data_args.resize_min_pixels, data_args.resize_max_pixels
size = {"shortest_edge": min_pixels, "longest_edge": max_pixels, "min_pixels": min_pixels, "max_pixels": max_pixels}
image_processor = Qwen2_5_VLImageProcessor.from_pretrained(model_name_or_path, size=size)
tokenizer = Qwen2TokenizerFast.from_pretrained(model_name_or_path)
processor = Qwen2_5_VLProcessor.from_pretrained(
model_name_or_path,
image_processor=image_processor, tokenizer=tokenizer,
uigraph_use=model_args.uigraph_use,
uigraph_diff=model_args.uigraph_diff, uigraph_rand=model_args.uigraph_rand,
uimask_ratio=model_args.uimask_ratio, uimask_rand=model_args.uimask_rand
)
elif model_args.model_backbone == INTERNVIDEO2:
return None
elif model_args.model_backbone == COLPALI:
from transformers import AutoProcessor
processor = ColPaliProcessor.from_pretrained(model_args.model_name)
else:
from transformers import AutoProcessor
processor = AutoProcessor.from_pretrained(
model_args.processor_name if model_args.processor_name else model_args.model_name,
trust_remote_code=True,
)
return processor
def get_backbone_name(hf_config, model_type=None):
if model_type is not None:
setattr(hf_config, 'model_type', model_type)
assert hf_config.model_type in SUPPORTED_MODELS, f"Unknown backbone name {hf_config.model_type}.Supported models are {SUPPORTED_MODELS}"
return MODEL2BACKBONE[hf_config.model_type]
def Llava_NEXT_process_fn(model_inputs: dict, processor, max_length=None):
# TODO: NOT FINISHED YET!
input_ids, pixel_values, image_sizes = [], [], []
texts, visual_inputs = model_inputs['text'], model_inputs['images']
image_exists = False
# 1. iterate each pair and process (since processors do not support batch processing)
for text, images in zip(texts, visual_inputs):
# in theory, each batch item should contain a list of frames, but we still check for exceptions here
# if no images as input (not likely to happen in mmeb pro cases)
if images is None or (type(images)==list and any(i is None for i in images)):
inputs = processor(images=None, text=text, return_tensors="np", max_length=max_length, truncation=True)
input_id = inputs["input_ids"].squeeze().tolist()
if isinstance(input_id, int):
# in case of empty string, only BOS is included
input_id = [input_id]
input_ids.append(input_id)
pixel_values.append(None)
image_sizes.append(None)
else:
image_exists = True
# in theory, valid images should be a list of frames
assert isinstance(images, list), f"images should be a list, but got {type(images)}"
inputs = processor(images=images, text=text, return_tensors="np", max_length=max_length, truncation=True)
input_ids.append(inputs["input_ids"].squeeze().tolist())
pixel_values.append(inputs['pixel_values'])
image_sizes.append(inputs['image_sizes'])
# 2. padding inputs
batch_encoding = processor.tokenizer.pad({'input_ids': input_ids}, return_tensors="pt")
input_ids, attention_mask = batch_encoding['input_ids'], batch_encoding['attention_mask']
inputs = {
'input_ids': input_ids.long(),
'attention_mask': attention_mask,
# 'texts': texts,
# 'images': visual_inputs,
}
image_exists = any([p is not None for p in pixel_values])
if image_exists:
pixel_values = torch.from_numpy(np.array(pixel_values)).float()
pixel_values_shape = pixel_values.shape
pixel_values = pixel_values.reshape(pixel_values_shape[0] * pixel_values_shape[1], *pixel_values_shape[2:])
image_sizes = torch.tensor(np.array(image_sizes)).long()
image_sizes_shape = image_sizes.shape
image_sizes = image_sizes.reshape(image_sizes_shape[0] * image_sizes_shape[1], *image_sizes_shape[2:])
inputs['pixel_values'] = torch.from_numpy(np.array(pixel_values)).float()
inputs['image_sizes'] = torch.tensor(np.array(image_sizes)).long()
else:
inputs['pixel_values'] = torch.zeros(input_ids.shape[0], 1)
inputs['image_sizes'] = torch.ones(input_ids.shape[0], 1)
return inputs
def Phi3V_process_fn(model_inputs: dict, processor, max_length=None):
input_ids, pixel_values, image_sizes, image_grid_thw = [], [], [], []
texts, images = model_inputs['text'], model_inputs['images']
image_exists = False
# 1. iterate each pair and process (since processors do not support batch processing)
for text, image in zip(texts, images):
if image is None:
inputs = processor(text, None, return_tensors="np", max_length=max_length, truncation=True)
input_id = inputs["input_ids"].squeeze().tolist()
if isinstance(input_id, int):
# in case of empty string, only BOS is included
input_id = [input_id]
input_ids.append(input_id)
pixel_values.append(None)
image_sizes.append(None)
image_grid_thw.append(None)
else:
image_exists = True
inputs = processor(text=text, images=[image], return_tensors="np", max_length=max_length, truncation=True)
input_ids.append(inputs["input_ids"].squeeze().tolist())
pixel_values.append(inputs['pixel_values'])
if 'image_sizes' in inputs:
image_sizes.append(inputs['image_sizes'])
if 'image_grid_thw' in inputs:
image_grid_thw.append(inputs['image_grid_thw'])
# 2. padding inputs
batch_encoding = processor.tokenizer.pad({'input_ids': input_ids}, return_tensors="pt")
input_ids, attention_mask = batch_encoding['input_ids'], batch_encoding['attention_mask']
inputs = {
'input_ids': input_ids,
'attention_mask': attention_mask,
'texts': texts,
'images': images,
}
# 3. special postcare for mixed batch (examples w/ and w/o images in the same batch)
if image_exists:
# add them to inputs
inputs['pixel_values'] = pixel_values
inputs['image_sizes'] = image_sizes
else:
inputs['pixel_values'] = torch.zeros(input_ids.shape[0], 1)
inputs['image_sizes'] = torch.ones(input_ids.shape[0], 1)
return inputs
def Qwen2_VL_process_fn(model_inputs: dict, processor: Qwen2VLProcessor, max_length=None):
# TODO: set separate max_len for text/visual inputs, currently max_length is only applied to text-only data
input_ids, pixel_values, image_grid_thw, pixel_values_videos, video_grid_thw = [], [], [], [], []
texts, visual_inputs = model_inputs['text'], model_inputs['images']
image_exists = False
vlm_image_token, vlm_video_token = VLM_IMAGE_TOKENS[QWEN2_VL], VLM_VIDEO_TOKENS[QWEN2_VL]
# 1. iterate each pair and process, since processors do not support processing for mixed batch (contains data w/ and w/o visual inputs)
for text, images in zip(texts, visual_inputs):
if images is None or (type(images)==list and any(i is None for i in images)):
# all images must be valid
inputs = processor(text=[text], images=None, return_tensors="np", max_length=max_length, truncation=True)
input_id = inputs["input_ids"].squeeze().tolist()
if isinstance(input_id, int):
# in case of empty string, only BOS is included
input_id = [input_id]
input_ids.append(input_id)
pixel_values.append(None)
image_grid_thw.append(None)
pixel_values_videos.append(None)
video_grid_thw.append(None)
else:
try:
if vlm_image_token in text:
if isinstance(images, PIL.Image.Image):
# images is a single image
images = [images]
for iid, image in enumerate(images):
# rare case in MMEB eval: resize to 28*28 if either w or h is smaller than 28
if image.size[0] < 28 or image.size[1] < 28:
image = image.resize((56, 56))
images[iid] = image
inputs = processor(text=[text], images=images, return_tensors="np", max_length=None, truncation=False, input_data_format=ChannelDimension.LAST)
elif vlm_video_token in text:
# TODO: check text/video data validity
inputs = processor(text=[text], videos=[images], return_tensors="np", max_length=None, truncation=False, input_data_format=ChannelDimension.LAST)
else:
raise NotImplementedError(f"No visual token found ({vlm_image_token} or {vlm_video_token}) in the text: {text}")
except Exception as e:
for i in images:
print(i.filename)
raise e
input_ids.append(inputs["input_ids"].squeeze().tolist())
if 'pixel_values' in inputs:
pixel_values.append(inputs['pixel_values'])
image_grid_thw.append(inputs['image_grid_thw'])
pixel_values_videos.append(None)
video_grid_thw.append(None)
else:
pixel_values.append(None)
image_grid_thw.append(None)
pixel_values_videos.append(inputs['pixel_values_videos'])
video_grid_thw.append(inputs['video_grid_thw'])
# 2. padding inputs
batch_encoding = processor.tokenizer.pad({'input_ids': input_ids}, return_tensors="pt")
input_ids, attention_mask = batch_encoding['input_ids'], batch_encoding['attention_mask']
# manually enforce long type due to:
# (1) [rank7]: RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.cuda.FloatTensor instead (while checking arguments for embedding)
# (2) [rank7]: File "/fsx/home/ruimeng/project/VLM2Vec/src/model.py", line 45, in _pooling
# [rank7]: reps = last_hidden_state[
# [rank7]: IndexError: tensors used as indices must be long, int, byte or bool tensors
inputs = {
'input_ids': input_ids.long(),
'attention_mask': attention_mask.long(),
'texts': texts,
'images': visual_inputs,
}
inputs['pixel_values'] = pixel_values
inputs['image_grid_thw'] = image_grid_thw
inputs['pixel_values_videos'] = pixel_values_videos
inputs['video_grid_thw'] = video_grid_thw
return inputs
def Gme_process_fn(model_inputs: dict, processor: Qwen2VLProcessor, max_length=None):
inputs = {
'texts': model_inputs['text'],
'images': model_inputs['images'],
}
return inputs
def Qwen2_VL_TokenSelection_process_fn(model_inputs: dict, processor: Qwen2VLTokenSelectionProcessor, max_length=None):
# TODO: set separate max_len for text/visual inputs, currently max_length is only applied to text-only data
input_ids, pixel_values, image_grid_thw, pixel_values_videos, video_grid_thw = [], [], [], [], []
patch_pos, select_mask = [], []
texts, visual_inputs = model_inputs['text'], model_inputs['images']
image_exists = False
# 1. iterate each pair and process (since processors do not support batch processing)
for text, images in zip(texts, visual_inputs):
if images is None or (type(images)==list and any(i is None for i in images)):
# all images must be valid
inputs = processor(text=[text], images=None, return_tensors="np", max_length=max_length, truncation=True)
input_id = inputs["input_ids"].squeeze().tolist()
if isinstance(input_id, int):
# in case of empty string, only BOS is included
input_id = [input_id]
input_ids.append(input_id)
pixel_values.append(None)
image_grid_thw.append(None)
patch_pos.append(None)
select_mask.append(None)
pixel_values_videos.append(None)
video_grid_thw.append(None)
else:
image_exists = True
# TODO only
# handling multi-image data from videos, cannot deal with mixed image + video data
if VLM_IMAGE_TOKENS[QWEN2_VL] in text:
inputs = processor(text=[text], images=[images], return_tensors="np", max_length=None, truncation=False, input_data_format=ChannelDimension.LAST)
elif VLM_VIDEO_TOKENS[QWEN2_VL] in text:
assert len(images) > 1, f"Video data must have more than 1 frame, got {len(images)}"
inputs = processor(text=[text], videos=[images], return_tensors="np", max_length=None, truncation=False, input_data_format=ChannelDimension.LAST)
else:
raise NotImplementedError(f"Unsupported visual token in text: {text}")
input_ids.append(inputs["input_ids"].squeeze().tolist())
if 'pixel_values' in inputs:
pixel_values.append(inputs['pixel_values'])
image_grid_thw.append(inputs['image_grid_thw'])
pixel_values_videos.append(None)
video_grid_thw.append(None)
if 'patch_pos' in inputs:
patch_pos.append(inputs['patch_pos'])
if 'select_mask' in inputs:
select_mask.append(inputs['select_mask'])
else:
pixel_values.append(None)
image_grid_thw.append(None)
patch_pos.append(None)
select_mask.append(None)
pixel_values_videos.append(inputs['pixel_values_videos'])
video_grid_thw.append(inputs['video_grid_thw'])
# 2. padding inputs
batch_encoding = processor.tokenizer.pad({'input_ids': input_ids}, return_tensors="pt")
input_ids, attention_mask = batch_encoding['input_ids'], batch_encoding['attention_mask']
if image_exists:
if patch_pos:
patch_pos_shape_for_padding = list(v.shape for v in patch_pos if v is not None)[0]
key_tmp = [torch.from_numpy(v) if v is not None else (torch.zeros(patch_pos_shape_for_padding) - 1) for v in patch_pos]
max_length = input_ids.size(1)
padded_key = [torch.nn.functional.pad(pos, (0, max_length - pos.size(1)), value=-1) for pos in key_tmp]
patch_pos = torch.cat(padded_key, dim=0)
if select_mask:
select_mask_shape_for_padding = list(v.shape for v in select_mask if v is not None)[0]
key_tmp = [torch.from_numpy(v) if v is not None else torch.ones(select_mask_shape_for_padding).bool() for v in select_mask]
max_length = input_ids.size(1)
padded_key = [torch.nn.functional.pad(pos, (0, max_length - pos.size(1)), value=True) for pos in key_tmp]
select_mask = torch.cat(padded_key, dim=0)
# manually enforce long type due to:
# (1) [rank7]: RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.cuda.FloatTensor instead (while checking arguments for embedding)
# (2) [rank7]: File "/fsx/home/ruimeng/project/VLM2Vec/src/model.py", line 45, in _pooling
# [rank7]: reps = last_hidden_state[
# [rank7]: IndexError: tensors used as indices must be long, int, byte or bool tensors
inputs = {
'input_ids': input_ids.long(),
'attention_mask': attention_mask.long()
}
inputs['pixel_values'] = pixel_values
inputs['image_grid_thw'] = image_grid_thw
inputs['pixel_values_videos'] = pixel_values_videos
inputs['video_grid_thw'] = video_grid_thw
inputs['patch_pos'] = patch_pos
inputs['select_mask'] = select_mask
return inputs
def _strip_image_tokens_from_text(txt: str, model_backbone: str) -> str:
if not isinstance(txt, str):
return ""
img_tok = VLM_IMAGE_TOKENS.get(model_backbone, None)
vid_tok = VLM_VIDEO_TOKENS.get(model_backbone, None)
if img_tok and img_tok in txt:
txt = txt.replace(img_tok, "").strip()
if vid_tok and vid_tok in txt:
txt = txt.replace(vid_tok, "").strip()
return txt
def Qwen2_5_VL_GP_chat_process_fn(model_inputs: dict, processor, max_length=None):
"""
支持输入:
model_inputs = {'text': [str], 'images': [List[PIL] or None], 'bboxes': Optional[List[List[xyxy]]]}
行为:
- 构造 chat messages: [{"role":"user","content":[{"type":"image","image":...}, {"type":"text","text":...}]}]
- 归一化 bbox(如果有);若无则 [[0,0,1,1]]
- processor(..., normed_bboxes=...) -> 返回 ref_token_masks
"""
texts = model_inputs["text"]
images_batch = model_inputs["images"]
bboxes_batch = model_inputs.get("bboxes", None)
messages, normed_bboxes, image_counts = [], [], []
for i, (txt, imgs) in enumerate(zip(texts, images_batch)):
txt = _strip_image_tokens_from_text(txt, QWEN2_5_VL_GP)
content = []
if imgs is not None:
assert isinstance(imgs, list), f"Expect a list of PIL images, got {type(imgs)}"
for im in imgs:
content.append({"type": "image", "image": im})
image_counts.append(len(imgs))
else:
image_counts.append(0)
content.append({"type": "text", "text": txt})
messages.append([{"role": "user", "content": content}])
if bboxes_batch is not None and bboxes_batch[i] and imgs is not None and len(imgs) > 0:
W, H = imgs[0].size
nb = []
for (x1, y1, x2, y2) in bboxes_batch[i]:
nb.append([x1 / W, y1 / H, x2 / W, y2 / H])
normed_bboxes.append(nb)
else:
normed_bboxes.append([[0.0, 0.0, 1.0, 1.0]])
text_inputs = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
image_inputs, video_inputs = process_vision_info(messages)
out = processor(
text=text_inputs,
images=image_inputs,
videos=video_inputs,
normed_bboxes=normed_bboxes,
padding=True,
return_tensors="pt",
)
out["texts"] = texts
out["images"] = images_batch
out["image_counts"] = torch.tensor(image_counts, dtype=torch.long) # 新增
merge_len = 1
try:
merge_len = int(getattr(processor.image_processor, "merge_size", 1)) ** 2
except Exception:
pass
out["image_counts"] = torch.tensor(image_counts, dtype=torch.long) # 每样本图片数 [B]
out["merge_length"] = torch.tensor(merge_len, dtype=torch.long) # 常量(窗口除数)
return out
def InternVL_process_fn(model_inputs: dict, processor, max_length=None):
# TODO not working yet
input_ids, pixel_values, image_sizes, image_grid_thw = [], [], [], []
texts, images = model_inputs['text'], model_inputs['images']
image_exists = False
# 1. iterate each pair and process (since processors do not support batch processing)
for text, image in zip(texts, images):
if image is None:
inputs = processor(text, None, return_tensors="np", max_length=max_length, truncation=True)
input_id = inputs["input_ids"].squeeze().tolist()
if isinstance(input_id, int):
# in case of empty string, only BOS is included
input_id = [input_id]
input_ids.append(input_id)
pixel_values.append(None)
image_sizes.append(None)
image_grid_thw.append(None)
else:
image_exists = True
inputs = processor(text=text, images=[image], return_tensors="np", max_length=max_length, truncation=True)
input_ids.append(inputs["input_ids"].squeeze().tolist())
pixel_values.append(inputs['pixel_values'])
if 'image_sizes' in inputs:
image_sizes.append(inputs['image_sizes'])
if 'image_grid_thw' in inputs:
image_grid_thw.append(inputs['image_grid_thw'])
# 2. padding inputs
batch_encoding = processor.tokenizer.pad({'input_ids': input_ids}, return_tensors="pt")
input_ids, attention_mask = batch_encoding['input_ids'], batch_encoding['attention_mask']
inputs = {
'input_ids': input_ids,
'attention_mask': attention_mask,
'texts': texts,
'images': images,
}
# 3. special postcare for mixed batch (examples w/ and w/o images in the same batch)
if image_exists:
# add them to inputs
inputs['pixel_values'] = pixel_values
inputs['image_sizes'] = image_sizes
else:
inputs['pixel_values'] = torch.zeros(input_ids.shape[0], 1)
inputs['image_sizes'] = torch.ones(input_ids.shape[0], 1)
return inputs
def ColPali_process_fn(model_inputs: dict, processor, max_length=None):
texts, images = model_inputs['text'], model_inputs['images']
if images is None or all(i is None for i in images):
inputs = processor.process_queries(texts)
else:
inputs = processor.process_images(images)
return inputs
def InternVideo2_process_fn(model_inputs: dict, processor, max_length=None):
if all(x is None for x in model_inputs["images"]):
# Text side
from src.model.baseline_backbone.internvideo2.modeling_internvideo2 import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
inputs = tokenizer(
model_inputs["text"],
padding="max_length",
truncation=True,
max_length=40,
return_tensors="pt")
else:
# Video side
from torchvision import transforms
preprocess = transforms.Compose([
transforms.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
transforms.Resize((224, 224)), # Resize to 224x224
transforms.ToTensor(), # Convert from PIL image to tensor (C, H, W)
transforms.Normalize(mean=[0.485, 0.456, 0.406], # ImageNet mean
std=[0.229, 0.224, 0.225]) # ImageNet std
])
frame_list = model_inputs["images"]
# to make image inputs be exact 4 frames
# Case 1: frame_list is flat (not a list of lists), e.g., [PIL, PIL, ...]
if type(frame_list[0]) is not list:
frame_list = [[img.copy() for _ in range(4)] for img in frame_list]
# Case 2: frame_list is already a list of lists, ensure each has exactly 4 images
elif type(frame_list[0]) is list and len(frame_list[0]) != 4:
new_list = []
for frames in frame_list:
if len(frames) < 4:
frames = frames + [frames[-1].copy() for _ in range(4 - len(frames))]
elif len(frames) > 4:
# Sample 4 indices uniformly across the sequence
indices = np.linspace(0, len(frames) - 1, num=4, dtype=int)
frames = [frames[i] for i in indices]
new_list.append(frames)
frame_list = new_list
pixel_values = [
torch.stack([preprocess(img) for img in frames], dim=0) # (num_frames, C, H, W)
for frames in frame_list
]
pixel_values = torch.stack(pixel_values, dim=0) # (B, num_frames, C, H, W)
inputs = {'pixel_values': pixel_values}
return inputs
def e5_v_prompt_template(text, add_video_token, add_image_token):
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n'
if text is not None and add_video_token is False and add_image_token is False: # only text
prompt = llama3_template.format('{}\nSummary above sentence in one word: '.format(text))
if text is None and add_video_token: # only video
prompt = llama3_template.format('<image>\nSummary above video in one word: ')
if text is None and add_image_token: # only image
prompt = llama3_template.format('<image>\nSummary above image in one word: ')
if text is not None and add_video_token: # video + text
prompt = llama3_template.format('<image>\n{}\nSummary above video and text in one word: '.format(text))
if text is not None and add_image_token:
prompt = llama3_template.format('<image>\n{}\nSummary above image and text in one word: '.format(text))
return prompt
PROMPT_TEMPLATE_DICT = {
"e5_v": e5_v_prompt_template,
}
def process_input_text(instruction, model_backbone, text=None, add_video_token=False, add_image_token=False):
# Formulate input text based on text, special token and instruction.
# TBD: Reorganize the hard-code part for baselines such as internvideo2
if model_backbone == "internvideo2":
return text
elif model_backbone in [GME, LamRA, LamRA_QWEN2_5]:
if text:
return instruction + " " + text # GME and LamRA do not need special tokens
else:
return instruction + " "
elif model_backbone == E5_V:
return PROMPT_TEMPLATE_DICT[model_backbone](text, add_video_token, add_image_token)
prompt = instruction
if text:
prompt = prompt + " " + text
if add_video_token:
video_token = VLM_VIDEO_TOKENS[model_backbone]
prompt = video_token + " " + prompt
if add_image_token:
image_token = VLM_IMAGE_TOKENS[model_backbone]
prompt = image_token + " " + prompt
return prompt
process_vlm_inputs_fns = {
PHI3V: Phi3V_process_fn,
LLAVA_NEXT: Llava_NEXT_process_fn,
QWEN2_VL_GP: Qwen2_VL_process_fn,
QWEN2_5_VL_GP: Qwen2_VL_process_fn,
QWEN2_VL_LayerPrune: Qwen2_VL_process_fn,
QWEN2_VL: Qwen2_VL_process_fn,
QWEN2_5_VL: Qwen2_VL_process_fn,
QWEN2_VL_TOKENSELECTION: Qwen2_VL_TokenSelection_process_fn,
QWEN2_5_VL_TOKENSELECTION: Qwen2_VL_TokenSelection_process_fn,
INTERNVIDEO2: InternVideo2_process_fn,
GME: Gme_process_fn,
LamRA: Gme_process_fn,
LamRA_QWEN2_5: Gme_process_fn,
COLPALI: ColPali_process_fn,
E5_V: Llava_NEXT_process_fn,
}