acsfid commited on
Commit
d96cc49
·
verified ·
1 Parent(s): d00ea0a

Upload PaddleOCR-VL split vision encoder artifacts

Browse files
Files changed (1) hide show
  1. model/extracted_vision_encoder.py +4 -1
model/extracted_vision_encoder.py CHANGED
@@ -22,6 +22,7 @@ FULL_VISUAL_PREFIX = "visual."
22
  FULL_PROJECTOR_PREFIX = "mlp_AR."
23
  STANDALONE_VISUAL_PREFIX = "visual."
24
  STANDALONE_PROJECTOR_PREFIX = "projector."
 
25
 
26
 
27
  def _read_json(path: Union[str, Path]) -> Dict[str, Any]:
@@ -360,7 +361,9 @@ class PaddleOCRVLVisionTower(torch.nn.Module):
360
  ) -> Dict[str, Any]:
361
  image_processor = image_processor or PaddleOCRVLImageProcessor(
362
  patch_size=self.config.vision_config.patch_size,
363
- temporal_patch_size=self.config.vision_config.temporal_patch_size,
 
 
364
  merge_size=self.config.vision_config.spatial_merge_size,
365
  )
366
  encoded: BatchFeature = image_processor(
 
22
  FULL_PROJECTOR_PREFIX = "mlp_AR."
23
  STANDALONE_VISUAL_PREFIX = "visual."
24
  STANDALONE_PROJECTOR_PREFIX = "projector."
25
+ IMAGE_PROCESSOR_TEMPORAL_PATCH_SIZE = 1
26
 
27
 
28
  def _read_json(path: Union[str, Path]) -> Dict[str, Any]:
 
361
  ) -> Dict[str, Any]:
362
  image_processor = image_processor or PaddleOCRVLImageProcessor(
363
  patch_size=self.config.vision_config.patch_size,
364
+ # The current image preprocessing implementation is image-only and asserts
365
+ # `temporal_patch_size == 1`, even though the vision model config may store 2.
366
+ temporal_patch_size=IMAGE_PROCESSOR_TEMPORAL_PATCH_SIZE,
367
  merge_size=self.config.vision_config.spatial_merge_size,
368
  )
369
  encoded: BatchFeature = image_processor(