Upload PaddleOCR-VL split vision encoder artifacts
Browse files
model/extracted_vision_encoder.py
CHANGED
|
@@ -22,6 +22,7 @@ FULL_VISUAL_PREFIX = "visual."
|
|
| 22 |
FULL_PROJECTOR_PREFIX = "mlp_AR."
|
| 23 |
STANDALONE_VISUAL_PREFIX = "visual."
|
| 24 |
STANDALONE_PROJECTOR_PREFIX = "projector."
|
|
|
|
| 25 |
|
| 26 |
|
| 27 |
def _read_json(path: Union[str, Path]) -> Dict[str, Any]:
|
|
@@ -360,7 +361,9 @@ class PaddleOCRVLVisionTower(torch.nn.Module):
|
|
| 360 |
) -> Dict[str, Any]:
|
| 361 |
image_processor = image_processor or PaddleOCRVLImageProcessor(
|
| 362 |
patch_size=self.config.vision_config.patch_size,
|
| 363 |
-
|
|
|
|
|
|
|
| 364 |
merge_size=self.config.vision_config.spatial_merge_size,
|
| 365 |
)
|
| 366 |
encoded: BatchFeature = image_processor(
|
|
|
|
| 22 |
FULL_PROJECTOR_PREFIX = "mlp_AR."
|
| 23 |
STANDALONE_VISUAL_PREFIX = "visual."
|
| 24 |
STANDALONE_PROJECTOR_PREFIX = "projector."
|
| 25 |
+
IMAGE_PROCESSOR_TEMPORAL_PATCH_SIZE = 1
|
| 26 |
|
| 27 |
|
| 28 |
def _read_json(path: Union[str, Path]) -> Dict[str, Any]:
|
|
|
|
| 361 |
) -> Dict[str, Any]:
|
| 362 |
image_processor = image_processor or PaddleOCRVLImageProcessor(
|
| 363 |
patch_size=self.config.vision_config.patch_size,
|
| 364 |
+
# The current image preprocessing implementation is image-only and asserts
|
| 365 |
+
# `temporal_patch_size == 1`, even though the vision model config may store 2.
|
| 366 |
+
temporal_patch_size=IMAGE_PROCESSOR_TEMPORAL_PATCH_SIZE,
|
| 367 |
merge_size=self.config.vision_config.spatial_merge_size,
|
| 368 |
)
|
| 369 |
encoded: BatchFeature = image_processor(
|