{ "architectures": [ "VFMMultiFrameTransformer" ], "chosen_layers": [ 4, 11, 17, 23 ], "ffn_layer": "mlp", "geometry_aggregator": false, "geometry_aggregator_layer": 6, "grounding_ratio": 0.5, "hidden_act": "gelu", "hidden_size": 1024, "image_aggregator": false, "image_aggregator_layer": 6, "image_size": 224, "image_ssl": { "compute_precision": { "sharding_strategy": "SHARD_GRAD_OP" }, "crops": { "local_crops_number": 2 }, "dino": { "force_weight_norm": false, "global_ignore_diagonal": true, "head_bottleneck_dim": 256, "head_hidden_dim": 2048, "head_n_prototypes": 65536, "head_nlayers": 3, "head_norm_last_layer": false, "koleo_distributed_replicas": 0, "koleo_loss_distributed": false, "koleo_loss_weight": 0.1, "koleo_topk": 1, "local_loss_weight_schedule": { "end": 0.5, "peak": 0.5, "start": 0.5, "warmup_epochs": 0 }, "loss_weight": 1.0, "reweight_dino_local_loss": false }, "distillation": { "checkpoint_path": "", "enabled": false, "full_cfg_path": "" }, "gram": { "ckpt": null, "compute_stats": false, "ema_teacher": false, "global_teacher_resize_antialias": false, "global_teacher_resize_method": "bicubic", "img_level": true, "it_first_update": 0, "it_load_ema_teacher": -1, "loss_weight": 1.0, "loss_weight_schedule": null, "max_updates": null, "normalized": true, "remove_neg": false, "remove_only_teacher_neg": false, "rep_update": true, "tokens_used": "all", "update_frequency": 50000, "use_loss": true }, "ibot": { "force_masking_even_with_zero_weight": false, "head_bottleneck_dim": 256, "head_hidden_dim": 2048, "head_n_prototypes": 65536, "head_nlayers": 3, "head_norm_last_layer": false, "loss_weight": 1.0, "mask_random_circular_shift": false, "mask_ratio_min_max": [ 0.1, 0.5 ], "mask_sample_probability": 0.5, "separate_head": true }, "multidistillation": { "enabled": false }, "train": { "centering": "sinkhorn_knopp" } }, "initializer_range": 0.02, "intermediate_size": 3072, "layer_norm_eps": 1e-06, "mlp_ratio": 4.0, "mm_projector_type": "mlp2x_gelu", "model_type": "vfm", "num_attention_heads": 16, "num_channels": 3, "num_experts": 8, "num_frames": 16, "patch_embed_name": "dinov3_vitl16_torch", "patch_size": 16, "top_k": 2, "torch_dtype": "float32", "transformers_version": "4.52.3", "upcycle_to_moe": false, "video_aggregator": true, "video_aggregator_layer": 24 }