| { | |
| "architectures": [ | |
| "VFMMultiFrameTransformer" | |
| ], | |
| "chosen_layers": [ | |
| 4, | |
| 11, | |
| 17, | |
| 23 | |
| ], | |
| "ffn_layer": "mlp", | |
| "geometry_aggregator": false, | |
| "geometry_aggregator_layer": 6, | |
| "grounding_ratio": 0.5, | |
| "hidden_act": "gelu", | |
| "hidden_size": 1024, | |
| "image_aggregator": false, | |
| "image_aggregator_layer": 6, | |
| "image_size": 224, | |
| "image_ssl": { | |
| "compute_precision": { | |
| "sharding_strategy": "SHARD_GRAD_OP" | |
| }, | |
| "crops": { | |
| "local_crops_number": 2 | |
| }, | |
| "dino": { | |
| "force_weight_norm": false, | |
| "global_ignore_diagonal": true, | |
| "head_bottleneck_dim": 256, | |
| "head_hidden_dim": 2048, | |
| "head_n_prototypes": 65536, | |
| "head_nlayers": 3, | |
| "head_norm_last_layer": false, | |
| "koleo_distributed_replicas": 0, | |
| "koleo_loss_distributed": false, | |
| "koleo_loss_weight": 0.1, | |
| "koleo_topk": 1, | |
| "local_loss_weight_schedule": { | |
| "end": 0.5, | |
| "peak": 0.5, | |
| "start": 0.5, | |
| "warmup_epochs": 0 | |
| }, | |
| "loss_weight": 1.0, | |
| "reweight_dino_local_loss": false | |
| }, | |
| "distillation": { | |
| "checkpoint_path": "", | |
| "enabled": false, | |
| "full_cfg_path": "" | |
| }, | |
| "gram": { | |
| "ckpt": null, | |
| "compute_stats": false, | |
| "ema_teacher": false, | |
| "global_teacher_resize_antialias": false, | |
| "global_teacher_resize_method": "bicubic", | |
| "img_level": true, | |
| "it_first_update": 0, | |
| "it_load_ema_teacher": -1, | |
| "loss_weight": 1.0, | |
| "loss_weight_schedule": null, | |
| "max_updates": null, | |
| "normalized": true, | |
| "remove_neg": false, | |
| "remove_only_teacher_neg": false, | |
| "rep_update": true, | |
| "tokens_used": "all", | |
| "update_frequency": 50000, | |
| "use_loss": true | |
| }, | |
| "ibot": { | |
| "force_masking_even_with_zero_weight": false, | |
| "head_bottleneck_dim": 256, | |
| "head_hidden_dim": 2048, | |
| "head_n_prototypes": 65536, | |
| "head_nlayers": 3, | |
| "head_norm_last_layer": false, | |
| "loss_weight": 1.0, | |
| "mask_random_circular_shift": false, | |
| "mask_ratio_min_max": [ | |
| 0.1, | |
| 0.5 | |
| ], | |
| "mask_sample_probability": 0.5, | |
| "separate_head": true | |
| }, | |
| "multidistillation": { | |
| "enabled": false | |
| }, | |
| "train": { | |
| "centering": "sinkhorn_knopp" | |
| } | |
| }, | |
| "initializer_range": 0.02, | |
| "intermediate_size": 3072, | |
| "layer_norm_eps": 1e-06, | |
| "mlp_ratio": 4.0, | |
| "mm_projector_type": "mlp2x_gelu", | |
| "model_type": "vfm", | |
| "num_attention_heads": 16, | |
| "num_channels": 3, | |
| "num_experts": 8, | |
| "num_frames": 16, | |
| "patch_embed_name": "dinov3_vitl16_torch", | |
| "patch_size": 16, | |
| "top_k": 2, | |
| "torch_dtype": "float32", | |
| "transformers_version": "4.52.3", | |
| "upcycle_to_moe": false, | |
| "video_aggregator": true, | |
| "video_aggregator_layer": 24 | |
| } | |