OmniStream / config.json
StreamFormer's picture
Upload folder using huggingface_hub
a2fbe0c verified
{
"architectures": [
"VFMMultiFrameTransformer"
],
"chosen_layers": [
4,
11,
17,
23
],
"ffn_layer": "mlp",
"geometry_aggregator": false,
"geometry_aggregator_layer": 6,
"grounding_ratio": 0.5,
"hidden_act": "gelu",
"hidden_size": 1024,
"image_aggregator": false,
"image_aggregator_layer": 6,
"image_size": 224,
"image_ssl": {
"compute_precision": {
"sharding_strategy": "SHARD_GRAD_OP"
},
"crops": {
"local_crops_number": 2
},
"dino": {
"force_weight_norm": false,
"global_ignore_diagonal": true,
"head_bottleneck_dim": 256,
"head_hidden_dim": 2048,
"head_n_prototypes": 65536,
"head_nlayers": 3,
"head_norm_last_layer": false,
"koleo_distributed_replicas": 0,
"koleo_loss_distributed": false,
"koleo_loss_weight": 0.1,
"koleo_topk": 1,
"local_loss_weight_schedule": {
"end": 0.5,
"peak": 0.5,
"start": 0.5,
"warmup_epochs": 0
},
"loss_weight": 1.0,
"reweight_dino_local_loss": false
},
"distillation": {
"checkpoint_path": "",
"enabled": false,
"full_cfg_path": ""
},
"gram": {
"ckpt": null,
"compute_stats": false,
"ema_teacher": false,
"global_teacher_resize_antialias": false,
"global_teacher_resize_method": "bicubic",
"img_level": true,
"it_first_update": 0,
"it_load_ema_teacher": -1,
"loss_weight": 1.0,
"loss_weight_schedule": null,
"max_updates": null,
"normalized": true,
"remove_neg": false,
"remove_only_teacher_neg": false,
"rep_update": true,
"tokens_used": "all",
"update_frequency": 50000,
"use_loss": true
},
"ibot": {
"force_masking_even_with_zero_weight": false,
"head_bottleneck_dim": 256,
"head_hidden_dim": 2048,
"head_n_prototypes": 65536,
"head_nlayers": 3,
"head_norm_last_layer": false,
"loss_weight": 1.0,
"mask_random_circular_shift": false,
"mask_ratio_min_max": [
0.1,
0.5
],
"mask_sample_probability": 0.5,
"separate_head": true
},
"multidistillation": {
"enabled": false
},
"train": {
"centering": "sinkhorn_knopp"
}
},
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-06,
"mlp_ratio": 4.0,
"mm_projector_type": "mlp2x_gelu",
"model_type": "vfm",
"num_attention_heads": 16,
"num_channels": 3,
"num_experts": 8,
"num_frames": 16,
"patch_embed_name": "dinov3_vitl16_torch",
"patch_size": 16,
"top_k": 2,
"torch_dtype": "float32",
"transformers_version": "4.52.3",
"upcycle_to_moe": false,
"video_aggregator": true,
"video_aggregator_layer": 24
}