DashLuuu commited on
Commit
603aea8
·
verified ·
1 Parent(s): 948e268

Add files using upload-large-folder tool

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
args.json ADDED
@@ -0,0 +1,403 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_dir": "/data/songshuo.lu/ms-swift/output/v10-20260426-233812",
3
+ "per_device_train_batch_size": 1,
4
+ "num_train_epochs": 3.0,
5
+ "max_steps": -1,
6
+ "learning_rate": 1e-05,
7
+ "lr_scheduler_type": "cosine",
8
+ "lr_scheduler_kwargs": null,
9
+ "warmup_steps": 50.0,
10
+ "optim": "adamw_torch_fused",
11
+ "optim_args": null,
12
+ "weight_decay": 0.1,
13
+ "adam_beta1": 0.9,
14
+ "adam_beta2": 0.95,
15
+ "adam_epsilon": 1e-08,
16
+ "optim_target_modules": null,
17
+ "gradient_accumulation_steps": 16,
18
+ "average_tokens_across_devices": true,
19
+ "max_grad_norm": 1.0,
20
+ "label_smoothing_factor": 0.0,
21
+ "bf16": true,
22
+ "fp16": false,
23
+ "bf16_full_eval": false,
24
+ "fp16_full_eval": false,
25
+ "tf32": null,
26
+ "gradient_checkpointing": true,
27
+ "gradient_checkpointing_kwargs": null,
28
+ "torch_compile": false,
29
+ "torch_compile_backend": null,
30
+ "torch_compile_mode": null,
31
+ "use_liger_kernel": false,
32
+ "liger_kernel_config": null,
33
+ "use_cache": false,
34
+ "neftune_noise_alpha": null,
35
+ "torch_empty_cache_steps": null,
36
+ "auto_find_batch_size": false,
37
+ "logging_strategy": "steps",
38
+ "logging_steps": 1,
39
+ "logging_first_step": true,
40
+ "log_on_each_node": true,
41
+ "logging_nan_inf_filter": true,
42
+ "include_num_input_tokens_seen": false,
43
+ "log_level": "passive",
44
+ "log_level_replica": "warning",
45
+ "disable_tqdm": null,
46
+ "report_to": [
47
+ "tensorboard"
48
+ ],
49
+ "run_name": "/data/songshuo.lu/ms-swift/output/v10-20260426-233812",
50
+ "project": "huggingface",
51
+ "trackio_space_id": "trackio",
52
+ "eval_strategy": "steps",
53
+ "eval_steps": 2000.0,
54
+ "eval_delay": 0,
55
+ "per_device_eval_batch_size": 1,
56
+ "prediction_loss_only": false,
57
+ "eval_on_start": false,
58
+ "eval_do_concat_batches": true,
59
+ "eval_use_gather_object": false,
60
+ "eval_accumulation_steps": null,
61
+ "include_for_metrics": [],
62
+ "batch_eval_metrics": false,
63
+ "save_only_model": true,
64
+ "save_strategy": "steps",
65
+ "save_steps": 50.0,
66
+ "save_on_each_node": false,
67
+ "save_total_limit": null,
68
+ "enable_jit_checkpoint": false,
69
+ "push_to_hub": false,
70
+ "hub_token": null,
71
+ "hub_private_repo": null,
72
+ "hub_model_id": null,
73
+ "hub_strategy": "every_save",
74
+ "hub_always_push": false,
75
+ "hub_revision": null,
76
+ "load_best_model_at_end": false,
77
+ "metric_for_best_model": "loss",
78
+ "greater_is_better": false,
79
+ "ignore_data_skip": false,
80
+ "restore_callback_states_from_checkpoint": false,
81
+ "full_determinism": false,
82
+ "seed": 42,
83
+ "data_seed": 42,
84
+ "use_cpu": false,
85
+ "accelerator_config": {
86
+ "dispatch_batches": false
87
+ },
88
+ "parallelism_config": null,
89
+ "dataloader_drop_last": false,
90
+ "dataloader_num_workers": 32,
91
+ "dataloader_pin_memory": true,
92
+ "dataloader_persistent_workers": false,
93
+ "dataloader_prefetch_factor": null,
94
+ "remove_unused_columns": true,
95
+ "label_names": null,
96
+ "train_sampling_strategy": "random",
97
+ "length_column_name": "length",
98
+ "ddp_find_unused_parameters": null,
99
+ "ddp_bucket_cap_mb": null,
100
+ "ddp_broadcast_buffers": null,
101
+ "ddp_backend": null,
102
+ "ddp_timeout": 18000000,
103
+ "fsdp": [],
104
+ "fsdp_config": null,
105
+ "deepspeed": {
106
+ "fp16": {
107
+ "enabled": "auto",
108
+ "loss_scale": 0,
109
+ "loss_scale_window": 1000,
110
+ "initial_scale_power": 16,
111
+ "hysteresis": 2,
112
+ "min_loss_scale": 1
113
+ },
114
+ "bf16": {
115
+ "enabled": "auto"
116
+ },
117
+ "zero_optimization": {
118
+ "stage": 3,
119
+ "offload_optimizer": {
120
+ "device": "cpu",
121
+ "pin_memory": true
122
+ },
123
+ "offload_param": {
124
+ "device": "cpu",
125
+ "pin_memory": true
126
+ },
127
+ "overlap_comm": false,
128
+ "contiguous_gradients": true,
129
+ "sub_group_size": 1000000000.0,
130
+ "reduce_bucket_size": "auto",
131
+ "stage3_prefetch_bucket_size": "auto",
132
+ "stage3_param_persistence_threshold": "auto",
133
+ "stage3_max_live_parameters": 1000000000.0,
134
+ "stage3_max_reuse_distance": 1000000000.0,
135
+ "stage3_gather_16bit_weights_on_model_save": true
136
+ },
137
+ "gradient_accumulation_steps": "auto",
138
+ "gradient_clipping": "auto",
139
+ "steps_per_print": 2000,
140
+ "train_batch_size": "auto",
141
+ "train_micro_batch_size_per_gpu": "auto",
142
+ "wall_clock_breakdown": false
143
+ },
144
+ "debug": null,
145
+ "skip_memory_metrics": true,
146
+ "do_train": false,
147
+ "do_eval": false,
148
+ "do_predict": false,
149
+ "resume_from_checkpoint": null,
150
+ "warmup_ratio": null,
151
+ "logging_dir": "/data/songshuo.lu/ms-swift/output/v10-20260426-233812/runs",
152
+ "local_rank": 0,
153
+ "sortish_sampler": false,
154
+ "predict_with_generate": false,
155
+ "generation_max_length": null,
156
+ "generation_num_beams": null,
157
+ "generation_config": null,
158
+ "tuner_backend": "peft",
159
+ "vit_gradient_checkpointing": false,
160
+ "router_aux_loss_coef": 0.0,
161
+ "enable_dft_loss": false,
162
+ "enable_channel_loss": false,
163
+ "safe_serialization": true,
164
+ "max_shard_size": "5GB",
165
+ "check_model": true,
166
+ "acc_strategy": "token",
167
+ "train_dataloader_shuffle": true,
168
+ "group_by_length": false,
169
+ "max_epochs": null,
170
+ "aligner_lr": null,
171
+ "vit_lr": null,
172
+ "use_logits_to_keep": null,
173
+ "ds3_gather_for_generation": true,
174
+ "resume_only_model": false,
175
+ "optimizer": null,
176
+ "loss_type": null,
177
+ "eval_metric": null,
178
+ "callbacks": [],
179
+ "early_stop_interval": null,
180
+ "eval_use_evalscope": false,
181
+ "eval_dataset": [],
182
+ "eval_dataset_args": null,
183
+ "eval_limit": null,
184
+ "eval_generation_config": null,
185
+ "extra_eval_args": null,
186
+ "tuner_type": "full",
187
+ "use_galore": false,
188
+ "galore_target_modules": null,
189
+ "galore_rank": 128,
190
+ "galore_update_proj_gap": 50,
191
+ "galore_scale": 1.0,
192
+ "galore_proj_type": "std",
193
+ "galore_optim_per_parameter": false,
194
+ "galore_with_embedding": false,
195
+ "galore_quantization": false,
196
+ "galore_proj_quant": false,
197
+ "galore_proj_bits": 4,
198
+ "galore_proj_group_size": 256,
199
+ "galore_cos_threshold": 0.4,
200
+ "galore_gamma_proj": 2,
201
+ "galore_queue_size": 5,
202
+ "lisa_activated_layers": 0,
203
+ "lisa_step_interval": 20,
204
+ "use_flash_ckpt": false,
205
+ "use_ray": false,
206
+ "ray_exp_name": null,
207
+ "device_groups": null,
208
+ "model": "/data/models/Qwen3.6-27B",
209
+ "model_type": "qwen3_5",
210
+ "model_revision": null,
211
+ "task_type": "causal_lm",
212
+ "torch_dtype": "bfloat16",
213
+ "attn_impl": "flash_attention_2",
214
+ "experts_impl": null,
215
+ "new_special_tokens": [],
216
+ "num_labels": null,
217
+ "problem_type": null,
218
+ "rope_scaling": null,
219
+ "device_map": null,
220
+ "max_memory": {},
221
+ "max_model_len": null,
222
+ "local_repo_path": null,
223
+ "init_strategy": null,
224
+ "template": "qwen3_5",
225
+ "system": null,
226
+ "max_length": 36000,
227
+ "truncation_strategy": "delete",
228
+ "max_pixels": null,
229
+ "agent_template": null,
230
+ "norm_bbox": null,
231
+ "use_chat_template": true,
232
+ "padding_side": "right",
233
+ "padding_free": false,
234
+ "loss_scale": "last_round+ignore_empty_think",
235
+ "sequence_parallel_size": 1,
236
+ "template_backend": "swift",
237
+ "response_prefix": null,
238
+ "enable_thinking": null,
239
+ "add_non_thinking_prefix": true,
240
+ "dataset": [
241
+ "/data/songshuo.lu/datas/sft-v26/general_thinking_5k.jsonl",
242
+ "/data/songshuo.lu/datas/sft-v26/cuda_agent_6k_k2.5_distill_robust_strict_kernelbench_l2l3_variants.jsonl",
243
+ "/data/songshuo.lu/datas/sft-v26/cuda_agent_6k_k2.5_distill_robust_strict_mt_l3_v1.jsonl",
244
+ "/data/songshuo.lu/datas/sft-v26/cuda_agent_6k_k2.5_distill_robust_strict_mt_l3_v2.jsonl",
245
+ "/data/songshuo.lu/datas/sft-v26/cuda_agent_6k_k2.5_distill_robust_strict_mt_l3_v3.jsonl",
246
+ "/data/songshuo.lu/datas/sft-v26/cuda_agent_6k_k2.5_distill_robust_strict_mt_l3_v4.jsonl",
247
+ "/data/songshuo.lu/datas/sft-v26/cuda_agent_6k_k2.5_distill_robust_strict_mt_l3.jsonl",
248
+ "/data/songshuo.lu/datas/sft-v26/cuda_agent_6k_k2.5_distill_robust_strict_mt_no_bias.jsonl",
249
+ "/data/songshuo.lu/datas/sft-v26/cuda_agent_6k_k2.5_distill_robust_strict_mt_v1.jsonl",
250
+ "/data/songshuo.lu/datas/sft-v26/cuda_agent_6k_k2.5_distill_robust_strict_mt_v3.jsonl",
251
+ "/data/songshuo.lu/datas/sft-v26/cuda_agent_6k_k2.5_distill_short_prompt_mt_v1.jsonl",
252
+ "/data/songshuo.lu/datas/sft-v26/cuda_agent_6k_k2.5_distill_short_prompt_mt_v2.jsonl",
253
+ "/data/songshuo.lu/datas/sft-v26/cuda_agent_6k_k2.5_distill_short_prompt_no_bias_st_v2.jsonl",
254
+ "/data/songshuo.lu/datas/sft-v26/cuda_agent_6k_qwen3_235b_distill_robust_strict_v2.jsonl",
255
+ "/data/songshuo.lu/datas/sft-v26/cuda_data_ds_v3_distill_torch_op_v1.jsonl",
256
+ "/data/songshuo.lu/datas/sft-v26/operator_hard_related_qa.jsonl"
257
+ ],
258
+ "val_dataset": [
259
+ "/data/songshuo.lu/datas/sft-v26/cuda_agent_6k_k2.5_distill_robust_strict_mt_l3_v1.jsonl#50"
260
+ ],
261
+ "cached_dataset": [],
262
+ "cached_val_dataset": [],
263
+ "split_dataset_ratio": 0.0,
264
+ "dataset_num_proc": 32,
265
+ "load_from_cache_file": false,
266
+ "dataset_shuffle": true,
267
+ "val_dataset_shuffle": false,
268
+ "streaming": false,
269
+ "interleave_prob": null,
270
+ "stopping_strategy": "first_exhausted",
271
+ "shuffle_buffer_size": 1000,
272
+ "download_mode": "reuse_dataset_if_exists",
273
+ "columns": {},
274
+ "strict": false,
275
+ "model_name": [
276
+ "MusaChat-9B-v1"
277
+ ],
278
+ "model_author": [
279
+ "DashLuuu"
280
+ ],
281
+ "custom_dataset_info": [],
282
+ "quant_method": null,
283
+ "quant_bits": null,
284
+ "hqq_axis": null,
285
+ "bnb_4bit_compute_dtype": "bfloat16",
286
+ "bnb_4bit_quant_type": "nf4",
287
+ "bnb_4bit_use_double_quant": true,
288
+ "bnb_4bit_quant_storage": null,
289
+ "max_new_tokens": 64,
290
+ "temperature": 0.0,
291
+ "top_k": null,
292
+ "top_p": null,
293
+ "repetition_penalty": null,
294
+ "num_beams": 1,
295
+ "stream": false,
296
+ "stop_words": [],
297
+ "logprobs": false,
298
+ "top_logprobs": null,
299
+ "structured_outputs_regex": null,
300
+ "adapters": [],
301
+ "external_plugins": [],
302
+ "custom_register_path": [],
303
+ "model_kwargs": {},
304
+ "load_args": false,
305
+ "load_data_args": false,
306
+ "packing": false,
307
+ "packing_length": null,
308
+ "packing_num_proc": 1,
309
+ "lazy_tokenize": true,
310
+ "use_hf": false,
311
+ "ignore_args_error": false,
312
+ "use_swift_lora": false,
313
+ "freeze_parameters": [
314
+ "model.visual",
315
+ "model.visual.merger"
316
+ ],
317
+ "freeze_parameters_regex": null,
318
+ "freeze_parameters_ratio": 0.0,
319
+ "trainable_parameters": [],
320
+ "trainable_parameters_regex": null,
321
+ "freeze_llm": false,
322
+ "freeze_vit": true,
323
+ "freeze_aligner": true,
324
+ "target_modules": [
325
+ "all-linear"
326
+ ],
327
+ "target_regex": null,
328
+ "target_parameters": null,
329
+ "modules_to_save": [],
330
+ "lora_rank": 8,
331
+ "lora_alpha": 32,
332
+ "lora_dropout": 0.05,
333
+ "lora_bias": "none",
334
+ "lora_dtype": null,
335
+ "lorap_lr_ratio": null,
336
+ "use_rslora": false,
337
+ "use_dora": false,
338
+ "lora_ga_batch_size": 2,
339
+ "lora_ga_iters": 2,
340
+ "lora_ga_max_length": 1024,
341
+ "lora_ga_direction": "ArB2r",
342
+ "lora_ga_scale": "stable",
343
+ "lora_ga_stable_gamma": 16,
344
+ "init_weights": true,
345
+ "fourier_n_frequency": 2000,
346
+ "fourier_scaling": 300.0,
347
+ "boft_block_size": 4,
348
+ "boft_block_num": 0,
349
+ "boft_n_butterfly_factor": 1,
350
+ "boft_dropout": 0.0,
351
+ "vera_rank": 256,
352
+ "vera_projection_prng_key": 0,
353
+ "vera_dropout": 0.0,
354
+ "vera_d_initial": 0.1,
355
+ "adapter_act": "gelu",
356
+ "adapter_length": 128,
357
+ "adalora_target_r": 8,
358
+ "adalora_init_r": 12,
359
+ "adalora_tinit": 0,
360
+ "adalora_tfinal": 0,
361
+ "adalora_deltaT": 1,
362
+ "adalora_beta1": 0.85,
363
+ "adalora_beta2": 0.85,
364
+ "adalora_orth_reg_weight": 0.5,
365
+ "llamapro_num_new_blocks": 4,
366
+ "llamapro_num_groups": null,
367
+ "reft_layer_key": null,
368
+ "reft_layers": null,
369
+ "reft_rank": 4,
370
+ "reft_intervention_type": "LoreftIntervention",
371
+ "reft_args": null,
372
+ "swanlab_token": null,
373
+ "swanlab_project": "ms-swift",
374
+ "swanlab_workspace": null,
375
+ "swanlab_exp_name": null,
376
+ "swanlab_notification_method": null,
377
+ "swanlab_webhook_url": null,
378
+ "swanlab_secret": null,
379
+ "swanlab_sender_email": null,
380
+ "swanlab_receiver_email": null,
381
+ "swanlab_smtp_server": null,
382
+ "swanlab_smtp_port": null,
383
+ "swanlab_email_language": "zh",
384
+ "swanlab_mode": "cloud",
385
+ "add_version": true,
386
+ "create_checkpoint_symlink": false,
387
+ "zero_hpz_partition_size": null,
388
+ "deepspeed_autotp_size": null,
389
+ "swift_version": "4.2.0.dev0",
390
+ "ckpt_dir": null,
391
+ "rank": 0,
392
+ "global_world_size": 8,
393
+ "local_world_size": 8,
394
+ "model_suffix": "Qwen3.6-27B",
395
+ "model_info": "ModelInfo(model_type='qwen3_5', model_dir='/data/models/Qwen3.6-27B', torch_dtype=torch.bfloat16, max_model_len=262144, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, is_multimodal=True, config=None, task_type='causal_lm', num_labels=None)",
396
+ "model_meta": "ModelMeta(model_type='qwen3_5', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen3.5-0.8B', hf_model_id='Qwen/Qwen3.5-0.8B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3.5-2B', hf_model_id='Qwen/Qwen3.5-2B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3.5-4B', hf_model_id='Qwen/Qwen3.5-4B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3.5-9B', hf_model_id='Qwen/Qwen3.5-9B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3.5-27B', hf_model_id='Qwen/Qwen3.5-27B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3.5-27B-FP8', hf_model_id='Qwen/Qwen3.5-27B-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3.5-0.8B-Base', hf_model_id='Qwen/Qwen3.5-0.8B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3.5-2B-Base', hf_model_id='Qwen/Qwen3.5-2B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3.5-4B-Base', hf_model_id='Qwen/Qwen3.5-4B-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3.5-9B-Base', hf_model_id='Qwen/Qwen3.5-9B-Base', model_path=None, ms_revision=None, hf_revision=None)], template='qwen3_5', ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='Qwen/Qwen3.6-27B', hf_model_id='Qwen/Qwen3.6-27B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3.6-27B-FP8', hf_model_id='Qwen/Qwen3.6-27B-FP8', model_path=None, ms_revision=None, hf_revision=None)], template='qwen3_5', ignore_patterns=None, requires=None, tags=[])], loader=<class 'swift.model.models.qwen.Qwen3_5Loader'>, template='qwen3_5', model_arch=MultiModelKeys(arch_name='qwen2_vl', embedding=None, module_list=None, lm_head=None, q_proj=None, k_proj=None, v_proj=None, o_proj=None, attention=None, mlp=None, down_proj=None, qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None, language_model=['model.language_model', 'lm_head'], aligner=['model.visual.merger'], vision_tower=['model.visual'], generator=[]), mcore_model_type=None, architectures=['Qwen3_5ForConditionalGeneration'], additional_saved_files=[], torch_dtype=None, is_multimodal=True, is_reward=False, task_type=None, ignore_patterns=None, requires=['transformers>=5.0.0.dev', 'qwen_vl_utils>=0.0.14', 'decord'], tags=[])",
397
+ "model_dir": "/data/models/Qwen3.6-27B",
398
+ "template_meta": "QwenTemplateMeta(template_type='qwen3_5', prefix=[], prompt=['<|im_start|>user\\n{{QUERY}}<|im_end|>\\n<|im_start|>assistant\\n'], chat_sep=['<|im_end|>\\n'], suffix=['<|im_end|>\\n'], template_cls=<class 'swift.template.templates.qwen.Qwen3_5Template'>, system_prefix=['<|im_start|>system\\n{{SYSTEM}}<|im_end|>\\n'], default_system=None, auto_add_bos=False, stop_words=['<|endoftext|>'], agent_template='qwen3_5', is_thinking=True, thinking_prefix='<think>\\n', non_thinking_prefix='<think>\\n\\n</think>\\n\\n', history_thinking_prefix='')",
399
+ "_val_dataset_exists": true,
400
+ "hub": "<class 'swift.hub.hub.MSHub'>",
401
+ "evaluation_strategy": "steps",
402
+ "training_args": "Seq2SeqTrainingArguments(output_dir='/data/songshuo.lu/ms-swift/output/v10-20260426-233812', per_device_train_batch_size=1, num_train_epochs=3.0, max_steps=-1, learning_rate=1e-05, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs=None, warmup_steps=50.0, optim=<OptimizerNames.ADAMW_TORCH_FUSED: 'adamw_torch_fused'>, optim_args=None, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, optim_target_modules=None, gradient_accumulation_steps=16, average_tokens_across_devices=None, max_grad_norm=1.0, label_smoothing_factor=0.0, bf16=True, fp16=False, bf16_full_eval=False, fp16_full_eval=False, tf32=None, gradient_checkpointing=True, gradient_checkpointing_kwargs=None, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, use_liger_kernel=False, liger_kernel_config=None, use_cache=False, neftune_noise_alpha=None, torch_empty_cache_steps=None, auto_find_batch_size=False, logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_steps=1, logging_first_step=True, log_on_each_node=True, logging_nan_inf_filter=True, include_num_input_tokens_seen=None, log_level='passive', log_level_replica='warning', disable_tqdm=False, report_to=['tensorboard'], run_name='/data/songshuo.lu/ms-swift/output/v10-20260426-233812', project='huggingface', trackio_space_id='trackio', eval_strategy=<IntervalStrategy.STEPS: 'steps'>, eval_steps=2000, eval_delay=0, per_device_eval_batch_size=1, prediction_loss_only=False, eval_on_start=False, eval_do_concat_batches=True, eval_use_gather_object=False, eval_accumulation_steps=None, include_for_metrics=[], batch_eval_metrics=False, save_only_model=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=50, save_on_each_node=False, save_total_limit=None, enable_jit_checkpoint=False, push_to_hub=False, hub_token=None, hub_private_repo=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_always_push=False, hub_revision=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, restore_callback_states_from_checkpoint=False, full_determinism=False, seed=42, data_seed=42, use_cpu=False, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), parallelism_config=None, dataloader_drop_last=False, dataloader_num_workers=32, dataloader_pin_memory=True, dataloader_persistent_workers=False, dataloader_prefetch_factor=2, remove_unused_columns=False, label_names=None, train_sampling_strategy='random', length_column_name='length', ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, ddp_backend=None, ddp_timeout=18000000, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'cpu', 'pin_memory': True}, 'offload_param': {'device': 'cpu', 'pin_memory': True}, 'overlap_comm': False, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, debug=[], skip_memory_metrics=True, do_train=False, do_eval=True, do_predict=False, resume_from_checkpoint=None, warmup_ratio=None, logging_dir='/data/songshuo.lu/ms-swift/output/v10-20260426-233812/runs', local_rank=0, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=False, router_aux_loss_coef=0.0, enable_dft_loss=False, enable_channel_loss=False, safe_serialization=True, max_shard_size='5GB', check_model=True, acc_strategy='token', train_dataloader_shuffle=True, group_by_length=False, max_epochs=None, aligner_lr=None, vit_lr=None, use_logits_to_keep=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer=None, loss_type=None, eval_metric=None, callbacks=[], early_stop_interval=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, tuner_type='full', use_galore=False, galore_target_modules=None, galore_rank=128, galore_update_proj_gap=50, galore_scale=1.0, galore_proj_type='std', galore_optim_per_parameter=False, galore_with_embedding=False, galore_quantization=False, galore_proj_quant=False, galore_proj_bits=4, galore_proj_group_size=256, galore_cos_threshold=0.4, galore_gamma_proj=2, galore_queue_size=5, lisa_activated_layers=0, lisa_step_interval=20, use_flash_ckpt=False)"
403
+ }
chat_template.jinja ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- set image_count = namespace(value=0) %}
2
+ {%- set video_count = namespace(value=0) %}
3
+ {%- macro render_content(content, do_vision_count, is_system_content=false) %}
4
+ {%- if content is string %}
5
+ {{- content }}
6
+ {%- elif content is iterable and content is not mapping %}
7
+ {%- for item in content %}
8
+ {%- if 'image' in item or 'image_url' in item or item.type == 'image' %}
9
+ {%- if is_system_content %}
10
+ {{- raise_exception('System message cannot contain images.') }}
11
+ {%- endif %}
12
+ {%- if do_vision_count %}
13
+ {%- set image_count.value = image_count.value + 1 %}
14
+ {%- endif %}
15
+ {%- if add_vision_id %}
16
+ {{- 'Picture ' ~ image_count.value ~ ': ' }}
17
+ {%- endif %}
18
+ {{- '<|vision_start|><|image_pad|><|vision_end|>' }}
19
+ {%- elif 'video' in item or item.type == 'video' %}
20
+ {%- if is_system_content %}
21
+ {{- raise_exception('System message cannot contain videos.') }}
22
+ {%- endif %}
23
+ {%- if do_vision_count %}
24
+ {%- set video_count.value = video_count.value + 1 %}
25
+ {%- endif %}
26
+ {%- if add_vision_id %}
27
+ {{- 'Video ' ~ video_count.value ~ ': ' }}
28
+ {%- endif %}
29
+ {{- '<|vision_start|><|video_pad|><|vision_end|>' }}
30
+ {%- elif 'text' in item %}
31
+ {{- item.text }}
32
+ {%- else %}
33
+ {{- raise_exception('Unexpected item type in content.') }}
34
+ {%- endif %}
35
+ {%- endfor %}
36
+ {%- elif content is none or content is undefined %}
37
+ {{- '' }}
38
+ {%- else %}
39
+ {{- raise_exception('Unexpected content type.') }}
40
+ {%- endif %}
41
+ {%- endmacro %}
42
+ {%- if not messages %}
43
+ {{- raise_exception('No messages provided.') }}
44
+ {%- endif %}
45
+ {%- if tools and tools is iterable and tools is not mapping %}
46
+ {{- '<|im_start|>system\n' }}
47
+ {{- "# Tools\n\nYou have access to the following functions:\n\n<tools>" }}
48
+ {%- for tool in tools %}
49
+ {{- "\n" }}
50
+ {{- tool | tojson }}
51
+ {%- endfor %}
52
+ {{- "\n</tools>" }}
53
+ {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
54
+ {%- if messages[0].role == 'system' %}
55
+ {%- set content = render_content(messages[0].content, false, true)|trim %}
56
+ {%- if content %}
57
+ {{- '\n\n' + content }}
58
+ {%- endif %}
59
+ {%- endif %}
60
+ {{- '<|im_end|>\n' }}
61
+ {%- else %}
62
+ {%- if messages[0].role == 'system' %}
63
+ {%- set content = render_content(messages[0].content, false, true)|trim %}
64
+ {{- '<|im_start|>system\n' + content + '<|im_end|>\n' }}
65
+ {%- endif %}
66
+ {%- endif %}
67
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
68
+ {%- for message in messages[::-1] %}
69
+ {%- set index = (messages|length - 1) - loop.index0 %}
70
+ {%- if ns.multi_step_tool and message.role == "user" %}
71
+ {%- set content = render_content(message.content, false)|trim %}
72
+ {%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}
73
+ {%- set ns.multi_step_tool = false %}
74
+ {%- set ns.last_query_index = index %}
75
+ {%- endif %}
76
+ {%- endif %}
77
+ {%- endfor %}
78
+ {%- if ns.multi_step_tool %}
79
+ {{- raise_exception('No user query found in messages.') }}
80
+ {%- endif %}
81
+ {%- for message in messages %}
82
+ {%- set content = render_content(message.content, true)|trim %}
83
+ {%- if message.role == "system" %}
84
+ {%- if not loop.first %}
85
+ {{- raise_exception('System message must be at the beginning.') }}
86
+ {%- endif %}
87
+ {%- elif message.role == "user" %}
88
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
89
+ {%- elif message.role == "assistant" %}
90
+ {%- set reasoning_content = '' %}
91
+ {%- if message.reasoning_content is string %}
92
+ {%- set reasoning_content = message.reasoning_content %}
93
+ {%- else %}
94
+ {%- if '</think>' in content %}
95
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
96
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
97
+ {%- endif %}
98
+ {%- endif %}
99
+ {%- set reasoning_content = reasoning_content|trim %}
100
+ {%- if (preserve_thinking is defined and preserve_thinking is true) or (loop.index0 > ns.last_query_index) %}
101
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content + '\n</think>\n\n' + content }}
102
+ {%- else %}
103
+ {{- '<|im_start|>' + message.role + '\n' + content }}
104
+ {%- endif %}
105
+ {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}
106
+ {%- for tool_call in message.tool_calls %}
107
+ {%- if tool_call.function is defined %}
108
+ {%- set tool_call = tool_call.function %}
109
+ {%- endif %}
110
+ {%- if loop.first %}
111
+ {%- if content|trim %}
112
+ {{- '\n\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
113
+ {%- else %}
114
+ {{- '<tool_call>\n<function=' + tool_call.name + '>\n' }}
115
+ {%- endif %}
116
+ {%- else %}
117
+ {{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
118
+ {%- endif %}
119
+ {%- if tool_call.arguments is defined %}
120
+ {%- for args_name, args_value in tool_call.arguments|items %}
121
+ {{- '<parameter=' + args_name + '>\n' }}
122
+ {%- set args_value = args_value | string if args_value is string else args_value | tojson | safe %}
123
+ {{- args_value }}
124
+ {{- '\n</parameter>\n' }}
125
+ {%- endfor %}
126
+ {%- endif %}
127
+ {{- '</function>\n</tool_call>' }}
128
+ {%- endfor %}
129
+ {%- endif %}
130
+ {{- '<|im_end|>\n' }}
131
+ {%- elif message.role == "tool" %}
132
+ {%- if loop.previtem and loop.previtem.role != "tool" %}
133
+ {{- '<|im_start|>user' }}
134
+ {%- endif %}
135
+ {{- '\n<tool_response>\n' }}
136
+ {{- content }}
137
+ {{- '\n</tool_response>' }}
138
+ {%- if not loop.last and loop.nextitem.role != "tool" %}
139
+ {{- '<|im_end|>\n' }}
140
+ {%- elif loop.last %}
141
+ {{- '<|im_end|>\n' }}
142
+ {%- endif %}
143
+ {%- else %}
144
+ {{- raise_exception('Unexpected message role.') }}
145
+ {%- endif %}
146
+ {%- endfor %}
147
+ {%- if add_generation_prompt %}
148
+ {{- '<|im_start|>assistant\n' }}
149
+ {%- if enable_thinking is defined and enable_thinking is false %}
150
+ {{- '<think>\n\n</think>\n\n' }}
151
+ {%- else %}
152
+ {{- '<think>\n' }}
153
+ {%- endif %}
154
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3_5ForConditionalGeneration"
4
+ ],
5
+ "bos_token_id": null,
6
+ "dtype": "bfloat16",
7
+ "eos_token_id": 248046,
8
+ "hidden_size": 5120,
9
+ "image_token_id": 248056,
10
+ "language_model_only": false,
11
+ "model_type": "qwen3_5",
12
+ "pad_token_id": 248044,
13
+ "text_config": {
14
+ "attention_bias": false,
15
+ "attention_dropout": 0.0,
16
+ "attn_output_gate": true,
17
+ "bos_token_id": 248044,
18
+ "dtype": "bfloat16",
19
+ "eos_token_id": 248044,
20
+ "full_attention_interval": 4,
21
+ "head_dim": 256,
22
+ "hidden_act": "silu",
23
+ "hidden_size": 5120,
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 17408,
26
+ "layer_types": [
27
+ "linear_attention",
28
+ "linear_attention",
29
+ "linear_attention",
30
+ "full_attention",
31
+ "linear_attention",
32
+ "linear_attention",
33
+ "linear_attention",
34
+ "full_attention",
35
+ "linear_attention",
36
+ "linear_attention",
37
+ "linear_attention",
38
+ "full_attention",
39
+ "linear_attention",
40
+ "linear_attention",
41
+ "linear_attention",
42
+ "full_attention",
43
+ "linear_attention",
44
+ "linear_attention",
45
+ "linear_attention",
46
+ "full_attention",
47
+ "linear_attention",
48
+ "linear_attention",
49
+ "linear_attention",
50
+ "full_attention",
51
+ "linear_attention",
52
+ "linear_attention",
53
+ "linear_attention",
54
+ "full_attention",
55
+ "linear_attention",
56
+ "linear_attention",
57
+ "linear_attention",
58
+ "full_attention",
59
+ "linear_attention",
60
+ "linear_attention",
61
+ "linear_attention",
62
+ "full_attention",
63
+ "linear_attention",
64
+ "linear_attention",
65
+ "linear_attention",
66
+ "full_attention",
67
+ "linear_attention",
68
+ "linear_attention",
69
+ "linear_attention",
70
+ "full_attention",
71
+ "linear_attention",
72
+ "linear_attention",
73
+ "linear_attention",
74
+ "full_attention",
75
+ "linear_attention",
76
+ "linear_attention",
77
+ "linear_attention",
78
+ "full_attention",
79
+ "linear_attention",
80
+ "linear_attention",
81
+ "linear_attention",
82
+ "full_attention",
83
+ "linear_attention",
84
+ "linear_attention",
85
+ "linear_attention",
86
+ "full_attention",
87
+ "linear_attention",
88
+ "linear_attention",
89
+ "linear_attention",
90
+ "full_attention"
91
+ ],
92
+ "linear_conv_kernel_dim": 4,
93
+ "linear_key_head_dim": 128,
94
+ "linear_num_key_heads": 16,
95
+ "linear_num_value_heads": 48,
96
+ "linear_value_head_dim": 128,
97
+ "mamba_ssm_dtype": "float32",
98
+ "max_position_embeddings": 262144,
99
+ "model_type": "qwen3_5_text",
100
+ "mtp_num_hidden_layers": 1,
101
+ "mtp_use_dedicated_embeddings": false,
102
+ "num_attention_heads": 24,
103
+ "num_hidden_layers": 64,
104
+ "num_key_value_heads": 4,
105
+ "output_gate_type": "swish",
106
+ "pad_token_id": 248044,
107
+ "partial_rotary_factor": 0.25,
108
+ "rms_norm_eps": 1e-06,
109
+ "rope_parameters": {
110
+ "mrope_interleaved": true,
111
+ "mrope_section": [
112
+ 11,
113
+ 11,
114
+ 10
115
+ ],
116
+ "partial_rotary_factor": 0.25,
117
+ "rope_theta": 10000000,
118
+ "rope_type": "default"
119
+ },
120
+ "tie_word_embeddings": false,
121
+ "use_cache": false,
122
+ "vocab_size": 248320
123
+ },
124
+ "tie_word_embeddings": false,
125
+ "transformers_version": "5.2.0",
126
+ "use_cache": false,
127
+ "video_token_id": 248057,
128
+ "vision_config": {
129
+ "deepstack_visual_indexes": [],
130
+ "depth": 27,
131
+ "dtype": "bfloat16",
132
+ "hidden_act": "gelu_pytorch_tanh",
133
+ "hidden_size": 1152,
134
+ "in_channels": 3,
135
+ "initializer_range": 0.02,
136
+ "intermediate_size": 4304,
137
+ "model_type": "qwen3_5",
138
+ "num_heads": 16,
139
+ "num_position_embeddings": 2304,
140
+ "out_hidden_size": 5120,
141
+ "patch_size": 16,
142
+ "spatial_merge_size": 2,
143
+ "temporal_patch_size": 2
144
+ },
145
+ "vision_end_token_id": 248054,
146
+ "vision_start_token_id": 248053
147
+ }
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 248044,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 248046,
6
+ 248044
7
+ ],
8
+ "pad_token_id": 248044,
9
+ "temperature": 1.0,
10
+ "top_k": 20,
11
+ "top_p": 0.95,
12
+ "transformers_version": "5.2.0"
13
+ }
model-00001-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a10361f0790772d7c613e59ca58e4f6837807edc21e59c3b8d1704837a701c97
3
+ size 2542796928
model-00002-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53b5eef4108b5ca68e7acd3df22ae10579a7236957b02c223b9c14f82d60d68f
3
+ size 4842451920
model-00003-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fc370e2fa68866704bfd67ed730d2f7d47e694d21174a328e694061ac557dd1
3
+ size 4965227944
model-00004-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14bebc6966ec03ad9bb665bf97be107d8f84b1983049a97923095000625f74a6
3
+ size 4912819264
model-00005-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:474832f396cdc08c589cbc7a08b0ee602c994b78c6a2db71d45148e76268382b
3
+ size 4986198544
model-00006-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6362465eaaf59d3aa5c0ad552b403161475bd14aaac1ce97412591968670ab4a
3
+ size 4912819320
model-00007-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d63e5bbe8fb142884193564f37a8a43f174e7715e4b44e2acc4f62bd10ebdaa3
3
+ size 4932703272
model-00008-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2683f0032247a35c5619215642019ffa22eca07a106c7ea57ac83ab8a3e2d49c
3
+ size 4966314576
model-00009-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acc8090ea3d96a4221470e255a3075614d5ee87537c826ef5c3fdb7e0cf97f6b
3
+ size 4964162248
model-00010-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d2d91120d86b2539d31db794750e02cf7de4536b5176783f6ac300971c6e32a
3
+ size 4933789824
model-00011-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8055858d4e03d46afa05473eb25ef339b074543319922bab02bd81d7793c366
3
+ size 4965228032
model-00012-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fd3e1364be23487c83d3fc435b051ab13e9015fcd341491386b24bf98e8f569
3
+ size 2789094896
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "size": {
3
+ "longest_edge": 16777216,
4
+ "shortest_edge": 65536
5
+ },
6
+ "patch_size": 16,
7
+ "temporal_patch_size": 2,
8
+ "merge_size": 2,
9
+ "image_mean": [
10
+ 0.5,
11
+ 0.5,
12
+ 0.5
13
+ ],
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "processor_class": "Qwen3VLProcessor",
20
+ "image_processor_type": "Qwen2VLImageProcessorFast"
21
+ }
processor_config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "image_processor": {
3
+ "data_format": "channels_first",
4
+ "do_convert_rgb": true,
5
+ "do_normalize": true,
6
+ "do_rescale": true,
7
+ "do_resize": true,
8
+ "image_mean": [
9
+ 0.5,
10
+ 0.5,
11
+ 0.5
12
+ ],
13
+ "image_processor_type": "Qwen2VLImageProcessorFast",
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "merge_size": 2,
20
+ "patch_size": 16,
21
+ "resample": 3,
22
+ "rescale_factor": 0.00392156862745098,
23
+ "size": {
24
+ "longest_edge": 16777216,
25
+ "shortest_edge": 65536
26
+ },
27
+ "temporal_patch_size": 2
28
+ },
29
+ "processor_class": "Qwen3VLProcessor",
30
+ "video_processor": {
31
+ "data_format": "channels_first",
32
+ "default_to_square": true,
33
+ "do_convert_rgb": true,
34
+ "do_normalize": true,
35
+ "do_rescale": true,
36
+ "do_resize": true,
37
+ "do_sample_frames": true,
38
+ "fps": 2,
39
+ "image_mean": [
40
+ 0.5,
41
+ 0.5,
42
+ 0.5
43
+ ],
44
+ "image_std": [
45
+ 0.5,
46
+ 0.5,
47
+ 0.5
48
+ ],
49
+ "max_frames": 768,
50
+ "merge_size": 2,
51
+ "min_frames": 4,
52
+ "patch_size": 16,
53
+ "resample": 3,
54
+ "rescale_factor": 0.00392156862745098,
55
+ "return_metadata": false,
56
+ "size": {
57
+ "longest_edge": 25165824,
58
+ "shortest_edge": 4096
59
+ },
60
+ "temporal_patch_size": 2,
61
+ "video_processor_type": "Qwen3VLVideoProcessor"
62
+ }
63
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87a7830d63fcf43bf241c3c5242e96e62dd3fdc29224ca26fed8ea333db72de4
3
+ size 19989343
tokenizer_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "audio_bos_token": "<|audio_start|>",
4
+ "audio_eos_token": "<|audio_end|>",
5
+ "audio_token": "<|audio_pad|>",
6
+ "backend": "tokenizers",
7
+ "bos_token": null,
8
+ "clean_up_tokenization_spaces": false,
9
+ "eos_token": "<|im_end|>",
10
+ "errors": "replace",
11
+ "image_token": "<|image_pad|>",
12
+ "is_local": true,
13
+ "model_max_length": 262144,
14
+ "model_specific_special_tokens": {
15
+ "audio_bos_token": "<|audio_start|>",
16
+ "audio_eos_token": "<|audio_end|>",
17
+ "audio_token": "<|audio_pad|>",
18
+ "image_token": "<|image_pad|>",
19
+ "video_token": "<|video_pad|>",
20
+ "vision_bos_token": "<|vision_start|>",
21
+ "vision_eos_token": "<|vision_end|>"
22
+ },
23
+ "pad_token": "<|endoftext|>",
24
+ "pretokenize_regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
25
+ "processor_class": "Qwen3VLProcessor",
26
+ "split_special_tokens": false,
27
+ "tokenizer_class": "Qwen2Tokenizer",
28
+ "unk_token": null,
29
+ "video_token": "<|video_pad|>",
30
+ "vision_bos_token": "<|vision_start|>",
31
+ "vision_eos_token": "<|vision_end|>"
32
+ }
trainer_state.json ADDED
@@ -0,0 +1,2434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.1588377723970944,
6
+ "eval_steps": 2000,
7
+ "global_step": 300,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.00387409200968523,
14
+ "grad_norm": 1.598986029624939,
15
+ "learning_rate": 2.0000000000000002e-07,
16
+ "loss": 0.40103477239608765,
17
+ "step": 1,
18
+ "token_acc": 0.8705013179702646
19
+ },
20
+ {
21
+ "epoch": 0.00774818401937046,
22
+ "grad_norm": 1.988427758216858,
23
+ "learning_rate": 4.0000000000000003e-07,
24
+ "loss": 0.4314175248146057,
25
+ "step": 2,
26
+ "token_acc": 0.8610088406262493
27
+ },
28
+ {
29
+ "epoch": 0.01162227602905569,
30
+ "grad_norm": 1.6525965929031372,
31
+ "learning_rate": 6.000000000000001e-07,
32
+ "loss": 0.41751521825790405,
33
+ "step": 3,
34
+ "token_acc": 0.8659394954574845
35
+ },
36
+ {
37
+ "epoch": 0.01549636803874092,
38
+ "grad_norm": 1.3594496250152588,
39
+ "learning_rate": 8.000000000000001e-07,
40
+ "loss": 0.39516761898994446,
41
+ "step": 4,
42
+ "token_acc": 0.8712739341656057
43
+ },
44
+ {
45
+ "epoch": 0.01937046004842615,
46
+ "grad_norm": 1.4459697008132935,
47
+ "learning_rate": 1.0000000000000002e-06,
48
+ "loss": 0.41443824768066406,
49
+ "step": 5,
50
+ "token_acc": 0.8673064711013153
51
+ },
52
+ {
53
+ "epoch": 0.02324455205811138,
54
+ "grad_norm": 1.165871024131775,
55
+ "learning_rate": 1.2000000000000002e-06,
56
+ "loss": 0.3951181471347809,
57
+ "step": 6,
58
+ "token_acc": 0.8717731277799119
59
+ },
60
+ {
61
+ "epoch": 0.02711864406779661,
62
+ "grad_norm": 1.150416374206543,
63
+ "learning_rate": 1.4000000000000001e-06,
64
+ "loss": 0.40562719106674194,
65
+ "step": 7,
66
+ "token_acc": 0.8683618627898853
67
+ },
68
+ {
69
+ "epoch": 0.03099273607748184,
70
+ "grad_norm": 0.7621377110481262,
71
+ "learning_rate": 1.6000000000000001e-06,
72
+ "loss": 0.4054454565048218,
73
+ "step": 8,
74
+ "token_acc": 0.8672108063124587
75
+ },
76
+ {
77
+ "epoch": 0.03486682808716707,
78
+ "grad_norm": 0.588590681552887,
79
+ "learning_rate": 1.8000000000000001e-06,
80
+ "loss": 0.383542001247406,
81
+ "step": 9,
82
+ "token_acc": 0.8732824386699718
83
+ },
84
+ {
85
+ "epoch": 0.0387409200968523,
86
+ "grad_norm": 0.5067570805549622,
87
+ "learning_rate": 2.0000000000000003e-06,
88
+ "loss": 0.3769374489784241,
89
+ "step": 10,
90
+ "token_acc": 0.8749500487669789
91
+ },
92
+ {
93
+ "epoch": 0.04261501210653753,
94
+ "grad_norm": 0.6109248995780945,
95
+ "learning_rate": 2.2e-06,
96
+ "loss": 0.3687226176261902,
97
+ "step": 11,
98
+ "token_acc": 0.8778429629931872
99
+ },
100
+ {
101
+ "epoch": 0.04648910411622276,
102
+ "grad_norm": 0.6168301701545715,
103
+ "learning_rate": 2.4000000000000003e-06,
104
+ "loss": 0.3631238639354706,
105
+ "step": 12,
106
+ "token_acc": 0.8792909317747671
107
+ },
108
+ {
109
+ "epoch": 0.05036319612590799,
110
+ "grad_norm": 0.5205990076065063,
111
+ "learning_rate": 2.6e-06,
112
+ "loss": 0.37530872225761414,
113
+ "step": 13,
114
+ "token_acc": 0.8747995859550826
115
+ },
116
+ {
117
+ "epoch": 0.05423728813559322,
118
+ "grad_norm": 0.4970836639404297,
119
+ "learning_rate": 2.8000000000000003e-06,
120
+ "loss": 0.33857205510139465,
121
+ "step": 14,
122
+ "token_acc": 0.8863650931395268
123
+ },
124
+ {
125
+ "epoch": 0.05811138014527845,
126
+ "grad_norm": 0.4103075861930847,
127
+ "learning_rate": 3e-06,
128
+ "loss": 0.38399845361709595,
129
+ "step": 15,
130
+ "token_acc": 0.8722473100295478
131
+ },
132
+ {
133
+ "epoch": 0.06198547215496368,
134
+ "grad_norm": 0.505113959312439,
135
+ "learning_rate": 3.2000000000000003e-06,
136
+ "loss": 0.37927311658859253,
137
+ "step": 16,
138
+ "token_acc": 0.8732506907722828
139
+ },
140
+ {
141
+ "epoch": 0.06585956416464891,
142
+ "grad_norm": 0.4578634202480316,
143
+ "learning_rate": 3.4000000000000005e-06,
144
+ "loss": 0.388744592666626,
145
+ "step": 17,
146
+ "token_acc": 0.8707925977418891
147
+ },
148
+ {
149
+ "epoch": 0.06973365617433414,
150
+ "grad_norm": 0.40881460905075073,
151
+ "learning_rate": 3.6000000000000003e-06,
152
+ "loss": 0.37862884998321533,
153
+ "step": 18,
154
+ "token_acc": 0.8738148420049672
155
+ },
156
+ {
157
+ "epoch": 0.07360774818401937,
158
+ "grad_norm": 0.3267415165901184,
159
+ "learning_rate": 3.8000000000000005e-06,
160
+ "loss": 0.3523765206336975,
161
+ "step": 19,
162
+ "token_acc": 0.8821479488850912
163
+ },
164
+ {
165
+ "epoch": 0.0774818401937046,
166
+ "grad_norm": 0.3520510196685791,
167
+ "learning_rate": 4.000000000000001e-06,
168
+ "loss": 0.37575048208236694,
169
+ "step": 20,
170
+ "token_acc": 0.8746806805808569
171
+ },
172
+ {
173
+ "epoch": 0.08135593220338982,
174
+ "grad_norm": 0.3177695870399475,
175
+ "learning_rate": 4.2000000000000004e-06,
176
+ "loss": 0.3877210021018982,
177
+ "step": 21,
178
+ "token_acc": 0.8709381583839385
179
+ },
180
+ {
181
+ "epoch": 0.08523002421307506,
182
+ "grad_norm": 0.3101595640182495,
183
+ "learning_rate": 4.4e-06,
184
+ "loss": 0.35647860169410706,
185
+ "step": 22,
186
+ "token_acc": 0.8802609194999448
187
+ },
188
+ {
189
+ "epoch": 0.0891041162227603,
190
+ "grad_norm": 0.42295873165130615,
191
+ "learning_rate": 4.600000000000001e-06,
192
+ "loss": 0.34535130858421326,
193
+ "step": 23,
194
+ "token_acc": 0.8842312960154491
195
+ },
196
+ {
197
+ "epoch": 0.09297820823244551,
198
+ "grad_norm": 0.38459983468055725,
199
+ "learning_rate": 4.800000000000001e-06,
200
+ "loss": 0.3480440676212311,
201
+ "step": 24,
202
+ "token_acc": 0.8830844934941354
203
+ },
204
+ {
205
+ "epoch": 0.09685230024213075,
206
+ "grad_norm": 0.3167020082473755,
207
+ "learning_rate": 5e-06,
208
+ "loss": 0.3617573082447052,
209
+ "step": 25,
210
+ "token_acc": 0.8794729562611736
211
+ },
212
+ {
213
+ "epoch": 0.10072639225181598,
214
+ "grad_norm": 0.3235217332839966,
215
+ "learning_rate": 5.2e-06,
216
+ "loss": 0.34485846757888794,
217
+ "step": 26,
218
+ "token_acc": 0.8849654381719892
219
+ },
220
+ {
221
+ "epoch": 0.10460048426150122,
222
+ "grad_norm": 0.33688801527023315,
223
+ "learning_rate": 5.400000000000001e-06,
224
+ "loss": 0.325369268655777,
225
+ "step": 27,
226
+ "token_acc": 0.8904570911619978
227
+ },
228
+ {
229
+ "epoch": 0.10847457627118644,
230
+ "grad_norm": 0.28384602069854736,
231
+ "learning_rate": 5.600000000000001e-06,
232
+ "loss": 0.3820268213748932,
233
+ "step": 28,
234
+ "token_acc": 0.8722670041260794
235
+ },
236
+ {
237
+ "epoch": 0.11234866828087167,
238
+ "grad_norm": 0.2726050019264221,
239
+ "learning_rate": 5.8e-06,
240
+ "loss": 0.34821516275405884,
241
+ "step": 29,
242
+ "token_acc": 0.8829695430808375
243
+ },
244
+ {
245
+ "epoch": 0.1162227602905569,
246
+ "grad_norm": 0.2613418698310852,
247
+ "learning_rate": 6e-06,
248
+ "loss": 0.3505156636238098,
249
+ "step": 30,
250
+ "token_acc": 0.8820213661332177
251
+ },
252
+ {
253
+ "epoch": 0.12009685230024213,
254
+ "grad_norm": 0.27066054940223694,
255
+ "learning_rate": 6.200000000000001e-06,
256
+ "loss": 0.3500295877456665,
257
+ "step": 31,
258
+ "token_acc": 0.8819775128328553
259
+ },
260
+ {
261
+ "epoch": 0.12397094430992736,
262
+ "grad_norm": 0.2605418562889099,
263
+ "learning_rate": 6.4000000000000006e-06,
264
+ "loss": 0.32833147048950195,
265
+ "step": 32,
266
+ "token_acc": 0.8892601629599358
267
+ },
268
+ {
269
+ "epoch": 0.12784503631961258,
270
+ "grad_norm": 0.2576088607311249,
271
+ "learning_rate": 6.600000000000001e-06,
272
+ "loss": 0.3447936475276947,
273
+ "step": 33,
274
+ "token_acc": 0.8835445537223737
275
+ },
276
+ {
277
+ "epoch": 0.13171912832929783,
278
+ "grad_norm": 0.2707255482673645,
279
+ "learning_rate": 6.800000000000001e-06,
280
+ "loss": 0.352622389793396,
281
+ "step": 34,
282
+ "token_acc": 0.8808578896779464
283
+ },
284
+ {
285
+ "epoch": 0.13559322033898305,
286
+ "grad_norm": 0.23704984784126282,
287
+ "learning_rate": 7e-06,
288
+ "loss": 0.34251606464385986,
289
+ "step": 35,
290
+ "token_acc": 0.8839590527934595
291
+ },
292
+ {
293
+ "epoch": 0.13946731234866827,
294
+ "grad_norm": 0.2552218735218048,
295
+ "learning_rate": 7.2000000000000005e-06,
296
+ "loss": 0.36937713623046875,
297
+ "step": 36,
298
+ "token_acc": 0.8746555562093041
299
+ },
300
+ {
301
+ "epoch": 0.14334140435835352,
302
+ "grad_norm": 0.25926339626312256,
303
+ "learning_rate": 7.4e-06,
304
+ "loss": 0.37243181467056274,
305
+ "step": 37,
306
+ "token_acc": 0.8742657147624016
307
+ },
308
+ {
309
+ "epoch": 0.14721549636803874,
310
+ "grad_norm": 0.25272250175476074,
311
+ "learning_rate": 7.600000000000001e-06,
312
+ "loss": 0.3371140956878662,
313
+ "step": 38,
314
+ "token_acc": 0.8851879286597788
315
+ },
316
+ {
317
+ "epoch": 0.15108958837772396,
318
+ "grad_norm": 0.2262120097875595,
319
+ "learning_rate": 7.800000000000002e-06,
320
+ "loss": 0.32758837938308716,
321
+ "step": 39,
322
+ "token_acc": 0.8883092864316684
323
+ },
324
+ {
325
+ "epoch": 0.1549636803874092,
326
+ "grad_norm": 0.26067835092544556,
327
+ "learning_rate": 8.000000000000001e-06,
328
+ "loss": 0.32051679491996765,
329
+ "step": 40,
330
+ "token_acc": 0.8908219532219895
331
+ },
332
+ {
333
+ "epoch": 0.15883777239709443,
334
+ "grad_norm": 0.22696885466575623,
335
+ "learning_rate": 8.2e-06,
336
+ "loss": 0.34018558263778687,
337
+ "step": 41,
338
+ "token_acc": 0.8843387459744694
339
+ },
340
+ {
341
+ "epoch": 0.16271186440677965,
342
+ "grad_norm": 0.2458319216966629,
343
+ "learning_rate": 8.400000000000001e-06,
344
+ "loss": 0.3157382607460022,
345
+ "step": 42,
346
+ "token_acc": 0.8923707458363505
347
+ },
348
+ {
349
+ "epoch": 0.1665859564164649,
350
+ "grad_norm": 0.23234310746192932,
351
+ "learning_rate": 8.6e-06,
352
+ "loss": 0.32486584782600403,
353
+ "step": 43,
354
+ "token_acc": 0.8898832391328527
355
+ },
356
+ {
357
+ "epoch": 0.17046004842615012,
358
+ "grad_norm": 0.24149972200393677,
359
+ "learning_rate": 8.8e-06,
360
+ "loss": 0.3565906286239624,
361
+ "step": 44,
362
+ "token_acc": 0.8786636478836652
363
+ },
364
+ {
365
+ "epoch": 0.17433414043583534,
366
+ "grad_norm": 0.23454472422599792,
367
+ "learning_rate": 9e-06,
368
+ "loss": 0.34243613481521606,
369
+ "step": 45,
370
+ "token_acc": 0.8836981353220466
371
+ },
372
+ {
373
+ "epoch": 0.1782082324455206,
374
+ "grad_norm": 0.22611235082149506,
375
+ "learning_rate": 9.200000000000002e-06,
376
+ "loss": 0.3169807493686676,
377
+ "step": 46,
378
+ "token_acc": 0.8913642111117898
379
+ },
380
+ {
381
+ "epoch": 0.1820823244552058,
382
+ "grad_norm": 0.2332201898097992,
383
+ "learning_rate": 9.4e-06,
384
+ "loss": 0.3335682153701782,
385
+ "step": 47,
386
+ "token_acc": 0.8853106607331619
387
+ },
388
+ {
389
+ "epoch": 0.18595641646489103,
390
+ "grad_norm": 0.26498886942863464,
391
+ "learning_rate": 9.600000000000001e-06,
392
+ "loss": 0.3396722674369812,
393
+ "step": 48,
394
+ "token_acc": 0.884526213547764
395
+ },
396
+ {
397
+ "epoch": 0.18983050847457628,
398
+ "grad_norm": 0.29751622676849365,
399
+ "learning_rate": 9.800000000000001e-06,
400
+ "loss": 0.3544740676879883,
401
+ "step": 49,
402
+ "token_acc": 0.8790830507178009
403
+ },
404
+ {
405
+ "epoch": 0.1937046004842615,
406
+ "grad_norm": 0.24125243723392487,
407
+ "learning_rate": 1e-05,
408
+ "loss": 0.3444434702396393,
409
+ "step": 50,
410
+ "token_acc": 0.8833375152943368
411
+ },
412
+ {
413
+ "epoch": 0.19757869249394674,
414
+ "grad_norm": 0.23450158536434174,
415
+ "learning_rate": 9.999953315763929e-06,
416
+ "loss": 0.34759777784347534,
417
+ "step": 51,
418
+ "token_acc": 0.8809645656414854
419
+ },
420
+ {
421
+ "epoch": 0.20145278450363197,
422
+ "grad_norm": 0.24415536224842072,
423
+ "learning_rate": 9.999813263927483e-06,
424
+ "loss": 0.3302762508392334,
425
+ "step": 52,
426
+ "token_acc": 0.8872595593874244
427
+ },
428
+ {
429
+ "epoch": 0.20532687651331719,
430
+ "grad_norm": 0.23792694509029388,
431
+ "learning_rate": 9.999579847105947e-06,
432
+ "loss": 0.3057291805744171,
433
+ "step": 53,
434
+ "token_acc": 0.8958096559669801
435
+ },
436
+ {
437
+ "epoch": 0.20920096852300243,
438
+ "grad_norm": 0.24918483197689056,
439
+ "learning_rate": 9.999253069658074e-06,
440
+ "loss": 0.3550814390182495,
441
+ "step": 54,
442
+ "token_acc": 0.8789014457104403
443
+ },
444
+ {
445
+ "epoch": 0.21307506053268765,
446
+ "grad_norm": 0.24681781232357025,
447
+ "learning_rate": 9.99883293768601e-06,
448
+ "loss": 0.329832524061203,
449
+ "step": 55,
450
+ "token_acc": 0.8862846605616155
451
+ },
452
+ {
453
+ "epoch": 0.21694915254237288,
454
+ "grad_norm": 0.25197944045066833,
455
+ "learning_rate": 9.998319459035168e-06,
456
+ "loss": 0.3133784532546997,
457
+ "step": 56,
458
+ "token_acc": 0.8929686000759445
459
+ },
460
+ {
461
+ "epoch": 0.22082324455205812,
462
+ "grad_norm": 0.29595333337783813,
463
+ "learning_rate": 9.997712643294093e-06,
464
+ "loss": 0.3238765597343445,
465
+ "step": 57,
466
+ "token_acc": 0.8900121095092376
467
+ },
468
+ {
469
+ "epoch": 0.22469733656174334,
470
+ "grad_norm": 0.2436024248600006,
471
+ "learning_rate": 9.997012501794273e-06,
472
+ "loss": 0.3283236622810364,
473
+ "step": 58,
474
+ "token_acc": 0.887762605178964
475
+ },
476
+ {
477
+ "epoch": 0.22857142857142856,
478
+ "grad_norm": 0.23041026294231415,
479
+ "learning_rate": 9.996219047609943e-06,
480
+ "loss": 0.3104722797870636,
481
+ "step": 59,
482
+ "token_acc": 0.8931121325749851
483
+ },
484
+ {
485
+ "epoch": 0.2324455205811138,
486
+ "grad_norm": 0.237432062625885,
487
+ "learning_rate": 9.995332295557818e-06,
488
+ "loss": 0.30940210819244385,
489
+ "step": 60,
490
+ "token_acc": 0.8942070394423697
491
+ },
492
+ {
493
+ "epoch": 0.23631961259079903,
494
+ "grad_norm": 0.23901380598545074,
495
+ "learning_rate": 9.994352262196839e-06,
496
+ "loss": 0.32523292303085327,
497
+ "step": 61,
498
+ "token_acc": 0.8885503611348168
499
+ },
500
+ {
501
+ "epoch": 0.24019370460048425,
502
+ "grad_norm": 0.27438339591026306,
503
+ "learning_rate": 9.993278965827844e-06,
504
+ "loss": 0.3501031994819641,
505
+ "step": 62,
506
+ "token_acc": 0.8796217252529412
507
+ },
508
+ {
509
+ "epoch": 0.2440677966101695,
510
+ "grad_norm": 0.23662753403186798,
511
+ "learning_rate": 9.992112426493247e-06,
512
+ "loss": 0.32605987787246704,
513
+ "step": 63,
514
+ "token_acc": 0.8890396653634925
515
+ },
516
+ {
517
+ "epoch": 0.24794188861985472,
518
+ "grad_norm": 0.2232031375169754,
519
+ "learning_rate": 9.990852665976648e-06,
520
+ "loss": 0.3196948170661926,
521
+ "step": 64,
522
+ "token_acc": 0.8907072739748918
523
+ },
524
+ {
525
+ "epoch": 0.25181598062953997,
526
+ "grad_norm": 0.2665523886680603,
527
+ "learning_rate": 9.989499707802424e-06,
528
+ "loss": 0.33278700709342957,
529
+ "step": 65,
530
+ "token_acc": 0.8863953116150797
531
+ },
532
+ {
533
+ "epoch": 0.25569007263922516,
534
+ "grad_norm": 0.23870785534381866,
535
+ "learning_rate": 9.988053577235306e-06,
536
+ "loss": 0.351688951253891,
537
+ "step": 66,
538
+ "token_acc": 0.879823584223047
539
+ },
540
+ {
541
+ "epoch": 0.2595641646489104,
542
+ "grad_norm": 0.24755656719207764,
543
+ "learning_rate": 9.986514301279894e-06,
544
+ "loss": 0.31553030014038086,
545
+ "step": 67,
546
+ "token_acc": 0.8921622627267041
547
+ },
548
+ {
549
+ "epoch": 0.26343825665859566,
550
+ "grad_norm": 0.23198164999485016,
551
+ "learning_rate": 9.984881908680157e-06,
552
+ "loss": 0.3355843424797058,
553
+ "step": 68,
554
+ "token_acc": 0.8848336232927391
555
+ },
556
+ {
557
+ "epoch": 0.26731234866828085,
558
+ "grad_norm": 0.2461438924074173,
559
+ "learning_rate": 9.983156429918895e-06,
560
+ "loss": 0.3341342508792877,
561
+ "step": 69,
562
+ "token_acc": 0.8856444439357114
563
+ },
564
+ {
565
+ "epoch": 0.2711864406779661,
566
+ "grad_norm": 0.22579748928546906,
567
+ "learning_rate": 9.981337897217171e-06,
568
+ "loss": 0.3188900947570801,
569
+ "step": 70,
570
+ "token_acc": 0.8906781387812132
571
+ },
572
+ {
573
+ "epoch": 0.27506053268765135,
574
+ "grad_norm": 0.24103567004203796,
575
+ "learning_rate": 9.979426344533712e-06,
576
+ "loss": 0.3240354061126709,
577
+ "step": 71,
578
+ "token_acc": 0.8889305949367731
579
+ },
580
+ {
581
+ "epoch": 0.27893462469733654,
582
+ "grad_norm": 0.23146985471248627,
583
+ "learning_rate": 9.977421807564264e-06,
584
+ "loss": 0.3256258964538574,
585
+ "step": 72,
586
+ "token_acc": 0.8886470476040884
587
+ },
588
+ {
589
+ "epoch": 0.2828087167070218,
590
+ "grad_norm": 0.2992205023765564,
591
+ "learning_rate": 9.97532432374094e-06,
592
+ "loss": 0.3162704110145569,
593
+ "step": 73,
594
+ "token_acc": 0.8912486582241803
595
+ },
596
+ {
597
+ "epoch": 0.28668280871670704,
598
+ "grad_norm": 0.2314625382423401,
599
+ "learning_rate": 9.973133932231514e-06,
600
+ "loss": 0.33748123049736023,
601
+ "step": 74,
602
+ "token_acc": 0.8834313251000246
603
+ },
604
+ {
605
+ "epoch": 0.29055690072639223,
606
+ "grad_norm": 0.23197512328624725,
607
+ "learning_rate": 9.970850673938684e-06,
608
+ "loss": 0.3105667233467102,
609
+ "step": 75,
610
+ "token_acc": 0.8935043208256486
611
+ },
612
+ {
613
+ "epoch": 0.2944309927360775,
614
+ "grad_norm": 0.2275022268295288,
615
+ "learning_rate": 9.96847459149932e-06,
616
+ "loss": 0.3327932357788086,
617
+ "step": 76,
618
+ "token_acc": 0.8861917159302386
619
+ },
620
+ {
621
+ "epoch": 0.2983050847457627,
622
+ "grad_norm": 0.2508430778980255,
623
+ "learning_rate": 9.966005729283658e-06,
624
+ "loss": 0.32548677921295166,
625
+ "step": 77,
626
+ "token_acc": 0.8882381273480396
627
+ },
628
+ {
629
+ "epoch": 0.3021791767554479,
630
+ "grad_norm": 0.5134550333023071,
631
+ "learning_rate": 9.963444133394478e-06,
632
+ "loss": 0.3120523691177368,
633
+ "step": 78,
634
+ "token_acc": 0.8919503736696569
635
+ },
636
+ {
637
+ "epoch": 0.30605326876513317,
638
+ "grad_norm": 0.21315379440784454,
639
+ "learning_rate": 9.960789851666237e-06,
640
+ "loss": 0.3215460181236267,
641
+ "step": 79,
642
+ "token_acc": 0.8896002985397907
643
+ },
644
+ {
645
+ "epoch": 0.3099273607748184,
646
+ "grad_norm": 0.23902781307697296,
647
+ "learning_rate": 9.958042933664186e-06,
648
+ "loss": 0.33162713050842285,
649
+ "step": 80,
650
+ "token_acc": 0.8866171518838251
651
+ },
652
+ {
653
+ "epoch": 0.3138014527845036,
654
+ "grad_norm": 0.24128590524196625,
655
+ "learning_rate": 9.955203430683425e-06,
656
+ "loss": 0.3268725574016571,
657
+ "step": 81,
658
+ "token_acc": 0.8882163748841388
659
+ },
660
+ {
661
+ "epoch": 0.31767554479418886,
662
+ "grad_norm": 0.24751782417297363,
663
+ "learning_rate": 9.952271395747969e-06,
664
+ "loss": 0.3100839853286743,
665
+ "step": 82,
666
+ "token_acc": 0.893085253361785
667
+ },
668
+ {
669
+ "epoch": 0.3215496368038741,
670
+ "grad_norm": 0.23644764721393585,
671
+ "learning_rate": 9.949246883609743e-06,
672
+ "loss": 0.32995104789733887,
673
+ "step": 83,
674
+ "token_acc": 0.8866222032237766
675
+ },
676
+ {
677
+ "epoch": 0.3254237288135593,
678
+ "grad_norm": 0.232451930642128,
679
+ "learning_rate": 9.94612995074756e-06,
680
+ "loss": 0.31272488832473755,
681
+ "step": 84,
682
+ "token_acc": 0.8926265473810503
683
+ },
684
+ {
685
+ "epoch": 0.32929782082324455,
686
+ "grad_norm": 0.21610639989376068,
687
+ "learning_rate": 9.942920655366075e-06,
688
+ "loss": 0.302722692489624,
689
+ "step": 85,
690
+ "token_acc": 0.8952372082627079
691
+ },
692
+ {
693
+ "epoch": 0.3331719128329298,
694
+ "grad_norm": 0.24474947154521942,
695
+ "learning_rate": 9.939619057394687e-06,
696
+ "loss": 0.31238657236099243,
697
+ "step": 86,
698
+ "token_acc": 0.8932181956136864
699
+ },
700
+ {
701
+ "epoch": 0.337046004842615,
702
+ "grad_norm": 0.22313052415847778,
703
+ "learning_rate": 9.936225218486428e-06,
704
+ "loss": 0.30595749616622925,
705
+ "step": 87,
706
+ "token_acc": 0.8942476419229949
707
+ },
708
+ {
709
+ "epoch": 0.34092009685230024,
710
+ "grad_norm": 0.25018593668937683,
711
+ "learning_rate": 9.93273920201681e-06,
712
+ "loss": 0.34218600392341614,
713
+ "step": 88,
714
+ "token_acc": 0.8826220754003523
715
+ },
716
+ {
717
+ "epoch": 0.3447941888619855,
718
+ "grad_norm": 0.21603761613368988,
719
+ "learning_rate": 9.929161073082636e-06,
720
+ "loss": 0.26845768094062805,
721
+ "step": 89,
722
+ "token_acc": 0.9068716054841073
723
+ },
724
+ {
725
+ "epoch": 0.3486682808716707,
726
+ "grad_norm": 0.22996748983860016,
727
+ "learning_rate": 9.925490898500796e-06,
728
+ "loss": 0.32508569955825806,
729
+ "step": 90,
730
+ "token_acc": 0.8884358725254423
731
+ },
732
+ {
733
+ "epoch": 0.3525423728813559,
734
+ "grad_norm": 0.3635949194431305,
735
+ "learning_rate": 9.921728746807008e-06,
736
+ "loss": 0.34217730164527893,
737
+ "step": 91,
738
+ "token_acc": 0.8833008019688547
739
+ },
740
+ {
741
+ "epoch": 0.3564164648910412,
742
+ "grad_norm": 0.22128325700759888,
743
+ "learning_rate": 9.917874688254542e-06,
744
+ "loss": 0.32345396280288696,
745
+ "step": 92,
746
+ "token_acc": 0.8889643834760571
747
+ },
748
+ {
749
+ "epoch": 0.36029055690072637,
750
+ "grad_norm": 0.24601417779922485,
751
+ "learning_rate": 9.913928794812909e-06,
752
+ "loss": 0.3252776265144348,
753
+ "step": 93,
754
+ "token_acc": 0.8881070006006884
755
+ },
756
+ {
757
+ "epoch": 0.3641646489104116,
758
+ "grad_norm": 0.23473182320594788,
759
+ "learning_rate": 9.90989114016652e-06,
760
+ "loss": 0.33626118302345276,
761
+ "step": 94,
762
+ "token_acc": 0.8841867411739727
763
+ },
764
+ {
765
+ "epoch": 0.36803874092009686,
766
+ "grad_norm": 0.22333025932312012,
767
+ "learning_rate": 9.905761799713302e-06,
768
+ "loss": 0.34545931220054626,
769
+ "step": 95,
770
+ "token_acc": 0.8803537032594166
771
+ },
772
+ {
773
+ "epoch": 0.37191283292978206,
774
+ "grad_norm": 0.21172457933425903,
775
+ "learning_rate": 9.901540850563295e-06,
776
+ "loss": 0.3074107766151428,
777
+ "step": 96,
778
+ "token_acc": 0.8944196156632918
779
+ },
780
+ {
781
+ "epoch": 0.3757869249394673,
782
+ "grad_norm": 0.2134028971195221,
783
+ "learning_rate": 9.89722837153722e-06,
784
+ "loss": 0.2957490086555481,
785
+ "step": 97,
786
+ "token_acc": 0.8978778618134635
787
+ },
788
+ {
789
+ "epoch": 0.37966101694915255,
790
+ "grad_norm": 0.2610202729701996,
791
+ "learning_rate": 9.892824443164987e-06,
792
+ "loss": 0.3412560224533081,
793
+ "step": 98,
794
+ "token_acc": 0.8829380073969748
795
+ },
796
+ {
797
+ "epoch": 0.38353510895883774,
798
+ "grad_norm": 0.25488367676734924,
799
+ "learning_rate": 9.88832914768421e-06,
800
+ "loss": 0.3430347442626953,
801
+ "step": 99,
802
+ "token_acc": 0.8815459290145207
803
+ },
804
+ {
805
+ "epoch": 0.387409200968523,
806
+ "grad_norm": 0.22882606089115143,
807
+ "learning_rate": 9.883742569038663e-06,
808
+ "loss": 0.33350762724876404,
809
+ "step": 100,
810
+ "token_acc": 0.8861422500817198
811
+ },
812
+ {
813
+ "epoch": 0.39128329297820824,
814
+ "grad_norm": 0.304647833108902,
815
+ "learning_rate": 9.879064792876717e-06,
816
+ "loss": 0.31420135498046875,
817
+ "step": 101,
818
+ "token_acc": 0.8915588172822687
819
+ },
820
+ {
821
+ "epoch": 0.3951573849878935,
822
+ "grad_norm": 0.22871072590351105,
823
+ "learning_rate": 9.874295906549728e-06,
824
+ "loss": 0.3116581439971924,
825
+ "step": 102,
826
+ "token_acc": 0.8917020548921253
827
+ },
828
+ {
829
+ "epoch": 0.3990314769975787,
830
+ "grad_norm": 0.2979466915130615,
831
+ "learning_rate": 9.869435999110428e-06,
832
+ "loss": 0.3145788013935089,
833
+ "step": 103,
834
+ "token_acc": 0.8916011830301528
835
+ },
836
+ {
837
+ "epoch": 0.40290556900726393,
838
+ "grad_norm": 0.20779502391815186,
839
+ "learning_rate": 9.864485161311242e-06,
840
+ "loss": 0.3070036768913269,
841
+ "step": 104,
842
+ "token_acc": 0.8938107647266995
843
+ },
844
+ {
845
+ "epoch": 0.4067796610169492,
846
+ "grad_norm": 0.2354535311460495,
847
+ "learning_rate": 9.859443485602603e-06,
848
+ "loss": 0.32298558950424194,
849
+ "step": 105,
850
+ "token_acc": 0.8882189451059107
851
+ },
852
+ {
853
+ "epoch": 0.41065375302663437,
854
+ "grad_norm": 0.22240500152111053,
855
+ "learning_rate": 9.85431106613122e-06,
856
+ "loss": 0.3104989528656006,
857
+ "step": 106,
858
+ "token_acc": 0.8923007628162216
859
+ },
860
+ {
861
+ "epoch": 0.4145278450363196,
862
+ "grad_norm": 0.21981710195541382,
863
+ "learning_rate": 9.849087998738328e-06,
864
+ "loss": 0.3237101435661316,
865
+ "step": 107,
866
+ "token_acc": 0.8879955719309623
867
+ },
868
+ {
869
+ "epoch": 0.41840193704600487,
870
+ "grad_norm": 0.2649724781513214,
871
+ "learning_rate": 9.84377438095789e-06,
872
+ "loss": 0.323306679725647,
873
+ "step": 108,
874
+ "token_acc": 0.8889382382835521
875
+ },
876
+ {
877
+ "epoch": 0.42227602905569006,
878
+ "grad_norm": 0.2193301022052765,
879
+ "learning_rate": 9.838370312014783e-06,
880
+ "loss": 0.31488102674484253,
881
+ "step": 109,
882
+ "token_acc": 0.8910646836196473
883
+ },
884
+ {
885
+ "epoch": 0.4261501210653753,
886
+ "grad_norm": 0.21842491626739502,
887
+ "learning_rate": 9.832875892822937e-06,
888
+ "loss": 0.3206183910369873,
889
+ "step": 110,
890
+ "token_acc": 0.8890832728771944
891
+ },
892
+ {
893
+ "epoch": 0.43002421307506056,
894
+ "grad_norm": 0.2456243336200714,
895
+ "learning_rate": 9.827291225983458e-06,
896
+ "loss": 0.3201240301132202,
897
+ "step": 111,
898
+ "token_acc": 0.8904148288428204
899
+ },
900
+ {
901
+ "epoch": 0.43389830508474575,
902
+ "grad_norm": 0.21340763568878174,
903
+ "learning_rate": 9.821616415782708e-06,
904
+ "loss": 0.29961007833480835,
905
+ "step": 112,
906
+ "token_acc": 0.8965660205577574
907
+ },
908
+ {
909
+ "epoch": 0.437772397094431,
910
+ "grad_norm": 0.2308902144432068,
911
+ "learning_rate": 9.815851568190358e-06,
912
+ "loss": 0.3107410669326782,
913
+ "step": 113,
914
+ "token_acc": 0.8927536025516888
915
+ },
916
+ {
917
+ "epoch": 0.44164648910411625,
918
+ "grad_norm": 0.2292374223470688,
919
+ "learning_rate": 9.80999679085741e-06,
920
+ "loss": 0.3277205228805542,
921
+ "step": 114,
922
+ "token_acc": 0.886787084498464
923
+ },
924
+ {
925
+ "epoch": 0.44552058111380144,
926
+ "grad_norm": 0.21509671211242676,
927
+ "learning_rate": 9.80405219311419e-06,
928
+ "loss": 0.3161908984184265,
929
+ "step": 115,
930
+ "token_acc": 0.8916077261448497
931
+ },
932
+ {
933
+ "epoch": 0.4493946731234867,
934
+ "grad_norm": 0.20529279112815857,
935
+ "learning_rate": 9.798017885968295e-06,
936
+ "loss": 0.29131007194519043,
937
+ "step": 116,
938
+ "token_acc": 0.8990066361086406
939
+ },
940
+ {
941
+ "epoch": 0.45326876513317194,
942
+ "grad_norm": 0.24888373911380768,
943
+ "learning_rate": 9.791893982102537e-06,
944
+ "loss": 0.31967025995254517,
945
+ "step": 117,
946
+ "token_acc": 0.8899925908756566
947
+ },
948
+ {
949
+ "epoch": 0.45714285714285713,
950
+ "grad_norm": 0.22014780342578888,
951
+ "learning_rate": 9.785680595872824e-06,
952
+ "loss": 0.31103435158729553,
953
+ "step": 118,
954
+ "token_acc": 0.8928936680571538
955
+ },
956
+ {
957
+ "epoch": 0.4610169491525424,
958
+ "grad_norm": 0.21783359348773956,
959
+ "learning_rate": 9.77937784330603e-06,
960
+ "loss": 0.307749480009079,
961
+ "step": 119,
962
+ "token_acc": 0.8931600584652736
963
+ },
964
+ {
965
+ "epoch": 0.4648910411622276,
966
+ "grad_norm": 0.2104286551475525,
967
+ "learning_rate": 9.772985842097832e-06,
968
+ "loss": 0.31199365854263306,
969
+ "step": 120,
970
+ "token_acc": 0.8926850259294361
971
+ },
972
+ {
973
+ "epoch": 0.4687651331719128,
974
+ "grad_norm": 0.21124128997325897,
975
+ "learning_rate": 9.766504711610507e-06,
976
+ "loss": 0.3170176148414612,
977
+ "step": 121,
978
+ "token_acc": 0.8906264477918435
979
+ },
980
+ {
981
+ "epoch": 0.47263922518159807,
982
+ "grad_norm": 0.23777632415294647,
983
+ "learning_rate": 9.759934572870706e-06,
984
+ "loss": 0.3052697777748108,
985
+ "step": 122,
986
+ "token_acc": 0.894442848003123
987
+ },
988
+ {
989
+ "epoch": 0.4765133171912833,
990
+ "grad_norm": 0.2527632713317871,
991
+ "learning_rate": 9.753275548567192e-06,
992
+ "loss": 0.3045836091041565,
993
+ "step": 123,
994
+ "token_acc": 0.8951105518605069
995
+ },
996
+ {
997
+ "epoch": 0.4803874092009685,
998
+ "grad_norm": 0.20530211925506592,
999
+ "learning_rate": 9.74652776304855e-06,
1000
+ "loss": 0.3366113305091858,
1001
+ "step": 124,
1002
+ "token_acc": 0.8836434912892324
1003
+ },
1004
+ {
1005
+ "epoch": 0.48426150121065376,
1006
+ "grad_norm": 0.26673150062561035,
1007
+ "learning_rate": 9.739691342320866e-06,
1008
+ "loss": 0.311764121055603,
1009
+ "step": 125,
1010
+ "token_acc": 0.8910826454277961
1011
+ },
1012
+ {
1013
+ "epoch": 0.488135593220339,
1014
+ "grad_norm": 0.2245185822248459,
1015
+ "learning_rate": 9.732766414045368e-06,
1016
+ "loss": 0.31055164337158203,
1017
+ "step": 126,
1018
+ "token_acc": 0.8926098098046538
1019
+ },
1020
+ {
1021
+ "epoch": 0.4920096852300242,
1022
+ "grad_norm": 0.2143883854150772,
1023
+ "learning_rate": 9.725753107536053e-06,
1024
+ "loss": 0.33499595522880554,
1025
+ "step": 127,
1026
+ "token_acc": 0.8840534260641282
1027
+ },
1028
+ {
1029
+ "epoch": 0.49588377723970944,
1030
+ "grad_norm": 0.22163285315036774,
1031
+ "learning_rate": 9.718651553757266e-06,
1032
+ "loss": 0.31920328736305237,
1033
+ "step": 128,
1034
+ "token_acc": 0.8901271163419964
1035
+ },
1036
+ {
1037
+ "epoch": 0.4997578692493947,
1038
+ "grad_norm": 0.2143898904323578,
1039
+ "learning_rate": 9.711461885321247e-06,
1040
+ "loss": 0.3301286995410919,
1041
+ "step": 129,
1042
+ "token_acc": 0.8853363916795757
1043
+ },
1044
+ {
1045
+ "epoch": 0.5036319612590799,
1046
+ "grad_norm": 0.24990734457969666,
1047
+ "learning_rate": 9.704184236485672e-06,
1048
+ "loss": 0.3278159201145172,
1049
+ "step": 130,
1050
+ "token_acc": 0.8874620923082561
1051
+ },
1052
+ {
1053
+ "epoch": 0.5075060532687651,
1054
+ "grad_norm": 0.22136539220809937,
1055
+ "learning_rate": 9.696818743151128e-06,
1056
+ "loss": 0.3319326937198639,
1057
+ "step": 131,
1058
+ "token_acc": 0.885009570455441
1059
+ },
1060
+ {
1061
+ "epoch": 0.5113801452784503,
1062
+ "grad_norm": 0.2669275999069214,
1063
+ "learning_rate": 9.68936554285859e-06,
1064
+ "loss": 0.3023684620857239,
1065
+ "step": 132,
1066
+ "token_acc": 0.8951259709956582
1067
+ },
1068
+ {
1069
+ "epoch": 0.5152542372881356,
1070
+ "grad_norm": 0.21833708882331848,
1071
+ "learning_rate": 9.68182477478684e-06,
1072
+ "loss": 0.3089104890823364,
1073
+ "step": 133,
1074
+ "token_acc": 0.8930920187299416
1075
+ },
1076
+ {
1077
+ "epoch": 0.5191283292978208,
1078
+ "grad_norm": 0.21197167038917542,
1079
+ "learning_rate": 9.67419657974988e-06,
1080
+ "loss": 0.3144392967224121,
1081
+ "step": 134,
1082
+ "token_acc": 0.8910884224709107
1083
+ },
1084
+ {
1085
+ "epoch": 0.5230024213075061,
1086
+ "grad_norm": 0.21434499323368073,
1087
+ "learning_rate": 9.66648110019429e-06,
1088
+ "loss": 0.3246540427207947,
1089
+ "step": 135,
1090
+ "token_acc": 0.8876412650671648
1091
+ },
1092
+ {
1093
+ "epoch": 0.5268765133171913,
1094
+ "grad_norm": 0.20343148708343506,
1095
+ "learning_rate": 9.658678480196579e-06,
1096
+ "loss": 0.315585196018219,
1097
+ "step": 136,
1098
+ "token_acc": 0.8905443269970013
1099
+ },
1100
+ {
1101
+ "epoch": 0.5307506053268766,
1102
+ "grad_norm": 0.23613257706165314,
1103
+ "learning_rate": 9.650788865460487e-06,
1104
+ "loss": 0.3131225109100342,
1105
+ "step": 137,
1106
+ "token_acc": 0.8912192170846405
1107
+ },
1108
+ {
1109
+ "epoch": 0.5346246973365617,
1110
+ "grad_norm": 0.4212075471878052,
1111
+ "learning_rate": 9.642812403314272e-06,
1112
+ "loss": 0.29884475469589233,
1113
+ "step": 138,
1114
+ "token_acc": 0.8966553773404051
1115
+ },
1116
+ {
1117
+ "epoch": 0.538498789346247,
1118
+ "grad_norm": 0.20193685591220856,
1119
+ "learning_rate": 9.634749242707948e-06,
1120
+ "loss": 0.26036083698272705,
1121
+ "step": 139,
1122
+ "token_acc": 0.9091038865111504
1123
+ },
1124
+ {
1125
+ "epoch": 0.5423728813559322,
1126
+ "grad_norm": 0.2208104431629181,
1127
+ "learning_rate": 9.626599534210514e-06,
1128
+ "loss": 0.33184394240379333,
1129
+ "step": 140,
1130
+ "token_acc": 0.8853617134142299
1131
+ },
1132
+ {
1133
+ "epoch": 0.5462469733656174,
1134
+ "grad_norm": 0.22493727505207062,
1135
+ "learning_rate": 9.618363430007134e-06,
1136
+ "loss": 0.31208667159080505,
1137
+ "step": 141,
1138
+ "token_acc": 0.8917024215686027
1139
+ },
1140
+ {
1141
+ "epoch": 0.5501210653753027,
1142
+ "grad_norm": 0.23963193595409393,
1143
+ "learning_rate": 9.610041083896304e-06,
1144
+ "loss": 0.33588868379592896,
1145
+ "step": 142,
1146
+ "token_acc": 0.883973627021253
1147
+ },
1148
+ {
1149
+ "epoch": 0.553995157384988,
1150
+ "grad_norm": 0.21784453094005585,
1151
+ "learning_rate": 9.60163265128697e-06,
1152
+ "loss": 0.3231375813484192,
1153
+ "step": 143,
1154
+ "token_acc": 0.8887875239014834
1155
+ },
1156
+ {
1157
+ "epoch": 0.5578692493946731,
1158
+ "grad_norm": 0.22835847735404968,
1159
+ "learning_rate": 9.593138289195634e-06,
1160
+ "loss": 0.3210199773311615,
1161
+ "step": 144,
1162
+ "token_acc": 0.8890582816354493
1163
+ },
1164
+ {
1165
+ "epoch": 0.5617433414043583,
1166
+ "grad_norm": 0.2136555314064026,
1167
+ "learning_rate": 9.584558156243418e-06,
1168
+ "loss": 0.3372665047645569,
1169
+ "step": 145,
1170
+ "token_acc": 0.8839793357706921
1171
+ },
1172
+ {
1173
+ "epoch": 0.5656174334140436,
1174
+ "grad_norm": 0.20598500967025757,
1175
+ "learning_rate": 9.575892412653102e-06,
1176
+ "loss": 0.30844664573669434,
1177
+ "step": 146,
1178
+ "token_acc": 0.8926156654585412
1179
+ },
1180
+ {
1181
+ "epoch": 0.5694915254237288,
1182
+ "grad_norm": 0.2522714138031006,
1183
+ "learning_rate": 9.567141220246136e-06,
1184
+ "loss": 0.36702272295951843,
1185
+ "step": 147,
1186
+ "token_acc": 0.8734296301671142
1187
+ },
1188
+ {
1189
+ "epoch": 0.5733656174334141,
1190
+ "grad_norm": 0.21975038945674896,
1191
+ "learning_rate": 9.55830474243961e-06,
1192
+ "loss": 0.32784411311149597,
1193
+ "step": 148,
1194
+ "token_acc": 0.8871756189192851
1195
+ },
1196
+ {
1197
+ "epoch": 0.5772397094430993,
1198
+ "grad_norm": 0.21233901381492615,
1199
+ "learning_rate": 9.549383144243213e-06,
1200
+ "loss": 0.2944122850894928,
1201
+ "step": 149,
1202
+ "token_acc": 0.8987453672884691
1203
+ },
1204
+ {
1205
+ "epoch": 0.5811138014527845,
1206
+ "grad_norm": 0.2199799120426178,
1207
+ "learning_rate": 9.540376592256142e-06,
1208
+ "loss": 0.3299463987350464,
1209
+ "step": 150,
1210
+ "token_acc": 0.8859144839374592
1211
+ },
1212
+ {
1213
+ "epoch": 0.5849878934624697,
1214
+ "grad_norm": 0.19698019325733185,
1215
+ "learning_rate": 9.531285254663997e-06,
1216
+ "loss": 0.3030051589012146,
1217
+ "step": 151,
1218
+ "token_acc": 0.8951707294894029
1219
+ },
1220
+ {
1221
+ "epoch": 0.588861985472155,
1222
+ "grad_norm": 0.22306668758392334,
1223
+ "learning_rate": 9.522109301235637e-06,
1224
+ "loss": 0.29752516746520996,
1225
+ "step": 152,
1226
+ "token_acc": 0.8966012679857996
1227
+ },
1228
+ {
1229
+ "epoch": 0.5927360774818402,
1230
+ "grad_norm": 0.21317337453365326,
1231
+ "learning_rate": 9.512848903320017e-06,
1232
+ "loss": 0.3052118122577667,
1233
+ "step": 153,
1234
+ "token_acc": 0.8944324633814714
1235
+ },
1236
+ {
1237
+ "epoch": 0.5966101694915255,
1238
+ "grad_norm": 0.2120915800333023,
1239
+ "learning_rate": 9.503504233842973e-06,
1240
+ "loss": 0.29761528968811035,
1241
+ "step": 154,
1242
+ "token_acc": 0.8966406260468731
1243
+ },
1244
+ {
1245
+ "epoch": 0.6004842615012107,
1246
+ "grad_norm": 0.23525090515613556,
1247
+ "learning_rate": 9.494075467304007e-06,
1248
+ "loss": 0.3034532070159912,
1249
+ "step": 155,
1250
+ "token_acc": 0.8944926637860167
1251
+ },
1252
+ {
1253
+ "epoch": 0.6043583535108958,
1254
+ "grad_norm": 0.2095353752374649,
1255
+ "learning_rate": 9.484562779773027e-06,
1256
+ "loss": 0.2903788089752197,
1257
+ "step": 156,
1258
+ "token_acc": 0.8990560027078014
1259
+ },
1260
+ {
1261
+ "epoch": 0.6082324455205811,
1262
+ "grad_norm": 0.23741677403450012,
1263
+ "learning_rate": 9.474966348887055e-06,
1264
+ "loss": 0.31467512249946594,
1265
+ "step": 157,
1266
+ "token_acc": 0.8904583329757747
1267
+ },
1268
+ {
1269
+ "epoch": 0.6121065375302663,
1270
+ "grad_norm": 0.2259555608034134,
1271
+ "learning_rate": 9.465286353846905e-06,
1272
+ "loss": 0.3404577374458313,
1273
+ "step": 158,
1274
+ "token_acc": 0.8826165622063978
1275
+ },
1276
+ {
1277
+ "epoch": 0.6159806295399516,
1278
+ "grad_norm": 0.2183879017829895,
1279
+ "learning_rate": 9.455522975413846e-06,
1280
+ "loss": 0.2766571640968323,
1281
+ "step": 159,
1282
+ "token_acc": 0.9038809421418853
1283
+ },
1284
+ {
1285
+ "epoch": 0.6198547215496368,
1286
+ "grad_norm": 0.22651784121990204,
1287
+ "learning_rate": 9.445676395906226e-06,
1288
+ "loss": 0.29638129472732544,
1289
+ "step": 160,
1290
+ "token_acc": 0.8970113168662065
1291
+ },
1292
+ {
1293
+ "epoch": 0.6237288135593221,
1294
+ "grad_norm": 0.22088395059108734,
1295
+ "learning_rate": 9.435746799196061e-06,
1296
+ "loss": 0.3023075759410858,
1297
+ "step": 161,
1298
+ "token_acc": 0.8946665593674712
1299
+ },
1300
+ {
1301
+ "epoch": 0.6276029055690072,
1302
+ "grad_norm": 0.21526560187339783,
1303
+ "learning_rate": 9.425734370705606e-06,
1304
+ "loss": 0.28661438822746277,
1305
+ "step": 162,
1306
+ "token_acc": 0.9002787847728345
1307
+ },
1308
+ {
1309
+ "epoch": 0.6314769975786925,
1310
+ "grad_norm": 0.23334769904613495,
1311
+ "learning_rate": 9.415639297403891e-06,
1312
+ "loss": 0.31685301661491394,
1313
+ "step": 163,
1314
+ "token_acc": 0.890886748080584
1315
+ },
1316
+ {
1317
+ "epoch": 0.6353510895883777,
1318
+ "grad_norm": 0.200165793299675,
1319
+ "learning_rate": 9.40546176780323e-06,
1320
+ "loss": 0.30981898307800293,
1321
+ "step": 164,
1322
+ "token_acc": 0.8924871164982372
1323
+ },
1324
+ {
1325
+ "epoch": 0.639225181598063,
1326
+ "grad_norm": 0.20800836384296417,
1327
+ "learning_rate": 9.395201971955701e-06,
1328
+ "loss": 0.3162352740764618,
1329
+ "step": 165,
1330
+ "token_acc": 0.8910434805285766
1331
+ },
1332
+ {
1333
+ "epoch": 0.6430992736077482,
1334
+ "grad_norm": 0.20923736691474915,
1335
+ "learning_rate": 9.384860101449598e-06,
1336
+ "loss": 0.32208406925201416,
1337
+ "step": 166,
1338
+ "token_acc": 0.8880633815629819
1339
+ },
1340
+ {
1341
+ "epoch": 0.6469733656174335,
1342
+ "grad_norm": 0.1986808031797409,
1343
+ "learning_rate": 9.374436349405847e-06,
1344
+ "loss": 0.28397923707962036,
1345
+ "step": 167,
1346
+ "token_acc": 0.9012052212352475
1347
+ },
1348
+ {
1349
+ "epoch": 0.6508474576271186,
1350
+ "grad_norm": 0.21215273439884186,
1351
+ "learning_rate": 9.36393091047441e-06,
1352
+ "loss": 0.3066609799861908,
1353
+ "step": 168,
1354
+ "token_acc": 0.894593303584187
1355
+ },
1356
+ {
1357
+ "epoch": 0.6547215496368038,
1358
+ "grad_norm": 0.20804037153720856,
1359
+ "learning_rate": 9.353343980830644e-06,
1360
+ "loss": 0.3097017705440521,
1361
+ "step": 169,
1362
+ "token_acc": 0.8926308156125992
1363
+ },
1364
+ {
1365
+ "epoch": 0.6585956416464891,
1366
+ "grad_norm": 0.20328834652900696,
1367
+ "learning_rate": 9.342675758171638e-06,
1368
+ "loss": 0.3010105490684509,
1369
+ "step": 170,
1370
+ "token_acc": 0.8950560660129195
1371
+ },
1372
+ {
1373
+ "epoch": 0.6624697336561743,
1374
+ "grad_norm": 0.2051060050725937,
1375
+ "learning_rate": 9.331926441712522e-06,
1376
+ "loss": 0.3019353151321411,
1377
+ "step": 171,
1378
+ "token_acc": 0.8949745506999682
1379
+ },
1380
+ {
1381
+ "epoch": 0.6663438256658596,
1382
+ "grad_norm": 0.24043123424053192,
1383
+ "learning_rate": 9.32109623218275e-06,
1384
+ "loss": 0.3116442859172821,
1385
+ "step": 172,
1386
+ "token_acc": 0.8915558784861239
1387
+ },
1388
+ {
1389
+ "epoch": 0.6702179176755448,
1390
+ "grad_norm": 0.21520181000232697,
1391
+ "learning_rate": 9.310185331822338e-06,
1392
+ "loss": 0.31186142563819885,
1393
+ "step": 173,
1394
+ "token_acc": 0.8917585320277845
1395
+ },
1396
+ {
1397
+ "epoch": 0.67409200968523,
1398
+ "grad_norm": 0.21344298124313354,
1399
+ "learning_rate": 9.299193944378112e-06,
1400
+ "loss": 0.3273160755634308,
1401
+ "step": 174,
1402
+ "token_acc": 0.886418268420563
1403
+ },
1404
+ {
1405
+ "epoch": 0.6779661016949152,
1406
+ "grad_norm": 0.20224156975746155,
1407
+ "learning_rate": 9.28812227509988e-06,
1408
+ "loss": 0.31608837842941284,
1409
+ "step": 175,
1410
+ "token_acc": 0.8894536504933755
1411
+ },
1412
+ {
1413
+ "epoch": 0.6818401937046005,
1414
+ "grad_norm": 0.2154257595539093,
1415
+ "learning_rate": 9.27697053073661e-06,
1416
+ "loss": 0.34367692470550537,
1417
+ "step": 176,
1418
+ "token_acc": 0.8811017511710314
1419
+ },
1420
+ {
1421
+ "epoch": 0.6857142857142857,
1422
+ "grad_norm": 0.22003678977489471,
1423
+ "learning_rate": 9.26573891953257e-06,
1424
+ "loss": 0.3205263018608093,
1425
+ "step": 177,
1426
+ "token_acc": 0.8893210947921869
1427
+ },
1428
+ {
1429
+ "epoch": 0.689588377723971,
1430
+ "grad_norm": 0.21449677646160126,
1431
+ "learning_rate": 9.254427651223434e-06,
1432
+ "loss": 0.28666430711746216,
1433
+ "step": 178,
1434
+ "token_acc": 0.9003720788020833
1435
+ },
1436
+ {
1437
+ "epoch": 0.6934624697336562,
1438
+ "grad_norm": 0.22110596299171448,
1439
+ "learning_rate": 9.243036937032373e-06,
1440
+ "loss": 0.3156067728996277,
1441
+ "step": 179,
1442
+ "token_acc": 0.8902597783694092
1443
+ },
1444
+ {
1445
+ "epoch": 0.6973365617433414,
1446
+ "grad_norm": 0.19700580835342407,
1447
+ "learning_rate": 9.2315669896661e-06,
1448
+ "loss": 0.28897273540496826,
1449
+ "step": 180,
1450
+ "token_acc": 0.8994499889336349
1451
+ },
1452
+ {
1453
+ "epoch": 0.7012106537530266,
1454
+ "grad_norm": 0.21460606157779694,
1455
+ "learning_rate": 9.220018023310908e-06,
1456
+ "loss": 0.31268295645713806,
1457
+ "step": 181,
1458
+ "token_acc": 0.8918378520876847
1459
+ },
1460
+ {
1461
+ "epoch": 0.7050847457627119,
1462
+ "grad_norm": 0.21692436933517456,
1463
+ "learning_rate": 9.208390253628667e-06,
1464
+ "loss": 0.28844964504241943,
1465
+ "step": 182,
1466
+ "token_acc": 0.8997311485616448
1467
+ },
1468
+ {
1469
+ "epoch": 0.7089588377723971,
1470
+ "grad_norm": 0.201703280210495,
1471
+ "learning_rate": 9.196683897752794e-06,
1472
+ "loss": 0.32861441373825073,
1473
+ "step": 183,
1474
+ "token_acc": 0.8854774295445417
1475
+ },
1476
+ {
1477
+ "epoch": 0.7128329297820823,
1478
+ "grad_norm": 3.976747751235962,
1479
+ "learning_rate": 9.184899174284201e-06,
1480
+ "loss": 0.33475255966186523,
1481
+ "step": 184,
1482
+ "token_acc": 0.8836819705392365
1483
+ },
1484
+ {
1485
+ "epoch": 0.7167070217917676,
1486
+ "grad_norm": 0.24247053265571594,
1487
+ "learning_rate": 9.173036303287215e-06,
1488
+ "loss": 0.3366454243659973,
1489
+ "step": 185,
1490
+ "token_acc": 0.8833432089980459
1491
+ },
1492
+ {
1493
+ "epoch": 0.7205811138014527,
1494
+ "grad_norm": 0.2282845675945282,
1495
+ "learning_rate": 9.16109550628546e-06,
1496
+ "loss": 0.2812536656856537,
1497
+ "step": 186,
1498
+ "token_acc": 0.9027706860502607
1499
+ },
1500
+ {
1501
+ "epoch": 0.724455205811138,
1502
+ "grad_norm": 0.2282128632068634,
1503
+ "learning_rate": 9.149077006257734e-06,
1504
+ "loss": 0.3136906027793884,
1505
+ "step": 187,
1506
+ "token_acc": 0.8912536222754189
1507
+ },
1508
+ {
1509
+ "epoch": 0.7283292978208232,
1510
+ "grad_norm": 0.20751290023326874,
1511
+ "learning_rate": 9.136981027633834e-06,
1512
+ "loss": 0.29636135697364807,
1513
+ "step": 188,
1514
+ "token_acc": 0.8974463288547996
1515
+ },
1516
+ {
1517
+ "epoch": 0.7322033898305085,
1518
+ "grad_norm": 0.23192144930362701,
1519
+ "learning_rate": 9.124807796290366e-06,
1520
+ "loss": 0.3046882152557373,
1521
+ "step": 189,
1522
+ "token_acc": 0.8943812414560115
1523
+ },
1524
+ {
1525
+ "epoch": 0.7360774818401937,
1526
+ "grad_norm": 0.221333310008049,
1527
+ "learning_rate": 9.112557539546535e-06,
1528
+ "loss": 0.32960376143455505,
1529
+ "step": 190,
1530
+ "token_acc": 0.8860915000599271
1531
+ },
1532
+ {
1533
+ "epoch": 0.739951573849879,
1534
+ "grad_norm": 0.1981872171163559,
1535
+ "learning_rate": 9.100230486159893e-06,
1536
+ "loss": 0.32151421904563904,
1537
+ "step": 191,
1538
+ "token_acc": 0.888598638535205
1539
+ },
1540
+ {
1541
+ "epoch": 0.7438256658595641,
1542
+ "grad_norm": 0.2172573357820511,
1543
+ "learning_rate": 9.087826866322065e-06,
1544
+ "loss": 0.3255336880683899,
1545
+ "step": 192,
1546
+ "token_acc": 0.8864367509340579
1547
+ },
1548
+ {
1549
+ "epoch": 0.7476997578692494,
1550
+ "grad_norm": 0.21215571463108063,
1551
+ "learning_rate": 9.075346911654456e-06,
1552
+ "loss": 0.30505236983299255,
1553
+ "step": 193,
1554
+ "token_acc": 0.8936060377931436
1555
+ },
1556
+ {
1557
+ "epoch": 0.7515738498789346,
1558
+ "grad_norm": 0.21355277299880981,
1559
+ "learning_rate": 9.062790855203932e-06,
1560
+ "loss": 0.3349328637123108,
1561
+ "step": 194,
1562
+ "token_acc": 0.8847527625851099
1563
+ },
1564
+ {
1565
+ "epoch": 0.7554479418886199,
1566
+ "grad_norm": 0.20415301620960236,
1567
+ "learning_rate": 9.050158931438451e-06,
1568
+ "loss": 0.3010273873806,
1569
+ "step": 195,
1570
+ "token_acc": 0.8946901896914337
1571
+ },
1572
+ {
1573
+ "epoch": 0.7593220338983051,
1574
+ "grad_norm": 0.2100018560886383,
1575
+ "learning_rate": 9.037451376242696e-06,
1576
+ "loss": 0.3295148015022278,
1577
+ "step": 196,
1578
+ "token_acc": 0.8861214255925314
1579
+ },
1580
+ {
1581
+ "epoch": 0.7631961259079904,
1582
+ "grad_norm": 0.21248096227645874,
1583
+ "learning_rate": 9.024668426913671e-06,
1584
+ "loss": 0.2901475727558136,
1585
+ "step": 197,
1586
+ "token_acc": 0.8984891018269412
1587
+ },
1588
+ {
1589
+ "epoch": 0.7670702179176755,
1590
+ "grad_norm": 0.20735451579093933,
1591
+ "learning_rate": 9.011810322156269e-06,
1592
+ "loss": 0.3123668134212494,
1593
+ "step": 198,
1594
+ "token_acc": 0.8911118341790296
1595
+ },
1596
+ {
1597
+ "epoch": 0.7709443099273607,
1598
+ "grad_norm": 0.2119433879852295,
1599
+ "learning_rate": 8.998877302078803e-06,
1600
+ "loss": 0.30766892433166504,
1601
+ "step": 199,
1602
+ "token_acc": 0.8930650097673094
1603
+ },
1604
+ {
1605
+ "epoch": 0.774818401937046,
1606
+ "grad_norm": 0.20151817798614502,
1607
+ "learning_rate": 8.985869608188545e-06,
1608
+ "loss": 0.294528067111969,
1609
+ "step": 200,
1610
+ "token_acc": 0.8973507748438794
1611
+ },
1612
+ {
1613
+ "epoch": 0.7786924939467312,
1614
+ "grad_norm": 0.20979715883731842,
1615
+ "learning_rate": 8.97278748338719e-06,
1616
+ "loss": 0.3116077184677124,
1617
+ "step": 201,
1618
+ "token_acc": 0.8916578293780434
1619
+ },
1620
+ {
1621
+ "epoch": 0.7825665859564165,
1622
+ "grad_norm": 0.21114560961723328,
1623
+ "learning_rate": 8.95963117196634e-06,
1624
+ "loss": 0.31117022037506104,
1625
+ "step": 202,
1626
+ "token_acc": 0.8922739117136779
1627
+ },
1628
+ {
1629
+ "epoch": 0.7864406779661017,
1630
+ "grad_norm": 0.2028111070394516,
1631
+ "learning_rate": 8.946400919602933e-06,
1632
+ "loss": 0.2925041913986206,
1633
+ "step": 203,
1634
+ "token_acc": 0.8979599612123477
1635
+ },
1636
+ {
1637
+ "epoch": 0.790314769975787,
1638
+ "grad_norm": 0.19873376190662384,
1639
+ "learning_rate": 8.933096973354665e-06,
1640
+ "loss": 0.3335387706756592,
1641
+ "step": 204,
1642
+ "token_acc": 0.8845781124549695
1643
+ },
1644
+ {
1645
+ "epoch": 0.7941888619854721,
1646
+ "grad_norm": 0.20865830779075623,
1647
+ "learning_rate": 8.919719581655357e-06,
1648
+ "loss": 0.3048374652862549,
1649
+ "step": 205,
1650
+ "token_acc": 0.8941424666394205
1651
+ },
1652
+ {
1653
+ "epoch": 0.7980629539951574,
1654
+ "grad_norm": 0.21847450733184814,
1655
+ "learning_rate": 8.906268994310339e-06,
1656
+ "loss": 0.30148929357528687,
1657
+ "step": 206,
1658
+ "token_acc": 0.8948231645494126
1659
+ },
1660
+ {
1661
+ "epoch": 0.8019370460048426,
1662
+ "grad_norm": 0.23447921872138977,
1663
+ "learning_rate": 8.892745462491763e-06,
1664
+ "loss": 0.3076891005039215,
1665
+ "step": 207,
1666
+ "token_acc": 0.8940680143003497
1667
+ },
1668
+ {
1669
+ "epoch": 0.8058111380145279,
1670
+ "grad_norm": 0.2047218531370163,
1671
+ "learning_rate": 8.879149238733932e-06,
1672
+ "loss": 0.2903471291065216,
1673
+ "step": 208,
1674
+ "token_acc": 0.8996930000967329
1675
+ },
1676
+ {
1677
+ "epoch": 0.8096852300242131,
1678
+ "grad_norm": 0.3560882806777954,
1679
+ "learning_rate": 8.865480576928578e-06,
1680
+ "loss": 0.2734353840351105,
1681
+ "step": 209,
1682
+ "token_acc": 0.9038816908230364
1683
+ },
1684
+ {
1685
+ "epoch": 0.8135593220338984,
1686
+ "grad_norm": 0.22588837146759033,
1687
+ "learning_rate": 8.851739732320109e-06,
1688
+ "loss": 0.30820316076278687,
1689
+ "step": 210,
1690
+ "token_acc": 0.8928903081404425
1691
+ },
1692
+ {
1693
+ "epoch": 0.8174334140435835,
1694
+ "grad_norm": 0.19928814470767975,
1695
+ "learning_rate": 8.83792696150086e-06,
1696
+ "loss": 0.30705487728118896,
1697
+ "step": 211,
1698
+ "token_acc": 0.8931717351449738
1699
+ },
1700
+ {
1701
+ "epoch": 0.8213075060532687,
1702
+ "grad_norm": 0.23134565353393555,
1703
+ "learning_rate": 8.824042522406295e-06,
1704
+ "loss": 0.3144133687019348,
1705
+ "step": 212,
1706
+ "token_acc": 0.8904542748607169
1707
+ },
1708
+ {
1709
+ "epoch": 0.825181598062954,
1710
+ "grad_norm": 0.20952780544757843,
1711
+ "learning_rate": 8.810086674310184e-06,
1712
+ "loss": 0.3166520595550537,
1713
+ "step": 213,
1714
+ "token_acc": 0.8902617260259249
1715
+ },
1716
+ {
1717
+ "epoch": 0.8290556900726392,
1718
+ "grad_norm": 0.21133121848106384,
1719
+ "learning_rate": 8.796059677819773e-06,
1720
+ "loss": 0.31384018063545227,
1721
+ "step": 214,
1722
+ "token_acc": 0.8909493414116798
1723
+ },
1724
+ {
1725
+ "epoch": 0.8329297820823245,
1726
+ "grad_norm": 0.3206462264060974,
1727
+ "learning_rate": 8.781961794870903e-06,
1728
+ "loss": 0.30939990282058716,
1729
+ "step": 215,
1730
+ "token_acc": 0.8926290243396312
1731
+ },
1732
+ {
1733
+ "epoch": 0.8368038740920097,
1734
+ "grad_norm": 0.21380406618118286,
1735
+ "learning_rate": 8.767793288723137e-06,
1736
+ "loss": 0.3126541078090668,
1737
+ "step": 216,
1738
+ "token_acc": 0.8918149018414423
1739
+ },
1740
+ {
1741
+ "epoch": 0.8406779661016949,
1742
+ "grad_norm": 0.2241922914981842,
1743
+ "learning_rate": 8.753554423954828e-06,
1744
+ "loss": 0.32906076312065125,
1745
+ "step": 217,
1746
+ "token_acc": 0.8866828065863777
1747
+ },
1748
+ {
1749
+ "epoch": 0.8445520581113801,
1750
+ "grad_norm": 0.19776619970798492,
1751
+ "learning_rate": 8.739245466458187e-06,
1752
+ "loss": 0.28062158823013306,
1753
+ "step": 218,
1754
+ "token_acc": 0.9022684784065322
1755
+ },
1756
+ {
1757
+ "epoch": 0.8484261501210654,
1758
+ "grad_norm": 0.2141999900341034,
1759
+ "learning_rate": 8.72486668343431e-06,
1760
+ "loss": 0.3276277184486389,
1761
+ "step": 219,
1762
+ "token_acc": 0.8861141792995992
1763
+ },
1764
+ {
1765
+ "epoch": 0.8523002421307506,
1766
+ "grad_norm": 0.2332129180431366,
1767
+ "learning_rate": 8.7104183433882e-06,
1768
+ "loss": 0.3168509304523468,
1769
+ "step": 220,
1770
+ "token_acc": 0.8899989570826125
1771
+ },
1772
+ {
1773
+ "epoch": 0.8561743341404359,
1774
+ "grad_norm": 0.2141677886247635,
1775
+ "learning_rate": 8.695900716123744e-06,
1776
+ "loss": 0.3259914219379425,
1777
+ "step": 221,
1778
+ "token_acc": 0.8866733094194235
1779
+ },
1780
+ {
1781
+ "epoch": 0.8600484261501211,
1782
+ "grad_norm": 0.20929858088493347,
1783
+ "learning_rate": 8.681314072738678e-06,
1784
+ "loss": 0.2776751220226288,
1785
+ "step": 222,
1786
+ "token_acc": 0.9029569916163804
1787
+ },
1788
+ {
1789
+ "epoch": 0.8639225181598063,
1790
+ "grad_norm": 0.26802197098731995,
1791
+ "learning_rate": 8.666658685619523e-06,
1792
+ "loss": 0.3192378282546997,
1793
+ "step": 223,
1794
+ "token_acc": 0.8888524656782731
1795
+ },
1796
+ {
1797
+ "epoch": 0.8677966101694915,
1798
+ "grad_norm": 0.19303195178508759,
1799
+ "learning_rate": 8.651934828436497e-06,
1800
+ "loss": 0.2820873260498047,
1801
+ "step": 224,
1802
+ "token_acc": 0.9010663601046539
1803
+ },
1804
+ {
1805
+ "epoch": 0.8716707021791767,
1806
+ "grad_norm": 0.20784462988376617,
1807
+ "learning_rate": 8.637142776138415e-06,
1808
+ "loss": 0.2850268483161926,
1809
+ "step": 225,
1810
+ "token_acc": 0.9003609394726915
1811
+ },
1812
+ {
1813
+ "epoch": 0.875544794188862,
1814
+ "grad_norm": 0.2194257229566574,
1815
+ "learning_rate": 8.622282804947537e-06,
1816
+ "loss": 0.31484997272491455,
1817
+ "step": 226,
1818
+ "token_acc": 0.8909253202507496
1819
+ },
1820
+ {
1821
+ "epoch": 0.8794188861985472,
1822
+ "grad_norm": 0.21197804808616638,
1823
+ "learning_rate": 8.607355192354425e-06,
1824
+ "loss": 0.3072202801704407,
1825
+ "step": 227,
1826
+ "token_acc": 0.8929364556285221
1827
+ },
1828
+ {
1829
+ "epoch": 0.8832929782082325,
1830
+ "grad_norm": 0.19514977931976318,
1831
+ "learning_rate": 8.592360217112759e-06,
1832
+ "loss": 0.31343895196914673,
1833
+ "step": 228,
1834
+ "token_acc": 0.8909144611151198
1835
+ },
1836
+ {
1837
+ "epoch": 0.8871670702179176,
1838
+ "grad_norm": 0.2198445200920105,
1839
+ "learning_rate": 8.57729815923412e-06,
1840
+ "loss": 0.31176120042800903,
1841
+ "step": 229,
1842
+ "token_acc": 0.8916788161998124
1843
+ },
1844
+ {
1845
+ "epoch": 0.8910411622276029,
1846
+ "grad_norm": 0.20297633111476898,
1847
+ "learning_rate": 8.562169299982776e-06,
1848
+ "loss": 0.30840498208999634,
1849
+ "step": 230,
1850
+ "token_acc": 0.8921534903182912
1851
+ },
1852
+ {
1853
+ "epoch": 0.8949152542372881,
1854
+ "grad_norm": 0.21356205642223358,
1855
+ "learning_rate": 8.546973921870421e-06,
1856
+ "loss": 0.3210839629173279,
1857
+ "step": 231,
1858
+ "token_acc": 0.8882864775840541
1859
+ },
1860
+ {
1861
+ "epoch": 0.8987893462469734,
1862
+ "grad_norm": 0.21405935287475586,
1863
+ "learning_rate": 8.531712308650904e-06,
1864
+ "loss": 0.3006952702999115,
1865
+ "step": 232,
1866
+ "token_acc": 0.8953128142705267
1867
+ },
1868
+ {
1869
+ "epoch": 0.9026634382566586,
1870
+ "grad_norm": 0.21220295131206512,
1871
+ "learning_rate": 8.516384745314926e-06,
1872
+ "loss": 0.33272668719291687,
1873
+ "step": 233,
1874
+ "token_acc": 0.8845533899027282
1875
+ },
1876
+ {
1877
+ "epoch": 0.9065375302663439,
1878
+ "grad_norm": 0.19546008110046387,
1879
+ "learning_rate": 8.50099151808472e-06,
1880
+ "loss": 0.26581257581710815,
1881
+ "step": 234,
1882
+ "token_acc": 0.9067262813046539
1883
+ },
1884
+ {
1885
+ "epoch": 0.910411622276029,
1886
+ "grad_norm": 0.2057773917913437,
1887
+ "learning_rate": 8.485532914408712e-06,
1888
+ "loss": 0.2936754524707794,
1889
+ "step": 235,
1890
+ "token_acc": 0.8980145512690381
1891
+ },
1892
+ {
1893
+ "epoch": 0.9142857142857143,
1894
+ "grad_norm": 0.21968601644039154,
1895
+ "learning_rate": 8.470009222956138e-06,
1896
+ "loss": 0.2990136742591858,
1897
+ "step": 236,
1898
+ "token_acc": 0.8944779048351311
1899
+ },
1900
+ {
1901
+ "epoch": 0.9181598062953995,
1902
+ "grad_norm": 0.22149494290351868,
1903
+ "learning_rate": 8.45442073361167e-06,
1904
+ "loss": 0.29907599091529846,
1905
+ "step": 237,
1906
+ "token_acc": 0.8953804266415489
1907
+ },
1908
+ {
1909
+ "epoch": 0.9220338983050848,
1910
+ "grad_norm": 0.18807418644428253,
1911
+ "learning_rate": 8.438767737469995e-06,
1912
+ "loss": 0.2596169412136078,
1913
+ "step": 238,
1914
+ "token_acc": 0.9094668271985952
1915
+ },
1916
+ {
1917
+ "epoch": 0.92590799031477,
1918
+ "grad_norm": 0.2053857445716858,
1919
+ "learning_rate": 8.42305052683038e-06,
1920
+ "loss": 0.320443719625473,
1921
+ "step": 239,
1922
+ "token_acc": 0.8882472950063495
1923
+ },
1924
+ {
1925
+ "epoch": 0.9297820823244553,
1926
+ "grad_norm": 0.19474725425243378,
1927
+ "learning_rate": 8.407269395191216e-06,
1928
+ "loss": 0.29054853320121765,
1929
+ "step": 240,
1930
+ "token_acc": 0.8986681898213217
1931
+ },
1932
+ {
1933
+ "epoch": 0.9336561743341404,
1934
+ "grad_norm": 0.22415153682231903,
1935
+ "learning_rate": 8.391424637244528e-06,
1936
+ "loss": 0.29720863699913025,
1937
+ "step": 241,
1938
+ "token_acc": 0.8967865758573351
1939
+ },
1940
+ {
1941
+ "epoch": 0.9375302663438256,
1942
+ "grad_norm": 0.20295462012290955,
1943
+ "learning_rate": 8.375516548870489e-06,
1944
+ "loss": 0.3213497996330261,
1945
+ "step": 242,
1946
+ "token_acc": 0.8888211973402874
1947
+ },
1948
+ {
1949
+ "epoch": 0.9414043583535109,
1950
+ "grad_norm": 0.235239177942276,
1951
+ "learning_rate": 8.359545427131876e-06,
1952
+ "loss": 0.31140708923339844,
1953
+ "step": 243,
1954
+ "token_acc": 0.8917541696945803
1955
+ },
1956
+ {
1957
+ "epoch": 0.9452784503631961,
1958
+ "grad_norm": 0.21419954299926758,
1959
+ "learning_rate": 8.343511570268541e-06,
1960
+ "loss": 0.3142154812812805,
1961
+ "step": 244,
1962
+ "token_acc": 0.890589961402836
1963
+ },
1964
+ {
1965
+ "epoch": 0.9491525423728814,
1966
+ "grad_norm": 0.20498663187026978,
1967
+ "learning_rate": 8.327415277691824e-06,
1968
+ "loss": 0.3464815020561218,
1969
+ "step": 245,
1970
+ "token_acc": 0.8797665540392294
1971
+ },
1972
+ {
1973
+ "epoch": 0.9530266343825666,
1974
+ "grad_norm": 0.20611073076725006,
1975
+ "learning_rate": 8.311256849978974e-06,
1976
+ "loss": 0.31497207283973694,
1977
+ "step": 246,
1978
+ "token_acc": 0.889790752866034
1979
+ },
1980
+ {
1981
+ "epoch": 0.9569007263922518,
1982
+ "grad_norm": 0.21447882056236267,
1983
+ "learning_rate": 8.295036588867533e-06,
1984
+ "loss": 0.28588759899139404,
1985
+ "step": 247,
1986
+ "token_acc": 0.8993494375908707
1987
+ },
1988
+ {
1989
+ "epoch": 0.960774818401937,
1990
+ "grad_norm": 0.21430622041225433,
1991
+ "learning_rate": 8.278754797249702e-06,
1992
+ "loss": 0.3209206461906433,
1993
+ "step": 248,
1994
+ "token_acc": 0.8878057052632179
1995
+ },
1996
+ {
1997
+ "epoch": 0.9646489104116223,
1998
+ "grad_norm": 0.1971716433763504,
1999
+ "learning_rate": 8.262411779166681e-06,
2000
+ "loss": 0.29577910900115967,
2001
+ "step": 249,
2002
+ "token_acc": 0.8970768255184925
2003
+ },
2004
+ {
2005
+ "epoch": 0.9685230024213075,
2006
+ "grad_norm": 0.20728042721748352,
2007
+ "learning_rate": 8.246007839802997e-06,
2008
+ "loss": 0.3149109482765198,
2009
+ "step": 250,
2010
+ "token_acc": 0.8904120076852685
2011
+ },
2012
+ {
2013
+ "epoch": 0.9723970944309928,
2014
+ "grad_norm": 0.23157289624214172,
2015
+ "learning_rate": 8.229543285480797e-06,
2016
+ "loss": 0.3057391047477722,
2017
+ "step": 251,
2018
+ "token_acc": 0.8943966929583815
2019
+ },
2020
+ {
2021
+ "epoch": 0.976271186440678,
2022
+ "grad_norm": 0.21818409860134125,
2023
+ "learning_rate": 8.213018423654144e-06,
2024
+ "loss": 0.3090881109237671,
2025
+ "step": 252,
2026
+ "token_acc": 0.8931029437419457
2027
+ },
2028
+ {
2029
+ "epoch": 0.9801452784503631,
2030
+ "grad_norm": 0.20345434546470642,
2031
+ "learning_rate": 8.196433562903252e-06,
2032
+ "loss": 0.2966330051422119,
2033
+ "step": 253,
2034
+ "token_acc": 0.8959465166900704
2035
+ },
2036
+ {
2037
+ "epoch": 0.9840193704600484,
2038
+ "grad_norm": 0.203868567943573,
2039
+ "learning_rate": 8.179789012928747e-06,
2040
+ "loss": 0.2893424928188324,
2041
+ "step": 254,
2042
+ "token_acc": 0.8989887993032385
2043
+ },
2044
+ {
2045
+ "epoch": 0.9878934624697336,
2046
+ "grad_norm": 0.20835842192173004,
2047
+ "learning_rate": 8.163085084545867e-06,
2048
+ "loss": 0.29561957716941833,
2049
+ "step": 255,
2050
+ "token_acc": 0.897130295078995
2051
+ },
2052
+ {
2053
+ "epoch": 0.9917675544794189,
2054
+ "grad_norm": 0.2602974772453308,
2055
+ "learning_rate": 8.146322089678668e-06,
2056
+ "loss": 0.33309951424598694,
2057
+ "step": 256,
2058
+ "token_acc": 0.8842519179704944
2059
+ },
2060
+ {
2061
+ "epoch": 0.9956416464891041,
2062
+ "grad_norm": 0.1993730664253235,
2063
+ "learning_rate": 8.129500341354192e-06,
2064
+ "loss": 0.32513946294784546,
2065
+ "step": 257,
2066
+ "token_acc": 0.8869922494628838
2067
+ },
2068
+ {
2069
+ "epoch": 0.9995157384987894,
2070
+ "grad_norm": 0.2033330649137497,
2071
+ "learning_rate": 8.11262015369663e-06,
2072
+ "loss": 0.29512181878089905,
2073
+ "step": 258,
2074
+ "token_acc": 0.8968425014801387
2075
+ },
2076
+ {
2077
+ "epoch": 1.0,
2078
+ "grad_norm": 0.6673643589019775,
2079
+ "learning_rate": 8.095681841921441e-06,
2080
+ "loss": 0.28728920221328735,
2081
+ "step": 259,
2082
+ "token_acc": 0.9003083713758805
2083
+ },
2084
+ {
2085
+ "epoch": 1.0038740920096851,
2086
+ "grad_norm": 0.32744893431663513,
2087
+ "learning_rate": 8.07868572232949e-06,
2088
+ "loss": 0.269972562789917,
2089
+ "step": 260,
2090
+ "token_acc": 0.9038492097273063
2091
+ },
2092
+ {
2093
+ "epoch": 1.0077481840193705,
2094
+ "grad_norm": 0.2596898376941681,
2095
+ "learning_rate": 8.061632112301122e-06,
2096
+ "loss": 0.2655790150165558,
2097
+ "step": 261,
2098
+ "token_acc": 0.9053338855906853
2099
+ },
2100
+ {
2101
+ "epoch": 1.0116222760290556,
2102
+ "grad_norm": 0.2612839639186859,
2103
+ "learning_rate": 8.044521330290235e-06,
2104
+ "loss": 0.2887282967567444,
2105
+ "step": 262,
2106
+ "token_acc": 0.8971828029711167
2107
+ },
2108
+ {
2109
+ "epoch": 1.015496368038741,
2110
+ "grad_norm": 0.2769652009010315,
2111
+ "learning_rate": 8.027353695818345e-06,
2112
+ "loss": 0.26126527786254883,
2113
+ "step": 263,
2114
+ "token_acc": 0.9065780969019781
2115
+ },
2116
+ {
2117
+ "epoch": 1.0193704600484261,
2118
+ "grad_norm": 0.27929142117500305,
2119
+ "learning_rate": 8.010129529468614e-06,
2120
+ "loss": 0.27868735790252686,
2121
+ "step": 264,
2122
+ "token_acc": 0.9001419249114798
2123
+ },
2124
+ {
2125
+ "epoch": 1.0232445520581113,
2126
+ "grad_norm": 0.23997750878334045,
2127
+ "learning_rate": 7.992849152879857e-06,
2128
+ "loss": 0.2831759750843048,
2129
+ "step": 265,
2130
+ "token_acc": 0.899304001670737
2131
+ },
2132
+ {
2133
+ "epoch": 1.0271186440677966,
2134
+ "grad_norm": 0.25313815474510193,
2135
+ "learning_rate": 7.97551288874055e-06,
2136
+ "loss": 0.27934202551841736,
2137
+ "step": 266,
2138
+ "token_acc": 0.9004498805562496
2139
+ },
2140
+ {
2141
+ "epoch": 1.0309927360774818,
2142
+ "grad_norm": 0.23287494480609894,
2143
+ "learning_rate": 7.95812106078279e-06,
2144
+ "loss": 0.26112881302833557,
2145
+ "step": 267,
2146
+ "token_acc": 0.9065508038300509
2147
+ },
2148
+ {
2149
+ "epoch": 1.0348668280871671,
2150
+ "grad_norm": 0.22660091519355774,
2151
+ "learning_rate": 7.940673993776258e-06,
2152
+ "loss": 0.2504875063896179,
2153
+ "step": 268,
2154
+ "token_acc": 0.9097140867981872
2155
+ },
2156
+ {
2157
+ "epoch": 1.0387409200968523,
2158
+ "grad_norm": 0.2266615480184555,
2159
+ "learning_rate": 7.923172013522153e-06,
2160
+ "loss": 0.25760790705680847,
2161
+ "step": 269,
2162
+ "token_acc": 0.9073963735109954
2163
+ },
2164
+ {
2165
+ "epoch": 1.0426150121065376,
2166
+ "grad_norm": 0.22593924403190613,
2167
+ "learning_rate": 7.905615446847107e-06,
2168
+ "loss": 0.28686419129371643,
2169
+ "step": 270,
2170
+ "token_acc": 0.8976161305002275
2171
+ },
2172
+ {
2173
+ "epoch": 1.0464891041162228,
2174
+ "grad_norm": 0.2425071895122528,
2175
+ "learning_rate": 7.888004621597079e-06,
2176
+ "loss": 0.2573948800563812,
2177
+ "step": 271,
2178
+ "token_acc": 0.907380557815819
2179
+ },
2180
+ {
2181
+ "epoch": 1.050363196125908,
2182
+ "grad_norm": 0.23996935784816742,
2183
+ "learning_rate": 7.87033986663124e-06,
2184
+ "loss": 0.2808932065963745,
2185
+ "step": 272,
2186
+ "token_acc": 0.8994914728045711
2187
+ },
2188
+ {
2189
+ "epoch": 1.0542372881355933,
2190
+ "grad_norm": 0.25931164622306824,
2191
+ "learning_rate": 7.852621511815825e-06,
2192
+ "loss": 0.26375657320022583,
2193
+ "step": 273,
2194
+ "token_acc": 0.9051297163863579
2195
+ },
2196
+ {
2197
+ "epoch": 1.0581113801452784,
2198
+ "grad_norm": 0.20594951510429382,
2199
+ "learning_rate": 7.834849888017979e-06,
2200
+ "loss": 0.23789554834365845,
2201
+ "step": 274,
2202
+ "token_acc": 0.9142479611743739
2203
+ },
2204
+ {
2205
+ "epoch": 1.0619854721549637,
2206
+ "grad_norm": 0.23315519094467163,
2207
+ "learning_rate": 7.817025327099574e-06,
2208
+ "loss": 0.24684631824493408,
2209
+ "step": 275,
2210
+ "token_acc": 0.9110874200426439
2211
+ },
2212
+ {
2213
+ "epoch": 1.0658595641646489,
2214
+ "grad_norm": 0.2189839482307434,
2215
+ "learning_rate": 7.799148161911013e-06,
2216
+ "loss": 0.2684437334537506,
2217
+ "step": 276,
2218
+ "token_acc": 0.9041172254519392
2219
+ },
2220
+ {
2221
+ "epoch": 1.0697336561743342,
2222
+ "grad_norm": 0.21298226714134216,
2223
+ "learning_rate": 7.781218726285014e-06,
2224
+ "loss": 0.2720562815666199,
2225
+ "step": 277,
2226
+ "token_acc": 0.9027445373018297
2227
+ },
2228
+ {
2229
+ "epoch": 1.0736077481840194,
2230
+ "grad_norm": 0.21282611787319183,
2231
+ "learning_rate": 7.763237355030384e-06,
2232
+ "loss": 0.2579670548439026,
2233
+ "step": 278,
2234
+ "token_acc": 0.9080073119376767
2235
+ },
2236
+ {
2237
+ "epoch": 1.0774818401937045,
2238
+ "grad_norm": 0.21488887071609497,
2239
+ "learning_rate": 7.745204383925753e-06,
2240
+ "loss": 0.2742394804954529,
2241
+ "step": 279,
2242
+ "token_acc": 0.9015262545209174
2243
+ },
2244
+ {
2245
+ "epoch": 1.0813559322033899,
2246
+ "grad_norm": 0.19826629757881165,
2247
+ "learning_rate": 7.727120149713313e-06,
2248
+ "loss": 0.23731666803359985,
2249
+ "step": 280,
2250
+ "token_acc": 0.9146603883445988
2251
+ },
2252
+ {
2253
+ "epoch": 1.085230024213075,
2254
+ "grad_norm": 0.20840346813201904,
2255
+ "learning_rate": 7.708984990092528e-06,
2256
+ "loss": 0.22673961520195007,
2257
+ "step": 281,
2258
+ "token_acc": 0.9184409845576723
2259
+ },
2260
+ {
2261
+ "epoch": 1.0891041162227604,
2262
+ "grad_norm": 0.21199366450309753,
2263
+ "learning_rate": 7.690799243713825e-06,
2264
+ "loss": 0.2788952887058258,
2265
+ "step": 282,
2266
+ "token_acc": 0.9002122640890617
2267
+ },
2268
+ {
2269
+ "epoch": 1.0929782082324455,
2270
+ "grad_norm": 0.23963455855846405,
2271
+ "learning_rate": 7.672563250172278e-06,
2272
+ "loss": 0.2703215479850769,
2273
+ "step": 283,
2274
+ "token_acc": 0.902904561306835
2275
+ },
2276
+ {
2277
+ "epoch": 1.0968523002421307,
2278
+ "grad_norm": 0.20739565789699554,
2279
+ "learning_rate": 7.654277350001255e-06,
2280
+ "loss": 0.2556743621826172,
2281
+ "step": 284,
2282
+ "token_acc": 0.9087778504769448
2283
+ },
2284
+ {
2285
+ "epoch": 1.100726392251816,
2286
+ "grad_norm": 0.3205340504646301,
2287
+ "learning_rate": 7.635941884666072e-06,
2288
+ "loss": 0.2660865783691406,
2289
+ "step": 285,
2290
+ "token_acc": 0.9052546447746934
2291
+ },
2292
+ {
2293
+ "epoch": 1.1046004842615011,
2294
+ "grad_norm": 0.20611628890037537,
2295
+ "learning_rate": 7.617557196557601e-06,
2296
+ "loss": 0.2590142488479614,
2297
+ "step": 286,
2298
+ "token_acc": 0.9070821077566713
2299
+ },
2300
+ {
2301
+ "epoch": 1.1084745762711865,
2302
+ "grad_norm": 0.1932753622531891,
2303
+ "learning_rate": 7.599123628985894e-06,
2304
+ "loss": 0.2396095246076584,
2305
+ "step": 287,
2306
+ "token_acc": 0.9135842317299648
2307
+ },
2308
+ {
2309
+ "epoch": 1.1123486682808716,
2310
+ "grad_norm": 0.21151748299598694,
2311
+ "learning_rate": 7.580641526173758e-06,
2312
+ "loss": 0.2544936537742615,
2313
+ "step": 288,
2314
+ "token_acc": 0.9088854539111634
2315
+ },
2316
+ {
2317
+ "epoch": 1.116222760290557,
2318
+ "grad_norm": 0.1992950737476349,
2319
+ "learning_rate": 7.5621112332503325e-06,
2320
+ "loss": 0.2544850707054138,
2321
+ "step": 289,
2322
+ "token_acc": 0.9090426161294457
2323
+ },
2324
+ {
2325
+ "epoch": 1.1200968523002421,
2326
+ "grad_norm": 0.20908565819263458,
2327
+ "learning_rate": 7.543533096244644e-06,
2328
+ "loss": 0.2762412428855896,
2329
+ "step": 290,
2330
+ "token_acc": 0.9013541447063986
2331
+ },
2332
+ {
2333
+ "epoch": 1.1239709443099273,
2334
+ "grad_norm": 0.2157965451478958,
2335
+ "learning_rate": 7.524907462079149e-06,
2336
+ "loss": 0.25533056259155273,
2337
+ "step": 291,
2338
+ "token_acc": 0.9080176353704462
2339
+ },
2340
+ {
2341
+ "epoch": 1.1278450363196126,
2342
+ "grad_norm": 0.19141145050525665,
2343
+ "learning_rate": 7.506234678563248e-06,
2344
+ "loss": 0.2362717241048813,
2345
+ "step": 292,
2346
+ "token_acc": 0.9155038610363999
2347
+ },
2348
+ {
2349
+ "epoch": 1.1317191283292978,
2350
+ "grad_norm": 0.21533732116222382,
2351
+ "learning_rate": 7.487515094386792e-06,
2352
+ "loss": 0.23099368810653687,
2353
+ "step": 293,
2354
+ "token_acc": 0.9173202498403009
2355
+ },
2356
+ {
2357
+ "epoch": 1.1355932203389831,
2358
+ "grad_norm": 0.20129309594631195,
2359
+ "learning_rate": 7.468749059113578e-06,
2360
+ "loss": 0.26144838333129883,
2361
+ "step": 294,
2362
+ "token_acc": 0.9057641431815713
2363
+ },
2364
+ {
2365
+ "epoch": 1.1394673123486683,
2366
+ "grad_norm": 0.3953739404678345,
2367
+ "learning_rate": 7.449936923174813e-06,
2368
+ "loss": 0.2557257413864136,
2369
+ "step": 295,
2370
+ "token_acc": 0.9087617787160037
2371
+ },
2372
+ {
2373
+ "epoch": 1.1433414043583534,
2374
+ "grad_norm": 0.21214410662651062,
2375
+ "learning_rate": 7.431079037862575e-06,
2376
+ "loss": 0.27983057498931885,
2377
+ "step": 296,
2378
+ "token_acc": 0.8996573827559394
2379
+ },
2380
+ {
2381
+ "epoch": 1.1472154963680388,
2382
+ "grad_norm": 0.20280665159225464,
2383
+ "learning_rate": 7.412175755323254e-06,
2384
+ "loss": 0.2772400677204132,
2385
+ "step": 297,
2386
+ "token_acc": 0.9010093723967251
2387
+ },
2388
+ {
2389
+ "epoch": 1.151089588377724,
2390
+ "grad_norm": 0.21776501834392548,
2391
+ "learning_rate": 7.39322742855097e-06,
2392
+ "loss": 0.24517808854579926,
2393
+ "step": 298,
2394
+ "token_acc": 0.9120538077359621
2395
+ },
2396
+ {
2397
+ "epoch": 1.1549636803874093,
2398
+ "grad_norm": 0.21630938351154327,
2399
+ "learning_rate": 7.374234411380987e-06,
2400
+ "loss": 0.2736694812774658,
2401
+ "step": 299,
2402
+ "token_acc": 0.9020631116999458
2403
+ },
2404
+ {
2405
+ "epoch": 1.1588377723970944,
2406
+ "grad_norm": 0.19338402152061462,
2407
+ "learning_rate": 7.355197058483103e-06,
2408
+ "loss": 0.24092288315296173,
2409
+ "step": 300,
2410
+ "token_acc": 0.9133508019967492
2411
+ }
2412
+ ],
2413
+ "logging_steps": 1,
2414
+ "max_steps": 777,
2415
+ "num_input_tokens_seen": 0,
2416
+ "num_train_epochs": 3,
2417
+ "save_steps": 50,
2418
+ "stateful_callbacks": {
2419
+ "TrainerControl": {
2420
+ "args": {
2421
+ "should_epoch_stop": false,
2422
+ "should_evaluate": false,
2423
+ "should_log": false,
2424
+ "should_save": true,
2425
+ "should_training_stop": false
2426
+ },
2427
+ "attributes": {}
2428
+ }
2429
+ },
2430
+ "total_flos": 1.1233780174946304e+16,
2431
+ "train_batch_size": 1,
2432
+ "trial_name": null,
2433
+ "trial_params": null
2434
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82fc3e8dde96239190e8d51b6089ff008851cd3d6cf4700f6dc0f566e10a996c
3
+ size 8977
video_preprocessor_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "size": {
3
+ "longest_edge": 25165824,
4
+ "shortest_edge": 4096
5
+ },
6
+ "patch_size": 16,
7
+ "temporal_patch_size": 2,
8
+ "merge_size": 2,
9
+ "image_mean": [
10
+ 0.5,
11
+ 0.5,
12
+ 0.5
13
+ ],
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "processor_class": "Qwen3VLProcessor",
20
+ "video_processor_type": "Qwen3VLVideoProcessor"
21
+ }