Training in progress, step 5500, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 536223056
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fcb80f83cde4a31bb60c1fd7260ffe3f7e16f618b67202dd29fd631a03093894
|
| 3 |
size 536223056
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1072594443
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:deac3ee60db6adb45d1da1976f4f679efdf8206065175afc58ada5c695ccf6a5
|
| 3 |
size 1072594443
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f4f36f1c6d7eb84c738a082911123d4e08f6356fc8093bb45612eb211d0cfe74
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -4508,6 +4508,456 @@
|
|
| 4508 |
"mean_token_accuracy": 0.800259780883789,
|
| 4509 |
"num_tokens": 5541015.0,
|
| 4510 |
"step": 5000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4511 |
}
|
| 4512 |
],
|
| 4513 |
"logging_steps": 10,
|
|
@@ -4527,7 +4977,7 @@
|
|
| 4527 |
"attributes": {}
|
| 4528 |
}
|
| 4529 |
},
|
| 4530 |
-
"total_flos":
|
| 4531 |
"train_batch_size": 8,
|
| 4532 |
"trial_name": null,
|
| 4533 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.1082006850695145,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 5500,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 4508 |
"mean_token_accuracy": 0.800259780883789,
|
| 4509 |
"num_tokens": 5541015.0,
|
| 4510 |
"step": 5000
|
| 4511 |
+
},
|
| 4512 |
+
{
|
| 4513 |
+
"epoch": 1.0094700785815032,
|
| 4514 |
+
"grad_norm": 9.875,
|
| 4515 |
+
"learning_rate": 1.3271542749680975e-05,
|
| 4516 |
+
"loss": 0.9383,
|
| 4517 |
+
"mean_token_accuracy": 0.7706878125667572,
|
| 4518 |
+
"num_tokens": 5553090.0,
|
| 4519 |
+
"step": 5010
|
| 4520 |
+
},
|
| 4521 |
+
{
|
| 4522 |
+
"epoch": 1.011484988917993,
|
| 4523 |
+
"grad_norm": 12.0625,
|
| 4524 |
+
"learning_rate": 1.3258110014104373e-05,
|
| 4525 |
+
"loss": 0.8471,
|
| 4526 |
+
"mean_token_accuracy": 0.7942995607852936,
|
| 4527 |
+
"num_tokens": 5564220.0,
|
| 4528 |
+
"step": 5020
|
| 4529 |
+
},
|
| 4530 |
+
{
|
| 4531 |
+
"epoch": 1.0134998992544832,
|
| 4532 |
+
"grad_norm": 10.375,
|
| 4533 |
+
"learning_rate": 1.3244677278527773e-05,
|
| 4534 |
+
"loss": 0.8018,
|
| 4535 |
+
"mean_token_accuracy": 0.7939584195613861,
|
| 4536 |
+
"num_tokens": 5574966.0,
|
| 4537 |
+
"step": 5030
|
| 4538 |
+
},
|
| 4539 |
+
{
|
| 4540 |
+
"epoch": 1.0155148095909732,
|
| 4541 |
+
"grad_norm": 12.625,
|
| 4542 |
+
"learning_rate": 1.3231244542951174e-05,
|
| 4543 |
+
"loss": 0.9348,
|
| 4544 |
+
"mean_token_accuracy": 0.7773958921432496,
|
| 4545 |
+
"num_tokens": 5586140.0,
|
| 4546 |
+
"step": 5040
|
| 4547 |
+
},
|
| 4548 |
+
{
|
| 4549 |
+
"epoch": 1.0175297199274633,
|
| 4550 |
+
"grad_norm": 9.625,
|
| 4551 |
+
"learning_rate": 1.3217811807374572e-05,
|
| 4552 |
+
"loss": 0.8882,
|
| 4553 |
+
"mean_token_accuracy": 0.7792610108852387,
|
| 4554 |
+
"num_tokens": 5597440.0,
|
| 4555 |
+
"step": 5050
|
| 4556 |
+
},
|
| 4557 |
+
{
|
| 4558 |
+
"epoch": 1.0195446302639533,
|
| 4559 |
+
"grad_norm": 11.75,
|
| 4560 |
+
"learning_rate": 1.3204379071797973e-05,
|
| 4561 |
+
"loss": 0.7882,
|
| 4562 |
+
"mean_token_accuracy": 0.8046412229537964,
|
| 4563 |
+
"num_tokens": 5609321.0,
|
| 4564 |
+
"step": 5060
|
| 4565 |
+
},
|
| 4566 |
+
{
|
| 4567 |
+
"epoch": 1.0215595406004432,
|
| 4568 |
+
"grad_norm": 9.4375,
|
| 4569 |
+
"learning_rate": 1.3190946336221373e-05,
|
| 4570 |
+
"loss": 0.8062,
|
| 4571 |
+
"mean_token_accuracy": 0.7952991247177124,
|
| 4572 |
+
"num_tokens": 5619194.0,
|
| 4573 |
+
"step": 5070
|
| 4574 |
+
},
|
| 4575 |
+
{
|
| 4576 |
+
"epoch": 1.0235744509369333,
|
| 4577 |
+
"grad_norm": 12.4375,
|
| 4578 |
+
"learning_rate": 1.3177513600644774e-05,
|
| 4579 |
+
"loss": 0.921,
|
| 4580 |
+
"mean_token_accuracy": 0.7800089240074157,
|
| 4581 |
+
"num_tokens": 5631065.0,
|
| 4582 |
+
"step": 5080
|
| 4583 |
+
},
|
| 4584 |
+
{
|
| 4585 |
+
"epoch": 1.0255893612734233,
|
| 4586 |
+
"grad_norm": 10.875,
|
| 4587 |
+
"learning_rate": 1.3164080865068171e-05,
|
| 4588 |
+
"loss": 0.799,
|
| 4589 |
+
"mean_token_accuracy": 0.8071331679821014,
|
| 4590 |
+
"num_tokens": 5642580.0,
|
| 4591 |
+
"step": 5090
|
| 4592 |
+
},
|
| 4593 |
+
{
|
| 4594 |
+
"epoch": 1.0276042716099134,
|
| 4595 |
+
"grad_norm": 10.1875,
|
| 4596 |
+
"learning_rate": 1.3150648129491572e-05,
|
| 4597 |
+
"loss": 0.7776,
|
| 4598 |
+
"mean_token_accuracy": 0.8046740829944611,
|
| 4599 |
+
"num_tokens": 5651910.0,
|
| 4600 |
+
"step": 5100
|
| 4601 |
+
},
|
| 4602 |
+
{
|
| 4603 |
+
"epoch": 1.0296191819464033,
|
| 4604 |
+
"grad_norm": 14.0,
|
| 4605 |
+
"learning_rate": 1.3137215393914972e-05,
|
| 4606 |
+
"loss": 0.8056,
|
| 4607 |
+
"mean_token_accuracy": 0.8012421131134033,
|
| 4608 |
+
"num_tokens": 5663726.0,
|
| 4609 |
+
"step": 5110
|
| 4610 |
+
},
|
| 4611 |
+
{
|
| 4612 |
+
"epoch": 1.0316340922828935,
|
| 4613 |
+
"grad_norm": 11.5,
|
| 4614 |
+
"learning_rate": 1.3123782658338371e-05,
|
| 4615 |
+
"loss": 0.7681,
|
| 4616 |
+
"mean_token_accuracy": 0.8097535610198975,
|
| 4617 |
+
"num_tokens": 5675558.0,
|
| 4618 |
+
"step": 5120
|
| 4619 |
+
},
|
| 4620 |
+
{
|
| 4621 |
+
"epoch": 1.0336490026193834,
|
| 4622 |
+
"grad_norm": 11.1875,
|
| 4623 |
+
"learning_rate": 1.3110349922761772e-05,
|
| 4624 |
+
"loss": 0.8813,
|
| 4625 |
+
"mean_token_accuracy": 0.7838487148284912,
|
| 4626 |
+
"num_tokens": 5687969.0,
|
| 4627 |
+
"step": 5130
|
| 4628 |
+
},
|
| 4629 |
+
{
|
| 4630 |
+
"epoch": 1.0356639129558736,
|
| 4631 |
+
"grad_norm": 9.3125,
|
| 4632 |
+
"learning_rate": 1.3096917187185172e-05,
|
| 4633 |
+
"loss": 0.9072,
|
| 4634 |
+
"mean_token_accuracy": 0.7834949135780335,
|
| 4635 |
+
"num_tokens": 5700354.0,
|
| 4636 |
+
"step": 5140
|
| 4637 |
+
},
|
| 4638 |
+
{
|
| 4639 |
+
"epoch": 1.0376788232923635,
|
| 4640 |
+
"grad_norm": 14.6875,
|
| 4641 |
+
"learning_rate": 1.3083484451608571e-05,
|
| 4642 |
+
"loss": 0.903,
|
| 4643 |
+
"mean_token_accuracy": 0.7816505491733551,
|
| 4644 |
+
"num_tokens": 5711090.0,
|
| 4645 |
+
"step": 5150
|
| 4646 |
+
},
|
| 4647 |
+
{
|
| 4648 |
+
"epoch": 1.0396937336288534,
|
| 4649 |
+
"grad_norm": 8.9375,
|
| 4650 |
+
"learning_rate": 1.3070051716031971e-05,
|
| 4651 |
+
"loss": 0.7961,
|
| 4652 |
+
"mean_token_accuracy": 0.8029458582401275,
|
| 4653 |
+
"num_tokens": 5721667.0,
|
| 4654 |
+
"step": 5160
|
| 4655 |
+
},
|
| 4656 |
+
{
|
| 4657 |
+
"epoch": 1.0417086439653436,
|
| 4658 |
+
"grad_norm": 10.8125,
|
| 4659 |
+
"learning_rate": 1.305661898045537e-05,
|
| 4660 |
+
"loss": 0.8394,
|
| 4661 |
+
"mean_token_accuracy": 0.7979920387268067,
|
| 4662 |
+
"num_tokens": 5733015.0,
|
| 4663 |
+
"step": 5170
|
| 4664 |
+
},
|
| 4665 |
+
{
|
| 4666 |
+
"epoch": 1.0437235543018335,
|
| 4667 |
+
"grad_norm": 11.1875,
|
| 4668 |
+
"learning_rate": 1.304318624487877e-05,
|
| 4669 |
+
"loss": 0.8749,
|
| 4670 |
+
"mean_token_accuracy": 0.7899072051048279,
|
| 4671 |
+
"num_tokens": 5743473.0,
|
| 4672 |
+
"step": 5180
|
| 4673 |
+
},
|
| 4674 |
+
{
|
| 4675 |
+
"epoch": 1.0457384646383237,
|
| 4676 |
+
"grad_norm": 11.0625,
|
| 4677 |
+
"learning_rate": 1.302975350930217e-05,
|
| 4678 |
+
"loss": 0.8553,
|
| 4679 |
+
"mean_token_accuracy": 0.7900504052639008,
|
| 4680 |
+
"num_tokens": 5754579.0,
|
| 4681 |
+
"step": 5190
|
| 4682 |
+
},
|
| 4683 |
+
{
|
| 4684 |
+
"epoch": 1.0477533749748136,
|
| 4685 |
+
"grad_norm": 10.9375,
|
| 4686 |
+
"learning_rate": 1.301632077372557e-05,
|
| 4687 |
+
"loss": 0.8735,
|
| 4688 |
+
"mean_token_accuracy": 0.7891764640808105,
|
| 4689 |
+
"num_tokens": 5765340.0,
|
| 4690 |
+
"step": 5200
|
| 4691 |
+
},
|
| 4692 |
+
{
|
| 4693 |
+
"epoch": 1.0497682853113037,
|
| 4694 |
+
"grad_norm": 9.0,
|
| 4695 |
+
"learning_rate": 1.300288803814897e-05,
|
| 4696 |
+
"loss": 0.7709,
|
| 4697 |
+
"mean_token_accuracy": 0.8050879895687103,
|
| 4698 |
+
"num_tokens": 5775710.0,
|
| 4699 |
+
"step": 5210
|
| 4700 |
+
},
|
| 4701 |
+
{
|
| 4702 |
+
"epoch": 1.0517831956477937,
|
| 4703 |
+
"grad_norm": 18.25,
|
| 4704 |
+
"learning_rate": 1.298945530257237e-05,
|
| 4705 |
+
"loss": 0.7335,
|
| 4706 |
+
"mean_token_accuracy": 0.8071872234344483,
|
| 4707 |
+
"num_tokens": 5785996.0,
|
| 4708 |
+
"step": 5220
|
| 4709 |
+
},
|
| 4710 |
+
{
|
| 4711 |
+
"epoch": 1.0537981059842838,
|
| 4712 |
+
"grad_norm": 13.375,
|
| 4713 |
+
"learning_rate": 1.297602256699577e-05,
|
| 4714 |
+
"loss": 0.877,
|
| 4715 |
+
"mean_token_accuracy": 0.7817419946193696,
|
| 4716 |
+
"num_tokens": 5796629.0,
|
| 4717 |
+
"step": 5230
|
| 4718 |
+
},
|
| 4719 |
+
{
|
| 4720 |
+
"epoch": 1.0558130163207737,
|
| 4721 |
+
"grad_norm": 11.1875,
|
| 4722 |
+
"learning_rate": 1.296258983141917e-05,
|
| 4723 |
+
"loss": 0.7858,
|
| 4724 |
+
"mean_token_accuracy": 0.8022194325923919,
|
| 4725 |
+
"num_tokens": 5806790.0,
|
| 4726 |
+
"step": 5240
|
| 4727 |
+
},
|
| 4728 |
+
{
|
| 4729 |
+
"epoch": 1.0578279266572637,
|
| 4730 |
+
"grad_norm": 11.1875,
|
| 4731 |
+
"learning_rate": 1.2949157095842568e-05,
|
| 4732 |
+
"loss": 0.8409,
|
| 4733 |
+
"mean_token_accuracy": 0.7854238629341126,
|
| 4734 |
+
"num_tokens": 5818974.0,
|
| 4735 |
+
"step": 5250
|
| 4736 |
+
},
|
| 4737 |
+
{
|
| 4738 |
+
"epoch": 1.0598428369937538,
|
| 4739 |
+
"grad_norm": 13.0,
|
| 4740 |
+
"learning_rate": 1.2935724360265968e-05,
|
| 4741 |
+
"loss": 0.7023,
|
| 4742 |
+
"mean_token_accuracy": 0.8206122577190399,
|
| 4743 |
+
"num_tokens": 5828962.0,
|
| 4744 |
+
"step": 5260
|
| 4745 |
+
},
|
| 4746 |
+
{
|
| 4747 |
+
"epoch": 1.0618577473302437,
|
| 4748 |
+
"grad_norm": 12.75,
|
| 4749 |
+
"learning_rate": 1.2922291624689369e-05,
|
| 4750 |
+
"loss": 0.8116,
|
| 4751 |
+
"mean_token_accuracy": 0.7957081377506257,
|
| 4752 |
+
"num_tokens": 5840475.0,
|
| 4753 |
+
"step": 5270
|
| 4754 |
+
},
|
| 4755 |
+
{
|
| 4756 |
+
"epoch": 1.063872657666734,
|
| 4757 |
+
"grad_norm": 12.375,
|
| 4758 |
+
"learning_rate": 1.290885888911277e-05,
|
| 4759 |
+
"loss": 0.876,
|
| 4760 |
+
"mean_token_accuracy": 0.7848715245723724,
|
| 4761 |
+
"num_tokens": 5851626.0,
|
| 4762 |
+
"step": 5280
|
| 4763 |
+
},
|
| 4764 |
+
{
|
| 4765 |
+
"epoch": 1.0658875680032238,
|
| 4766 |
+
"grad_norm": 12.1875,
|
| 4767 |
+
"learning_rate": 1.2895426153536168e-05,
|
| 4768 |
+
"loss": 0.8648,
|
| 4769 |
+
"mean_token_accuracy": 0.7879779160022735,
|
| 4770 |
+
"num_tokens": 5861745.0,
|
| 4771 |
+
"step": 5290
|
| 4772 |
+
},
|
| 4773 |
+
{
|
| 4774 |
+
"epoch": 1.067902478339714,
|
| 4775 |
+
"grad_norm": 11.5625,
|
| 4776 |
+
"learning_rate": 1.2881993417959569e-05,
|
| 4777 |
+
"loss": 0.7807,
|
| 4778 |
+
"mean_token_accuracy": 0.8065967261791229,
|
| 4779 |
+
"num_tokens": 5871744.0,
|
| 4780 |
+
"step": 5300
|
| 4781 |
+
},
|
| 4782 |
+
{
|
| 4783 |
+
"epoch": 1.069917388676204,
|
| 4784 |
+
"grad_norm": 11.875,
|
| 4785 |
+
"learning_rate": 1.286856068238297e-05,
|
| 4786 |
+
"loss": 0.8184,
|
| 4787 |
+
"mean_token_accuracy": 0.7950898349285126,
|
| 4788 |
+
"num_tokens": 5882570.0,
|
| 4789 |
+
"step": 5310
|
| 4790 |
+
},
|
| 4791 |
+
{
|
| 4792 |
+
"epoch": 1.071932299012694,
|
| 4793 |
+
"grad_norm": 12.125,
|
| 4794 |
+
"learning_rate": 1.2855127946806366e-05,
|
| 4795 |
+
"loss": 0.7624,
|
| 4796 |
+
"mean_token_accuracy": 0.8084113836288452,
|
| 4797 |
+
"num_tokens": 5893477.0,
|
| 4798 |
+
"step": 5320
|
| 4799 |
+
},
|
| 4800 |
+
{
|
| 4801 |
+
"epoch": 1.073947209349184,
|
| 4802 |
+
"grad_norm": 12.0625,
|
| 4803 |
+
"learning_rate": 1.2841695211229767e-05,
|
| 4804 |
+
"loss": 0.8525,
|
| 4805 |
+
"mean_token_accuracy": 0.8004627406597138,
|
| 4806 |
+
"num_tokens": 5906228.0,
|
| 4807 |
+
"step": 5330
|
| 4808 |
+
},
|
| 4809 |
+
{
|
| 4810 |
+
"epoch": 1.075962119685674,
|
| 4811 |
+
"grad_norm": 10.1875,
|
| 4812 |
+
"learning_rate": 1.2828262475653167e-05,
|
| 4813 |
+
"loss": 0.7381,
|
| 4814 |
+
"mean_token_accuracy": 0.815189528465271,
|
| 4815 |
+
"num_tokens": 5917163.0,
|
| 4816 |
+
"step": 5340
|
| 4817 |
+
},
|
| 4818 |
+
{
|
| 4819 |
+
"epoch": 1.077977030022164,
|
| 4820 |
+
"grad_norm": 13.4375,
|
| 4821 |
+
"learning_rate": 1.2814829740076568e-05,
|
| 4822 |
+
"loss": 0.8192,
|
| 4823 |
+
"mean_token_accuracy": 0.7983390390872955,
|
| 4824 |
+
"num_tokens": 5927959.0,
|
| 4825 |
+
"step": 5350
|
| 4826 |
+
},
|
| 4827 |
+
{
|
| 4828 |
+
"epoch": 1.079991940358654,
|
| 4829 |
+
"grad_norm": 11.125,
|
| 4830 |
+
"learning_rate": 1.2801397004499967e-05,
|
| 4831 |
+
"loss": 0.8847,
|
| 4832 |
+
"mean_token_accuracy": 0.7825915396213532,
|
| 4833 |
+
"num_tokens": 5938684.0,
|
| 4834 |
+
"step": 5360
|
| 4835 |
+
},
|
| 4836 |
+
{
|
| 4837 |
+
"epoch": 1.0820068506951441,
|
| 4838 |
+
"grad_norm": 11.625,
|
| 4839 |
+
"learning_rate": 1.2787964268923367e-05,
|
| 4840 |
+
"loss": 0.8451,
|
| 4841 |
+
"mean_token_accuracy": 0.7878111064434051,
|
| 4842 |
+
"num_tokens": 5948765.0,
|
| 4843 |
+
"step": 5370
|
| 4844 |
+
},
|
| 4845 |
+
{
|
| 4846 |
+
"epoch": 1.084021761031634,
|
| 4847 |
+
"grad_norm": 13.0,
|
| 4848 |
+
"learning_rate": 1.2774531533346768e-05,
|
| 4849 |
+
"loss": 0.7971,
|
| 4850 |
+
"mean_token_accuracy": 0.8030431568622589,
|
| 4851 |
+
"num_tokens": 5960108.0,
|
| 4852 |
+
"step": 5380
|
| 4853 |
+
},
|
| 4854 |
+
{
|
| 4855 |
+
"epoch": 1.0860366713681242,
|
| 4856 |
+
"grad_norm": 10.625,
|
| 4857 |
+
"learning_rate": 1.2761098797770167e-05,
|
| 4858 |
+
"loss": 0.8786,
|
| 4859 |
+
"mean_token_accuracy": 0.7854897439479828,
|
| 4860 |
+
"num_tokens": 5972007.0,
|
| 4861 |
+
"step": 5390
|
| 4862 |
+
},
|
| 4863 |
+
{
|
| 4864 |
+
"epoch": 1.0880515817046141,
|
| 4865 |
+
"grad_norm": 10.0625,
|
| 4866 |
+
"learning_rate": 1.2747666062193567e-05,
|
| 4867 |
+
"loss": 0.8395,
|
| 4868 |
+
"mean_token_accuracy": 0.7956344962120057,
|
| 4869 |
+
"num_tokens": 5983211.0,
|
| 4870 |
+
"step": 5400
|
| 4871 |
+
},
|
| 4872 |
+
{
|
| 4873 |
+
"epoch": 1.090066492041104,
|
| 4874 |
+
"grad_norm": 10.5625,
|
| 4875 |
+
"learning_rate": 1.2734233326616968e-05,
|
| 4876 |
+
"loss": 0.9274,
|
| 4877 |
+
"mean_token_accuracy": 0.7794575989246368,
|
| 4878 |
+
"num_tokens": 5995219.0,
|
| 4879 |
+
"step": 5410
|
| 4880 |
+
},
|
| 4881 |
+
{
|
| 4882 |
+
"epoch": 1.0920814023775942,
|
| 4883 |
+
"grad_norm": 13.1875,
|
| 4884 |
+
"learning_rate": 1.2720800591040365e-05,
|
| 4885 |
+
"loss": 0.8251,
|
| 4886 |
+
"mean_token_accuracy": 0.802078241109848,
|
| 4887 |
+
"num_tokens": 6006324.0,
|
| 4888 |
+
"step": 5420
|
| 4889 |
+
},
|
| 4890 |
+
{
|
| 4891 |
+
"epoch": 1.0940963127140841,
|
| 4892 |
+
"grad_norm": 14.0625,
|
| 4893 |
+
"learning_rate": 1.2707367855463765e-05,
|
| 4894 |
+
"loss": 0.8402,
|
| 4895 |
+
"mean_token_accuracy": 0.7896000027656556,
|
| 4896 |
+
"num_tokens": 6017542.0,
|
| 4897 |
+
"step": 5430
|
| 4898 |
+
},
|
| 4899 |
+
{
|
| 4900 |
+
"epoch": 1.0961112230505743,
|
| 4901 |
+
"grad_norm": 11.8125,
|
| 4902 |
+
"learning_rate": 1.2693935119887166e-05,
|
| 4903 |
+
"loss": 0.8307,
|
| 4904 |
+
"mean_token_accuracy": 0.7981148719787597,
|
| 4905 |
+
"num_tokens": 6027523.0,
|
| 4906 |
+
"step": 5440
|
| 4907 |
+
},
|
| 4908 |
+
{
|
| 4909 |
+
"epoch": 1.0981261333870642,
|
| 4910 |
+
"grad_norm": 9.6875,
|
| 4911 |
+
"learning_rate": 1.2680502384310566e-05,
|
| 4912 |
+
"loss": 0.866,
|
| 4913 |
+
"mean_token_accuracy": 0.7834112644195557,
|
| 4914 |
+
"num_tokens": 6038697.0,
|
| 4915 |
+
"step": 5450
|
| 4916 |
+
},
|
| 4917 |
+
{
|
| 4918 |
+
"epoch": 1.1001410437235544,
|
| 4919 |
+
"grad_norm": 11.0625,
|
| 4920 |
+
"learning_rate": 1.2667069648733965e-05,
|
| 4921 |
+
"loss": 0.793,
|
| 4922 |
+
"mean_token_accuracy": 0.7983521819114685,
|
| 4923 |
+
"num_tokens": 6049813.0,
|
| 4924 |
+
"step": 5460
|
| 4925 |
+
},
|
| 4926 |
+
{
|
| 4927 |
+
"epoch": 1.1021559540600443,
|
| 4928 |
+
"grad_norm": 12.0625,
|
| 4929 |
+
"learning_rate": 1.2653636913157366e-05,
|
| 4930 |
+
"loss": 0.7633,
|
| 4931 |
+
"mean_token_accuracy": 0.811886590719223,
|
| 4932 |
+
"num_tokens": 6060176.0,
|
| 4933 |
+
"step": 5470
|
| 4934 |
+
},
|
| 4935 |
+
{
|
| 4936 |
+
"epoch": 1.1041708643965344,
|
| 4937 |
+
"grad_norm": 12.875,
|
| 4938 |
+
"learning_rate": 1.2640204177580766e-05,
|
| 4939 |
+
"loss": 0.8755,
|
| 4940 |
+
"mean_token_accuracy": 0.7823013424873352,
|
| 4941 |
+
"num_tokens": 6069957.0,
|
| 4942 |
+
"step": 5480
|
| 4943 |
+
},
|
| 4944 |
+
{
|
| 4945 |
+
"epoch": 1.1061857747330244,
|
| 4946 |
+
"grad_norm": 12.6875,
|
| 4947 |
+
"learning_rate": 1.2626771442004164e-05,
|
| 4948 |
+
"loss": 0.8468,
|
| 4949 |
+
"mean_token_accuracy": 0.7942144453525544,
|
| 4950 |
+
"num_tokens": 6080224.0,
|
| 4951 |
+
"step": 5490
|
| 4952 |
+
},
|
| 4953 |
+
{
|
| 4954 |
+
"epoch": 1.1082006850695145,
|
| 4955 |
+
"grad_norm": 10.5,
|
| 4956 |
+
"learning_rate": 1.2613338706427564e-05,
|
| 4957 |
+
"loss": 0.8926,
|
| 4958 |
+
"mean_token_accuracy": 0.7852272689342499,
|
| 4959 |
+
"num_tokens": 6091516.0,
|
| 4960 |
+
"step": 5500
|
| 4961 |
}
|
| 4962 |
],
|
| 4963 |
"logging_steps": 10,
|
|
|
|
| 4977 |
"attributes": {}
|
| 4978 |
}
|
| 4979 |
},
|
| 4980 |
+
"total_flos": 7364465716629504.0,
|
| 4981 |
"train_batch_size": 8,
|
| 4982 |
"trial_name": null,
|
| 4983 |
"trial_params": null
|