Instructions to use rovdetection/code-1b-instruct with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use rovdetection/code-1b-instruct with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("rovdetection/code-1b-instruct", dtype="auto") - Notebooks
- Google Colab
- Kaggle
Training in progress, step 5000, checkpoint
Browse files
last-checkpoint/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 9446744
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5e72d16b2e050107874bda34c32842693cb03183fe37e99259fd5f4499db55d7
|
| 3 |
size 9446744
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4879947
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8c9789ca8345d90dcacc80a1a783b43cb333b05712d5ff9f32742adcdad67703
|
| 3 |
size 4879947
|
last-checkpoint/rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14917
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b6c65cbc045dd0d2fc61664c618dc95af09df46ef33dca72fb52e607162f7cd0
|
| 3 |
size 14917
|
last-checkpoint/rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14917
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9b52aae8bdee498050d557f981556359d1fd46a65c7057f7ff5253cd2856e123
|
| 3 |
size 14917
|
last-checkpoint/scaler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1383
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:82ad8990572ad11a824b7db276c8af49c179ca7e7724b4e6906cd0ae480a80a8
|
| 3 |
size 1383
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0ed92728e8486ac6f40cff2848582530afa1f43adb61e60cafa8617d08778617
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch":
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -4508,6 +4508,506 @@
|
|
| 4508 |
"mean_token_accuracy": 0.6680058591067791,
|
| 4509 |
"num_tokens": 26735542.0,
|
| 4510 |
"step": 4500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4511 |
}
|
| 4512 |
],
|
| 4513 |
"logging_steps": 10,
|
|
@@ -4522,12 +5022,12 @@
|
|
| 4522 |
"should_evaluate": false,
|
| 4523 |
"should_log": false,
|
| 4524 |
"should_save": true,
|
| 4525 |
-
"should_training_stop":
|
| 4526 |
},
|
| 4527 |
"attributes": {}
|
| 4528 |
}
|
| 4529 |
},
|
| 4530 |
-
"total_flos": 2.
|
| 4531 |
"train_batch_size": 2,
|
| 4532 |
"trial_name": null,
|
| 4533 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 8.591446378680422,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 5000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 4508 |
"mean_token_accuracy": 0.6680058591067791,
|
| 4509 |
"num_tokens": 26735542.0,
|
| 4510 |
"step": 4500
|
| 4511 |
+
},
|
| 4512 |
+
{
|
| 4513 |
+
"entropy": 1.7587152615189552,
|
| 4514 |
+
"epoch": 7.749623898560069,
|
| 4515 |
+
"grad_norm": 0.8131846189498901,
|
| 4516 |
+
"learning_rate": 1.9640000000000002e-05,
|
| 4517 |
+
"loss": 1.798016357421875,
|
| 4518 |
+
"mean_token_accuracy": 0.6655693002045154,
|
| 4519 |
+
"num_tokens": 26796245.0,
|
| 4520 |
+
"step": 4510
|
| 4521 |
+
},
|
| 4522 |
+
{
|
| 4523 |
+
"entropy": 1.7238084524869919,
|
| 4524 |
+
"epoch": 7.766817107242639,
|
| 4525 |
+
"grad_norm": 0.8774024248123169,
|
| 4526 |
+
"learning_rate": 1.924e-05,
|
| 4527 |
+
"loss": 1.7398443222045898,
|
| 4528 |
+
"mean_token_accuracy": 0.6723451249301433,
|
| 4529 |
+
"num_tokens": 26852843.0,
|
| 4530 |
+
"step": 4520
|
| 4531 |
+
},
|
| 4532 |
+
{
|
| 4533 |
+
"entropy": 1.8012757793068885,
|
| 4534 |
+
"epoch": 7.78401031592521,
|
| 4535 |
+
"grad_norm": 0.881601095199585,
|
| 4536 |
+
"learning_rate": 1.8840000000000003e-05,
|
| 4537 |
+
"loss": 1.851584243774414,
|
| 4538 |
+
"mean_token_accuracy": 0.6612551022320986,
|
| 4539 |
+
"num_tokens": 26912327.0,
|
| 4540 |
+
"step": 4530
|
| 4541 |
+
},
|
| 4542 |
+
{
|
| 4543 |
+
"entropy": 1.7035338878631592,
|
| 4544 |
+
"epoch": 7.8012035246077795,
|
| 4545 |
+
"grad_norm": 0.8460244536399841,
|
| 4546 |
+
"learning_rate": 1.8440000000000003e-05,
|
| 4547 |
+
"loss": 1.7524948120117188,
|
| 4548 |
+
"mean_token_accuracy": 0.6760960537940264,
|
| 4549 |
+
"num_tokens": 26971076.0,
|
| 4550 |
+
"step": 4540
|
| 4551 |
+
},
|
| 4552 |
+
{
|
| 4553 |
+
"entropy": 1.6795054778456688,
|
| 4554 |
+
"epoch": 7.81839673329035,
|
| 4555 |
+
"grad_norm": 0.7720061540603638,
|
| 4556 |
+
"learning_rate": 1.804e-05,
|
| 4557 |
+
"loss": 1.70491943359375,
|
| 4558 |
+
"mean_token_accuracy": 0.6768644891679287,
|
| 4559 |
+
"num_tokens": 27031120.0,
|
| 4560 |
+
"step": 4550
|
| 4561 |
+
},
|
| 4562 |
+
{
|
| 4563 |
+
"entropy": 1.775759120285511,
|
| 4564 |
+
"epoch": 7.835589941972921,
|
| 4565 |
+
"grad_norm": 0.8407703638076782,
|
| 4566 |
+
"learning_rate": 1.764e-05,
|
| 4567 |
+
"loss": 1.8208852767944337,
|
| 4568 |
+
"mean_token_accuracy": 0.6638765886425972,
|
| 4569 |
+
"num_tokens": 27089926.0,
|
| 4570 |
+
"step": 4560
|
| 4571 |
+
},
|
| 4572 |
+
{
|
| 4573 |
+
"entropy": 1.7749223679304122,
|
| 4574 |
+
"epoch": 7.852783150655491,
|
| 4575 |
+
"grad_norm": 0.8033788204193115,
|
| 4576 |
+
"learning_rate": 1.724e-05,
|
| 4577 |
+
"loss": 1.8128280639648438,
|
| 4578 |
+
"mean_token_accuracy": 0.6697524327784776,
|
| 4579 |
+
"num_tokens": 27155776.0,
|
| 4580 |
+
"step": 4570
|
| 4581 |
+
},
|
| 4582 |
+
{
|
| 4583 |
+
"entropy": 1.7019891321659089,
|
| 4584 |
+
"epoch": 7.869976359338062,
|
| 4585 |
+
"grad_norm": 0.8756063580513,
|
| 4586 |
+
"learning_rate": 1.684e-05,
|
| 4587 |
+
"loss": 1.752833366394043,
|
| 4588 |
+
"mean_token_accuracy": 0.6720911644399166,
|
| 4589 |
+
"num_tokens": 27213676.0,
|
| 4590 |
+
"step": 4580
|
| 4591 |
+
},
|
| 4592 |
+
{
|
| 4593 |
+
"entropy": 1.7089907452464104,
|
| 4594 |
+
"epoch": 7.8871695680206315,
|
| 4595 |
+
"grad_norm": 0.8547044396400452,
|
| 4596 |
+
"learning_rate": 1.644e-05,
|
| 4597 |
+
"loss": 1.7329090118408204,
|
| 4598 |
+
"mean_token_accuracy": 0.6730512753129005,
|
| 4599 |
+
"num_tokens": 27273812.0,
|
| 4600 |
+
"step": 4590
|
| 4601 |
+
},
|
| 4602 |
+
{
|
| 4603 |
+
"entropy": 1.8000069722533225,
|
| 4604 |
+
"epoch": 7.904362776703202,
|
| 4605 |
+
"grad_norm": 0.8191949725151062,
|
| 4606 |
+
"learning_rate": 1.604e-05,
|
| 4607 |
+
"loss": 1.8508378982543945,
|
| 4608 |
+
"mean_token_accuracy": 0.6602330446243286,
|
| 4609 |
+
"num_tokens": 27334482.0,
|
| 4610 |
+
"step": 4600
|
| 4611 |
+
},
|
| 4612 |
+
{
|
| 4613 |
+
"entropy": 1.6531485810875892,
|
| 4614 |
+
"epoch": 7.921555985385773,
|
| 4615 |
+
"grad_norm": 0.7952063679695129,
|
| 4616 |
+
"learning_rate": 1.5640000000000003e-05,
|
| 4617 |
+
"loss": 1.6732818603515625,
|
| 4618 |
+
"mean_token_accuracy": 0.6840143203735352,
|
| 4619 |
+
"num_tokens": 27390777.0,
|
| 4620 |
+
"step": 4610
|
| 4621 |
+
},
|
| 4622 |
+
{
|
| 4623 |
+
"entropy": 1.7451679170131684,
|
| 4624 |
+
"epoch": 7.938749194068343,
|
| 4625 |
+
"grad_norm": 0.7736355066299438,
|
| 4626 |
+
"learning_rate": 1.5240000000000001e-05,
|
| 4627 |
+
"loss": 1.836105728149414,
|
| 4628 |
+
"mean_token_accuracy": 0.6631482250988483,
|
| 4629 |
+
"num_tokens": 27452458.0,
|
| 4630 |
+
"step": 4620
|
| 4631 |
+
},
|
| 4632 |
+
{
|
| 4633 |
+
"entropy": 1.6219932287931442,
|
| 4634 |
+
"epoch": 7.955942402750914,
|
| 4635 |
+
"grad_norm": 0.7429597973823547,
|
| 4636 |
+
"learning_rate": 1.4840000000000002e-05,
|
| 4637 |
+
"loss": 1.6252763748168946,
|
| 4638 |
+
"mean_token_accuracy": 0.6922797068953515,
|
| 4639 |
+
"num_tokens": 27510793.0,
|
| 4640 |
+
"step": 4630
|
| 4641 |
+
},
|
| 4642 |
+
{
|
| 4643 |
+
"entropy": 1.7097622737288476,
|
| 4644 |
+
"epoch": 7.9731356114334835,
|
| 4645 |
+
"grad_norm": 0.7546749114990234,
|
| 4646 |
+
"learning_rate": 1.444e-05,
|
| 4647 |
+
"loss": 1.7529830932617188,
|
| 4648 |
+
"mean_token_accuracy": 0.6756818048655987,
|
| 4649 |
+
"num_tokens": 27570434.0,
|
| 4650 |
+
"step": 4640
|
| 4651 |
+
},
|
| 4652 |
+
{
|
| 4653 |
+
"entropy": 1.7681476891040802,
|
| 4654 |
+
"epoch": 7.990328820116054,
|
| 4655 |
+
"grad_norm": 0.8919919729232788,
|
| 4656 |
+
"learning_rate": 1.4040000000000001e-05,
|
| 4657 |
+
"loss": 1.8469413757324218,
|
| 4658 |
+
"mean_token_accuracy": 0.6651480123400688,
|
| 4659 |
+
"num_tokens": 27632017.0,
|
| 4660 |
+
"step": 4650
|
| 4661 |
+
},
|
| 4662 |
+
{
|
| 4663 |
+
"entropy": 1.7464849283168842,
|
| 4664 |
+
"epoch": 8.006877283473028,
|
| 4665 |
+
"grad_norm": 0.8629288077354431,
|
| 4666 |
+
"learning_rate": 1.364e-05,
|
| 4667 |
+
"loss": 1.7770162582397462,
|
| 4668 |
+
"mean_token_accuracy": 0.6717489861048661,
|
| 4669 |
+
"num_tokens": 27687721.0,
|
| 4670 |
+
"step": 4660
|
| 4671 |
+
},
|
| 4672 |
+
{
|
| 4673 |
+
"entropy": 1.733792708069086,
|
| 4674 |
+
"epoch": 8.024070492155598,
|
| 4675 |
+
"grad_norm": 0.8012450337409973,
|
| 4676 |
+
"learning_rate": 1.324e-05,
|
| 4677 |
+
"loss": 1.7535259246826171,
|
| 4678 |
+
"mean_token_accuracy": 0.6781957261264324,
|
| 4679 |
+
"num_tokens": 27748609.0,
|
| 4680 |
+
"step": 4670
|
| 4681 |
+
},
|
| 4682 |
+
{
|
| 4683 |
+
"entropy": 1.673891542851925,
|
| 4684 |
+
"epoch": 8.041263700838169,
|
| 4685 |
+
"grad_norm": 0.8763530850410461,
|
| 4686 |
+
"learning_rate": 1.2839999999999999e-05,
|
| 4687 |
+
"loss": 1.7353546142578125,
|
| 4688 |
+
"mean_token_accuracy": 0.6773874297738075,
|
| 4689 |
+
"num_tokens": 27805200.0,
|
| 4690 |
+
"step": 4680
|
| 4691 |
+
},
|
| 4692 |
+
{
|
| 4693 |
+
"entropy": 1.6245143353939056,
|
| 4694 |
+
"epoch": 8.05845690952074,
|
| 4695 |
+
"grad_norm": 0.7880796194076538,
|
| 4696 |
+
"learning_rate": 1.244e-05,
|
| 4697 |
+
"loss": 1.6489152908325195,
|
| 4698 |
+
"mean_token_accuracy": 0.6891307681798935,
|
| 4699 |
+
"num_tokens": 27866189.0,
|
| 4700 |
+
"step": 4690
|
| 4701 |
+
},
|
| 4702 |
+
{
|
| 4703 |
+
"entropy": 1.7772031486034394,
|
| 4704 |
+
"epoch": 8.07565011820331,
|
| 4705 |
+
"grad_norm": 0.894481360912323,
|
| 4706 |
+
"learning_rate": 1.204e-05,
|
| 4707 |
+
"loss": 1.8237220764160156,
|
| 4708 |
+
"mean_token_accuracy": 0.6645158022642136,
|
| 4709 |
+
"num_tokens": 27929040.0,
|
| 4710 |
+
"step": 4700
|
| 4711 |
+
},
|
| 4712 |
+
{
|
| 4713 |
+
"entropy": 1.6911936491727828,
|
| 4714 |
+
"epoch": 8.09284332688588,
|
| 4715 |
+
"grad_norm": 0.8212205171585083,
|
| 4716 |
+
"learning_rate": 1.164e-05,
|
| 4717 |
+
"loss": 1.718613624572754,
|
| 4718 |
+
"mean_token_accuracy": 0.6778515942394734,
|
| 4719 |
+
"num_tokens": 27989259.0,
|
| 4720 |
+
"step": 4710
|
| 4721 |
+
},
|
| 4722 |
+
{
|
| 4723 |
+
"entropy": 1.7341958984732628,
|
| 4724 |
+
"epoch": 8.110036535568451,
|
| 4725 |
+
"grad_norm": 0.8757619261741638,
|
| 4726 |
+
"learning_rate": 1.124e-05,
|
| 4727 |
+
"loss": 1.83496150970459,
|
| 4728 |
+
"mean_token_accuracy": 0.67105031311512,
|
| 4729 |
+
"num_tokens": 28051037.0,
|
| 4730 |
+
"step": 4720
|
| 4731 |
+
},
|
| 4732 |
+
{
|
| 4733 |
+
"entropy": 1.6540620133280755,
|
| 4734 |
+
"epoch": 8.127229744251022,
|
| 4735 |
+
"grad_norm": 0.6871177554130554,
|
| 4736 |
+
"learning_rate": 1.084e-05,
|
| 4737 |
+
"loss": 1.6868721008300782,
|
| 4738 |
+
"mean_token_accuracy": 0.6824644193053245,
|
| 4739 |
+
"num_tokens": 28117218.0,
|
| 4740 |
+
"step": 4730
|
| 4741 |
+
},
|
| 4742 |
+
{
|
| 4743 |
+
"entropy": 1.7760244339704514,
|
| 4744 |
+
"epoch": 8.144422952933592,
|
| 4745 |
+
"grad_norm": 0.8672593832015991,
|
| 4746 |
+
"learning_rate": 1.0440000000000002e-05,
|
| 4747 |
+
"loss": 1.8467548370361329,
|
| 4748 |
+
"mean_token_accuracy": 0.6605620160698891,
|
| 4749 |
+
"num_tokens": 28176643.0,
|
| 4750 |
+
"step": 4740
|
| 4751 |
+
},
|
| 4752 |
+
{
|
| 4753 |
+
"entropy": 1.6998422421514987,
|
| 4754 |
+
"epoch": 8.16161616161616,
|
| 4755 |
+
"grad_norm": 0.9853087663650513,
|
| 4756 |
+
"learning_rate": 1.004e-05,
|
| 4757 |
+
"loss": 1.7283611297607422,
|
| 4758 |
+
"mean_token_accuracy": 0.6775359824299813,
|
| 4759 |
+
"num_tokens": 28234550.0,
|
| 4760 |
+
"step": 4750
|
| 4761 |
+
},
|
| 4762 |
+
{
|
| 4763 |
+
"entropy": 1.7665151111781596,
|
| 4764 |
+
"epoch": 8.178809370298731,
|
| 4765 |
+
"grad_norm": 0.8272210955619812,
|
| 4766 |
+
"learning_rate": 9.640000000000001e-06,
|
| 4767 |
+
"loss": 1.8442218780517579,
|
| 4768 |
+
"mean_token_accuracy": 0.6675057601183653,
|
| 4769 |
+
"num_tokens": 28292004.0,
|
| 4770 |
+
"step": 4760
|
| 4771 |
+
},
|
| 4772 |
+
{
|
| 4773 |
+
"entropy": 1.7351939789950848,
|
| 4774 |
+
"epoch": 8.196002578981302,
|
| 4775 |
+
"grad_norm": 0.8758223652839661,
|
| 4776 |
+
"learning_rate": 9.24e-06,
|
| 4777 |
+
"loss": 1.7823253631591798,
|
| 4778 |
+
"mean_token_accuracy": 0.6717655852437019,
|
| 4779 |
+
"num_tokens": 28351089.0,
|
| 4780 |
+
"step": 4770
|
| 4781 |
+
},
|
| 4782 |
+
{
|
| 4783 |
+
"entropy": 1.7320286817848682,
|
| 4784 |
+
"epoch": 8.213195787663873,
|
| 4785 |
+
"grad_norm": 0.8538162708282471,
|
| 4786 |
+
"learning_rate": 8.840000000000002e-06,
|
| 4787 |
+
"loss": 1.758108139038086,
|
| 4788 |
+
"mean_token_accuracy": 0.6750058546662331,
|
| 4789 |
+
"num_tokens": 28411108.0,
|
| 4790 |
+
"step": 4780
|
| 4791 |
+
},
|
| 4792 |
+
{
|
| 4793 |
+
"entropy": 1.7250167533755303,
|
| 4794 |
+
"epoch": 8.230388996346443,
|
| 4795 |
+
"grad_norm": 0.8055081963539124,
|
| 4796 |
+
"learning_rate": 8.44e-06,
|
| 4797 |
+
"loss": 1.7342365264892579,
|
| 4798 |
+
"mean_token_accuracy": 0.6727670766413212,
|
| 4799 |
+
"num_tokens": 28469910.0,
|
| 4800 |
+
"step": 4790
|
| 4801 |
+
},
|
| 4802 |
+
{
|
| 4803 |
+
"entropy": 1.6715928614139557,
|
| 4804 |
+
"epoch": 8.247582205029014,
|
| 4805 |
+
"grad_norm": 0.8282851576805115,
|
| 4806 |
+
"learning_rate": 8.040000000000001e-06,
|
| 4807 |
+
"loss": 1.7284685134887696,
|
| 4808 |
+
"mean_token_accuracy": 0.6803247310221195,
|
| 4809 |
+
"num_tokens": 28528732.0,
|
| 4810 |
+
"step": 4800
|
| 4811 |
+
},
|
| 4812 |
+
{
|
| 4813 |
+
"entropy": 1.7717369854450227,
|
| 4814 |
+
"epoch": 8.264775413711584,
|
| 4815 |
+
"grad_norm": 0.7199074029922485,
|
| 4816 |
+
"learning_rate": 7.64e-06,
|
| 4817 |
+
"loss": 1.8089387893676758,
|
| 4818 |
+
"mean_token_accuracy": 0.6684400778263807,
|
| 4819 |
+
"num_tokens": 28591231.0,
|
| 4820 |
+
"step": 4810
|
| 4821 |
+
},
|
| 4822 |
+
{
|
| 4823 |
+
"entropy": 1.6829568967223167,
|
| 4824 |
+
"epoch": 8.281968622394155,
|
| 4825 |
+
"grad_norm": 0.8212400674819946,
|
| 4826 |
+
"learning_rate": 7.240000000000001e-06,
|
| 4827 |
+
"loss": 1.6901424407958985,
|
| 4828 |
+
"mean_token_accuracy": 0.6812582932412624,
|
| 4829 |
+
"num_tokens": 28651538.0,
|
| 4830 |
+
"step": 4820
|
| 4831 |
+
},
|
| 4832 |
+
{
|
| 4833 |
+
"entropy": 1.7792557999491692,
|
| 4834 |
+
"epoch": 8.299161831076725,
|
| 4835 |
+
"grad_norm": 0.8251553773880005,
|
| 4836 |
+
"learning_rate": 6.840000000000001e-06,
|
| 4837 |
+
"loss": 1.8440101623535157,
|
| 4838 |
+
"mean_token_accuracy": 0.6635224357247352,
|
| 4839 |
+
"num_tokens": 28713818.0,
|
| 4840 |
+
"step": 4830
|
| 4841 |
+
},
|
| 4842 |
+
{
|
| 4843 |
+
"entropy": 1.6888219453394413,
|
| 4844 |
+
"epoch": 8.316355039759294,
|
| 4845 |
+
"grad_norm": 0.799067497253418,
|
| 4846 |
+
"learning_rate": 6.44e-06,
|
| 4847 |
+
"loss": 1.7452951431274415,
|
| 4848 |
+
"mean_token_accuracy": 0.6766478583216667,
|
| 4849 |
+
"num_tokens": 28771713.0,
|
| 4850 |
+
"step": 4840
|
| 4851 |
+
},
|
| 4852 |
+
{
|
| 4853 |
+
"entropy": 1.6663143932819366,
|
| 4854 |
+
"epoch": 8.333548248441865,
|
| 4855 |
+
"grad_norm": 0.7468796968460083,
|
| 4856 |
+
"learning_rate": 6.040000000000001e-06,
|
| 4857 |
+
"loss": 1.6975286483764649,
|
| 4858 |
+
"mean_token_accuracy": 0.6818139903247357,
|
| 4859 |
+
"num_tokens": 28833584.0,
|
| 4860 |
+
"step": 4850
|
| 4861 |
+
},
|
| 4862 |
+
{
|
| 4863 |
+
"entropy": 1.736840507388115,
|
| 4864 |
+
"epoch": 8.350741457124435,
|
| 4865 |
+
"grad_norm": 0.9168211817741394,
|
| 4866 |
+
"learning_rate": 5.64e-06,
|
| 4867 |
+
"loss": 1.8019765853881835,
|
| 4868 |
+
"mean_token_accuracy": 0.6729365028440952,
|
| 4869 |
+
"num_tokens": 28891158.0,
|
| 4870 |
+
"step": 4860
|
| 4871 |
+
},
|
| 4872 |
+
{
|
| 4873 |
+
"entropy": 1.7159839145839215,
|
| 4874 |
+
"epoch": 8.367934665807006,
|
| 4875 |
+
"grad_norm": 0.8348814249038696,
|
| 4876 |
+
"learning_rate": 5.240000000000001e-06,
|
| 4877 |
+
"loss": 1.7910118103027344,
|
| 4878 |
+
"mean_token_accuracy": 0.67631860896945,
|
| 4879 |
+
"num_tokens": 28948026.0,
|
| 4880 |
+
"step": 4870
|
| 4881 |
+
},
|
| 4882 |
+
{
|
| 4883 |
+
"entropy": 1.7169093780219555,
|
| 4884 |
+
"epoch": 8.385127874489577,
|
| 4885 |
+
"grad_norm": 0.8493881821632385,
|
| 4886 |
+
"learning_rate": 4.84e-06,
|
| 4887 |
+
"loss": 1.7167430877685548,
|
| 4888 |
+
"mean_token_accuracy": 0.6753393478691578,
|
| 4889 |
+
"num_tokens": 29005197.0,
|
| 4890 |
+
"step": 4880
|
| 4891 |
+
},
|
| 4892 |
+
{
|
| 4893 |
+
"entropy": 1.6801239594817161,
|
| 4894 |
+
"epoch": 8.402321083172147,
|
| 4895 |
+
"grad_norm": 0.8069011569023132,
|
| 4896 |
+
"learning_rate": 4.440000000000001e-06,
|
| 4897 |
+
"loss": 1.6674100875854492,
|
| 4898 |
+
"mean_token_accuracy": 0.681441531330347,
|
| 4899 |
+
"num_tokens": 29062454.0,
|
| 4900 |
+
"step": 4890
|
| 4901 |
+
},
|
| 4902 |
+
{
|
| 4903 |
+
"entropy": 1.7267012923955918,
|
| 4904 |
+
"epoch": 8.419514291854718,
|
| 4905 |
+
"grad_norm": 0.8063756823539734,
|
| 4906 |
+
"learning_rate": 4.04e-06,
|
| 4907 |
+
"loss": 1.7544673919677733,
|
| 4908 |
+
"mean_token_accuracy": 0.6745367147028446,
|
| 4909 |
+
"num_tokens": 29121055.0,
|
| 4910 |
+
"step": 4900
|
| 4911 |
+
},
|
| 4912 |
+
{
|
| 4913 |
+
"entropy": 1.6062462359666825,
|
| 4914 |
+
"epoch": 8.436707500537288,
|
| 4915 |
+
"grad_norm": 0.8285024762153625,
|
| 4916 |
+
"learning_rate": 3.6400000000000003e-06,
|
| 4917 |
+
"loss": 1.6273128509521484,
|
| 4918 |
+
"mean_token_accuracy": 0.690464211255312,
|
| 4919 |
+
"num_tokens": 29176963.0,
|
| 4920 |
+
"step": 4910
|
| 4921 |
+
},
|
| 4922 |
+
{
|
| 4923 |
+
"entropy": 1.7958560451865195,
|
| 4924 |
+
"epoch": 8.453900709219859,
|
| 4925 |
+
"grad_norm": 0.8202657103538513,
|
| 4926 |
+
"learning_rate": 3.24e-06,
|
| 4927 |
+
"loss": 1.8311897277832032,
|
| 4928 |
+
"mean_token_accuracy": 0.661663169786334,
|
| 4929 |
+
"num_tokens": 29235880.0,
|
| 4930 |
+
"step": 4920
|
| 4931 |
+
},
|
| 4932 |
+
{
|
| 4933 |
+
"entropy": 1.665907260030508,
|
| 4934 |
+
"epoch": 8.47109391790243,
|
| 4935 |
+
"grad_norm": 0.8672494292259216,
|
| 4936 |
+
"learning_rate": 2.8400000000000003e-06,
|
| 4937 |
+
"loss": 1.6878423690795898,
|
| 4938 |
+
"mean_token_accuracy": 0.6819184564054013,
|
| 4939 |
+
"num_tokens": 29295823.0,
|
| 4940 |
+
"step": 4930
|
| 4941 |
+
},
|
| 4942 |
+
{
|
| 4943 |
+
"entropy": 1.7426866918802262,
|
| 4944 |
+
"epoch": 8.488287126584998,
|
| 4945 |
+
"grad_norm": 0.8398126363754272,
|
| 4946 |
+
"learning_rate": 2.4400000000000004e-06,
|
| 4947 |
+
"loss": 1.810443115234375,
|
| 4948 |
+
"mean_token_accuracy": 0.6639036998152733,
|
| 4949 |
+
"num_tokens": 29355386.0,
|
| 4950 |
+
"step": 4940
|
| 4951 |
+
},
|
| 4952 |
+
{
|
| 4953 |
+
"entropy": 1.6938614405691623,
|
| 4954 |
+
"epoch": 8.505480335267569,
|
| 4955 |
+
"grad_norm": 0.7652584314346313,
|
| 4956 |
+
"learning_rate": 2.0400000000000004e-06,
|
| 4957 |
+
"loss": 1.7690727233886718,
|
| 4958 |
+
"mean_token_accuracy": 0.6737098075449467,
|
| 4959 |
+
"num_tokens": 29414966.0,
|
| 4960 |
+
"step": 4950
|
| 4961 |
+
},
|
| 4962 |
+
{
|
| 4963 |
+
"entropy": 1.7538506165146828,
|
| 4964 |
+
"epoch": 8.52267354395014,
|
| 4965 |
+
"grad_norm": 0.8389163017272949,
|
| 4966 |
+
"learning_rate": 1.6400000000000002e-06,
|
| 4967 |
+
"loss": 1.8067062377929688,
|
| 4968 |
+
"mean_token_accuracy": 0.6728679880499839,
|
| 4969 |
+
"num_tokens": 29472960.0,
|
| 4970 |
+
"step": 4960
|
| 4971 |
+
},
|
| 4972 |
+
{
|
| 4973 |
+
"entropy": 1.7591105610132218,
|
| 4974 |
+
"epoch": 8.53986675263271,
|
| 4975 |
+
"grad_norm": 0.8280366063117981,
|
| 4976 |
+
"learning_rate": 1.24e-06,
|
| 4977 |
+
"loss": 1.7855098724365235,
|
| 4978 |
+
"mean_token_accuracy": 0.6670263484120369,
|
| 4979 |
+
"num_tokens": 29531300.0,
|
| 4980 |
+
"step": 4970
|
| 4981 |
+
},
|
| 4982 |
+
{
|
| 4983 |
+
"entropy": 1.6825189530849456,
|
| 4984 |
+
"epoch": 8.55705996131528,
|
| 4985 |
+
"grad_norm": 0.8177328109741211,
|
| 4986 |
+
"learning_rate": 8.4e-07,
|
| 4987 |
+
"loss": 1.731926727294922,
|
| 4988 |
+
"mean_token_accuracy": 0.6818420931696891,
|
| 4989 |
+
"num_tokens": 29591290.0,
|
| 4990 |
+
"step": 4980
|
| 4991 |
+
},
|
| 4992 |
+
{
|
| 4993 |
+
"entropy": 1.7112577512860299,
|
| 4994 |
+
"epoch": 8.574253169997851,
|
| 4995 |
+
"grad_norm": 0.8413036465644836,
|
| 4996 |
+
"learning_rate": 4.4e-07,
|
| 4997 |
+
"loss": 1.7446353912353516,
|
| 4998 |
+
"mean_token_accuracy": 0.6750271447002888,
|
| 4999 |
+
"num_tokens": 29646086.0,
|
| 5000 |
+
"step": 4990
|
| 5001 |
+
},
|
| 5002 |
+
{
|
| 5003 |
+
"entropy": 1.7419164210557938,
|
| 5004 |
+
"epoch": 8.591446378680422,
|
| 5005 |
+
"grad_norm": 0.9462088346481323,
|
| 5006 |
+
"learning_rate": 4e-08,
|
| 5007 |
+
"loss": 1.7870445251464844,
|
| 5008 |
+
"mean_token_accuracy": 0.666933435574174,
|
| 5009 |
+
"num_tokens": 29704815.0,
|
| 5010 |
+
"step": 5000
|
| 5011 |
}
|
| 5012 |
],
|
| 5013 |
"logging_steps": 10,
|
|
|
|
| 5022 |
"should_evaluate": false,
|
| 5023 |
"should_log": false,
|
| 5024 |
"should_save": true,
|
| 5025 |
+
"should_training_stop": true
|
| 5026 |
},
|
| 5027 |
"attributes": {}
|
| 5028 |
}
|
| 5029 |
},
|
| 5030 |
+
"total_flos": 2.438188209453138e+17,
|
| 5031 |
"train_batch_size": 2,
|
| 5032 |
"trial_name": null,
|
| 5033 |
"trial_params": null
|