rovdetection commited on
Commit
a83e93d
·
verified ·
1 Parent(s): 5620747

Training in progress, step 5000, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b732646b1016d0368b94920529e0e03c133894ca8756d67e145a97d90d254777
3
  size 9446744
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e72d16b2e050107874bda34c32842693cb03183fe37e99259fd5f4499db55d7
3
  size 9446744
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d1728e885cf58302b2e8ae68b6c9f146637db471aa0ed43e5c883bad6235443e
3
  size 4879947
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c9789ca8345d90dcacc80a1a783b43cb333b05712d5ff9f32742adcdad67703
3
  size 4879947
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16de339ad05cf2ba88ca8586907951353749d574c9326b3098589fb0f62ac32e
3
  size 14917
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6c65cbc045dd0d2fc61664c618dc95af09df46ef33dca72fb52e607162f7cd0
3
  size 14917
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2cefe33faabb000e8f719c6f02e0099d6289469d78aca45133006441981cd323
3
  size 14917
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b52aae8bdee498050d557f981556359d1fd46a65c7057f7ff5253cd2856e123
3
  size 14917
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b62db0ba9861d9ab63380744e79a287faa461a1bf55700140a411fe1e976f1cd
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82ad8990572ad11a824b7db276c8af49c179ca7e7724b4e6906cd0ae480a80a8
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b41aa0c086667ab13fd1c3da2f8b431d894c7368cafdbcdd2e5351f4800eddf8
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ed92728e8486ac6f40cff2848582530afa1f43adb61e60cafa8617d08778617
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 7.732430689877498,
6
  "eval_steps": 500,
7
- "global_step": 4500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4508,6 +4508,506 @@
4508
  "mean_token_accuracy": 0.6680058591067791,
4509
  "num_tokens": 26735542.0,
4510
  "step": 4500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4511
  }
4512
  ],
4513
  "logging_steps": 10,
@@ -4522,12 +5022,12 @@
4522
  "should_evaluate": false,
4523
  "should_log": false,
4524
  "should_save": true,
4525
- "should_training_stop": false
4526
  },
4527
  "attributes": {}
4528
  }
4529
  },
4530
- "total_flos": 2.19451190411264e+17,
4531
  "train_batch_size": 2,
4532
  "trial_name": null,
4533
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 8.591446378680422,
6
  "eval_steps": 500,
7
+ "global_step": 5000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4508
  "mean_token_accuracy": 0.6680058591067791,
4509
  "num_tokens": 26735542.0,
4510
  "step": 4500
4511
+ },
4512
+ {
4513
+ "entropy": 1.7587152615189552,
4514
+ "epoch": 7.749623898560069,
4515
+ "grad_norm": 0.8131846189498901,
4516
+ "learning_rate": 1.9640000000000002e-05,
4517
+ "loss": 1.798016357421875,
4518
+ "mean_token_accuracy": 0.6655693002045154,
4519
+ "num_tokens": 26796245.0,
4520
+ "step": 4510
4521
+ },
4522
+ {
4523
+ "entropy": 1.7238084524869919,
4524
+ "epoch": 7.766817107242639,
4525
+ "grad_norm": 0.8774024248123169,
4526
+ "learning_rate": 1.924e-05,
4527
+ "loss": 1.7398443222045898,
4528
+ "mean_token_accuracy": 0.6723451249301433,
4529
+ "num_tokens": 26852843.0,
4530
+ "step": 4520
4531
+ },
4532
+ {
4533
+ "entropy": 1.8012757793068885,
4534
+ "epoch": 7.78401031592521,
4535
+ "grad_norm": 0.881601095199585,
4536
+ "learning_rate": 1.8840000000000003e-05,
4537
+ "loss": 1.851584243774414,
4538
+ "mean_token_accuracy": 0.6612551022320986,
4539
+ "num_tokens": 26912327.0,
4540
+ "step": 4530
4541
+ },
4542
+ {
4543
+ "entropy": 1.7035338878631592,
4544
+ "epoch": 7.8012035246077795,
4545
+ "grad_norm": 0.8460244536399841,
4546
+ "learning_rate": 1.8440000000000003e-05,
4547
+ "loss": 1.7524948120117188,
4548
+ "mean_token_accuracy": 0.6760960537940264,
4549
+ "num_tokens": 26971076.0,
4550
+ "step": 4540
4551
+ },
4552
+ {
4553
+ "entropy": 1.6795054778456688,
4554
+ "epoch": 7.81839673329035,
4555
+ "grad_norm": 0.7720061540603638,
4556
+ "learning_rate": 1.804e-05,
4557
+ "loss": 1.70491943359375,
4558
+ "mean_token_accuracy": 0.6768644891679287,
4559
+ "num_tokens": 27031120.0,
4560
+ "step": 4550
4561
+ },
4562
+ {
4563
+ "entropy": 1.775759120285511,
4564
+ "epoch": 7.835589941972921,
4565
+ "grad_norm": 0.8407703638076782,
4566
+ "learning_rate": 1.764e-05,
4567
+ "loss": 1.8208852767944337,
4568
+ "mean_token_accuracy": 0.6638765886425972,
4569
+ "num_tokens": 27089926.0,
4570
+ "step": 4560
4571
+ },
4572
+ {
4573
+ "entropy": 1.7749223679304122,
4574
+ "epoch": 7.852783150655491,
4575
+ "grad_norm": 0.8033788204193115,
4576
+ "learning_rate": 1.724e-05,
4577
+ "loss": 1.8128280639648438,
4578
+ "mean_token_accuracy": 0.6697524327784776,
4579
+ "num_tokens": 27155776.0,
4580
+ "step": 4570
4581
+ },
4582
+ {
4583
+ "entropy": 1.7019891321659089,
4584
+ "epoch": 7.869976359338062,
4585
+ "grad_norm": 0.8756063580513,
4586
+ "learning_rate": 1.684e-05,
4587
+ "loss": 1.752833366394043,
4588
+ "mean_token_accuracy": 0.6720911644399166,
4589
+ "num_tokens": 27213676.0,
4590
+ "step": 4580
4591
+ },
4592
+ {
4593
+ "entropy": 1.7089907452464104,
4594
+ "epoch": 7.8871695680206315,
4595
+ "grad_norm": 0.8547044396400452,
4596
+ "learning_rate": 1.644e-05,
4597
+ "loss": 1.7329090118408204,
4598
+ "mean_token_accuracy": 0.6730512753129005,
4599
+ "num_tokens": 27273812.0,
4600
+ "step": 4590
4601
+ },
4602
+ {
4603
+ "entropy": 1.8000069722533225,
4604
+ "epoch": 7.904362776703202,
4605
+ "grad_norm": 0.8191949725151062,
4606
+ "learning_rate": 1.604e-05,
4607
+ "loss": 1.8508378982543945,
4608
+ "mean_token_accuracy": 0.6602330446243286,
4609
+ "num_tokens": 27334482.0,
4610
+ "step": 4600
4611
+ },
4612
+ {
4613
+ "entropy": 1.6531485810875892,
4614
+ "epoch": 7.921555985385773,
4615
+ "grad_norm": 0.7952063679695129,
4616
+ "learning_rate": 1.5640000000000003e-05,
4617
+ "loss": 1.6732818603515625,
4618
+ "mean_token_accuracy": 0.6840143203735352,
4619
+ "num_tokens": 27390777.0,
4620
+ "step": 4610
4621
+ },
4622
+ {
4623
+ "entropy": 1.7451679170131684,
4624
+ "epoch": 7.938749194068343,
4625
+ "grad_norm": 0.7736355066299438,
4626
+ "learning_rate": 1.5240000000000001e-05,
4627
+ "loss": 1.836105728149414,
4628
+ "mean_token_accuracy": 0.6631482250988483,
4629
+ "num_tokens": 27452458.0,
4630
+ "step": 4620
4631
+ },
4632
+ {
4633
+ "entropy": 1.6219932287931442,
4634
+ "epoch": 7.955942402750914,
4635
+ "grad_norm": 0.7429597973823547,
4636
+ "learning_rate": 1.4840000000000002e-05,
4637
+ "loss": 1.6252763748168946,
4638
+ "mean_token_accuracy": 0.6922797068953515,
4639
+ "num_tokens": 27510793.0,
4640
+ "step": 4630
4641
+ },
4642
+ {
4643
+ "entropy": 1.7097622737288476,
4644
+ "epoch": 7.9731356114334835,
4645
+ "grad_norm": 0.7546749114990234,
4646
+ "learning_rate": 1.444e-05,
4647
+ "loss": 1.7529830932617188,
4648
+ "mean_token_accuracy": 0.6756818048655987,
4649
+ "num_tokens": 27570434.0,
4650
+ "step": 4640
4651
+ },
4652
+ {
4653
+ "entropy": 1.7681476891040802,
4654
+ "epoch": 7.990328820116054,
4655
+ "grad_norm": 0.8919919729232788,
4656
+ "learning_rate": 1.4040000000000001e-05,
4657
+ "loss": 1.8469413757324218,
4658
+ "mean_token_accuracy": 0.6651480123400688,
4659
+ "num_tokens": 27632017.0,
4660
+ "step": 4650
4661
+ },
4662
+ {
4663
+ "entropy": 1.7464849283168842,
4664
+ "epoch": 8.006877283473028,
4665
+ "grad_norm": 0.8629288077354431,
4666
+ "learning_rate": 1.364e-05,
4667
+ "loss": 1.7770162582397462,
4668
+ "mean_token_accuracy": 0.6717489861048661,
4669
+ "num_tokens": 27687721.0,
4670
+ "step": 4660
4671
+ },
4672
+ {
4673
+ "entropy": 1.733792708069086,
4674
+ "epoch": 8.024070492155598,
4675
+ "grad_norm": 0.8012450337409973,
4676
+ "learning_rate": 1.324e-05,
4677
+ "loss": 1.7535259246826171,
4678
+ "mean_token_accuracy": 0.6781957261264324,
4679
+ "num_tokens": 27748609.0,
4680
+ "step": 4670
4681
+ },
4682
+ {
4683
+ "entropy": 1.673891542851925,
4684
+ "epoch": 8.041263700838169,
4685
+ "grad_norm": 0.8763530850410461,
4686
+ "learning_rate": 1.2839999999999999e-05,
4687
+ "loss": 1.7353546142578125,
4688
+ "mean_token_accuracy": 0.6773874297738075,
4689
+ "num_tokens": 27805200.0,
4690
+ "step": 4680
4691
+ },
4692
+ {
4693
+ "entropy": 1.6245143353939056,
4694
+ "epoch": 8.05845690952074,
4695
+ "grad_norm": 0.7880796194076538,
4696
+ "learning_rate": 1.244e-05,
4697
+ "loss": 1.6489152908325195,
4698
+ "mean_token_accuracy": 0.6891307681798935,
4699
+ "num_tokens": 27866189.0,
4700
+ "step": 4690
4701
+ },
4702
+ {
4703
+ "entropy": 1.7772031486034394,
4704
+ "epoch": 8.07565011820331,
4705
+ "grad_norm": 0.894481360912323,
4706
+ "learning_rate": 1.204e-05,
4707
+ "loss": 1.8237220764160156,
4708
+ "mean_token_accuracy": 0.6645158022642136,
4709
+ "num_tokens": 27929040.0,
4710
+ "step": 4700
4711
+ },
4712
+ {
4713
+ "entropy": 1.6911936491727828,
4714
+ "epoch": 8.09284332688588,
4715
+ "grad_norm": 0.8212205171585083,
4716
+ "learning_rate": 1.164e-05,
4717
+ "loss": 1.718613624572754,
4718
+ "mean_token_accuracy": 0.6778515942394734,
4719
+ "num_tokens": 27989259.0,
4720
+ "step": 4710
4721
+ },
4722
+ {
4723
+ "entropy": 1.7341958984732628,
4724
+ "epoch": 8.110036535568451,
4725
+ "grad_norm": 0.8757619261741638,
4726
+ "learning_rate": 1.124e-05,
4727
+ "loss": 1.83496150970459,
4728
+ "mean_token_accuracy": 0.67105031311512,
4729
+ "num_tokens": 28051037.0,
4730
+ "step": 4720
4731
+ },
4732
+ {
4733
+ "entropy": 1.6540620133280755,
4734
+ "epoch": 8.127229744251022,
4735
+ "grad_norm": 0.6871177554130554,
4736
+ "learning_rate": 1.084e-05,
4737
+ "loss": 1.6868721008300782,
4738
+ "mean_token_accuracy": 0.6824644193053245,
4739
+ "num_tokens": 28117218.0,
4740
+ "step": 4730
4741
+ },
4742
+ {
4743
+ "entropy": 1.7760244339704514,
4744
+ "epoch": 8.144422952933592,
4745
+ "grad_norm": 0.8672593832015991,
4746
+ "learning_rate": 1.0440000000000002e-05,
4747
+ "loss": 1.8467548370361329,
4748
+ "mean_token_accuracy": 0.6605620160698891,
4749
+ "num_tokens": 28176643.0,
4750
+ "step": 4740
4751
+ },
4752
+ {
4753
+ "entropy": 1.6998422421514987,
4754
+ "epoch": 8.16161616161616,
4755
+ "grad_norm": 0.9853087663650513,
4756
+ "learning_rate": 1.004e-05,
4757
+ "loss": 1.7283611297607422,
4758
+ "mean_token_accuracy": 0.6775359824299813,
4759
+ "num_tokens": 28234550.0,
4760
+ "step": 4750
4761
+ },
4762
+ {
4763
+ "entropy": 1.7665151111781596,
4764
+ "epoch": 8.178809370298731,
4765
+ "grad_norm": 0.8272210955619812,
4766
+ "learning_rate": 9.640000000000001e-06,
4767
+ "loss": 1.8442218780517579,
4768
+ "mean_token_accuracy": 0.6675057601183653,
4769
+ "num_tokens": 28292004.0,
4770
+ "step": 4760
4771
+ },
4772
+ {
4773
+ "entropy": 1.7351939789950848,
4774
+ "epoch": 8.196002578981302,
4775
+ "grad_norm": 0.8758223652839661,
4776
+ "learning_rate": 9.24e-06,
4777
+ "loss": 1.7823253631591798,
4778
+ "mean_token_accuracy": 0.6717655852437019,
4779
+ "num_tokens": 28351089.0,
4780
+ "step": 4770
4781
+ },
4782
+ {
4783
+ "entropy": 1.7320286817848682,
4784
+ "epoch": 8.213195787663873,
4785
+ "grad_norm": 0.8538162708282471,
4786
+ "learning_rate": 8.840000000000002e-06,
4787
+ "loss": 1.758108139038086,
4788
+ "mean_token_accuracy": 0.6750058546662331,
4789
+ "num_tokens": 28411108.0,
4790
+ "step": 4780
4791
+ },
4792
+ {
4793
+ "entropy": 1.7250167533755303,
4794
+ "epoch": 8.230388996346443,
4795
+ "grad_norm": 0.8055081963539124,
4796
+ "learning_rate": 8.44e-06,
4797
+ "loss": 1.7342365264892579,
4798
+ "mean_token_accuracy": 0.6727670766413212,
4799
+ "num_tokens": 28469910.0,
4800
+ "step": 4790
4801
+ },
4802
+ {
4803
+ "entropy": 1.6715928614139557,
4804
+ "epoch": 8.247582205029014,
4805
+ "grad_norm": 0.8282851576805115,
4806
+ "learning_rate": 8.040000000000001e-06,
4807
+ "loss": 1.7284685134887696,
4808
+ "mean_token_accuracy": 0.6803247310221195,
4809
+ "num_tokens": 28528732.0,
4810
+ "step": 4800
4811
+ },
4812
+ {
4813
+ "entropy": 1.7717369854450227,
4814
+ "epoch": 8.264775413711584,
4815
+ "grad_norm": 0.7199074029922485,
4816
+ "learning_rate": 7.64e-06,
4817
+ "loss": 1.8089387893676758,
4818
+ "mean_token_accuracy": 0.6684400778263807,
4819
+ "num_tokens": 28591231.0,
4820
+ "step": 4810
4821
+ },
4822
+ {
4823
+ "entropy": 1.6829568967223167,
4824
+ "epoch": 8.281968622394155,
4825
+ "grad_norm": 0.8212400674819946,
4826
+ "learning_rate": 7.240000000000001e-06,
4827
+ "loss": 1.6901424407958985,
4828
+ "mean_token_accuracy": 0.6812582932412624,
4829
+ "num_tokens": 28651538.0,
4830
+ "step": 4820
4831
+ },
4832
+ {
4833
+ "entropy": 1.7792557999491692,
4834
+ "epoch": 8.299161831076725,
4835
+ "grad_norm": 0.8251553773880005,
4836
+ "learning_rate": 6.840000000000001e-06,
4837
+ "loss": 1.8440101623535157,
4838
+ "mean_token_accuracy": 0.6635224357247352,
4839
+ "num_tokens": 28713818.0,
4840
+ "step": 4830
4841
+ },
4842
+ {
4843
+ "entropy": 1.6888219453394413,
4844
+ "epoch": 8.316355039759294,
4845
+ "grad_norm": 0.799067497253418,
4846
+ "learning_rate": 6.44e-06,
4847
+ "loss": 1.7452951431274415,
4848
+ "mean_token_accuracy": 0.6766478583216667,
4849
+ "num_tokens": 28771713.0,
4850
+ "step": 4840
4851
+ },
4852
+ {
4853
+ "entropy": 1.6663143932819366,
4854
+ "epoch": 8.333548248441865,
4855
+ "grad_norm": 0.7468796968460083,
4856
+ "learning_rate": 6.040000000000001e-06,
4857
+ "loss": 1.6975286483764649,
4858
+ "mean_token_accuracy": 0.6818139903247357,
4859
+ "num_tokens": 28833584.0,
4860
+ "step": 4850
4861
+ },
4862
+ {
4863
+ "entropy": 1.736840507388115,
4864
+ "epoch": 8.350741457124435,
4865
+ "grad_norm": 0.9168211817741394,
4866
+ "learning_rate": 5.64e-06,
4867
+ "loss": 1.8019765853881835,
4868
+ "mean_token_accuracy": 0.6729365028440952,
4869
+ "num_tokens": 28891158.0,
4870
+ "step": 4860
4871
+ },
4872
+ {
4873
+ "entropy": 1.7159839145839215,
4874
+ "epoch": 8.367934665807006,
4875
+ "grad_norm": 0.8348814249038696,
4876
+ "learning_rate": 5.240000000000001e-06,
4877
+ "loss": 1.7910118103027344,
4878
+ "mean_token_accuracy": 0.67631860896945,
4879
+ "num_tokens": 28948026.0,
4880
+ "step": 4870
4881
+ },
4882
+ {
4883
+ "entropy": 1.7169093780219555,
4884
+ "epoch": 8.385127874489577,
4885
+ "grad_norm": 0.8493881821632385,
4886
+ "learning_rate": 4.84e-06,
4887
+ "loss": 1.7167430877685548,
4888
+ "mean_token_accuracy": 0.6753393478691578,
4889
+ "num_tokens": 29005197.0,
4890
+ "step": 4880
4891
+ },
4892
+ {
4893
+ "entropy": 1.6801239594817161,
4894
+ "epoch": 8.402321083172147,
4895
+ "grad_norm": 0.8069011569023132,
4896
+ "learning_rate": 4.440000000000001e-06,
4897
+ "loss": 1.6674100875854492,
4898
+ "mean_token_accuracy": 0.681441531330347,
4899
+ "num_tokens": 29062454.0,
4900
+ "step": 4890
4901
+ },
4902
+ {
4903
+ "entropy": 1.7267012923955918,
4904
+ "epoch": 8.419514291854718,
4905
+ "grad_norm": 0.8063756823539734,
4906
+ "learning_rate": 4.04e-06,
4907
+ "loss": 1.7544673919677733,
4908
+ "mean_token_accuracy": 0.6745367147028446,
4909
+ "num_tokens": 29121055.0,
4910
+ "step": 4900
4911
+ },
4912
+ {
4913
+ "entropy": 1.6062462359666825,
4914
+ "epoch": 8.436707500537288,
4915
+ "grad_norm": 0.8285024762153625,
4916
+ "learning_rate": 3.6400000000000003e-06,
4917
+ "loss": 1.6273128509521484,
4918
+ "mean_token_accuracy": 0.690464211255312,
4919
+ "num_tokens": 29176963.0,
4920
+ "step": 4910
4921
+ },
4922
+ {
4923
+ "entropy": 1.7958560451865195,
4924
+ "epoch": 8.453900709219859,
4925
+ "grad_norm": 0.8202657103538513,
4926
+ "learning_rate": 3.24e-06,
4927
+ "loss": 1.8311897277832032,
4928
+ "mean_token_accuracy": 0.661663169786334,
4929
+ "num_tokens": 29235880.0,
4930
+ "step": 4920
4931
+ },
4932
+ {
4933
+ "entropy": 1.665907260030508,
4934
+ "epoch": 8.47109391790243,
4935
+ "grad_norm": 0.8672494292259216,
4936
+ "learning_rate": 2.8400000000000003e-06,
4937
+ "loss": 1.6878423690795898,
4938
+ "mean_token_accuracy": 0.6819184564054013,
4939
+ "num_tokens": 29295823.0,
4940
+ "step": 4930
4941
+ },
4942
+ {
4943
+ "entropy": 1.7426866918802262,
4944
+ "epoch": 8.488287126584998,
4945
+ "grad_norm": 0.8398126363754272,
4946
+ "learning_rate": 2.4400000000000004e-06,
4947
+ "loss": 1.810443115234375,
4948
+ "mean_token_accuracy": 0.6639036998152733,
4949
+ "num_tokens": 29355386.0,
4950
+ "step": 4940
4951
+ },
4952
+ {
4953
+ "entropy": 1.6938614405691623,
4954
+ "epoch": 8.505480335267569,
4955
+ "grad_norm": 0.7652584314346313,
4956
+ "learning_rate": 2.0400000000000004e-06,
4957
+ "loss": 1.7690727233886718,
4958
+ "mean_token_accuracy": 0.6737098075449467,
4959
+ "num_tokens": 29414966.0,
4960
+ "step": 4950
4961
+ },
4962
+ {
4963
+ "entropy": 1.7538506165146828,
4964
+ "epoch": 8.52267354395014,
4965
+ "grad_norm": 0.8389163017272949,
4966
+ "learning_rate": 1.6400000000000002e-06,
4967
+ "loss": 1.8067062377929688,
4968
+ "mean_token_accuracy": 0.6728679880499839,
4969
+ "num_tokens": 29472960.0,
4970
+ "step": 4960
4971
+ },
4972
+ {
4973
+ "entropy": 1.7591105610132218,
4974
+ "epoch": 8.53986675263271,
4975
+ "grad_norm": 0.8280366063117981,
4976
+ "learning_rate": 1.24e-06,
4977
+ "loss": 1.7855098724365235,
4978
+ "mean_token_accuracy": 0.6670263484120369,
4979
+ "num_tokens": 29531300.0,
4980
+ "step": 4970
4981
+ },
4982
+ {
4983
+ "entropy": 1.6825189530849456,
4984
+ "epoch": 8.55705996131528,
4985
+ "grad_norm": 0.8177328109741211,
4986
+ "learning_rate": 8.4e-07,
4987
+ "loss": 1.731926727294922,
4988
+ "mean_token_accuracy": 0.6818420931696891,
4989
+ "num_tokens": 29591290.0,
4990
+ "step": 4980
4991
+ },
4992
+ {
4993
+ "entropy": 1.7112577512860299,
4994
+ "epoch": 8.574253169997851,
4995
+ "grad_norm": 0.8413036465644836,
4996
+ "learning_rate": 4.4e-07,
4997
+ "loss": 1.7446353912353516,
4998
+ "mean_token_accuracy": 0.6750271447002888,
4999
+ "num_tokens": 29646086.0,
5000
+ "step": 4990
5001
+ },
5002
+ {
5003
+ "entropy": 1.7419164210557938,
5004
+ "epoch": 8.591446378680422,
5005
+ "grad_norm": 0.9462088346481323,
5006
+ "learning_rate": 4e-08,
5007
+ "loss": 1.7870445251464844,
5008
+ "mean_token_accuracy": 0.666933435574174,
5009
+ "num_tokens": 29704815.0,
5010
+ "step": 5000
5011
  }
5012
  ],
5013
  "logging_steps": 10,
 
5022
  "should_evaluate": false,
5023
  "should_log": false,
5024
  "should_save": true,
5025
+ "should_training_stop": true
5026
  },
5027
  "attributes": {}
5028
  }
5029
  },
5030
+ "total_flos": 2.438188209453138e+17,
5031
  "train_batch_size": 2,
5032
  "trial_name": null,
5033
  "trial_params": null