Plofski commited on
Commit
5309a54
·
verified ·
1 Parent(s): 19583b8

Training in progress, step 5500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e6e1468ef363199a8ce8dceeee806e0cd1265dabba9569f802d5e0ffdf55cf29
3
  size 536223056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcb80f83cde4a31bb60c1fd7260ffe3f7e16f618b67202dd29fd631a03093894
3
  size 536223056
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:979fd7f70ce82e647328d9ca181635fd358343ae3c4356518a994deb8d2c7554
3
  size 1072594443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:deac3ee60db6adb45d1da1976f4f679efdf8206065175afc58ada5c695ccf6a5
3
  size 1072594443
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c8c6451e983e45b2059a969443ca799e62ce60a9d34862e6b02e6b5034f66233
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4f36f1c6d7eb84c738a082911123d4e08f6356fc8093bb45612eb211d0cfe74
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.007455168245013,
6
  "eval_steps": 500,
7
- "global_step": 5000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4508,6 +4508,456 @@
4508
  "mean_token_accuracy": 0.800259780883789,
4509
  "num_tokens": 5541015.0,
4510
  "step": 5000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4511
  }
4512
  ],
4513
  "logging_steps": 10,
@@ -4527,7 +4977,7 @@
4527
  "attributes": {}
4528
  }
4529
  },
4530
- "total_flos": 6697551334397952.0,
4531
  "train_batch_size": 8,
4532
  "trial_name": null,
4533
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.1082006850695145,
6
  "eval_steps": 500,
7
+ "global_step": 5500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4508
  "mean_token_accuracy": 0.800259780883789,
4509
  "num_tokens": 5541015.0,
4510
  "step": 5000
4511
+ },
4512
+ {
4513
+ "epoch": 1.0094700785815032,
4514
+ "grad_norm": 9.875,
4515
+ "learning_rate": 1.3271542749680975e-05,
4516
+ "loss": 0.9383,
4517
+ "mean_token_accuracy": 0.7706878125667572,
4518
+ "num_tokens": 5553090.0,
4519
+ "step": 5010
4520
+ },
4521
+ {
4522
+ "epoch": 1.011484988917993,
4523
+ "grad_norm": 12.0625,
4524
+ "learning_rate": 1.3258110014104373e-05,
4525
+ "loss": 0.8471,
4526
+ "mean_token_accuracy": 0.7942995607852936,
4527
+ "num_tokens": 5564220.0,
4528
+ "step": 5020
4529
+ },
4530
+ {
4531
+ "epoch": 1.0134998992544832,
4532
+ "grad_norm": 10.375,
4533
+ "learning_rate": 1.3244677278527773e-05,
4534
+ "loss": 0.8018,
4535
+ "mean_token_accuracy": 0.7939584195613861,
4536
+ "num_tokens": 5574966.0,
4537
+ "step": 5030
4538
+ },
4539
+ {
4540
+ "epoch": 1.0155148095909732,
4541
+ "grad_norm": 12.625,
4542
+ "learning_rate": 1.3231244542951174e-05,
4543
+ "loss": 0.9348,
4544
+ "mean_token_accuracy": 0.7773958921432496,
4545
+ "num_tokens": 5586140.0,
4546
+ "step": 5040
4547
+ },
4548
+ {
4549
+ "epoch": 1.0175297199274633,
4550
+ "grad_norm": 9.625,
4551
+ "learning_rate": 1.3217811807374572e-05,
4552
+ "loss": 0.8882,
4553
+ "mean_token_accuracy": 0.7792610108852387,
4554
+ "num_tokens": 5597440.0,
4555
+ "step": 5050
4556
+ },
4557
+ {
4558
+ "epoch": 1.0195446302639533,
4559
+ "grad_norm": 11.75,
4560
+ "learning_rate": 1.3204379071797973e-05,
4561
+ "loss": 0.7882,
4562
+ "mean_token_accuracy": 0.8046412229537964,
4563
+ "num_tokens": 5609321.0,
4564
+ "step": 5060
4565
+ },
4566
+ {
4567
+ "epoch": 1.0215595406004432,
4568
+ "grad_norm": 9.4375,
4569
+ "learning_rate": 1.3190946336221373e-05,
4570
+ "loss": 0.8062,
4571
+ "mean_token_accuracy": 0.7952991247177124,
4572
+ "num_tokens": 5619194.0,
4573
+ "step": 5070
4574
+ },
4575
+ {
4576
+ "epoch": 1.0235744509369333,
4577
+ "grad_norm": 12.4375,
4578
+ "learning_rate": 1.3177513600644774e-05,
4579
+ "loss": 0.921,
4580
+ "mean_token_accuracy": 0.7800089240074157,
4581
+ "num_tokens": 5631065.0,
4582
+ "step": 5080
4583
+ },
4584
+ {
4585
+ "epoch": 1.0255893612734233,
4586
+ "grad_norm": 10.875,
4587
+ "learning_rate": 1.3164080865068171e-05,
4588
+ "loss": 0.799,
4589
+ "mean_token_accuracy": 0.8071331679821014,
4590
+ "num_tokens": 5642580.0,
4591
+ "step": 5090
4592
+ },
4593
+ {
4594
+ "epoch": 1.0276042716099134,
4595
+ "grad_norm": 10.1875,
4596
+ "learning_rate": 1.3150648129491572e-05,
4597
+ "loss": 0.7776,
4598
+ "mean_token_accuracy": 0.8046740829944611,
4599
+ "num_tokens": 5651910.0,
4600
+ "step": 5100
4601
+ },
4602
+ {
4603
+ "epoch": 1.0296191819464033,
4604
+ "grad_norm": 14.0,
4605
+ "learning_rate": 1.3137215393914972e-05,
4606
+ "loss": 0.8056,
4607
+ "mean_token_accuracy": 0.8012421131134033,
4608
+ "num_tokens": 5663726.0,
4609
+ "step": 5110
4610
+ },
4611
+ {
4612
+ "epoch": 1.0316340922828935,
4613
+ "grad_norm": 11.5,
4614
+ "learning_rate": 1.3123782658338371e-05,
4615
+ "loss": 0.7681,
4616
+ "mean_token_accuracy": 0.8097535610198975,
4617
+ "num_tokens": 5675558.0,
4618
+ "step": 5120
4619
+ },
4620
+ {
4621
+ "epoch": 1.0336490026193834,
4622
+ "grad_norm": 11.1875,
4623
+ "learning_rate": 1.3110349922761772e-05,
4624
+ "loss": 0.8813,
4625
+ "mean_token_accuracy": 0.7838487148284912,
4626
+ "num_tokens": 5687969.0,
4627
+ "step": 5130
4628
+ },
4629
+ {
4630
+ "epoch": 1.0356639129558736,
4631
+ "grad_norm": 9.3125,
4632
+ "learning_rate": 1.3096917187185172e-05,
4633
+ "loss": 0.9072,
4634
+ "mean_token_accuracy": 0.7834949135780335,
4635
+ "num_tokens": 5700354.0,
4636
+ "step": 5140
4637
+ },
4638
+ {
4639
+ "epoch": 1.0376788232923635,
4640
+ "grad_norm": 14.6875,
4641
+ "learning_rate": 1.3083484451608571e-05,
4642
+ "loss": 0.903,
4643
+ "mean_token_accuracy": 0.7816505491733551,
4644
+ "num_tokens": 5711090.0,
4645
+ "step": 5150
4646
+ },
4647
+ {
4648
+ "epoch": 1.0396937336288534,
4649
+ "grad_norm": 8.9375,
4650
+ "learning_rate": 1.3070051716031971e-05,
4651
+ "loss": 0.7961,
4652
+ "mean_token_accuracy": 0.8029458582401275,
4653
+ "num_tokens": 5721667.0,
4654
+ "step": 5160
4655
+ },
4656
+ {
4657
+ "epoch": 1.0417086439653436,
4658
+ "grad_norm": 10.8125,
4659
+ "learning_rate": 1.305661898045537e-05,
4660
+ "loss": 0.8394,
4661
+ "mean_token_accuracy": 0.7979920387268067,
4662
+ "num_tokens": 5733015.0,
4663
+ "step": 5170
4664
+ },
4665
+ {
4666
+ "epoch": 1.0437235543018335,
4667
+ "grad_norm": 11.1875,
4668
+ "learning_rate": 1.304318624487877e-05,
4669
+ "loss": 0.8749,
4670
+ "mean_token_accuracy": 0.7899072051048279,
4671
+ "num_tokens": 5743473.0,
4672
+ "step": 5180
4673
+ },
4674
+ {
4675
+ "epoch": 1.0457384646383237,
4676
+ "grad_norm": 11.0625,
4677
+ "learning_rate": 1.302975350930217e-05,
4678
+ "loss": 0.8553,
4679
+ "mean_token_accuracy": 0.7900504052639008,
4680
+ "num_tokens": 5754579.0,
4681
+ "step": 5190
4682
+ },
4683
+ {
4684
+ "epoch": 1.0477533749748136,
4685
+ "grad_norm": 10.9375,
4686
+ "learning_rate": 1.301632077372557e-05,
4687
+ "loss": 0.8735,
4688
+ "mean_token_accuracy": 0.7891764640808105,
4689
+ "num_tokens": 5765340.0,
4690
+ "step": 5200
4691
+ },
4692
+ {
4693
+ "epoch": 1.0497682853113037,
4694
+ "grad_norm": 9.0,
4695
+ "learning_rate": 1.300288803814897e-05,
4696
+ "loss": 0.7709,
4697
+ "mean_token_accuracy": 0.8050879895687103,
4698
+ "num_tokens": 5775710.0,
4699
+ "step": 5210
4700
+ },
4701
+ {
4702
+ "epoch": 1.0517831956477937,
4703
+ "grad_norm": 18.25,
4704
+ "learning_rate": 1.298945530257237e-05,
4705
+ "loss": 0.7335,
4706
+ "mean_token_accuracy": 0.8071872234344483,
4707
+ "num_tokens": 5785996.0,
4708
+ "step": 5220
4709
+ },
4710
+ {
4711
+ "epoch": 1.0537981059842838,
4712
+ "grad_norm": 13.375,
4713
+ "learning_rate": 1.297602256699577e-05,
4714
+ "loss": 0.877,
4715
+ "mean_token_accuracy": 0.7817419946193696,
4716
+ "num_tokens": 5796629.0,
4717
+ "step": 5230
4718
+ },
4719
+ {
4720
+ "epoch": 1.0558130163207737,
4721
+ "grad_norm": 11.1875,
4722
+ "learning_rate": 1.296258983141917e-05,
4723
+ "loss": 0.7858,
4724
+ "mean_token_accuracy": 0.8022194325923919,
4725
+ "num_tokens": 5806790.0,
4726
+ "step": 5240
4727
+ },
4728
+ {
4729
+ "epoch": 1.0578279266572637,
4730
+ "grad_norm": 11.1875,
4731
+ "learning_rate": 1.2949157095842568e-05,
4732
+ "loss": 0.8409,
4733
+ "mean_token_accuracy": 0.7854238629341126,
4734
+ "num_tokens": 5818974.0,
4735
+ "step": 5250
4736
+ },
4737
+ {
4738
+ "epoch": 1.0598428369937538,
4739
+ "grad_norm": 13.0,
4740
+ "learning_rate": 1.2935724360265968e-05,
4741
+ "loss": 0.7023,
4742
+ "mean_token_accuracy": 0.8206122577190399,
4743
+ "num_tokens": 5828962.0,
4744
+ "step": 5260
4745
+ },
4746
+ {
4747
+ "epoch": 1.0618577473302437,
4748
+ "grad_norm": 12.75,
4749
+ "learning_rate": 1.2922291624689369e-05,
4750
+ "loss": 0.8116,
4751
+ "mean_token_accuracy": 0.7957081377506257,
4752
+ "num_tokens": 5840475.0,
4753
+ "step": 5270
4754
+ },
4755
+ {
4756
+ "epoch": 1.063872657666734,
4757
+ "grad_norm": 12.375,
4758
+ "learning_rate": 1.290885888911277e-05,
4759
+ "loss": 0.876,
4760
+ "mean_token_accuracy": 0.7848715245723724,
4761
+ "num_tokens": 5851626.0,
4762
+ "step": 5280
4763
+ },
4764
+ {
4765
+ "epoch": 1.0658875680032238,
4766
+ "grad_norm": 12.1875,
4767
+ "learning_rate": 1.2895426153536168e-05,
4768
+ "loss": 0.8648,
4769
+ "mean_token_accuracy": 0.7879779160022735,
4770
+ "num_tokens": 5861745.0,
4771
+ "step": 5290
4772
+ },
4773
+ {
4774
+ "epoch": 1.067902478339714,
4775
+ "grad_norm": 11.5625,
4776
+ "learning_rate": 1.2881993417959569e-05,
4777
+ "loss": 0.7807,
4778
+ "mean_token_accuracy": 0.8065967261791229,
4779
+ "num_tokens": 5871744.0,
4780
+ "step": 5300
4781
+ },
4782
+ {
4783
+ "epoch": 1.069917388676204,
4784
+ "grad_norm": 11.875,
4785
+ "learning_rate": 1.286856068238297e-05,
4786
+ "loss": 0.8184,
4787
+ "mean_token_accuracy": 0.7950898349285126,
4788
+ "num_tokens": 5882570.0,
4789
+ "step": 5310
4790
+ },
4791
+ {
4792
+ "epoch": 1.071932299012694,
4793
+ "grad_norm": 12.125,
4794
+ "learning_rate": 1.2855127946806366e-05,
4795
+ "loss": 0.7624,
4796
+ "mean_token_accuracy": 0.8084113836288452,
4797
+ "num_tokens": 5893477.0,
4798
+ "step": 5320
4799
+ },
4800
+ {
4801
+ "epoch": 1.073947209349184,
4802
+ "grad_norm": 12.0625,
4803
+ "learning_rate": 1.2841695211229767e-05,
4804
+ "loss": 0.8525,
4805
+ "mean_token_accuracy": 0.8004627406597138,
4806
+ "num_tokens": 5906228.0,
4807
+ "step": 5330
4808
+ },
4809
+ {
4810
+ "epoch": 1.075962119685674,
4811
+ "grad_norm": 10.1875,
4812
+ "learning_rate": 1.2828262475653167e-05,
4813
+ "loss": 0.7381,
4814
+ "mean_token_accuracy": 0.815189528465271,
4815
+ "num_tokens": 5917163.0,
4816
+ "step": 5340
4817
+ },
4818
+ {
4819
+ "epoch": 1.077977030022164,
4820
+ "grad_norm": 13.4375,
4821
+ "learning_rate": 1.2814829740076568e-05,
4822
+ "loss": 0.8192,
4823
+ "mean_token_accuracy": 0.7983390390872955,
4824
+ "num_tokens": 5927959.0,
4825
+ "step": 5350
4826
+ },
4827
+ {
4828
+ "epoch": 1.079991940358654,
4829
+ "grad_norm": 11.125,
4830
+ "learning_rate": 1.2801397004499967e-05,
4831
+ "loss": 0.8847,
4832
+ "mean_token_accuracy": 0.7825915396213532,
4833
+ "num_tokens": 5938684.0,
4834
+ "step": 5360
4835
+ },
4836
+ {
4837
+ "epoch": 1.0820068506951441,
4838
+ "grad_norm": 11.625,
4839
+ "learning_rate": 1.2787964268923367e-05,
4840
+ "loss": 0.8451,
4841
+ "mean_token_accuracy": 0.7878111064434051,
4842
+ "num_tokens": 5948765.0,
4843
+ "step": 5370
4844
+ },
4845
+ {
4846
+ "epoch": 1.084021761031634,
4847
+ "grad_norm": 13.0,
4848
+ "learning_rate": 1.2774531533346768e-05,
4849
+ "loss": 0.7971,
4850
+ "mean_token_accuracy": 0.8030431568622589,
4851
+ "num_tokens": 5960108.0,
4852
+ "step": 5380
4853
+ },
4854
+ {
4855
+ "epoch": 1.0860366713681242,
4856
+ "grad_norm": 10.625,
4857
+ "learning_rate": 1.2761098797770167e-05,
4858
+ "loss": 0.8786,
4859
+ "mean_token_accuracy": 0.7854897439479828,
4860
+ "num_tokens": 5972007.0,
4861
+ "step": 5390
4862
+ },
4863
+ {
4864
+ "epoch": 1.0880515817046141,
4865
+ "grad_norm": 10.0625,
4866
+ "learning_rate": 1.2747666062193567e-05,
4867
+ "loss": 0.8395,
4868
+ "mean_token_accuracy": 0.7956344962120057,
4869
+ "num_tokens": 5983211.0,
4870
+ "step": 5400
4871
+ },
4872
+ {
4873
+ "epoch": 1.090066492041104,
4874
+ "grad_norm": 10.5625,
4875
+ "learning_rate": 1.2734233326616968e-05,
4876
+ "loss": 0.9274,
4877
+ "mean_token_accuracy": 0.7794575989246368,
4878
+ "num_tokens": 5995219.0,
4879
+ "step": 5410
4880
+ },
4881
+ {
4882
+ "epoch": 1.0920814023775942,
4883
+ "grad_norm": 13.1875,
4884
+ "learning_rate": 1.2720800591040365e-05,
4885
+ "loss": 0.8251,
4886
+ "mean_token_accuracy": 0.802078241109848,
4887
+ "num_tokens": 6006324.0,
4888
+ "step": 5420
4889
+ },
4890
+ {
4891
+ "epoch": 1.0940963127140841,
4892
+ "grad_norm": 14.0625,
4893
+ "learning_rate": 1.2707367855463765e-05,
4894
+ "loss": 0.8402,
4895
+ "mean_token_accuracy": 0.7896000027656556,
4896
+ "num_tokens": 6017542.0,
4897
+ "step": 5430
4898
+ },
4899
+ {
4900
+ "epoch": 1.0961112230505743,
4901
+ "grad_norm": 11.8125,
4902
+ "learning_rate": 1.2693935119887166e-05,
4903
+ "loss": 0.8307,
4904
+ "mean_token_accuracy": 0.7981148719787597,
4905
+ "num_tokens": 6027523.0,
4906
+ "step": 5440
4907
+ },
4908
+ {
4909
+ "epoch": 1.0981261333870642,
4910
+ "grad_norm": 9.6875,
4911
+ "learning_rate": 1.2680502384310566e-05,
4912
+ "loss": 0.866,
4913
+ "mean_token_accuracy": 0.7834112644195557,
4914
+ "num_tokens": 6038697.0,
4915
+ "step": 5450
4916
+ },
4917
+ {
4918
+ "epoch": 1.1001410437235544,
4919
+ "grad_norm": 11.0625,
4920
+ "learning_rate": 1.2667069648733965e-05,
4921
+ "loss": 0.793,
4922
+ "mean_token_accuracy": 0.7983521819114685,
4923
+ "num_tokens": 6049813.0,
4924
+ "step": 5460
4925
+ },
4926
+ {
4927
+ "epoch": 1.1021559540600443,
4928
+ "grad_norm": 12.0625,
4929
+ "learning_rate": 1.2653636913157366e-05,
4930
+ "loss": 0.7633,
4931
+ "mean_token_accuracy": 0.811886590719223,
4932
+ "num_tokens": 6060176.0,
4933
+ "step": 5470
4934
+ },
4935
+ {
4936
+ "epoch": 1.1041708643965344,
4937
+ "grad_norm": 12.875,
4938
+ "learning_rate": 1.2640204177580766e-05,
4939
+ "loss": 0.8755,
4940
+ "mean_token_accuracy": 0.7823013424873352,
4941
+ "num_tokens": 6069957.0,
4942
+ "step": 5480
4943
+ },
4944
+ {
4945
+ "epoch": 1.1061857747330244,
4946
+ "grad_norm": 12.6875,
4947
+ "learning_rate": 1.2626771442004164e-05,
4948
+ "loss": 0.8468,
4949
+ "mean_token_accuracy": 0.7942144453525544,
4950
+ "num_tokens": 6080224.0,
4951
+ "step": 5490
4952
+ },
4953
+ {
4954
+ "epoch": 1.1082006850695145,
4955
+ "grad_norm": 10.5,
4956
+ "learning_rate": 1.2613338706427564e-05,
4957
+ "loss": 0.8926,
4958
+ "mean_token_accuracy": 0.7852272689342499,
4959
+ "num_tokens": 6091516.0,
4960
+ "step": 5500
4961
  }
4962
  ],
4963
  "logging_steps": 10,
 
4977
  "attributes": {}
4978
  }
4979
  },
4980
+ "total_flos": 7364465716629504.0,
4981
  "train_batch_size": 8,
4982
  "trial_name": null,
4983
  "trial_params": null