Plofski commited on
Commit
e0ce647
·
verified ·
1 Parent(s): 223946d

Training in progress, step 6500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9eb50dbcfebd5f63fc3cc77929d31805c3be0d18c479c86d9d2674102149d998
3
  size 536223056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8da83f3c30b9473fef2b931e6b47e4814c76e805b02501c93641aed6bc786ead
3
  size 536223056
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:69bc7042f1e1e7b74e152e40dbcd26c60ace8254419664002f89f720c072bde5
3
  size 1072594443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8c7391e5803dc14420bae3b5326bbd52abb5236b17e67147b31348d199ebeef
3
  size 1072594443
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6125d2b668a070022ee702876ba7ef10eb371529c27241694b5b376ca68bdc81
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0717c8780efa444a6d80d462b725b32f107f9a3c24550aaaa04a7d27cefba76b
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.2089462018940158,
6
  "eval_steps": 500,
7
- "global_step": 6000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -5408,6 +5408,456 @@
5408
  "mean_token_accuracy": 0.8181872367858887,
5409
  "num_tokens": 6657605.0,
5410
  "step": 6000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5411
  }
5412
  ],
5413
  "logging_steps": 10,
@@ -5427,7 +5877,7 @@
5427
  "attributes": {}
5428
  }
5429
  },
5430
- "total_flos": 8046973169571840.0,
5431
  "train_batch_size": 8,
5432
  "trial_name": null,
5433
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.309691718718517,
6
  "eval_steps": 500,
7
+ "global_step": 6500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
5408
  "mean_token_accuracy": 0.8181872367858887,
5409
  "num_tokens": 6657605.0,
5410
  "step": 6000
5411
+ },
5412
+ {
5413
+ "epoch": 1.2109611122305057,
5414
+ "grad_norm": 16.125,
5415
+ "learning_rate": 1.1928269192020955e-05,
5416
+ "loss": 0.8571,
5417
+ "mean_token_accuracy": 0.7833750724792481,
5418
+ "num_tokens": 6667905.0,
5419
+ "step": 6010
5420
+ },
5421
+ {
5422
+ "epoch": 1.2129760225669957,
5423
+ "grad_norm": 9.8125,
5424
+ "learning_rate": 1.1914836456444356e-05,
5425
+ "loss": 0.7898,
5426
+ "mean_token_accuracy": 0.7975350022315979,
5427
+ "num_tokens": 6678888.0,
5428
+ "step": 6020
5429
+ },
5430
+ {
5431
+ "epoch": 1.2149909329034858,
5432
+ "grad_norm": 13.5,
5433
+ "learning_rate": 1.1901403720867755e-05,
5434
+ "loss": 0.8104,
5435
+ "mean_token_accuracy": 0.7968179106712341,
5436
+ "num_tokens": 6690767.0,
5437
+ "step": 6030
5438
+ },
5439
+ {
5440
+ "epoch": 1.2170058432399757,
5441
+ "grad_norm": 12.9375,
5442
+ "learning_rate": 1.1887970985291155e-05,
5443
+ "loss": 0.8847,
5444
+ "mean_token_accuracy": 0.7826601445674897,
5445
+ "num_tokens": 6701963.0,
5446
+ "step": 6040
5447
+ },
5448
+ {
5449
+ "epoch": 1.2190207535764659,
5450
+ "grad_norm": 10.375,
5451
+ "learning_rate": 1.1874538249714556e-05,
5452
+ "loss": 0.7929,
5453
+ "mean_token_accuracy": 0.795721584558487,
5454
+ "num_tokens": 6712765.0,
5455
+ "step": 6050
5456
+ },
5457
+ {
5458
+ "epoch": 1.2210356639129558,
5459
+ "grad_norm": 9.5625,
5460
+ "learning_rate": 1.1861105514137956e-05,
5461
+ "loss": 0.8396,
5462
+ "mean_token_accuracy": 0.7961892068386078,
5463
+ "num_tokens": 6723997.0,
5464
+ "step": 6060
5465
+ },
5466
+ {
5467
+ "epoch": 1.223050574249446,
5468
+ "grad_norm": 10.4375,
5469
+ "learning_rate": 1.1847672778561354e-05,
5470
+ "loss": 0.7774,
5471
+ "mean_token_accuracy": 0.8090421617031097,
5472
+ "num_tokens": 6734959.0,
5473
+ "step": 6070
5474
+ },
5475
+ {
5476
+ "epoch": 1.225065484585936,
5477
+ "grad_norm": 13.75,
5478
+ "learning_rate": 1.1834240042984754e-05,
5479
+ "loss": 0.8442,
5480
+ "mean_token_accuracy": 0.7920287191867829,
5481
+ "num_tokens": 6745125.0,
5482
+ "step": 6080
5483
+ },
5484
+ {
5485
+ "epoch": 1.227080394922426,
5486
+ "grad_norm": 12.375,
5487
+ "learning_rate": 1.1820807307408155e-05,
5488
+ "loss": 0.7339,
5489
+ "mean_token_accuracy": 0.8152937352657318,
5490
+ "num_tokens": 6755578.0,
5491
+ "step": 6090
5492
+ },
5493
+ {
5494
+ "epoch": 1.229095305258916,
5495
+ "grad_norm": 10.5,
5496
+ "learning_rate": 1.1807374571831553e-05,
5497
+ "loss": 0.7799,
5498
+ "mean_token_accuracy": 0.8055865943431855,
5499
+ "num_tokens": 6766161.0,
5500
+ "step": 6100
5501
+ },
5502
+ {
5503
+ "epoch": 1.2311102155954061,
5504
+ "grad_norm": 12.5,
5505
+ "learning_rate": 1.1793941836254954e-05,
5506
+ "loss": 0.8072,
5507
+ "mean_token_accuracy": 0.7963293552398681,
5508
+ "num_tokens": 6776660.0,
5509
+ "step": 6110
5510
+ },
5511
+ {
5512
+ "epoch": 1.233125125931896,
5513
+ "grad_norm": 14.0,
5514
+ "learning_rate": 1.1780509100678354e-05,
5515
+ "loss": 0.882,
5516
+ "mean_token_accuracy": 0.7818022012710572,
5517
+ "num_tokens": 6787702.0,
5518
+ "step": 6120
5519
+ },
5520
+ {
5521
+ "epoch": 1.235140036268386,
5522
+ "grad_norm": 11.1875,
5523
+ "learning_rate": 1.1767076365101755e-05,
5524
+ "loss": 0.8014,
5525
+ "mean_token_accuracy": 0.8063171863555908,
5526
+ "num_tokens": 6798904.0,
5527
+ "step": 6130
5528
+ },
5529
+ {
5530
+ "epoch": 1.2371549466048761,
5531
+ "grad_norm": 13.75,
5532
+ "learning_rate": 1.1753643629525154e-05,
5533
+ "loss": 0.8453,
5534
+ "mean_token_accuracy": 0.7982756316661834,
5535
+ "num_tokens": 6808990.0,
5536
+ "step": 6140
5537
+ },
5538
+ {
5539
+ "epoch": 1.239169856941366,
5540
+ "grad_norm": 10.8125,
5541
+ "learning_rate": 1.1740210893948554e-05,
5542
+ "loss": 0.7952,
5543
+ "mean_token_accuracy": 0.8004013955593109,
5544
+ "num_tokens": 6818726.0,
5545
+ "step": 6150
5546
+ },
5547
+ {
5548
+ "epoch": 1.2411847672778562,
5549
+ "grad_norm": 9.3125,
5550
+ "learning_rate": 1.1726778158371955e-05,
5551
+ "loss": 0.8466,
5552
+ "mean_token_accuracy": 0.794275438785553,
5553
+ "num_tokens": 6830189.0,
5554
+ "step": 6160
5555
+ },
5556
+ {
5557
+ "epoch": 1.2431996776143461,
5558
+ "grad_norm": 11.8125,
5559
+ "learning_rate": 1.1713345422795352e-05,
5560
+ "loss": 0.809,
5561
+ "mean_token_accuracy": 0.7935117900371551,
5562
+ "num_tokens": 6840564.0,
5563
+ "step": 6170
5564
+ },
5565
+ {
5566
+ "epoch": 1.2452145879508363,
5567
+ "grad_norm": 10.1875,
5568
+ "learning_rate": 1.1699912687218753e-05,
5569
+ "loss": 0.8684,
5570
+ "mean_token_accuracy": 0.7853965878486633,
5571
+ "num_tokens": 6851057.0,
5572
+ "step": 6180
5573
+ },
5574
+ {
5575
+ "epoch": 1.2472294982873262,
5576
+ "grad_norm": 12.3125,
5577
+ "learning_rate": 1.1686479951642153e-05,
5578
+ "loss": 0.7914,
5579
+ "mean_token_accuracy": 0.7986515760421753,
5580
+ "num_tokens": 6863098.0,
5581
+ "step": 6190
5582
+ },
5583
+ {
5584
+ "epoch": 1.2492444086238161,
5585
+ "grad_norm": 11.1875,
5586
+ "learning_rate": 1.1673047216065552e-05,
5587
+ "loss": 0.7723,
5588
+ "mean_token_accuracy": 0.8050061583518981,
5589
+ "num_tokens": 6873361.0,
5590
+ "step": 6200
5591
+ },
5592
+ {
5593
+ "epoch": 1.2512593189603063,
5594
+ "grad_norm": 10.6875,
5595
+ "learning_rate": 1.1659614480488952e-05,
5596
+ "loss": 0.7892,
5597
+ "mean_token_accuracy": 0.8081447362899781,
5598
+ "num_tokens": 6885561.0,
5599
+ "step": 6210
5600
+ },
5601
+ {
5602
+ "epoch": 1.2532742292967962,
5603
+ "grad_norm": 11.5625,
5604
+ "learning_rate": 1.1646181744912353e-05,
5605
+ "loss": 0.8335,
5606
+ "mean_token_accuracy": 0.7930399179458618,
5607
+ "num_tokens": 6896678.0,
5608
+ "step": 6220
5609
+ },
5610
+ {
5611
+ "epoch": 1.2552891396332864,
5612
+ "grad_norm": 12.5,
5613
+ "learning_rate": 1.1632749009335754e-05,
5614
+ "loss": 0.8161,
5615
+ "mean_token_accuracy": 0.8012160181999206,
5616
+ "num_tokens": 6906436.0,
5617
+ "step": 6230
5618
+ },
5619
+ {
5620
+ "epoch": 1.2573040499697763,
5621
+ "grad_norm": 14.5625,
5622
+ "learning_rate": 1.161931627375915e-05,
5623
+ "loss": 0.8408,
5624
+ "mean_token_accuracy": 0.7945603370666504,
5625
+ "num_tokens": 6916620.0,
5626
+ "step": 6240
5627
+ },
5628
+ {
5629
+ "epoch": 1.2593189603062664,
5630
+ "grad_norm": 16.25,
5631
+ "learning_rate": 1.1605883538182551e-05,
5632
+ "loss": 0.8039,
5633
+ "mean_token_accuracy": 0.801008677482605,
5634
+ "num_tokens": 6928094.0,
5635
+ "step": 6250
5636
+ },
5637
+ {
5638
+ "epoch": 1.2613338706427564,
5639
+ "grad_norm": 11.0625,
5640
+ "learning_rate": 1.1592450802605952e-05,
5641
+ "loss": 0.7906,
5642
+ "mean_token_accuracy": 0.8035953044891357,
5643
+ "num_tokens": 6937852.0,
5644
+ "step": 6260
5645
+ },
5646
+ {
5647
+ "epoch": 1.2633487809792463,
5648
+ "grad_norm": 13.0,
5649
+ "learning_rate": 1.157901806702935e-05,
5650
+ "loss": 0.8865,
5651
+ "mean_token_accuracy": 0.7758103013038635,
5652
+ "num_tokens": 6948566.0,
5653
+ "step": 6270
5654
+ },
5655
+ {
5656
+ "epoch": 1.2653636913157364,
5657
+ "grad_norm": 14.125,
5658
+ "learning_rate": 1.1565585331452751e-05,
5659
+ "loss": 0.9311,
5660
+ "mean_token_accuracy": 0.7705212533473969,
5661
+ "num_tokens": 6959264.0,
5662
+ "step": 6280
5663
+ },
5664
+ {
5665
+ "epoch": 1.2673786016522266,
5666
+ "grad_norm": 11.3125,
5667
+ "learning_rate": 1.1552152595876152e-05,
5668
+ "loss": 0.8076,
5669
+ "mean_token_accuracy": 0.7959451377391815,
5670
+ "num_tokens": 6970706.0,
5671
+ "step": 6290
5672
+ },
5673
+ {
5674
+ "epoch": 1.2693935119887165,
5675
+ "grad_norm": 12.9375,
5676
+ "learning_rate": 1.1538719860299552e-05,
5677
+ "loss": 0.8435,
5678
+ "mean_token_accuracy": 0.7901066780090332,
5679
+ "num_tokens": 6982005.0,
5680
+ "step": 6300
5681
+ },
5682
+ {
5683
+ "epoch": 1.2714084223252065,
5684
+ "grad_norm": 15.875,
5685
+ "learning_rate": 1.1525287124722951e-05,
5686
+ "loss": 0.7781,
5687
+ "mean_token_accuracy": 0.8100695073604584,
5688
+ "num_tokens": 6993228.0,
5689
+ "step": 6310
5690
+ },
5691
+ {
5692
+ "epoch": 1.2734233326616966,
5693
+ "grad_norm": 11.8125,
5694
+ "learning_rate": 1.1511854389146352e-05,
5695
+ "loss": 0.7771,
5696
+ "mean_token_accuracy": 0.7994659662246704,
5697
+ "num_tokens": 7003855.0,
5698
+ "step": 6320
5699
+ },
5700
+ {
5701
+ "epoch": 1.2754382429981865,
5702
+ "grad_norm": 11.5625,
5703
+ "learning_rate": 1.1498421653569752e-05,
5704
+ "loss": 0.8883,
5705
+ "mean_token_accuracy": 0.7779460906982422,
5706
+ "num_tokens": 7016389.0,
5707
+ "step": 6330
5708
+ },
5709
+ {
5710
+ "epoch": 1.2774531533346767,
5711
+ "grad_norm": 10.6875,
5712
+ "learning_rate": 1.148498891799315e-05,
5713
+ "loss": 0.7674,
5714
+ "mean_token_accuracy": 0.8048594057559967,
5715
+ "num_tokens": 7027229.0,
5716
+ "step": 6340
5717
+ },
5718
+ {
5719
+ "epoch": 1.2794680636711666,
5720
+ "grad_norm": 13.0,
5721
+ "learning_rate": 1.147155618241655e-05,
5722
+ "loss": 0.8556,
5723
+ "mean_token_accuracy": 0.7895182788372039,
5724
+ "num_tokens": 7038636.0,
5725
+ "step": 6350
5726
+ },
5727
+ {
5728
+ "epoch": 1.2814829740076568,
5729
+ "grad_norm": 10.75,
5730
+ "learning_rate": 1.145812344683995e-05,
5731
+ "loss": 0.8673,
5732
+ "mean_token_accuracy": 0.7855879724025726,
5733
+ "num_tokens": 7049602.0,
5734
+ "step": 6360
5735
+ },
5736
+ {
5737
+ "epoch": 1.2834978843441467,
5738
+ "grad_norm": 11.0625,
5739
+ "learning_rate": 1.1444690711263349e-05,
5740
+ "loss": 0.8281,
5741
+ "mean_token_accuracy": 0.798017168045044,
5742
+ "num_tokens": 7060783.0,
5743
+ "step": 6370
5744
+ },
5745
+ {
5746
+ "epoch": 1.2855127946806366,
5747
+ "grad_norm": 10.5,
5748
+ "learning_rate": 1.143125797568675e-05,
5749
+ "loss": 0.781,
5750
+ "mean_token_accuracy": 0.8092272758483887,
5751
+ "num_tokens": 7072749.0,
5752
+ "step": 6380
5753
+ },
5754
+ {
5755
+ "epoch": 1.2875277050171268,
5756
+ "grad_norm": 9.75,
5757
+ "learning_rate": 1.141782524011015e-05,
5758
+ "loss": 0.8664,
5759
+ "mean_token_accuracy": 0.7885208010673523,
5760
+ "num_tokens": 7085190.0,
5761
+ "step": 6390
5762
+ },
5763
+ {
5764
+ "epoch": 1.2895426153536167,
5765
+ "grad_norm": 10.625,
5766
+ "learning_rate": 1.140439250453355e-05,
5767
+ "loss": 0.7861,
5768
+ "mean_token_accuracy": 0.80192711353302,
5769
+ "num_tokens": 7095259.0,
5770
+ "step": 6400
5771
+ },
5772
+ {
5773
+ "epoch": 1.2915575256901068,
5774
+ "grad_norm": 10.5625,
5775
+ "learning_rate": 1.1390959768956948e-05,
5776
+ "loss": 0.7456,
5777
+ "mean_token_accuracy": 0.8084029912948608,
5778
+ "num_tokens": 7104840.0,
5779
+ "step": 6410
5780
+ },
5781
+ {
5782
+ "epoch": 1.2935724360265968,
5783
+ "grad_norm": 9.25,
5784
+ "learning_rate": 1.1377527033380348e-05,
5785
+ "loss": 0.9101,
5786
+ "mean_token_accuracy": 0.7775183081626892,
5787
+ "num_tokens": 7116704.0,
5788
+ "step": 6420
5789
+ },
5790
+ {
5791
+ "epoch": 1.295587346363087,
5792
+ "grad_norm": 10.5625,
5793
+ "learning_rate": 1.1364094297803749e-05,
5794
+ "loss": 0.8154,
5795
+ "mean_token_accuracy": 0.7952620327472687,
5796
+ "num_tokens": 7128360.0,
5797
+ "step": 6430
5798
+ },
5799
+ {
5800
+ "epoch": 1.2976022566995769,
5801
+ "grad_norm": 10.0625,
5802
+ "learning_rate": 1.1350661562227148e-05,
5803
+ "loss": 0.7922,
5804
+ "mean_token_accuracy": 0.8016098260879516,
5805
+ "num_tokens": 7139177.0,
5806
+ "step": 6440
5807
+ },
5808
+ {
5809
+ "epoch": 1.2996171670360668,
5810
+ "grad_norm": 13.0625,
5811
+ "learning_rate": 1.1337228826650548e-05,
5812
+ "loss": 0.9353,
5813
+ "mean_token_accuracy": 0.7753040254116058,
5814
+ "num_tokens": 7149827.0,
5815
+ "step": 6450
5816
+ },
5817
+ {
5818
+ "epoch": 1.301632077372557,
5819
+ "grad_norm": 9.4375,
5820
+ "learning_rate": 1.1323796091073949e-05,
5821
+ "loss": 0.8437,
5822
+ "mean_token_accuracy": 0.7929971814155579,
5823
+ "num_tokens": 7162136.0,
5824
+ "step": 6460
5825
+ },
5826
+ {
5827
+ "epoch": 1.303646987709047,
5828
+ "grad_norm": 10.875,
5829
+ "learning_rate": 1.1310363355497348e-05,
5830
+ "loss": 0.9565,
5831
+ "mean_token_accuracy": 0.7702773094177247,
5832
+ "num_tokens": 7173369.0,
5833
+ "step": 6470
5834
+ },
5835
+ {
5836
+ "epoch": 1.305661898045537,
5837
+ "grad_norm": 10.75,
5838
+ "learning_rate": 1.1296930619920748e-05,
5839
+ "loss": 0.8265,
5840
+ "mean_token_accuracy": 0.7950271546840668,
5841
+ "num_tokens": 7184403.0,
5842
+ "step": 6480
5843
+ },
5844
+ {
5845
+ "epoch": 1.307676808382027,
5846
+ "grad_norm": 14.9375,
5847
+ "learning_rate": 1.1283497884344149e-05,
5848
+ "loss": 0.7715,
5849
+ "mean_token_accuracy": 0.8035805761814118,
5850
+ "num_tokens": 7195550.0,
5851
+ "step": 6490
5852
+ },
5853
+ {
5854
+ "epoch": 1.309691718718517,
5855
+ "grad_norm": 13.8125,
5856
+ "learning_rate": 1.1270065148767547e-05,
5857
+ "loss": 0.8843,
5858
+ "mean_token_accuracy": 0.7886571526527405,
5859
+ "num_tokens": 7207013.0,
5860
+ "step": 6500
5861
  }
5862
  ],
5863
  "logging_steps": 10,
 
5877
  "attributes": {}
5878
  }
5879
  },
5880
+ "total_flos": 8718380552103936.0,
5881
  "train_batch_size": 8,
5882
  "trial_name": null,
5883
  "trial_params": null