gr15_close_pot_BASE / trainer_state.json
binhng's picture
Upload folder using huggingface_hub
ad48cd8 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 24.928092042186,
"eval_steps": 500,
"global_step": 26000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009587727708533078,
"grad_norm": 16.952646255493164,
"learning_rate": 6.923076923076923e-07,
"loss": 1.0981,
"step": 10
},
{
"epoch": 0.019175455417066157,
"grad_norm": 10.460495948791504,
"learning_rate": 1.4615384615384616e-06,
"loss": 1.0812,
"step": 20
},
{
"epoch": 0.028763183125599234,
"grad_norm": 5.855829238891602,
"learning_rate": 2.2307692307692307e-06,
"loss": 0.8546,
"step": 30
},
{
"epoch": 0.038350910834132314,
"grad_norm": 3.5648763179779053,
"learning_rate": 3e-06,
"loss": 0.4892,
"step": 40
},
{
"epoch": 0.04793863854266539,
"grad_norm": 1.662581205368042,
"learning_rate": 3.7692307692307694e-06,
"loss": 0.3639,
"step": 50
},
{
"epoch": 0.05752636625119847,
"grad_norm": 1.4345495700836182,
"learning_rate": 4.538461538461539e-06,
"loss": 0.2922,
"step": 60
},
{
"epoch": 0.06711409395973154,
"grad_norm": 0.9792284369468689,
"learning_rate": 5.307692307692308e-06,
"loss": 0.2074,
"step": 70
},
{
"epoch": 0.07670182166826463,
"grad_norm": 0.9946898818016052,
"learning_rate": 6.0769230769230775e-06,
"loss": 0.2146,
"step": 80
},
{
"epoch": 0.0862895493767977,
"grad_norm": 1.3608415126800537,
"learning_rate": 6.846153846153847e-06,
"loss": 0.1745,
"step": 90
},
{
"epoch": 0.09587727708533078,
"grad_norm": 1.0509544610977173,
"learning_rate": 7.615384615384616e-06,
"loss": 0.1748,
"step": 100
},
{
"epoch": 0.10546500479386385,
"grad_norm": 0.9403872489929199,
"learning_rate": 8.384615384615385e-06,
"loss": 0.1564,
"step": 110
},
{
"epoch": 0.11505273250239693,
"grad_norm": 0.9270913004875183,
"learning_rate": 9.153846153846155e-06,
"loss": 0.1696,
"step": 120
},
{
"epoch": 0.12464046021093,
"grad_norm": 1.0389190912246704,
"learning_rate": 9.923076923076923e-06,
"loss": 0.1392,
"step": 130
},
{
"epoch": 0.1342281879194631,
"grad_norm": 0.9624547958374023,
"learning_rate": 1.0692307692307694e-05,
"loss": 0.1311,
"step": 140
},
{
"epoch": 0.14381591562799617,
"grad_norm": 1.0129961967468262,
"learning_rate": 1.1461538461538462e-05,
"loss": 0.1212,
"step": 150
},
{
"epoch": 0.15340364333652926,
"grad_norm": 1.2572994232177734,
"learning_rate": 1.2230769230769232e-05,
"loss": 0.1267,
"step": 160
},
{
"epoch": 0.1629913710450623,
"grad_norm": 1.06370210647583,
"learning_rate": 1.3000000000000001e-05,
"loss": 0.1241,
"step": 170
},
{
"epoch": 0.1725790987535954,
"grad_norm": 1.2056634426116943,
"learning_rate": 1.3769230769230771e-05,
"loss": 0.1207,
"step": 180
},
{
"epoch": 0.18216682646212848,
"grad_norm": 1.5257799625396729,
"learning_rate": 1.453846153846154e-05,
"loss": 0.1078,
"step": 190
},
{
"epoch": 0.19175455417066156,
"grad_norm": 1.0377438068389893,
"learning_rate": 1.5307692307692308e-05,
"loss": 0.1191,
"step": 200
},
{
"epoch": 0.20134228187919462,
"grad_norm": 1.1318110227584839,
"learning_rate": 1.607692307692308e-05,
"loss": 0.1211,
"step": 210
},
{
"epoch": 0.2109300095877277,
"grad_norm": 0.8394863605499268,
"learning_rate": 1.684615384615385e-05,
"loss": 0.1103,
"step": 220
},
{
"epoch": 0.22051773729626079,
"grad_norm": 0.6863590478897095,
"learning_rate": 1.7615384615384615e-05,
"loss": 0.101,
"step": 230
},
{
"epoch": 0.23010546500479387,
"grad_norm": 0.9169079661369324,
"learning_rate": 1.8384615384615386e-05,
"loss": 0.1034,
"step": 240
},
{
"epoch": 0.23969319271332695,
"grad_norm": 1.088216781616211,
"learning_rate": 1.9153846153846156e-05,
"loss": 0.0964,
"step": 250
},
{
"epoch": 0.24928092042186,
"grad_norm": 0.8121523261070251,
"learning_rate": 1.9923076923076926e-05,
"loss": 0.0959,
"step": 260
},
{
"epoch": 0.2588686481303931,
"grad_norm": 1.408576250076294,
"learning_rate": 2.0692307692307693e-05,
"loss": 0.1017,
"step": 270
},
{
"epoch": 0.2684563758389262,
"grad_norm": 1.0147638320922852,
"learning_rate": 2.1461538461538463e-05,
"loss": 0.0935,
"step": 280
},
{
"epoch": 0.27804410354745923,
"grad_norm": 1.0343986749649048,
"learning_rate": 2.2230769230769233e-05,
"loss": 0.1039,
"step": 290
},
{
"epoch": 0.28763183125599234,
"grad_norm": 1.32474684715271,
"learning_rate": 2.3000000000000003e-05,
"loss": 0.0926,
"step": 300
},
{
"epoch": 0.2972195589645254,
"grad_norm": 0.9797191619873047,
"learning_rate": 2.376923076923077e-05,
"loss": 0.0967,
"step": 310
},
{
"epoch": 0.3068072866730585,
"grad_norm": 0.6505740880966187,
"learning_rate": 2.453846153846154e-05,
"loss": 0.0792,
"step": 320
},
{
"epoch": 0.31639501438159157,
"grad_norm": 0.9509547352790833,
"learning_rate": 2.530769230769231e-05,
"loss": 0.0827,
"step": 330
},
{
"epoch": 0.3259827420901246,
"grad_norm": 1.0206745862960815,
"learning_rate": 2.6076923076923077e-05,
"loss": 0.0771,
"step": 340
},
{
"epoch": 0.33557046979865773,
"grad_norm": 1.0349384546279907,
"learning_rate": 2.6846153846153848e-05,
"loss": 0.0799,
"step": 350
},
{
"epoch": 0.3451581975071908,
"grad_norm": 0.9108182191848755,
"learning_rate": 2.7615384615384614e-05,
"loss": 0.0873,
"step": 360
},
{
"epoch": 0.3547459252157239,
"grad_norm": 0.5712908506393433,
"learning_rate": 2.8384615384615388e-05,
"loss": 0.0831,
"step": 370
},
{
"epoch": 0.36433365292425696,
"grad_norm": 0.9796934127807617,
"learning_rate": 2.9153846153846155e-05,
"loss": 0.0719,
"step": 380
},
{
"epoch": 0.37392138063279,
"grad_norm": 1.1480381488800049,
"learning_rate": 2.9923076923076925e-05,
"loss": 0.0828,
"step": 390
},
{
"epoch": 0.3835091083413231,
"grad_norm": 0.5862910151481628,
"learning_rate": 3.069230769230769e-05,
"loss": 0.0709,
"step": 400
},
{
"epoch": 0.3930968360498562,
"grad_norm": 0.7163400650024414,
"learning_rate": 3.146153846153846e-05,
"loss": 0.0721,
"step": 410
},
{
"epoch": 0.40268456375838924,
"grad_norm": 0.7817345261573792,
"learning_rate": 3.223076923076923e-05,
"loss": 0.0717,
"step": 420
},
{
"epoch": 0.41227229146692235,
"grad_norm": 0.6121333837509155,
"learning_rate": 3.3e-05,
"loss": 0.0665,
"step": 430
},
{
"epoch": 0.4218600191754554,
"grad_norm": 0.774795413017273,
"learning_rate": 3.376923076923077e-05,
"loss": 0.0691,
"step": 440
},
{
"epoch": 0.4314477468839885,
"grad_norm": 0.898847222328186,
"learning_rate": 3.453846153846154e-05,
"loss": 0.0741,
"step": 450
},
{
"epoch": 0.44103547459252157,
"grad_norm": 0.7293726801872253,
"learning_rate": 3.5307692307692306e-05,
"loss": 0.0645,
"step": 460
},
{
"epoch": 0.4506232023010546,
"grad_norm": 0.5674548149108887,
"learning_rate": 3.607692307692308e-05,
"loss": 0.0629,
"step": 470
},
{
"epoch": 0.46021093000958774,
"grad_norm": 0.7961140275001526,
"learning_rate": 3.684615384615385e-05,
"loss": 0.0628,
"step": 480
},
{
"epoch": 0.4697986577181208,
"grad_norm": 0.6253398656845093,
"learning_rate": 3.761538461538462e-05,
"loss": 0.0645,
"step": 490
},
{
"epoch": 0.4793863854266539,
"grad_norm": 0.9273212552070618,
"learning_rate": 3.838461538461539e-05,
"loss": 0.062,
"step": 500
},
{
"epoch": 0.48897411313518696,
"grad_norm": 0.6789622902870178,
"learning_rate": 3.915384615384616e-05,
"loss": 0.0654,
"step": 510
},
{
"epoch": 0.49856184084372,
"grad_norm": 0.8248144388198853,
"learning_rate": 3.992307692307692e-05,
"loss": 0.0665,
"step": 520
},
{
"epoch": 0.5081495685522531,
"grad_norm": 0.8695188164710999,
"learning_rate": 4.06923076923077e-05,
"loss": 0.0717,
"step": 530
},
{
"epoch": 0.5177372962607862,
"grad_norm": 0.6595909595489502,
"learning_rate": 4.146153846153846e-05,
"loss": 0.0628,
"step": 540
},
{
"epoch": 0.5273250239693192,
"grad_norm": 0.7226746678352356,
"learning_rate": 4.223076923076924e-05,
"loss": 0.0657,
"step": 550
},
{
"epoch": 0.5369127516778524,
"grad_norm": 0.6370866298675537,
"learning_rate": 4.3e-05,
"loss": 0.0581,
"step": 560
},
{
"epoch": 0.5465004793863855,
"grad_norm": 0.47755175828933716,
"learning_rate": 4.376923076923077e-05,
"loss": 0.052,
"step": 570
},
{
"epoch": 0.5560882070949185,
"grad_norm": 0.7424858808517456,
"learning_rate": 4.453846153846154e-05,
"loss": 0.0606,
"step": 580
},
{
"epoch": 0.5656759348034516,
"grad_norm": 0.4627436399459839,
"learning_rate": 4.530769230769231e-05,
"loss": 0.0618,
"step": 590
},
{
"epoch": 0.5752636625119847,
"grad_norm": 0.5372833609580994,
"learning_rate": 4.6076923076923076e-05,
"loss": 0.0616,
"step": 600
},
{
"epoch": 0.5848513902205177,
"grad_norm": 0.8923951387405396,
"learning_rate": 4.684615384615385e-05,
"loss": 0.0659,
"step": 610
},
{
"epoch": 0.5944391179290508,
"grad_norm": 0.9428364038467407,
"learning_rate": 4.7615384615384616e-05,
"loss": 0.0707,
"step": 620
},
{
"epoch": 0.6040268456375839,
"grad_norm": 0.7362667322158813,
"learning_rate": 4.8384615384615386e-05,
"loss": 0.062,
"step": 630
},
{
"epoch": 0.613614573346117,
"grad_norm": 0.7807226181030273,
"learning_rate": 4.9153846153846157e-05,
"loss": 0.0662,
"step": 640
},
{
"epoch": 0.62320230105465,
"grad_norm": 0.5898621678352356,
"learning_rate": 4.992307692307693e-05,
"loss": 0.0594,
"step": 650
},
{
"epoch": 0.6327900287631831,
"grad_norm": 0.4694168269634247,
"learning_rate": 5.06923076923077e-05,
"loss": 0.0572,
"step": 660
},
{
"epoch": 0.6423777564717162,
"grad_norm": 0.6720401048660278,
"learning_rate": 5.146153846153846e-05,
"loss": 0.0697,
"step": 670
},
{
"epoch": 0.6519654841802492,
"grad_norm": 0.5371865034103394,
"learning_rate": 5.223076923076924e-05,
"loss": 0.059,
"step": 680
},
{
"epoch": 0.6615532118887824,
"grad_norm": 0.6751993894577026,
"learning_rate": 5.300000000000001e-05,
"loss": 0.0566,
"step": 690
},
{
"epoch": 0.6711409395973155,
"grad_norm": 0.7496346831321716,
"learning_rate": 5.376923076923077e-05,
"loss": 0.0592,
"step": 700
},
{
"epoch": 0.6807286673058485,
"grad_norm": 0.7620933055877686,
"learning_rate": 5.453846153846154e-05,
"loss": 0.0645,
"step": 710
},
{
"epoch": 0.6903163950143816,
"grad_norm": 0.9095701575279236,
"learning_rate": 5.5307692307692305e-05,
"loss": 0.0568,
"step": 720
},
{
"epoch": 0.6999041227229147,
"grad_norm": 0.7606950998306274,
"learning_rate": 5.607692307692308e-05,
"loss": 0.0624,
"step": 730
},
{
"epoch": 0.7094918504314478,
"grad_norm": 1.0387766361236572,
"learning_rate": 5.684615384615385e-05,
"loss": 0.0584,
"step": 740
},
{
"epoch": 0.7190795781399808,
"grad_norm": 0.7113978862762451,
"learning_rate": 5.7615384615384615e-05,
"loss": 0.0652,
"step": 750
},
{
"epoch": 0.7286673058485139,
"grad_norm": 0.604448139667511,
"learning_rate": 5.838461538461538e-05,
"loss": 0.0654,
"step": 760
},
{
"epoch": 0.738255033557047,
"grad_norm": 0.8723410367965698,
"learning_rate": 5.915384615384616e-05,
"loss": 0.0531,
"step": 770
},
{
"epoch": 0.74784276126558,
"grad_norm": 0.5730307102203369,
"learning_rate": 5.9923076923076926e-05,
"loss": 0.0559,
"step": 780
},
{
"epoch": 0.7574304889741131,
"grad_norm": 0.7451117634773254,
"learning_rate": 6.0692307692307696e-05,
"loss": 0.0643,
"step": 790
},
{
"epoch": 0.7670182166826462,
"grad_norm": 0.3902491331100464,
"learning_rate": 6.146153846153846e-05,
"loss": 0.0611,
"step": 800
},
{
"epoch": 0.7766059443911792,
"grad_norm": 0.6148221492767334,
"learning_rate": 6.223076923076924e-05,
"loss": 0.0549,
"step": 810
},
{
"epoch": 0.7861936720997124,
"grad_norm": 0.5791975259780884,
"learning_rate": 6.3e-05,
"loss": 0.0589,
"step": 820
},
{
"epoch": 0.7957813998082455,
"grad_norm": 0.5318537950515747,
"learning_rate": 6.376923076923077e-05,
"loss": 0.0618,
"step": 830
},
{
"epoch": 0.8053691275167785,
"grad_norm": 0.8901371359825134,
"learning_rate": 6.453846153846154e-05,
"loss": 0.0563,
"step": 840
},
{
"epoch": 0.8149568552253116,
"grad_norm": 0.8964536786079407,
"learning_rate": 6.530769230769231e-05,
"loss": 0.0656,
"step": 850
},
{
"epoch": 0.8245445829338447,
"grad_norm": 0.5159094929695129,
"learning_rate": 6.607692307692308e-05,
"loss": 0.0582,
"step": 860
},
{
"epoch": 0.8341323106423778,
"grad_norm": 0.6684253811836243,
"learning_rate": 6.684615384615385e-05,
"loss": 0.0569,
"step": 870
},
{
"epoch": 0.8437200383509108,
"grad_norm": 0.5698950290679932,
"learning_rate": 6.761538461538461e-05,
"loss": 0.0549,
"step": 880
},
{
"epoch": 0.8533077660594439,
"grad_norm": 0.44796323776245117,
"learning_rate": 6.838461538461539e-05,
"loss": 0.0557,
"step": 890
},
{
"epoch": 0.862895493767977,
"grad_norm": 0.7032187581062317,
"learning_rate": 6.915384615384616e-05,
"loss": 0.069,
"step": 900
},
{
"epoch": 0.87248322147651,
"grad_norm": 0.538271963596344,
"learning_rate": 6.992307692307692e-05,
"loss": 0.0568,
"step": 910
},
{
"epoch": 0.8820709491850431,
"grad_norm": 0.46786853671073914,
"learning_rate": 7.069230769230769e-05,
"loss": 0.0623,
"step": 920
},
{
"epoch": 0.8916586768935763,
"grad_norm": 0.6529656052589417,
"learning_rate": 7.146153846153847e-05,
"loss": 0.064,
"step": 930
},
{
"epoch": 0.9012464046021093,
"grad_norm": 0.9618151187896729,
"learning_rate": 7.223076923076923e-05,
"loss": 0.0557,
"step": 940
},
{
"epoch": 0.9108341323106424,
"grad_norm": 0.5643552541732788,
"learning_rate": 7.3e-05,
"loss": 0.0651,
"step": 950
},
{
"epoch": 0.9204218600191755,
"grad_norm": 0.7007706761360168,
"learning_rate": 7.376923076923077e-05,
"loss": 0.0514,
"step": 960
},
{
"epoch": 0.9300095877277086,
"grad_norm": 0.4530331492424011,
"learning_rate": 7.453846153846154e-05,
"loss": 0.0563,
"step": 970
},
{
"epoch": 0.9395973154362416,
"grad_norm": 0.6113521456718445,
"learning_rate": 7.530769230769231e-05,
"loss": 0.0606,
"step": 980
},
{
"epoch": 0.9491850431447747,
"grad_norm": 0.5007736682891846,
"learning_rate": 7.607692307692308e-05,
"loss": 0.0561,
"step": 990
},
{
"epoch": 0.9587727708533078,
"grad_norm": 0.49903005361557007,
"learning_rate": 7.684615384615385e-05,
"loss": 0.0578,
"step": 1000
},
{
"epoch": 0.9683604985618408,
"grad_norm": 0.629622220993042,
"learning_rate": 7.761538461538462e-05,
"loss": 0.0572,
"step": 1010
},
{
"epoch": 0.9779482262703739,
"grad_norm": 0.5830038785934448,
"learning_rate": 7.838461538461539e-05,
"loss": 0.0586,
"step": 1020
},
{
"epoch": 0.987535953978907,
"grad_norm": 0.502075731754303,
"learning_rate": 7.915384615384616e-05,
"loss": 0.052,
"step": 1030
},
{
"epoch": 0.99712368168744,
"grad_norm": 0.6076005101203918,
"learning_rate": 7.992307692307692e-05,
"loss": 0.0536,
"step": 1040
},
{
"epoch": 1.0067114093959733,
"grad_norm": 0.6297442317008972,
"learning_rate": 8.06923076923077e-05,
"loss": 0.0565,
"step": 1050
},
{
"epoch": 1.0162991371045063,
"grad_norm": 0.6776733994483948,
"learning_rate": 8.146153846153847e-05,
"loss": 0.0556,
"step": 1060
},
{
"epoch": 1.0258868648130393,
"grad_norm": 0.5691619515419006,
"learning_rate": 8.223076923076923e-05,
"loss": 0.0528,
"step": 1070
},
{
"epoch": 1.0354745925215725,
"grad_norm": 0.7027555108070374,
"learning_rate": 8.3e-05,
"loss": 0.0614,
"step": 1080
},
{
"epoch": 1.0450623202301055,
"grad_norm": 0.7508878111839294,
"learning_rate": 8.376923076923078e-05,
"loss": 0.0496,
"step": 1090
},
{
"epoch": 1.0546500479386385,
"grad_norm": 0.6663224101066589,
"learning_rate": 8.453846153846154e-05,
"loss": 0.0507,
"step": 1100
},
{
"epoch": 1.0642377756471717,
"grad_norm": 0.5372412204742432,
"learning_rate": 8.530769230769231e-05,
"loss": 0.0547,
"step": 1110
},
{
"epoch": 1.0738255033557047,
"grad_norm": 0.6460400223731995,
"learning_rate": 8.607692307692308e-05,
"loss": 0.0598,
"step": 1120
},
{
"epoch": 1.0834132310642377,
"grad_norm": 0.5155197381973267,
"learning_rate": 8.684615384615385e-05,
"loss": 0.0601,
"step": 1130
},
{
"epoch": 1.093000958772771,
"grad_norm": 0.42931079864501953,
"learning_rate": 8.761538461538462e-05,
"loss": 0.0602,
"step": 1140
},
{
"epoch": 1.102588686481304,
"grad_norm": 0.5317569971084595,
"learning_rate": 8.838461538461539e-05,
"loss": 0.0577,
"step": 1150
},
{
"epoch": 1.112176414189837,
"grad_norm": 0.6564596891403198,
"learning_rate": 8.915384615384616e-05,
"loss": 0.0596,
"step": 1160
},
{
"epoch": 1.1217641418983701,
"grad_norm": 0.43666043877601624,
"learning_rate": 8.992307692307693e-05,
"loss": 0.0549,
"step": 1170
},
{
"epoch": 1.1313518696069031,
"grad_norm": 0.6105823516845703,
"learning_rate": 9.06923076923077e-05,
"loss": 0.0641,
"step": 1180
},
{
"epoch": 1.1409395973154361,
"grad_norm": 0.5657874345779419,
"learning_rate": 9.146153846153847e-05,
"loss": 0.0591,
"step": 1190
},
{
"epoch": 1.1505273250239694,
"grad_norm": 0.5609491467475891,
"learning_rate": 9.223076923076923e-05,
"loss": 0.0622,
"step": 1200
},
{
"epoch": 1.1601150527325024,
"grad_norm": 0.6493374705314636,
"learning_rate": 9.300000000000001e-05,
"loss": 0.0589,
"step": 1210
},
{
"epoch": 1.1697027804410354,
"grad_norm": 0.7406426072120667,
"learning_rate": 9.376923076923078e-05,
"loss": 0.0579,
"step": 1220
},
{
"epoch": 1.1792905081495686,
"grad_norm": 0.6438266634941101,
"learning_rate": 9.453846153846154e-05,
"loss": 0.058,
"step": 1230
},
{
"epoch": 1.1888782358581016,
"grad_norm": 0.49737435579299927,
"learning_rate": 9.530769230769231e-05,
"loss": 0.0599,
"step": 1240
},
{
"epoch": 1.1984659635666346,
"grad_norm": 0.5221342444419861,
"learning_rate": 9.607692307692309e-05,
"loss": 0.0613,
"step": 1250
},
{
"epoch": 1.2080536912751678,
"grad_norm": 0.5636175870895386,
"learning_rate": 9.684615384615385e-05,
"loss": 0.054,
"step": 1260
},
{
"epoch": 1.2176414189837008,
"grad_norm": 0.6858579516410828,
"learning_rate": 9.761538461538462e-05,
"loss": 0.0633,
"step": 1270
},
{
"epoch": 1.2272291466922338,
"grad_norm": 0.5884243845939636,
"learning_rate": 9.838461538461539e-05,
"loss": 0.0576,
"step": 1280
},
{
"epoch": 1.236816874400767,
"grad_norm": 0.753278374671936,
"learning_rate": 9.915384615384616e-05,
"loss": 0.0624,
"step": 1290
},
{
"epoch": 1.2464046021093,
"grad_norm": 0.5968719720840454,
"learning_rate": 9.992307692307693e-05,
"loss": 0.0615,
"step": 1300
},
{
"epoch": 1.255992329817833,
"grad_norm": 0.4386919140815735,
"learning_rate": 9.99999672409862e-05,
"loss": 0.0612,
"step": 1310
},
{
"epoch": 1.2655800575263663,
"grad_norm": 0.7106592655181885,
"learning_rate": 9.999985400000595e-05,
"loss": 0.0582,
"step": 1320
},
{
"epoch": 1.2751677852348993,
"grad_norm": 0.642195463180542,
"learning_rate": 9.999965987281012e-05,
"loss": 0.0539,
"step": 1330
},
{
"epoch": 1.2847555129434325,
"grad_norm": 0.8102270364761353,
"learning_rate": 9.999938485971279e-05,
"loss": 0.0571,
"step": 1340
},
{
"epoch": 1.2943432406519655,
"grad_norm": 0.5724937319755554,
"learning_rate": 9.999902896115882e-05,
"loss": 0.059,
"step": 1350
},
{
"epoch": 1.3039309683604985,
"grad_norm": 0.5850300788879395,
"learning_rate": 9.999859217772396e-05,
"loss": 0.0546,
"step": 1360
},
{
"epoch": 1.3135186960690317,
"grad_norm": 0.5836851000785828,
"learning_rate": 9.999807451011483e-05,
"loss": 0.0574,
"step": 1370
},
{
"epoch": 1.3231064237775647,
"grad_norm": 0.4875651001930237,
"learning_rate": 9.999747595916886e-05,
"loss": 0.0584,
"step": 1380
},
{
"epoch": 1.332694151486098,
"grad_norm": 0.6385061144828796,
"learning_rate": 9.999679652585436e-05,
"loss": 0.0551,
"step": 1390
},
{
"epoch": 1.342281879194631,
"grad_norm": 0.6868314743041992,
"learning_rate": 9.999603621127043e-05,
"loss": 0.0644,
"step": 1400
},
{
"epoch": 1.351869606903164,
"grad_norm": 0.879398763179779,
"learning_rate": 9.99951950166471e-05,
"loss": 0.0556,
"step": 1410
},
{
"epoch": 1.3614573346116972,
"grad_norm": 0.5804061889648438,
"learning_rate": 9.999427294334516e-05,
"loss": 0.066,
"step": 1420
},
{
"epoch": 1.3710450623202302,
"grad_norm": 0.8581869602203369,
"learning_rate": 9.999326999285628e-05,
"loss": 0.0604,
"step": 1430
},
{
"epoch": 1.3806327900287632,
"grad_norm": 0.5264695882797241,
"learning_rate": 9.999218616680299e-05,
"loss": 0.0616,
"step": 1440
},
{
"epoch": 1.3902205177372964,
"grad_norm": 0.9933851957321167,
"learning_rate": 9.999102146693859e-05,
"loss": 0.0593,
"step": 1450
},
{
"epoch": 1.3998082454458294,
"grad_norm": 0.4718506932258606,
"learning_rate": 9.998977589514729e-05,
"loss": 0.056,
"step": 1460
},
{
"epoch": 1.4093959731543624,
"grad_norm": 0.46576133370399475,
"learning_rate": 9.998844945344405e-05,
"loss": 0.0547,
"step": 1470
},
{
"epoch": 1.4189837008628956,
"grad_norm": 0.8062454462051392,
"learning_rate": 9.99870421439747e-05,
"loss": 0.0624,
"step": 1480
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.6128931641578674,
"learning_rate": 9.99855539690159e-05,
"loss": 0.054,
"step": 1490
},
{
"epoch": 1.4381591562799616,
"grad_norm": 0.781894326210022,
"learning_rate": 9.998398493097511e-05,
"loss": 0.0593,
"step": 1500
},
{
"epoch": 1.4477468839884948,
"grad_norm": 0.4165836572647095,
"learning_rate": 9.998233503239059e-05,
"loss": 0.0467,
"step": 1510
},
{
"epoch": 1.4573346116970278,
"grad_norm": 0.2851610779762268,
"learning_rate": 9.998060427593146e-05,
"loss": 0.0544,
"step": 1520
},
{
"epoch": 1.4669223394055608,
"grad_norm": 0.578106164932251,
"learning_rate": 9.997879266439758e-05,
"loss": 0.0518,
"step": 1530
},
{
"epoch": 1.476510067114094,
"grad_norm": 0.5424726009368896,
"learning_rate": 9.997690020071968e-05,
"loss": 0.0589,
"step": 1540
},
{
"epoch": 1.486097794822627,
"grad_norm": 0.3104839622974396,
"learning_rate": 9.997492688795924e-05,
"loss": 0.0675,
"step": 1550
},
{
"epoch": 1.49568552253116,
"grad_norm": 0.3031683564186096,
"learning_rate": 9.997287272930854e-05,
"loss": 0.0568,
"step": 1560
},
{
"epoch": 1.5052732502396933,
"grad_norm": 0.3921635150909424,
"learning_rate": 9.997073772809065e-05,
"loss": 0.0475,
"step": 1570
},
{
"epoch": 1.5148609779482263,
"grad_norm": 0.32904309034347534,
"learning_rate": 9.996852188775942e-05,
"loss": 0.0475,
"step": 1580
},
{
"epoch": 1.5244487056567593,
"grad_norm": 0.5768727660179138,
"learning_rate": 9.996622521189952e-05,
"loss": 0.0471,
"step": 1590
},
{
"epoch": 1.5340364333652925,
"grad_norm": 0.5629034042358398,
"learning_rate": 9.996384770422629e-05,
"loss": 0.0498,
"step": 1600
},
{
"epoch": 1.5436241610738255,
"grad_norm": 0.5300479531288147,
"learning_rate": 9.996138936858593e-05,
"loss": 0.0498,
"step": 1610
},
{
"epoch": 1.5532118887823585,
"grad_norm": 0.5257939696311951,
"learning_rate": 9.995885020895536e-05,
"loss": 0.0512,
"step": 1620
},
{
"epoch": 1.5627996164908917,
"grad_norm": 0.5948016047477722,
"learning_rate": 9.995623022944223e-05,
"loss": 0.0512,
"step": 1630
},
{
"epoch": 1.5723873441994247,
"grad_norm": 0.5377443432807922,
"learning_rate": 9.995352943428497e-05,
"loss": 0.0563,
"step": 1640
},
{
"epoch": 1.5819750719079577,
"grad_norm": 0.658616840839386,
"learning_rate": 9.995074782785275e-05,
"loss": 0.0591,
"step": 1650
},
{
"epoch": 1.591562799616491,
"grad_norm": 0.5048621296882629,
"learning_rate": 9.994788541464543e-05,
"loss": 0.0461,
"step": 1660
},
{
"epoch": 1.601150527325024,
"grad_norm": 0.30649903416633606,
"learning_rate": 9.994494219929365e-05,
"loss": 0.0472,
"step": 1670
},
{
"epoch": 1.610738255033557,
"grad_norm": 0.5432398319244385,
"learning_rate": 9.99419181865587e-05,
"loss": 0.0516,
"step": 1680
},
{
"epoch": 1.6203259827420902,
"grad_norm": 0.458732545375824,
"learning_rate": 9.993881338133261e-05,
"loss": 0.0471,
"step": 1690
},
{
"epoch": 1.6299137104506232,
"grad_norm": 0.4103093445301056,
"learning_rate": 9.993562778863817e-05,
"loss": 0.0533,
"step": 1700
},
{
"epoch": 1.6395014381591562,
"grad_norm": 0.7458987832069397,
"learning_rate": 9.993236141362874e-05,
"loss": 0.0533,
"step": 1710
},
{
"epoch": 1.6490891658676894,
"grad_norm": 0.4409146010875702,
"learning_rate": 9.992901426158848e-05,
"loss": 0.0574,
"step": 1720
},
{
"epoch": 1.6586768935762224,
"grad_norm": 0.43476009368896484,
"learning_rate": 9.992558633793212e-05,
"loss": 0.0551,
"step": 1730
},
{
"epoch": 1.6682646212847554,
"grad_norm": 0.5552487373352051,
"learning_rate": 9.992207764820516e-05,
"loss": 0.0544,
"step": 1740
},
{
"epoch": 1.6778523489932886,
"grad_norm": 0.3948347270488739,
"learning_rate": 9.99184881980837e-05,
"loss": 0.0549,
"step": 1750
},
{
"epoch": 1.6874400767018218,
"grad_norm": 0.36312541365623474,
"learning_rate": 9.991481799337448e-05,
"loss": 0.0628,
"step": 1760
},
{
"epoch": 1.6970278044103546,
"grad_norm": 0.48039504885673523,
"learning_rate": 9.991106704001491e-05,
"loss": 0.0518,
"step": 1770
},
{
"epoch": 1.7066155321188878,
"grad_norm": 0.43102404475212097,
"learning_rate": 9.990723534407302e-05,
"loss": 0.0531,
"step": 1780
},
{
"epoch": 1.716203259827421,
"grad_norm": 0.635412335395813,
"learning_rate": 9.990332291174747e-05,
"loss": 0.0623,
"step": 1790
},
{
"epoch": 1.7257909875359538,
"grad_norm": 0.41768330335617065,
"learning_rate": 9.989932974936746e-05,
"loss": 0.0489,
"step": 1800
},
{
"epoch": 1.735378715244487,
"grad_norm": 0.4321722984313965,
"learning_rate": 9.98952558633929e-05,
"loss": 0.0578,
"step": 1810
},
{
"epoch": 1.7449664429530203,
"grad_norm": 0.5160396099090576,
"learning_rate": 9.98911012604142e-05,
"loss": 0.0538,
"step": 1820
},
{
"epoch": 1.754554170661553,
"grad_norm": 0.5091599822044373,
"learning_rate": 9.98868659471524e-05,
"loss": 0.062,
"step": 1830
},
{
"epoch": 1.7641418983700863,
"grad_norm": 0.289798378944397,
"learning_rate": 9.988254993045908e-05,
"loss": 0.0561,
"step": 1840
},
{
"epoch": 1.7737296260786195,
"grad_norm": 0.6626523733139038,
"learning_rate": 9.98781532173164e-05,
"loss": 0.0584,
"step": 1850
},
{
"epoch": 1.7833173537871523,
"grad_norm": 0.4821811020374298,
"learning_rate": 9.987367581483705e-05,
"loss": 0.0597,
"step": 1860
},
{
"epoch": 1.7929050814956855,
"grad_norm": 0.45109039545059204,
"learning_rate": 9.986911773026422e-05,
"loss": 0.0618,
"step": 1870
},
{
"epoch": 1.8024928092042187,
"grad_norm": 0.5203428864479065,
"learning_rate": 9.98644789709717e-05,
"loss": 0.054,
"step": 1880
},
{
"epoch": 1.8120805369127517,
"grad_norm": 0.3689659833908081,
"learning_rate": 9.985975954446372e-05,
"loss": 0.0506,
"step": 1890
},
{
"epoch": 1.8216682646212847,
"grad_norm": 0.5378998517990112,
"learning_rate": 9.985495945837504e-05,
"loss": 0.0527,
"step": 1900
},
{
"epoch": 1.831255992329818,
"grad_norm": 0.36838144063949585,
"learning_rate": 9.985007872047088e-05,
"loss": 0.0484,
"step": 1910
},
{
"epoch": 1.840843720038351,
"grad_norm": 0.3217353820800781,
"learning_rate": 9.984511733864698e-05,
"loss": 0.0495,
"step": 1920
},
{
"epoch": 1.850431447746884,
"grad_norm": 0.5914832353591919,
"learning_rate": 9.984007532092951e-05,
"loss": 0.0562,
"step": 1930
},
{
"epoch": 1.8600191754554172,
"grad_norm": 0.44079649448394775,
"learning_rate": 9.983495267547508e-05,
"loss": 0.0515,
"step": 1940
},
{
"epoch": 1.8696069031639502,
"grad_norm": 0.5204843878746033,
"learning_rate": 9.982974941057073e-05,
"loss": 0.0547,
"step": 1950
},
{
"epoch": 1.8791946308724832,
"grad_norm": 0.505711555480957,
"learning_rate": 9.982446553463397e-05,
"loss": 0.0445,
"step": 1960
},
{
"epoch": 1.8887823585810164,
"grad_norm": 0.3592546582221985,
"learning_rate": 9.981910105621262e-05,
"loss": 0.0586,
"step": 1970
},
{
"epoch": 1.8983700862895494,
"grad_norm": 0.3347618281841278,
"learning_rate": 9.9813655983985e-05,
"loss": 0.0616,
"step": 1980
},
{
"epoch": 1.9079578139980824,
"grad_norm": 0.6229729056358337,
"learning_rate": 9.980813032675974e-05,
"loss": 0.0486,
"step": 1990
},
{
"epoch": 1.9175455417066156,
"grad_norm": 0.4660274386405945,
"learning_rate": 9.980252409347588e-05,
"loss": 0.0481,
"step": 2000
},
{
"epoch": 1.9271332694151486,
"grad_norm": 0.2956122159957886,
"learning_rate": 9.979683729320275e-05,
"loss": 0.0511,
"step": 2010
},
{
"epoch": 1.9367209971236816,
"grad_norm": 0.45697900652885437,
"learning_rate": 9.97910699351401e-05,
"loss": 0.0519,
"step": 2020
},
{
"epoch": 1.9463087248322148,
"grad_norm": 0.5107268691062927,
"learning_rate": 9.97852220286179e-05,
"loss": 0.0563,
"step": 2030
},
{
"epoch": 1.9558964525407478,
"grad_norm": 0.3761272728443146,
"learning_rate": 9.97792935830965e-05,
"loss": 0.0532,
"step": 2040
},
{
"epoch": 1.9654841802492808,
"grad_norm": 0.4759978950023651,
"learning_rate": 9.977328460816654e-05,
"loss": 0.0588,
"step": 2050
},
{
"epoch": 1.975071907957814,
"grad_norm": 0.4457103908061981,
"learning_rate": 9.976719511354889e-05,
"loss": 0.0459,
"step": 2060
},
{
"epoch": 1.984659635666347,
"grad_norm": 0.31241118907928467,
"learning_rate": 9.976102510909469e-05,
"loss": 0.0521,
"step": 2070
},
{
"epoch": 1.99424736337488,
"grad_norm": 0.5308888554573059,
"learning_rate": 9.975477460478538e-05,
"loss": 0.0514,
"step": 2080
},
{
"epoch": 2.0038350910834133,
"grad_norm": 0.35070937871932983,
"learning_rate": 9.974844361073252e-05,
"loss": 0.0524,
"step": 2090
},
{
"epoch": 2.0134228187919465,
"grad_norm": 0.47052425146102905,
"learning_rate": 9.9742032137178e-05,
"loss": 0.0476,
"step": 2100
},
{
"epoch": 2.0230105465004793,
"grad_norm": 0.6150134205818176,
"learning_rate": 9.973554019449383e-05,
"loss": 0.0412,
"step": 2110
},
{
"epoch": 2.0325982742090125,
"grad_norm": 0.5497679114341736,
"learning_rate": 9.972896779318219e-05,
"loss": 0.0592,
"step": 2120
},
{
"epoch": 2.0421860019175457,
"grad_norm": 0.5127347111701965,
"learning_rate": 9.972231494387547e-05,
"loss": 0.0468,
"step": 2130
},
{
"epoch": 2.0517737296260785,
"grad_norm": 0.43948736786842346,
"learning_rate": 9.971558165733619e-05,
"loss": 0.0484,
"step": 2140
},
{
"epoch": 2.0613614573346117,
"grad_norm": 0.47324222326278687,
"learning_rate": 9.970876794445694e-05,
"loss": 0.0517,
"step": 2150
},
{
"epoch": 2.070949185043145,
"grad_norm": 0.34907156229019165,
"learning_rate": 9.970187381626048e-05,
"loss": 0.0566,
"step": 2160
},
{
"epoch": 2.0805369127516777,
"grad_norm": 0.51346355676651,
"learning_rate": 9.969489928389965e-05,
"loss": 0.0409,
"step": 2170
},
{
"epoch": 2.090124640460211,
"grad_norm": 0.34040042757987976,
"learning_rate": 9.968784435865737e-05,
"loss": 0.0462,
"step": 2180
},
{
"epoch": 2.099712368168744,
"grad_norm": 0.4003884792327881,
"learning_rate": 9.968070905194656e-05,
"loss": 0.0434,
"step": 2190
},
{
"epoch": 2.109300095877277,
"grad_norm": 0.4381425380706787,
"learning_rate": 9.967349337531023e-05,
"loss": 0.0438,
"step": 2200
},
{
"epoch": 2.11888782358581,
"grad_norm": 0.5975500345230103,
"learning_rate": 9.966619734042139e-05,
"loss": 0.0441,
"step": 2210
},
{
"epoch": 2.1284755512943434,
"grad_norm": 0.39649492502212524,
"learning_rate": 9.965882095908305e-05,
"loss": 0.0485,
"step": 2220
},
{
"epoch": 2.138063279002876,
"grad_norm": 0.5102829337120056,
"learning_rate": 9.96513642432282e-05,
"loss": 0.0462,
"step": 2230
},
{
"epoch": 2.1476510067114094,
"grad_norm": 0.5115483999252319,
"learning_rate": 9.964382720491976e-05,
"loss": 0.0539,
"step": 2240
},
{
"epoch": 2.1572387344199426,
"grad_norm": 0.4768059551715851,
"learning_rate": 9.963620985635065e-05,
"loss": 0.0521,
"step": 2250
},
{
"epoch": 2.1668264621284754,
"grad_norm": 0.4891989827156067,
"learning_rate": 9.962851220984366e-05,
"loss": 0.0486,
"step": 2260
},
{
"epoch": 2.1764141898370086,
"grad_norm": 0.5893239974975586,
"learning_rate": 9.962073427785149e-05,
"loss": 0.053,
"step": 2270
},
{
"epoch": 2.186001917545542,
"grad_norm": 0.640600323677063,
"learning_rate": 9.961287607295673e-05,
"loss": 0.0516,
"step": 2280
},
{
"epoch": 2.1955896452540746,
"grad_norm": 0.5314393639564514,
"learning_rate": 9.960493760787184e-05,
"loss": 0.0552,
"step": 2290
},
{
"epoch": 2.205177372962608,
"grad_norm": 0.4695710241794586,
"learning_rate": 9.95969188954391e-05,
"loss": 0.0488,
"step": 2300
},
{
"epoch": 2.214765100671141,
"grad_norm": 0.41498687863349915,
"learning_rate": 9.958881994863058e-05,
"loss": 0.0554,
"step": 2310
},
{
"epoch": 2.224352828379674,
"grad_norm": 0.3587738573551178,
"learning_rate": 9.958064078054823e-05,
"loss": 0.0415,
"step": 2320
},
{
"epoch": 2.233940556088207,
"grad_norm": 0.3993861973285675,
"learning_rate": 9.957238140442371e-05,
"loss": 0.0529,
"step": 2330
},
{
"epoch": 2.2435282837967403,
"grad_norm": 0.4770705997943878,
"learning_rate": 9.956404183361845e-05,
"loss": 0.0521,
"step": 2340
},
{
"epoch": 2.253116011505273,
"grad_norm": 0.5887109041213989,
"learning_rate": 9.955562208162362e-05,
"loss": 0.0632,
"step": 2350
},
{
"epoch": 2.2627037392138063,
"grad_norm": 0.6732892990112305,
"learning_rate": 9.954712216206008e-05,
"loss": 0.06,
"step": 2360
},
{
"epoch": 2.2722914669223395,
"grad_norm": 0.37186869978904724,
"learning_rate": 9.953854208867841e-05,
"loss": 0.0572,
"step": 2370
},
{
"epoch": 2.2818791946308723,
"grad_norm": 0.3546556234359741,
"learning_rate": 9.952988187535886e-05,
"loss": 0.0495,
"step": 2380
},
{
"epoch": 2.2914669223394055,
"grad_norm": 0.23416608572006226,
"learning_rate": 9.952114153611128e-05,
"loss": 0.0463,
"step": 2390
},
{
"epoch": 2.3010546500479387,
"grad_norm": 0.5339412689208984,
"learning_rate": 9.951232108507517e-05,
"loss": 0.0503,
"step": 2400
},
{
"epoch": 2.310642377756472,
"grad_norm": 0.34483078122138977,
"learning_rate": 9.950342053651967e-05,
"loss": 0.0428,
"step": 2410
},
{
"epoch": 2.3202301054650047,
"grad_norm": 0.449236124753952,
"learning_rate": 9.949443990484342e-05,
"loss": 0.0495,
"step": 2420
},
{
"epoch": 2.329817833173538,
"grad_norm": 0.40906885266304016,
"learning_rate": 9.948537920457466e-05,
"loss": 0.0442,
"step": 2430
},
{
"epoch": 2.3394055608820707,
"grad_norm": 0.3320155143737793,
"learning_rate": 9.947623845037112e-05,
"loss": 0.0469,
"step": 2440
},
{
"epoch": 2.348993288590604,
"grad_norm": 0.3933449387550354,
"learning_rate": 9.946701765702012e-05,
"loss": 0.0499,
"step": 2450
},
{
"epoch": 2.358581016299137,
"grad_norm": 0.42711353302001953,
"learning_rate": 9.945771683943836e-05,
"loss": 0.0465,
"step": 2460
},
{
"epoch": 2.3681687440076704,
"grad_norm": 0.3379175364971161,
"learning_rate": 9.944833601267207e-05,
"loss": 0.0446,
"step": 2470
},
{
"epoch": 2.377756471716203,
"grad_norm": 0.2655797302722931,
"learning_rate": 9.943887519189685e-05,
"loss": 0.0457,
"step": 2480
},
{
"epoch": 2.3873441994247364,
"grad_norm": 0.534376859664917,
"learning_rate": 9.94293343924178e-05,
"loss": 0.0386,
"step": 2490
},
{
"epoch": 2.396931927133269,
"grad_norm": 0.5116010904312134,
"learning_rate": 9.941971362966929e-05,
"loss": 0.0488,
"step": 2500
},
{
"epoch": 2.4065196548418024,
"grad_norm": 0.33155950903892517,
"learning_rate": 9.941001291921512e-05,
"loss": 0.0561,
"step": 2510
},
{
"epoch": 2.4161073825503356,
"grad_norm": 0.4785441756248474,
"learning_rate": 9.940023227674844e-05,
"loss": 0.055,
"step": 2520
},
{
"epoch": 2.425695110258869,
"grad_norm": 0.4031260907649994,
"learning_rate": 9.939037171809167e-05,
"loss": 0.0489,
"step": 2530
},
{
"epoch": 2.4352828379674016,
"grad_norm": 0.4069255590438843,
"learning_rate": 9.93804312591965e-05,
"loss": 0.0499,
"step": 2540
},
{
"epoch": 2.444870565675935,
"grad_norm": 0.4854568541049957,
"learning_rate": 9.937041091614392e-05,
"loss": 0.0508,
"step": 2550
},
{
"epoch": 2.4544582933844676,
"grad_norm": 0.42022451758384705,
"learning_rate": 9.936031070514413e-05,
"loss": 0.0533,
"step": 2560
},
{
"epoch": 2.464046021093001,
"grad_norm": 0.3417539894580841,
"learning_rate": 9.935013064253652e-05,
"loss": 0.0487,
"step": 2570
},
{
"epoch": 2.473633748801534,
"grad_norm": 0.7130690813064575,
"learning_rate": 9.933987074478969e-05,
"loss": 0.0482,
"step": 2580
},
{
"epoch": 2.4832214765100673,
"grad_norm": 0.328921914100647,
"learning_rate": 9.932953102850136e-05,
"loss": 0.0462,
"step": 2590
},
{
"epoch": 2.4928092042186,
"grad_norm": 0.27391597628593445,
"learning_rate": 9.931911151039838e-05,
"loss": 0.0543,
"step": 2600
},
{
"epoch": 2.5023969319271333,
"grad_norm": 0.3968970775604248,
"learning_rate": 9.930861220733674e-05,
"loss": 0.0446,
"step": 2610
},
{
"epoch": 2.511984659635666,
"grad_norm": 0.31161823868751526,
"learning_rate": 9.929803313630145e-05,
"loss": 0.0542,
"step": 2620
},
{
"epoch": 2.5215723873441993,
"grad_norm": 0.49789026379585266,
"learning_rate": 9.928737431440658e-05,
"loss": 0.0496,
"step": 2630
},
{
"epoch": 2.5311601150527325,
"grad_norm": 0.3426557779312134,
"learning_rate": 9.927663575889521e-05,
"loss": 0.0451,
"step": 2640
},
{
"epoch": 2.5407478427612658,
"grad_norm": 0.35124093294143677,
"learning_rate": 9.926581748713942e-05,
"loss": 0.0469,
"step": 2650
},
{
"epoch": 2.5503355704697985,
"grad_norm": 0.5212651491165161,
"learning_rate": 9.925491951664023e-05,
"loss": 0.0574,
"step": 2660
},
{
"epoch": 2.5599232981783318,
"grad_norm": 0.5474659204483032,
"learning_rate": 9.92439418650276e-05,
"loss": 0.0592,
"step": 2670
},
{
"epoch": 2.569511025886865,
"grad_norm": 0.36428266763687134,
"learning_rate": 9.923288455006045e-05,
"loss": 0.0534,
"step": 2680
},
{
"epoch": 2.5790987535953978,
"grad_norm": 0.3940581977367401,
"learning_rate": 9.922174758962645e-05,
"loss": 0.0493,
"step": 2690
},
{
"epoch": 2.588686481303931,
"grad_norm": 0.32265448570251465,
"learning_rate": 9.921053100174223e-05,
"loss": 0.0465,
"step": 2700
},
{
"epoch": 2.598274209012464,
"grad_norm": 0.35290199518203735,
"learning_rate": 9.919923480455317e-05,
"loss": 0.048,
"step": 2710
},
{
"epoch": 2.607861936720997,
"grad_norm": 0.4928702712059021,
"learning_rate": 9.918785901633345e-05,
"loss": 0.0463,
"step": 2720
},
{
"epoch": 2.61744966442953,
"grad_norm": 0.39868831634521484,
"learning_rate": 9.917640365548604e-05,
"loss": 0.0478,
"step": 2730
},
{
"epoch": 2.6270373921380634,
"grad_norm": 0.48915326595306396,
"learning_rate": 9.916486874054259e-05,
"loss": 0.0452,
"step": 2740
},
{
"epoch": 2.636625119846596,
"grad_norm": 0.3415433466434479,
"learning_rate": 9.915325429016345e-05,
"loss": 0.0399,
"step": 2750
},
{
"epoch": 2.6462128475551294,
"grad_norm": 0.4320572316646576,
"learning_rate": 9.914156032313768e-05,
"loss": 0.052,
"step": 2760
},
{
"epoch": 2.6558005752636626,
"grad_norm": 0.5043158531188965,
"learning_rate": 9.912978685838294e-05,
"loss": 0.05,
"step": 2770
},
{
"epoch": 2.665388302972196,
"grad_norm": 0.3065243363380432,
"learning_rate": 9.911793391494552e-05,
"loss": 0.0449,
"step": 2780
},
{
"epoch": 2.6749760306807286,
"grad_norm": 0.42839324474334717,
"learning_rate": 9.910600151200025e-05,
"loss": 0.0506,
"step": 2790
},
{
"epoch": 2.684563758389262,
"grad_norm": 0.32670149207115173,
"learning_rate": 9.909398966885053e-05,
"loss": 0.0482,
"step": 2800
},
{
"epoch": 2.6941514860977946,
"grad_norm": 0.49310222268104553,
"learning_rate": 9.908189840492827e-05,
"loss": 0.0457,
"step": 2810
},
{
"epoch": 2.703739213806328,
"grad_norm": 0.43462368845939636,
"learning_rate": 9.906972773979388e-05,
"loss": 0.0494,
"step": 2820
},
{
"epoch": 2.713326941514861,
"grad_norm": 0.3611735701560974,
"learning_rate": 9.905747769313616e-05,
"loss": 0.0472,
"step": 2830
},
{
"epoch": 2.7229146692233943,
"grad_norm": 0.3046175539493561,
"learning_rate": 9.90451482847724e-05,
"loss": 0.046,
"step": 2840
},
{
"epoch": 2.732502396931927,
"grad_norm": 0.5815914869308472,
"learning_rate": 9.903273953464821e-05,
"loss": 0.0505,
"step": 2850
},
{
"epoch": 2.7420901246404603,
"grad_norm": 0.4920728802680969,
"learning_rate": 9.902025146283761e-05,
"loss": 0.0475,
"step": 2860
},
{
"epoch": 2.751677852348993,
"grad_norm": 0.3602769374847412,
"learning_rate": 9.90076840895429e-05,
"loss": 0.0425,
"step": 2870
},
{
"epoch": 2.7612655800575263,
"grad_norm": 0.580506443977356,
"learning_rate": 9.899503743509471e-05,
"loss": 0.0493,
"step": 2880
},
{
"epoch": 2.7708533077660595,
"grad_norm": 0.4402373135089874,
"learning_rate": 9.898231151995187e-05,
"loss": 0.0468,
"step": 2890
},
{
"epoch": 2.7804410354745928,
"grad_norm": 0.5210007429122925,
"learning_rate": 9.896950636470147e-05,
"loss": 0.0461,
"step": 2900
},
{
"epoch": 2.7900287631831255,
"grad_norm": 0.4113840162754059,
"learning_rate": 9.89566219900588e-05,
"loss": 0.0561,
"step": 2910
},
{
"epoch": 2.7996164908916588,
"grad_norm": 0.4887576699256897,
"learning_rate": 9.894365841686726e-05,
"loss": 0.0484,
"step": 2920
},
{
"epoch": 2.8092042186001915,
"grad_norm": 0.3261569142341614,
"learning_rate": 9.893061566609843e-05,
"loss": 0.0457,
"step": 2930
},
{
"epoch": 2.8187919463087248,
"grad_norm": 0.3729310631752014,
"learning_rate": 9.891749375885191e-05,
"loss": 0.0459,
"step": 2940
},
{
"epoch": 2.828379674017258,
"grad_norm": 0.4186583459377289,
"learning_rate": 9.890429271635541e-05,
"loss": 0.0448,
"step": 2950
},
{
"epoch": 2.837967401725791,
"grad_norm": 0.4808233380317688,
"learning_rate": 9.889101255996466e-05,
"loss": 0.0513,
"step": 2960
},
{
"epoch": 2.847555129434324,
"grad_norm": 0.24302266538143158,
"learning_rate": 9.887765331116331e-05,
"loss": 0.0439,
"step": 2970
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.28988000750541687,
"learning_rate": 9.886421499156305e-05,
"loss": 0.0448,
"step": 2980
},
{
"epoch": 2.86673058485139,
"grad_norm": 0.408470094203949,
"learning_rate": 9.88506976229034e-05,
"loss": 0.0457,
"step": 2990
},
{
"epoch": 2.876318312559923,
"grad_norm": 0.279012531042099,
"learning_rate": 9.883710122705184e-05,
"loss": 0.0521,
"step": 3000
},
{
"epoch": 2.8859060402684564,
"grad_norm": 0.3104060888290405,
"learning_rate": 9.882342582600361e-05,
"loss": 0.0479,
"step": 3010
},
{
"epoch": 2.8954937679769897,
"grad_norm": 0.36359190940856934,
"learning_rate": 9.880967144188184e-05,
"loss": 0.0545,
"step": 3020
},
{
"epoch": 2.9050814956855224,
"grad_norm": 0.3486534059047699,
"learning_rate": 9.879583809693738e-05,
"loss": 0.0469,
"step": 3030
},
{
"epoch": 2.9146692233940557,
"grad_norm": 0.35138458013534546,
"learning_rate": 9.878192581354883e-05,
"loss": 0.0486,
"step": 3040
},
{
"epoch": 2.9242569511025884,
"grad_norm": 0.3004566431045532,
"learning_rate": 9.87679346142225e-05,
"loss": 0.048,
"step": 3050
},
{
"epoch": 2.9338446788111217,
"grad_norm": 0.4111393988132477,
"learning_rate": 9.875386452159237e-05,
"loss": 0.0526,
"step": 3060
},
{
"epoch": 2.943432406519655,
"grad_norm": 0.5051720142364502,
"learning_rate": 9.873971555842e-05,
"loss": 0.0521,
"step": 3070
},
{
"epoch": 2.953020134228188,
"grad_norm": 0.3800508677959442,
"learning_rate": 9.872548774759465e-05,
"loss": 0.0509,
"step": 3080
},
{
"epoch": 2.962607861936721,
"grad_norm": 0.3587114214897156,
"learning_rate": 9.871118111213299e-05,
"loss": 0.0463,
"step": 3090
},
{
"epoch": 2.972195589645254,
"grad_norm": 0.3234626352787018,
"learning_rate": 9.869679567517931e-05,
"loss": 0.0421,
"step": 3100
},
{
"epoch": 2.981783317353787,
"grad_norm": 0.37090590596199036,
"learning_rate": 9.868233146000535e-05,
"loss": 0.0497,
"step": 3110
},
{
"epoch": 2.99137104506232,
"grad_norm": 0.274735689163208,
"learning_rate": 9.86677884900103e-05,
"loss": 0.0492,
"step": 3120
},
{
"epoch": 3.0009587727708533,
"grad_norm": 0.35899800062179565,
"learning_rate": 9.865316678872073e-05,
"loss": 0.0436,
"step": 3130
},
{
"epoch": 3.0105465004793865,
"grad_norm": 0.3196784555912018,
"learning_rate": 9.863846637979057e-05,
"loss": 0.0472,
"step": 3140
},
{
"epoch": 3.0201342281879193,
"grad_norm": 0.3077564537525177,
"learning_rate": 9.862368728700115e-05,
"loss": 0.0527,
"step": 3150
},
{
"epoch": 3.0297219558964525,
"grad_norm": 0.40086090564727783,
"learning_rate": 9.860882953426099e-05,
"loss": 0.0507,
"step": 3160
},
{
"epoch": 3.0393096836049858,
"grad_norm": 0.6561570763587952,
"learning_rate": 9.859389314560595e-05,
"loss": 0.0545,
"step": 3170
},
{
"epoch": 3.0488974113135185,
"grad_norm": 0.34068262577056885,
"learning_rate": 9.857887814519902e-05,
"loss": 0.0458,
"step": 3180
},
{
"epoch": 3.0584851390220518,
"grad_norm": 0.31878864765167236,
"learning_rate": 9.856378455733042e-05,
"loss": 0.0399,
"step": 3190
},
{
"epoch": 3.068072866730585,
"grad_norm": 0.41648054122924805,
"learning_rate": 9.854861240641748e-05,
"loss": 0.0452,
"step": 3200
},
{
"epoch": 3.0776605944391178,
"grad_norm": 0.35710304975509644,
"learning_rate": 9.853336171700464e-05,
"loss": 0.0509,
"step": 3210
},
{
"epoch": 3.087248322147651,
"grad_norm": 0.3782924711704254,
"learning_rate": 9.851803251376336e-05,
"loss": 0.0445,
"step": 3220
},
{
"epoch": 3.096836049856184,
"grad_norm": 0.49359890818595886,
"learning_rate": 9.85026248214922e-05,
"loss": 0.0515,
"step": 3230
},
{
"epoch": 3.106423777564717,
"grad_norm": 0.4491162598133087,
"learning_rate": 9.848713866511655e-05,
"loss": 0.0444,
"step": 3240
},
{
"epoch": 3.11601150527325,
"grad_norm": 0.3755772113800049,
"learning_rate": 9.847157406968885e-05,
"loss": 0.0417,
"step": 3250
},
{
"epoch": 3.1255992329817834,
"grad_norm": 0.32591307163238525,
"learning_rate": 9.84559310603884e-05,
"loss": 0.0571,
"step": 3260
},
{
"epoch": 3.135186960690316,
"grad_norm": 0.4025377333164215,
"learning_rate": 9.844020966252137e-05,
"loss": 0.0479,
"step": 3270
},
{
"epoch": 3.1447746883988494,
"grad_norm": 0.3106444478034973,
"learning_rate": 9.842440990152068e-05,
"loss": 0.0472,
"step": 3280
},
{
"epoch": 3.1543624161073827,
"grad_norm": 0.3832003176212311,
"learning_rate": 9.840853180294608e-05,
"loss": 0.0566,
"step": 3290
},
{
"epoch": 3.1639501438159154,
"grad_norm": 0.2815271019935608,
"learning_rate": 9.839257539248403e-05,
"loss": 0.0396,
"step": 3300
},
{
"epoch": 3.1735378715244487,
"grad_norm": 0.38503342866897583,
"learning_rate": 9.83765406959477e-05,
"loss": 0.048,
"step": 3310
},
{
"epoch": 3.183125599232982,
"grad_norm": 0.31450656056404114,
"learning_rate": 9.836042773927685e-05,
"loss": 0.0383,
"step": 3320
},
{
"epoch": 3.1927133269415147,
"grad_norm": 0.39521682262420654,
"learning_rate": 9.834423654853791e-05,
"loss": 0.0449,
"step": 3330
},
{
"epoch": 3.202301054650048,
"grad_norm": 0.4725668728351593,
"learning_rate": 9.832796714992381e-05,
"loss": 0.0436,
"step": 3340
},
{
"epoch": 3.211888782358581,
"grad_norm": 0.43373286724090576,
"learning_rate": 9.831161956975405e-05,
"loss": 0.0502,
"step": 3350
},
{
"epoch": 3.221476510067114,
"grad_norm": 0.30628758668899536,
"learning_rate": 9.829519383447456e-05,
"loss": 0.0454,
"step": 3360
},
{
"epoch": 3.231064237775647,
"grad_norm": 0.6050196290016174,
"learning_rate": 9.827868997065777e-05,
"loss": 0.0528,
"step": 3370
},
{
"epoch": 3.2406519654841803,
"grad_norm": 0.36287015676498413,
"learning_rate": 9.826210800500242e-05,
"loss": 0.0529,
"step": 3380
},
{
"epoch": 3.2502396931927136,
"grad_norm": 0.41856274008750916,
"learning_rate": 9.824544796433366e-05,
"loss": 0.0489,
"step": 3390
},
{
"epoch": 3.2598274209012463,
"grad_norm": 0.35269007086753845,
"learning_rate": 9.82287098756029e-05,
"loss": 0.049,
"step": 3400
},
{
"epoch": 3.2694151486097796,
"grad_norm": 0.35962244868278503,
"learning_rate": 9.821189376588786e-05,
"loss": 0.0471,
"step": 3410
},
{
"epoch": 3.2790028763183123,
"grad_norm": 0.5149263739585876,
"learning_rate": 9.819499966239243e-05,
"loss": 0.0533,
"step": 3420
},
{
"epoch": 3.2885906040268456,
"grad_norm": 0.3651978075504303,
"learning_rate": 9.81780275924467e-05,
"loss": 0.0428,
"step": 3430
},
{
"epoch": 3.2981783317353788,
"grad_norm": 0.377916157245636,
"learning_rate": 9.816097758350688e-05,
"loss": 0.0527,
"step": 3440
},
{
"epoch": 3.307766059443912,
"grad_norm": 0.39240193367004395,
"learning_rate": 9.814384966315526e-05,
"loss": 0.0498,
"step": 3450
},
{
"epoch": 3.3173537871524448,
"grad_norm": 0.4727850556373596,
"learning_rate": 9.812664385910018e-05,
"loss": 0.0519,
"step": 3460
},
{
"epoch": 3.326941514860978,
"grad_norm": 0.3471921980381012,
"learning_rate": 9.810936019917595e-05,
"loss": 0.043,
"step": 3470
},
{
"epoch": 3.336529242569511,
"grad_norm": 0.3818338215351105,
"learning_rate": 9.809199871134287e-05,
"loss": 0.0427,
"step": 3480
},
{
"epoch": 3.346116970278044,
"grad_norm": 0.34183284640312195,
"learning_rate": 9.807455942368711e-05,
"loss": 0.0414,
"step": 3490
},
{
"epoch": 3.3557046979865772,
"grad_norm": 0.3725120425224304,
"learning_rate": 9.805704236442073e-05,
"loss": 0.0493,
"step": 3500
},
{
"epoch": 3.3652924256951104,
"grad_norm": 0.4457106590270996,
"learning_rate": 9.803944756188157e-05,
"loss": 0.0423,
"step": 3510
},
{
"epoch": 3.3748801534036432,
"grad_norm": 0.3035670220851898,
"learning_rate": 9.802177504453326e-05,
"loss": 0.0431,
"step": 3520
},
{
"epoch": 3.3844678811121764,
"grad_norm": 0.36193615198135376,
"learning_rate": 9.800402484096513e-05,
"loss": 0.0461,
"step": 3530
},
{
"epoch": 3.3940556088207097,
"grad_norm": 0.39786848425865173,
"learning_rate": 9.798619697989222e-05,
"loss": 0.0558,
"step": 3540
},
{
"epoch": 3.4036433365292424,
"grad_norm": 0.3743523061275482,
"learning_rate": 9.796829149015517e-05,
"loss": 0.0439,
"step": 3550
},
{
"epoch": 3.4132310642377757,
"grad_norm": 0.36101034283638,
"learning_rate": 9.79503084007202e-05,
"loss": 0.0483,
"step": 3560
},
{
"epoch": 3.422818791946309,
"grad_norm": 0.3148845434188843,
"learning_rate": 9.79322477406791e-05,
"loss": 0.043,
"step": 3570
},
{
"epoch": 3.4324065196548417,
"grad_norm": 0.45851582288742065,
"learning_rate": 9.79141095392491e-05,
"loss": 0.0492,
"step": 3580
},
{
"epoch": 3.441994247363375,
"grad_norm": 0.6849660277366638,
"learning_rate": 9.789589382577291e-05,
"loss": 0.0434,
"step": 3590
},
{
"epoch": 3.451581975071908,
"grad_norm": 0.5036081671714783,
"learning_rate": 9.787760062971861e-05,
"loss": 0.0525,
"step": 3600
},
{
"epoch": 3.461169702780441,
"grad_norm": 0.46620407700538635,
"learning_rate": 9.785922998067963e-05,
"loss": 0.0491,
"step": 3610
},
{
"epoch": 3.470757430488974,
"grad_norm": 0.5096569657325745,
"learning_rate": 9.784078190837472e-05,
"loss": 0.0514,
"step": 3620
},
{
"epoch": 3.4803451581975073,
"grad_norm": 0.2947571873664856,
"learning_rate": 9.782225644264784e-05,
"loss": 0.0457,
"step": 3630
},
{
"epoch": 3.48993288590604,
"grad_norm": 0.4548271894454956,
"learning_rate": 9.780365361346821e-05,
"loss": 0.0448,
"step": 3640
},
{
"epoch": 3.4995206136145733,
"grad_norm": 0.5976017713546753,
"learning_rate": 9.778497345093013e-05,
"loss": 0.0495,
"step": 3650
},
{
"epoch": 3.5091083413231066,
"grad_norm": 0.3194081783294678,
"learning_rate": 9.776621598525305e-05,
"loss": 0.0473,
"step": 3660
},
{
"epoch": 3.5186960690316393,
"grad_norm": 0.2841929793357849,
"learning_rate": 9.774738124678148e-05,
"loss": 0.0429,
"step": 3670
},
{
"epoch": 3.5282837967401726,
"grad_norm": 0.2357761263847351,
"learning_rate": 9.772846926598491e-05,
"loss": 0.0494,
"step": 3680
},
{
"epoch": 3.537871524448706,
"grad_norm": 0.893323540687561,
"learning_rate": 9.770948007345779e-05,
"loss": 0.0497,
"step": 3690
},
{
"epoch": 3.547459252157239,
"grad_norm": 0.23153287172317505,
"learning_rate": 9.769041369991953e-05,
"loss": 0.0457,
"step": 3700
},
{
"epoch": 3.557046979865772,
"grad_norm": 0.36270731687545776,
"learning_rate": 9.767127017621431e-05,
"loss": 0.0535,
"step": 3710
},
{
"epoch": 3.566634707574305,
"grad_norm": 0.39080706238746643,
"learning_rate": 9.76520495333112e-05,
"loss": 0.0462,
"step": 3720
},
{
"epoch": 3.576222435282838,
"grad_norm": 0.5226278901100159,
"learning_rate": 9.763275180230395e-05,
"loss": 0.0486,
"step": 3730
},
{
"epoch": 3.585810162991371,
"grad_norm": 0.2358178198337555,
"learning_rate": 9.761337701441111e-05,
"loss": 0.0452,
"step": 3740
},
{
"epoch": 3.5953978906999042,
"grad_norm": 0.47069254517555237,
"learning_rate": 9.759392520097581e-05,
"loss": 0.049,
"step": 3750
},
{
"epoch": 3.6049856184084375,
"grad_norm": 0.327800452709198,
"learning_rate": 9.75743963934658e-05,
"loss": 0.0411,
"step": 3760
},
{
"epoch": 3.6145733461169702,
"grad_norm": 0.4534970819950104,
"learning_rate": 9.755479062347344e-05,
"loss": 0.0472,
"step": 3770
},
{
"epoch": 3.6241610738255035,
"grad_norm": 0.2962513566017151,
"learning_rate": 9.753510792271549e-05,
"loss": 0.0523,
"step": 3780
},
{
"epoch": 3.6337488015340362,
"grad_norm": 0.46883541345596313,
"learning_rate": 9.75153483230333e-05,
"loss": 0.0468,
"step": 3790
},
{
"epoch": 3.6433365292425695,
"grad_norm": 0.2845245599746704,
"learning_rate": 9.749551185639249e-05,
"loss": 0.0438,
"step": 3800
},
{
"epoch": 3.6529242569511027,
"grad_norm": 0.2763413190841675,
"learning_rate": 9.747559855488313e-05,
"loss": 0.0472,
"step": 3810
},
{
"epoch": 3.662511984659636,
"grad_norm": 0.27591028809547424,
"learning_rate": 9.74556084507195e-05,
"loss": 0.0457,
"step": 3820
},
{
"epoch": 3.6720997123681687,
"grad_norm": 0.36455026268959045,
"learning_rate": 9.743554157624023e-05,
"loss": 0.0453,
"step": 3830
},
{
"epoch": 3.681687440076702,
"grad_norm": 0.4757814407348633,
"learning_rate": 9.741539796390804e-05,
"loss": 0.0496,
"step": 3840
},
{
"epoch": 3.6912751677852347,
"grad_norm": 0.3472752869129181,
"learning_rate": 9.739517764630984e-05,
"loss": 0.0438,
"step": 3850
},
{
"epoch": 3.700862895493768,
"grad_norm": 0.39700034260749817,
"learning_rate": 9.737488065615665e-05,
"loss": 0.045,
"step": 3860
},
{
"epoch": 3.710450623202301,
"grad_norm": 0.2766479551792145,
"learning_rate": 9.735450702628348e-05,
"loss": 0.0361,
"step": 3870
},
{
"epoch": 3.7200383509108343,
"grad_norm": 0.3525460660457611,
"learning_rate": 9.733405678964935e-05,
"loss": 0.044,
"step": 3880
},
{
"epoch": 3.729626078619367,
"grad_norm": 0.35298100113868713,
"learning_rate": 9.731352997933718e-05,
"loss": 0.0392,
"step": 3890
},
{
"epoch": 3.7392138063279003,
"grad_norm": 0.32511138916015625,
"learning_rate": 9.729292662855383e-05,
"loss": 0.0463,
"step": 3900
},
{
"epoch": 3.748801534036433,
"grad_norm": 0.33208218216896057,
"learning_rate": 9.727224677062992e-05,
"loss": 0.0479,
"step": 3910
},
{
"epoch": 3.7583892617449663,
"grad_norm": 0.43648335337638855,
"learning_rate": 9.725149043901985e-05,
"loss": 0.0459,
"step": 3920
},
{
"epoch": 3.7679769894534996,
"grad_norm": 0.3617904782295227,
"learning_rate": 9.723065766730172e-05,
"loss": 0.0545,
"step": 3930
},
{
"epoch": 3.777564717162033,
"grad_norm": 0.34762272238731384,
"learning_rate": 9.720974848917735e-05,
"loss": 0.0433,
"step": 3940
},
{
"epoch": 3.7871524448705656,
"grad_norm": 0.3334721028804779,
"learning_rate": 9.71887629384721e-05,
"loss": 0.0445,
"step": 3950
},
{
"epoch": 3.796740172579099,
"grad_norm": 0.4064335823059082,
"learning_rate": 9.716770104913492e-05,
"loss": 0.0436,
"step": 3960
},
{
"epoch": 3.8063279002876316,
"grad_norm": 0.4279939532279968,
"learning_rate": 9.714656285523821e-05,
"loss": 0.0534,
"step": 3970
},
{
"epoch": 3.815915627996165,
"grad_norm": 0.28922349214553833,
"learning_rate": 9.71253483909779e-05,
"loss": 0.0488,
"step": 3980
},
{
"epoch": 3.825503355704698,
"grad_norm": 0.701637327671051,
"learning_rate": 9.710405769067317e-05,
"loss": 0.0465,
"step": 3990
},
{
"epoch": 3.8350910834132312,
"grad_norm": 0.3132900595664978,
"learning_rate": 9.708269078876666e-05,
"loss": 0.046,
"step": 4000
},
{
"epoch": 3.844678811121764,
"grad_norm": 0.2793468236923218,
"learning_rate": 9.706124771982421e-05,
"loss": 0.0382,
"step": 4010
},
{
"epoch": 3.8542665388302972,
"grad_norm": 0.32509496808052063,
"learning_rate": 9.703972851853488e-05,
"loss": 0.0419,
"step": 4020
},
{
"epoch": 3.8638542665388305,
"grad_norm": 0.5768635869026184,
"learning_rate": 9.701813321971091e-05,
"loss": 0.0513,
"step": 4030
},
{
"epoch": 3.8734419942473632,
"grad_norm": 0.37095797061920166,
"learning_rate": 9.699646185828768e-05,
"loss": 0.0493,
"step": 4040
},
{
"epoch": 3.8830297219558965,
"grad_norm": 0.4116993844509125,
"learning_rate": 9.697471446932353e-05,
"loss": 0.0481,
"step": 4050
},
{
"epoch": 3.8926174496644297,
"grad_norm": 0.4631316363811493,
"learning_rate": 9.695289108799989e-05,
"loss": 0.057,
"step": 4060
},
{
"epoch": 3.9022051773729625,
"grad_norm": 0.5926663279533386,
"learning_rate": 9.693099174962103e-05,
"loss": 0.0541,
"step": 4070
},
{
"epoch": 3.9117929050814957,
"grad_norm": 0.4884685277938843,
"learning_rate": 9.690901648961418e-05,
"loss": 0.0444,
"step": 4080
},
{
"epoch": 3.921380632790029,
"grad_norm": 0.5205138921737671,
"learning_rate": 9.688696534352935e-05,
"loss": 0.0469,
"step": 4090
},
{
"epoch": 3.9309683604985617,
"grad_norm": 0.3476182222366333,
"learning_rate": 9.68648383470393e-05,
"loss": 0.0506,
"step": 4100
},
{
"epoch": 3.940556088207095,
"grad_norm": 0.310553640127182,
"learning_rate": 9.684263553593953e-05,
"loss": 0.0454,
"step": 4110
},
{
"epoch": 3.950143815915628,
"grad_norm": 0.2743299603462219,
"learning_rate": 9.682035694614817e-05,
"loss": 0.0517,
"step": 4120
},
{
"epoch": 3.959731543624161,
"grad_norm": 0.33413469791412354,
"learning_rate": 9.679800261370594e-05,
"loss": 0.0428,
"step": 4130
},
{
"epoch": 3.969319271332694,
"grad_norm": 0.4639144837856293,
"learning_rate": 9.677557257477609e-05,
"loss": 0.0444,
"step": 4140
},
{
"epoch": 3.9789069990412274,
"grad_norm": 0.33329275250434875,
"learning_rate": 9.675306686564437e-05,
"loss": 0.0472,
"step": 4150
},
{
"epoch": 3.98849472674976,
"grad_norm": 0.4461182653903961,
"learning_rate": 9.673048552271889e-05,
"loss": 0.0375,
"step": 4160
},
{
"epoch": 3.9980824544582934,
"grad_norm": 0.26508504152297974,
"learning_rate": 9.670782858253015e-05,
"loss": 0.0468,
"step": 4170
},
{
"epoch": 4.007670182166827,
"grad_norm": 0.4112192690372467,
"learning_rate": 9.668509608173094e-05,
"loss": 0.0419,
"step": 4180
},
{
"epoch": 4.01725790987536,
"grad_norm": 0.3724784255027771,
"learning_rate": 9.66622880570963e-05,
"loss": 0.0526,
"step": 4190
},
{
"epoch": 4.026845637583893,
"grad_norm": 0.43858179450035095,
"learning_rate": 9.663940454552342e-05,
"loss": 0.0481,
"step": 4200
},
{
"epoch": 4.036433365292425,
"grad_norm": 0.27318644523620605,
"learning_rate": 9.661644558403162e-05,
"loss": 0.0372,
"step": 4210
},
{
"epoch": 4.046021093000959,
"grad_norm": 0.36369022727012634,
"learning_rate": 9.659341120976229e-05,
"loss": 0.0421,
"step": 4220
},
{
"epoch": 4.055608820709492,
"grad_norm": 0.3167804479598999,
"learning_rate": 9.657030145997878e-05,
"loss": 0.0437,
"step": 4230
},
{
"epoch": 4.065196548418025,
"grad_norm": 0.37195485830307007,
"learning_rate": 9.654711637206644e-05,
"loss": 0.0391,
"step": 4240
},
{
"epoch": 4.074784276126558,
"grad_norm": 0.26798343658447266,
"learning_rate": 9.652385598353244e-05,
"loss": 0.0424,
"step": 4250
},
{
"epoch": 4.0843720038350915,
"grad_norm": 0.38160890340805054,
"learning_rate": 9.650052033200578e-05,
"loss": 0.0473,
"step": 4260
},
{
"epoch": 4.093959731543624,
"grad_norm": 0.3133178651332855,
"learning_rate": 9.647710945523725e-05,
"loss": 0.0446,
"step": 4270
},
{
"epoch": 4.103547459252157,
"grad_norm": 0.314330130815506,
"learning_rate": 9.645362339109927e-05,
"loss": 0.0402,
"step": 4280
},
{
"epoch": 4.11313518696069,
"grad_norm": 0.6541547775268555,
"learning_rate": 9.643006217758594e-05,
"loss": 0.0417,
"step": 4290
},
{
"epoch": 4.1227229146692235,
"grad_norm": 0.3850661814212799,
"learning_rate": 9.640642585281292e-05,
"loss": 0.0483,
"step": 4300
},
{
"epoch": 4.132310642377757,
"grad_norm": 0.4512180984020233,
"learning_rate": 9.638271445501739e-05,
"loss": 0.0382,
"step": 4310
},
{
"epoch": 4.14189837008629,
"grad_norm": 0.277205228805542,
"learning_rate": 9.635892802255794e-05,
"loss": 0.0456,
"step": 4320
},
{
"epoch": 4.151486097794822,
"grad_norm": 0.39535996317863464,
"learning_rate": 9.63350665939146e-05,
"loss": 0.0402,
"step": 4330
},
{
"epoch": 4.1610738255033555,
"grad_norm": 0.3694916069507599,
"learning_rate": 9.63111302076887e-05,
"loss": 0.0414,
"step": 4340
},
{
"epoch": 4.170661553211889,
"grad_norm": 0.4235345423221588,
"learning_rate": 9.628711890260279e-05,
"loss": 0.0475,
"step": 4350
},
{
"epoch": 4.180249280920422,
"grad_norm": 0.46871331334114075,
"learning_rate": 9.626303271750069e-05,
"loss": 0.044,
"step": 4360
},
{
"epoch": 4.189837008628955,
"grad_norm": 0.33372747898101807,
"learning_rate": 9.623887169134731e-05,
"loss": 0.0479,
"step": 4370
},
{
"epoch": 4.199424736337488,
"grad_norm": 0.29731622338294983,
"learning_rate": 9.621463586322863e-05,
"loss": 0.0477,
"step": 4380
},
{
"epoch": 4.209012464046021,
"grad_norm": 0.3897842466831207,
"learning_rate": 9.619032527235168e-05,
"loss": 0.0449,
"step": 4390
},
{
"epoch": 4.218600191754554,
"grad_norm": 0.375775009393692,
"learning_rate": 9.616593995804437e-05,
"loss": 0.0527,
"step": 4400
},
{
"epoch": 4.228187919463087,
"grad_norm": 0.44136837124824524,
"learning_rate": 9.614147995975557e-05,
"loss": 0.0465,
"step": 4410
},
{
"epoch": 4.23777564717162,
"grad_norm": 0.49286016821861267,
"learning_rate": 9.611694531705493e-05,
"loss": 0.0478,
"step": 4420
},
{
"epoch": 4.247363374880154,
"grad_norm": 0.36331725120544434,
"learning_rate": 9.609233606963282e-05,
"loss": 0.0453,
"step": 4430
},
{
"epoch": 4.256951102588687,
"grad_norm": 0.4064538776874542,
"learning_rate": 9.606765225730035e-05,
"loss": 0.0512,
"step": 4440
},
{
"epoch": 4.26653883029722,
"grad_norm": 0.3883167505264282,
"learning_rate": 9.604289391998925e-05,
"loss": 0.0416,
"step": 4450
},
{
"epoch": 4.276126558005752,
"grad_norm": 0.364762544631958,
"learning_rate": 9.601806109775179e-05,
"loss": 0.0483,
"step": 4460
},
{
"epoch": 4.285714285714286,
"grad_norm": 0.3354509472846985,
"learning_rate": 9.599315383076075e-05,
"loss": 0.0475,
"step": 4470
},
{
"epoch": 4.295302013422819,
"grad_norm": 0.3480011224746704,
"learning_rate": 9.596817215930934e-05,
"loss": 0.0441,
"step": 4480
},
{
"epoch": 4.304889741131352,
"grad_norm": 0.37383604049682617,
"learning_rate": 9.594311612381114e-05,
"loss": 0.0566,
"step": 4490
},
{
"epoch": 4.314477468839885,
"grad_norm": 0.5128716826438904,
"learning_rate": 9.591798576480001e-05,
"loss": 0.0452,
"step": 4500
},
{
"epoch": 4.324065196548418,
"grad_norm": 0.2675018310546875,
"learning_rate": 9.589278112293007e-05,
"loss": 0.0453,
"step": 4510
},
{
"epoch": 4.333652924256951,
"grad_norm": 0.39544346928596497,
"learning_rate": 9.586750223897562e-05,
"loss": 0.0479,
"step": 4520
},
{
"epoch": 4.343240651965484,
"grad_norm": 0.7438755631446838,
"learning_rate": 9.584214915383103e-05,
"loss": 0.0432,
"step": 4530
},
{
"epoch": 4.352828379674017,
"grad_norm": 0.29193535447120667,
"learning_rate": 9.58167219085107e-05,
"loss": 0.0467,
"step": 4540
},
{
"epoch": 4.3624161073825505,
"grad_norm": 0.34703853726387024,
"learning_rate": 9.579122054414907e-05,
"loss": 0.0435,
"step": 4550
},
{
"epoch": 4.372003835091084,
"grad_norm": 0.408741295337677,
"learning_rate": 9.576564510200038e-05,
"loss": 0.0433,
"step": 4560
},
{
"epoch": 4.381591562799617,
"grad_norm": 0.4278319478034973,
"learning_rate": 9.573999562343882e-05,
"loss": 0.047,
"step": 4570
},
{
"epoch": 4.391179290508149,
"grad_norm": 0.2754301428794861,
"learning_rate": 9.571427214995826e-05,
"loss": 0.0423,
"step": 4580
},
{
"epoch": 4.4007670182166825,
"grad_norm": 0.35973575711250305,
"learning_rate": 9.568847472317232e-05,
"loss": 0.0404,
"step": 4590
},
{
"epoch": 4.410354745925216,
"grad_norm": 0.31195884943008423,
"learning_rate": 9.566260338481425e-05,
"loss": 0.0476,
"step": 4600
},
{
"epoch": 4.419942473633749,
"grad_norm": 0.32485419511795044,
"learning_rate": 9.563665817673688e-05,
"loss": 0.0434,
"step": 4610
},
{
"epoch": 4.429530201342282,
"grad_norm": 0.6262250542640686,
"learning_rate": 9.56106391409125e-05,
"loss": 0.0523,
"step": 4620
},
{
"epoch": 4.439117929050815,
"grad_norm": 0.41280031204223633,
"learning_rate": 9.558454631943286e-05,
"loss": 0.0446,
"step": 4630
},
{
"epoch": 4.448705656759348,
"grad_norm": 0.32726627588272095,
"learning_rate": 9.55583797545091e-05,
"loss": 0.0424,
"step": 4640
},
{
"epoch": 4.458293384467881,
"grad_norm": 0.49836626648902893,
"learning_rate": 9.55321394884716e-05,
"loss": 0.0449,
"step": 4650
},
{
"epoch": 4.467881112176414,
"grad_norm": 0.24192816019058228,
"learning_rate": 9.550582556377003e-05,
"loss": 0.0533,
"step": 4660
},
{
"epoch": 4.477468839884947,
"grad_norm": 0.455990731716156,
"learning_rate": 9.547943802297317e-05,
"loss": 0.0473,
"step": 4670
},
{
"epoch": 4.487056567593481,
"grad_norm": 0.37101179361343384,
"learning_rate": 9.545297690876893e-05,
"loss": 0.0362,
"step": 4680
},
{
"epoch": 4.496644295302014,
"grad_norm": 0.5495269894599915,
"learning_rate": 9.54264422639642e-05,
"loss": 0.0498,
"step": 4690
},
{
"epoch": 4.506232023010546,
"grad_norm": 0.2883033752441406,
"learning_rate": 9.539983413148486e-05,
"loss": 0.0487,
"step": 4700
},
{
"epoch": 4.515819750719079,
"grad_norm": 0.36912739276885986,
"learning_rate": 9.537315255437565e-05,
"loss": 0.0388,
"step": 4710
},
{
"epoch": 4.525407478427613,
"grad_norm": 0.4408855438232422,
"learning_rate": 9.534639757580013e-05,
"loss": 0.0439,
"step": 4720
},
{
"epoch": 4.534995206136146,
"grad_norm": 0.4027664363384247,
"learning_rate": 9.531956923904062e-05,
"loss": 0.0443,
"step": 4730
},
{
"epoch": 4.544582933844679,
"grad_norm": 0.41703498363494873,
"learning_rate": 9.52926675874981e-05,
"loss": 0.0405,
"step": 4740
},
{
"epoch": 4.554170661553212,
"grad_norm": 0.5367491245269775,
"learning_rate": 9.526569266469213e-05,
"loss": 0.0518,
"step": 4750
},
{
"epoch": 4.563758389261745,
"grad_norm": 0.5591031312942505,
"learning_rate": 9.523864451426086e-05,
"loss": 0.0436,
"step": 4760
},
{
"epoch": 4.573346116970278,
"grad_norm": 0.6005666851997375,
"learning_rate": 9.521152317996083e-05,
"loss": 0.0408,
"step": 4770
},
{
"epoch": 4.582933844678811,
"grad_norm": 0.36557164788246155,
"learning_rate": 9.518432870566703e-05,
"loss": 0.0441,
"step": 4780
},
{
"epoch": 4.592521572387344,
"grad_norm": 0.3382973372936249,
"learning_rate": 9.515706113537275e-05,
"loss": 0.0448,
"step": 4790
},
{
"epoch": 4.6021093000958775,
"grad_norm": 0.36412662267684937,
"learning_rate": 9.512972051318952e-05,
"loss": 0.0447,
"step": 4800
},
{
"epoch": 4.611697027804411,
"grad_norm": 0.28740257024765015,
"learning_rate": 9.510230688334709e-05,
"loss": 0.0489,
"step": 4810
},
{
"epoch": 4.621284755512944,
"grad_norm": 0.37100985646247864,
"learning_rate": 9.507482029019324e-05,
"loss": 0.0417,
"step": 4820
},
{
"epoch": 4.630872483221476,
"grad_norm": 0.4594654142856598,
"learning_rate": 9.504726077819387e-05,
"loss": 0.0426,
"step": 4830
},
{
"epoch": 4.6404602109300095,
"grad_norm": 0.37358155846595764,
"learning_rate": 9.501962839193277e-05,
"loss": 0.0491,
"step": 4840
},
{
"epoch": 4.650047938638543,
"grad_norm": 0.31801337003707886,
"learning_rate": 9.499192317611167e-05,
"loss": 0.0444,
"step": 4850
},
{
"epoch": 4.659635666347076,
"grad_norm": 0.4786074161529541,
"learning_rate": 9.496414517555012e-05,
"loss": 0.0426,
"step": 4860
},
{
"epoch": 4.669223394055609,
"grad_norm": 0.23249605298042297,
"learning_rate": 9.493629443518537e-05,
"loss": 0.0495,
"step": 4870
},
{
"epoch": 4.6788111217641415,
"grad_norm": 0.4823112487792969,
"learning_rate": 9.490837100007237e-05,
"loss": 0.0505,
"step": 4880
},
{
"epoch": 4.688398849472675,
"grad_norm": 0.38305050134658813,
"learning_rate": 9.488037491538369e-05,
"loss": 0.0441,
"step": 4890
},
{
"epoch": 4.697986577181208,
"grad_norm": 0.4120224714279175,
"learning_rate": 9.485230622640939e-05,
"loss": 0.0464,
"step": 4900
},
{
"epoch": 4.707574304889741,
"grad_norm": 0.26718661189079285,
"learning_rate": 9.482416497855705e-05,
"loss": 0.0442,
"step": 4910
},
{
"epoch": 4.717162032598274,
"grad_norm": 0.35078132152557373,
"learning_rate": 9.47959512173515e-05,
"loss": 0.0402,
"step": 4920
},
{
"epoch": 4.726749760306808,
"grad_norm": 0.2796134948730469,
"learning_rate": 9.476766498843504e-05,
"loss": 0.0444,
"step": 4930
},
{
"epoch": 4.736337488015341,
"grad_norm": 0.37330299615859985,
"learning_rate": 9.473930633756706e-05,
"loss": 0.0433,
"step": 4940
},
{
"epoch": 4.745925215723873,
"grad_norm": 0.47067347168922424,
"learning_rate": 9.471087531062424e-05,
"loss": 0.0479,
"step": 4950
},
{
"epoch": 4.755512943432406,
"grad_norm": 0.3017641603946686,
"learning_rate": 9.468237195360023e-05,
"loss": 0.0427,
"step": 4960
},
{
"epoch": 4.76510067114094,
"grad_norm": 0.41320186853408813,
"learning_rate": 9.465379631260574e-05,
"loss": 0.052,
"step": 4970
},
{
"epoch": 4.774688398849473,
"grad_norm": 0.4640481173992157,
"learning_rate": 9.462514843386845e-05,
"loss": 0.0463,
"step": 4980
},
{
"epoch": 4.784276126558006,
"grad_norm": 0.2581227123737335,
"learning_rate": 9.459642836373282e-05,
"loss": 0.0371,
"step": 4990
},
{
"epoch": 4.793863854266538,
"grad_norm": 0.3752846121788025,
"learning_rate": 9.456763614866016e-05,
"loss": 0.0437,
"step": 5000
},
{
"epoch": 4.803451581975072,
"grad_norm": 0.27923133969306946,
"learning_rate": 9.453877183522848e-05,
"loss": 0.0442,
"step": 5010
},
{
"epoch": 4.813039309683605,
"grad_norm": 0.31683099269866943,
"learning_rate": 9.450983547013242e-05,
"loss": 0.0396,
"step": 5020
},
{
"epoch": 4.822627037392138,
"grad_norm": 0.5572241544723511,
"learning_rate": 9.448082710018317e-05,
"loss": 0.0464,
"step": 5030
},
{
"epoch": 4.832214765100671,
"grad_norm": 0.4878758490085602,
"learning_rate": 9.44517467723084e-05,
"loss": 0.0462,
"step": 5040
},
{
"epoch": 4.8418024928092045,
"grad_norm": 0.2646120488643646,
"learning_rate": 9.442259453355222e-05,
"loss": 0.0434,
"step": 5050
},
{
"epoch": 4.851390220517738,
"grad_norm": 0.23440934717655182,
"learning_rate": 9.439337043107506e-05,
"loss": 0.0454,
"step": 5060
},
{
"epoch": 4.86097794822627,
"grad_norm": 0.2339864820241928,
"learning_rate": 9.436407451215356e-05,
"loss": 0.0388,
"step": 5070
},
{
"epoch": 4.870565675934803,
"grad_norm": 0.3039968013763428,
"learning_rate": 9.433470682418061e-05,
"loss": 0.0466,
"step": 5080
},
{
"epoch": 4.8801534036433365,
"grad_norm": 0.29253584146499634,
"learning_rate": 9.430526741466519e-05,
"loss": 0.0421,
"step": 5090
},
{
"epoch": 4.88974113135187,
"grad_norm": 0.1914910078048706,
"learning_rate": 9.427575633123224e-05,
"loss": 0.0476,
"step": 5100
},
{
"epoch": 4.899328859060403,
"grad_norm": 0.2769542932510376,
"learning_rate": 9.424617362162271e-05,
"loss": 0.0498,
"step": 5110
},
{
"epoch": 4.908916586768935,
"grad_norm": 0.4235975444316864,
"learning_rate": 9.421651933369345e-05,
"loss": 0.0479,
"step": 5120
},
{
"epoch": 4.9185043144774685,
"grad_norm": 0.2977217733860016,
"learning_rate": 9.4186793515417e-05,
"loss": 0.0367,
"step": 5130
},
{
"epoch": 4.928092042186002,
"grad_norm": 0.4000433087348938,
"learning_rate": 9.415699621488172e-05,
"loss": 0.0452,
"step": 5140
},
{
"epoch": 4.937679769894535,
"grad_norm": 0.3901826739311218,
"learning_rate": 9.412712748029157e-05,
"loss": 0.0431,
"step": 5150
},
{
"epoch": 4.947267497603068,
"grad_norm": 0.45422032475471497,
"learning_rate": 9.409718735996605e-05,
"loss": 0.0419,
"step": 5160
},
{
"epoch": 4.956855225311601,
"grad_norm": 0.29559481143951416,
"learning_rate": 9.406717590234016e-05,
"loss": 0.0404,
"step": 5170
},
{
"epoch": 4.966442953020135,
"grad_norm": 0.39736929535865784,
"learning_rate": 9.403709315596431e-05,
"loss": 0.0409,
"step": 5180
},
{
"epoch": 4.976030680728667,
"grad_norm": 0.37043488025665283,
"learning_rate": 9.400693916950427e-05,
"loss": 0.0494,
"step": 5190
},
{
"epoch": 4.9856184084372,
"grad_norm": 0.35436293482780457,
"learning_rate": 9.397671399174096e-05,
"loss": 0.0505,
"step": 5200
},
{
"epoch": 4.995206136145733,
"grad_norm": 0.24993938207626343,
"learning_rate": 9.394641767157056e-05,
"loss": 0.0491,
"step": 5210
},
{
"epoch": 5.004793863854267,
"grad_norm": 0.3652108609676361,
"learning_rate": 9.391605025800431e-05,
"loss": 0.0474,
"step": 5220
},
{
"epoch": 5.0143815915628,
"grad_norm": 0.3362497389316559,
"learning_rate": 9.388561180016844e-05,
"loss": 0.0481,
"step": 5230
},
{
"epoch": 5.023969319271333,
"grad_norm": 0.25596174597740173,
"learning_rate": 9.385510234730415e-05,
"loss": 0.0475,
"step": 5240
},
{
"epoch": 5.033557046979865,
"grad_norm": 0.4541703760623932,
"learning_rate": 9.382452194876743e-05,
"loss": 0.0448,
"step": 5250
},
{
"epoch": 5.043144774688399,
"grad_norm": 0.5844725966453552,
"learning_rate": 9.379387065402911e-05,
"loss": 0.0531,
"step": 5260
},
{
"epoch": 5.052732502396932,
"grad_norm": 0.5136455297470093,
"learning_rate": 9.376314851267468e-05,
"loss": 0.0478,
"step": 5270
},
{
"epoch": 5.062320230105465,
"grad_norm": 0.36073240637779236,
"learning_rate": 9.373235557440423e-05,
"loss": 0.0413,
"step": 5280
},
{
"epoch": 5.071907957813998,
"grad_norm": 0.3564154803752899,
"learning_rate": 9.370149188903238e-05,
"loss": 0.0474,
"step": 5290
},
{
"epoch": 5.0814956855225315,
"grad_norm": 0.269563764333725,
"learning_rate": 9.367055750648823e-05,
"loss": 0.05,
"step": 5300
},
{
"epoch": 5.091083413231064,
"grad_norm": 0.32311663031578064,
"learning_rate": 9.363955247681522e-05,
"loss": 0.0443,
"step": 5310
},
{
"epoch": 5.100671140939597,
"grad_norm": 0.2627108097076416,
"learning_rate": 9.360847685017109e-05,
"loss": 0.0442,
"step": 5320
},
{
"epoch": 5.11025886864813,
"grad_norm": 0.34790635108947754,
"learning_rate": 9.357733067682777e-05,
"loss": 0.0441,
"step": 5330
},
{
"epoch": 5.1198465963566635,
"grad_norm": 0.22408638894557953,
"learning_rate": 9.354611400717135e-05,
"loss": 0.0415,
"step": 5340
},
{
"epoch": 5.129434324065197,
"grad_norm": 0.3347373306751251,
"learning_rate": 9.351482689170193e-05,
"loss": 0.0427,
"step": 5350
},
{
"epoch": 5.13902205177373,
"grad_norm": 0.30321311950683594,
"learning_rate": 9.348346938103359e-05,
"loss": 0.0434,
"step": 5360
},
{
"epoch": 5.148609779482262,
"grad_norm": 0.2402300387620926,
"learning_rate": 9.345204152589428e-05,
"loss": 0.0475,
"step": 5370
},
{
"epoch": 5.1581975071907955,
"grad_norm": 0.5249261856079102,
"learning_rate": 9.342054337712576e-05,
"loss": 0.0486,
"step": 5380
},
{
"epoch": 5.167785234899329,
"grad_norm": 0.3607705533504486,
"learning_rate": 9.338897498568349e-05,
"loss": 0.0417,
"step": 5390
},
{
"epoch": 5.177372962607862,
"grad_norm": 0.38747304677963257,
"learning_rate": 9.33573364026366e-05,
"loss": 0.0477,
"step": 5400
},
{
"epoch": 5.186960690316395,
"grad_norm": 0.36637309193611145,
"learning_rate": 9.332562767916771e-05,
"loss": 0.044,
"step": 5410
},
{
"epoch": 5.196548418024928,
"grad_norm": 0.31087052822113037,
"learning_rate": 9.329384886657296e-05,
"loss": 0.0373,
"step": 5420
},
{
"epoch": 5.206136145733462,
"grad_norm": 0.3998284935951233,
"learning_rate": 9.326200001626184e-05,
"loss": 0.036,
"step": 5430
},
{
"epoch": 5.215723873441994,
"grad_norm": 0.3035097122192383,
"learning_rate": 9.323008117975718e-05,
"loss": 0.0429,
"step": 5440
},
{
"epoch": 5.225311601150527,
"grad_norm": 0.3162848949432373,
"learning_rate": 9.319809240869502e-05,
"loss": 0.0524,
"step": 5450
},
{
"epoch": 5.23489932885906,
"grad_norm": 0.3142375946044922,
"learning_rate": 9.316603375482449e-05,
"loss": 0.0479,
"step": 5460
},
{
"epoch": 5.244487056567594,
"grad_norm": 0.3951794505119324,
"learning_rate": 9.313390527000783e-05,
"loss": 0.044,
"step": 5470
},
{
"epoch": 5.254074784276127,
"grad_norm": 0.26764142513275146,
"learning_rate": 9.310170700622021e-05,
"loss": 0.0409,
"step": 5480
},
{
"epoch": 5.263662511984659,
"grad_norm": 0.3293421268463135,
"learning_rate": 9.306943901554972e-05,
"loss": 0.0413,
"step": 5490
},
{
"epoch": 5.273250239693192,
"grad_norm": 0.39588311314582825,
"learning_rate": 9.30371013501972e-05,
"loss": 0.0475,
"step": 5500
},
{
"epoch": 5.282837967401726,
"grad_norm": 0.330180287361145,
"learning_rate": 9.300469406247621e-05,
"loss": 0.0409,
"step": 5510
},
{
"epoch": 5.292425695110259,
"grad_norm": 0.37915417551994324,
"learning_rate": 9.297221720481302e-05,
"loss": 0.0413,
"step": 5520
},
{
"epoch": 5.302013422818792,
"grad_norm": 0.3579411208629608,
"learning_rate": 9.293967082974632e-05,
"loss": 0.0476,
"step": 5530
},
{
"epoch": 5.311601150527325,
"grad_norm": 0.30744969844818115,
"learning_rate": 9.29070549899274e-05,
"loss": 0.0454,
"step": 5540
},
{
"epoch": 5.3211888782358585,
"grad_norm": 0.31515830755233765,
"learning_rate": 9.287436973811978e-05,
"loss": 0.0343,
"step": 5550
},
{
"epoch": 5.330776605944391,
"grad_norm": 0.26603803038597107,
"learning_rate": 9.284161512719938e-05,
"loss": 0.0402,
"step": 5560
},
{
"epoch": 5.340364333652924,
"grad_norm": 0.39798933267593384,
"learning_rate": 9.280879121015428e-05,
"loss": 0.0339,
"step": 5570
},
{
"epoch": 5.349952061361457,
"grad_norm": 0.35744068026542664,
"learning_rate": 9.277589804008467e-05,
"loss": 0.0442,
"step": 5580
},
{
"epoch": 5.3595397890699905,
"grad_norm": 0.426455557346344,
"learning_rate": 9.27429356702028e-05,
"loss": 0.0515,
"step": 5590
},
{
"epoch": 5.369127516778524,
"grad_norm": 0.5735211372375488,
"learning_rate": 9.270990415383285e-05,
"loss": 0.0447,
"step": 5600
},
{
"epoch": 5.378715244487057,
"grad_norm": 0.40888845920562744,
"learning_rate": 9.267680354441087e-05,
"loss": 0.0523,
"step": 5610
},
{
"epoch": 5.388302972195589,
"grad_norm": 0.5509734153747559,
"learning_rate": 9.264363389548465e-05,
"loss": 0.0475,
"step": 5620
},
{
"epoch": 5.3978906999041225,
"grad_norm": 0.4060477912425995,
"learning_rate": 9.261039526071374e-05,
"loss": 0.0456,
"step": 5630
},
{
"epoch": 5.407478427612656,
"grad_norm": 0.3927951157093048,
"learning_rate": 9.257708769386919e-05,
"loss": 0.0588,
"step": 5640
},
{
"epoch": 5.417066155321189,
"grad_norm": 0.2928200662136078,
"learning_rate": 9.254371124883366e-05,
"loss": 0.0412,
"step": 5650
},
{
"epoch": 5.426653883029722,
"grad_norm": 0.37971609830856323,
"learning_rate": 9.251026597960117e-05,
"loss": 0.0473,
"step": 5660
},
{
"epoch": 5.436241610738255,
"grad_norm": 0.3287939131259918,
"learning_rate": 9.247675194027712e-05,
"loss": 0.055,
"step": 5670
},
{
"epoch": 5.445829338446788,
"grad_norm": 0.2745339870452881,
"learning_rate": 9.244316918507813e-05,
"loss": 0.044,
"step": 5680
},
{
"epoch": 5.455417066155321,
"grad_norm": 0.2364960014820099,
"learning_rate": 9.240951776833202e-05,
"loss": 0.045,
"step": 5690
},
{
"epoch": 5.465004793863854,
"grad_norm": 0.5563991069793701,
"learning_rate": 9.237579774447765e-05,
"loss": 0.042,
"step": 5700
},
{
"epoch": 5.474592521572387,
"grad_norm": 0.3112446367740631,
"learning_rate": 9.234200916806486e-05,
"loss": 0.0488,
"step": 5710
},
{
"epoch": 5.484180249280921,
"grad_norm": 0.32364800572395325,
"learning_rate": 9.230815209375446e-05,
"loss": 0.039,
"step": 5720
},
{
"epoch": 5.493767976989454,
"grad_norm": 0.35172006487846375,
"learning_rate": 9.227422657631796e-05,
"loss": 0.0443,
"step": 5730
},
{
"epoch": 5.503355704697986,
"grad_norm": 0.3294823169708252,
"learning_rate": 9.22402326706377e-05,
"loss": 0.0466,
"step": 5740
},
{
"epoch": 5.512943432406519,
"grad_norm": 0.32464146614074707,
"learning_rate": 9.220617043170661e-05,
"loss": 0.0456,
"step": 5750
},
{
"epoch": 5.522531160115053,
"grad_norm": 0.36492130160331726,
"learning_rate": 9.217203991462815e-05,
"loss": 0.041,
"step": 5760
},
{
"epoch": 5.532118887823586,
"grad_norm": 0.35331547260284424,
"learning_rate": 9.213784117461624e-05,
"loss": 0.0377,
"step": 5770
},
{
"epoch": 5.541706615532119,
"grad_norm": 0.2622346580028534,
"learning_rate": 9.210357426699519e-05,
"loss": 0.0422,
"step": 5780
},
{
"epoch": 5.551294343240652,
"grad_norm": 0.4553088843822479,
"learning_rate": 9.206923924719955e-05,
"loss": 0.0467,
"step": 5790
},
{
"epoch": 5.5608820709491855,
"grad_norm": 0.38138529658317566,
"learning_rate": 9.203483617077411e-05,
"loss": 0.0397,
"step": 5800
},
{
"epoch": 5.570469798657718,
"grad_norm": 0.4665132761001587,
"learning_rate": 9.200036509337369e-05,
"loss": 0.0518,
"step": 5810
},
{
"epoch": 5.580057526366251,
"grad_norm": 0.27688702940940857,
"learning_rate": 9.196582607076319e-05,
"loss": 0.0469,
"step": 5820
},
{
"epoch": 5.589645254074784,
"grad_norm": 0.3505072295665741,
"learning_rate": 9.193121915881737e-05,
"loss": 0.0526,
"step": 5830
},
{
"epoch": 5.5992329817833175,
"grad_norm": 0.23712855577468872,
"learning_rate": 9.189654441352082e-05,
"loss": 0.0431,
"step": 5840
},
{
"epoch": 5.608820709491851,
"grad_norm": 0.41854333877563477,
"learning_rate": 9.186180189096791e-05,
"loss": 0.0505,
"step": 5850
},
{
"epoch": 5.618408437200383,
"grad_norm": 0.2426682859659195,
"learning_rate": 9.182699164736264e-05,
"loss": 0.0444,
"step": 5860
},
{
"epoch": 5.627996164908916,
"grad_norm": 0.6301522850990295,
"learning_rate": 9.17921137390185e-05,
"loss": 0.0495,
"step": 5870
},
{
"epoch": 5.6375838926174495,
"grad_norm": 0.3933928608894348,
"learning_rate": 9.175716822235854e-05,
"loss": 0.0524,
"step": 5880
},
{
"epoch": 5.647171620325983,
"grad_norm": 0.4735229015350342,
"learning_rate": 9.17221551539151e-05,
"loss": 0.0416,
"step": 5890
},
{
"epoch": 5.656759348034516,
"grad_norm": 0.23624800145626068,
"learning_rate": 9.168707459032988e-05,
"loss": 0.0436,
"step": 5900
},
{
"epoch": 5.666347075743049,
"grad_norm": 0.35046079754829407,
"learning_rate": 9.165192658835369e-05,
"loss": 0.043,
"step": 5910
},
{
"epoch": 5.675934803451582,
"grad_norm": 0.43765562772750854,
"learning_rate": 9.161671120484649e-05,
"loss": 0.0449,
"step": 5920
},
{
"epoch": 5.685522531160115,
"grad_norm": 0.2839658856391907,
"learning_rate": 9.158142849677723e-05,
"loss": 0.0429,
"step": 5930
},
{
"epoch": 5.695110258868648,
"grad_norm": 0.34485873579978943,
"learning_rate": 9.154607852122376e-05,
"loss": 0.0444,
"step": 5940
},
{
"epoch": 5.704697986577181,
"grad_norm": 1.9406903982162476,
"learning_rate": 9.15106613353728e-05,
"loss": 0.0491,
"step": 5950
},
{
"epoch": 5.714285714285714,
"grad_norm": 0.25078949332237244,
"learning_rate": 9.14751769965197e-05,
"loss": 0.0455,
"step": 5960
},
{
"epoch": 5.723873441994248,
"grad_norm": 0.342736154794693,
"learning_rate": 9.143962556206853e-05,
"loss": 0.0418,
"step": 5970
},
{
"epoch": 5.73346116970278,
"grad_norm": 0.36492887139320374,
"learning_rate": 9.140400708953189e-05,
"loss": 0.0457,
"step": 5980
},
{
"epoch": 5.743048897411313,
"grad_norm": 0.3328196108341217,
"learning_rate": 9.136832163653083e-05,
"loss": 0.0434,
"step": 5990
},
{
"epoch": 5.752636625119846,
"grad_norm": 0.31458353996276855,
"learning_rate": 9.13325692607947e-05,
"loss": 0.0426,
"step": 6000
},
{
"epoch": 5.76222435282838,
"grad_norm": 0.2388927936553955,
"learning_rate": 9.129675002016119e-05,
"loss": 0.0412,
"step": 6010
},
{
"epoch": 5.771812080536913,
"grad_norm": 0.29163026809692383,
"learning_rate": 9.126086397257612e-05,
"loss": 0.0464,
"step": 6020
},
{
"epoch": 5.781399808245446,
"grad_norm": 0.3460707664489746,
"learning_rate": 9.122491117609336e-05,
"loss": 0.0417,
"step": 6030
},
{
"epoch": 5.790987535953979,
"grad_norm": 0.17329041659832,
"learning_rate": 9.118889168887483e-05,
"loss": 0.0394,
"step": 6040
},
{
"epoch": 5.800575263662512,
"grad_norm": 0.2987213730812073,
"learning_rate": 9.11528055691903e-05,
"loss": 0.0399,
"step": 6050
},
{
"epoch": 5.810162991371045,
"grad_norm": 0.4310978353023529,
"learning_rate": 9.111665287541733e-05,
"loss": 0.0387,
"step": 6060
},
{
"epoch": 5.819750719079578,
"grad_norm": 0.3461402952671051,
"learning_rate": 9.108043366604115e-05,
"loss": 0.0388,
"step": 6070
},
{
"epoch": 5.829338446788111,
"grad_norm": 0.3460417091846466,
"learning_rate": 9.104414799965468e-05,
"loss": 0.0458,
"step": 6080
},
{
"epoch": 5.8389261744966445,
"grad_norm": 0.26389792561531067,
"learning_rate": 9.100779593495825e-05,
"loss": 0.0416,
"step": 6090
},
{
"epoch": 5.848513902205178,
"grad_norm": 0.39147645235061646,
"learning_rate": 9.097137753075966e-05,
"loss": 0.0392,
"step": 6100
},
{
"epoch": 5.85810162991371,
"grad_norm": 0.4331185221672058,
"learning_rate": 9.093489284597404e-05,
"loss": 0.0388,
"step": 6110
},
{
"epoch": 5.867689357622243,
"grad_norm": 0.1933136284351349,
"learning_rate": 9.089834193962372e-05,
"loss": 0.0408,
"step": 6120
},
{
"epoch": 5.8772770853307765,
"grad_norm": 0.29839614033699036,
"learning_rate": 9.086172487083815e-05,
"loss": 0.0374,
"step": 6130
},
{
"epoch": 5.88686481303931,
"grad_norm": 0.23067611455917358,
"learning_rate": 9.082504169885381e-05,
"loss": 0.044,
"step": 6140
},
{
"epoch": 5.896452540747843,
"grad_norm": 0.2817287743091583,
"learning_rate": 9.078829248301417e-05,
"loss": 0.036,
"step": 6150
},
{
"epoch": 5.906040268456376,
"grad_norm": 0.2695087790489197,
"learning_rate": 9.07514772827695e-05,
"loss": 0.0371,
"step": 6160
},
{
"epoch": 5.9156279961649085,
"grad_norm": 0.5207583904266357,
"learning_rate": 9.071459615767679e-05,
"loss": 0.0406,
"step": 6170
},
{
"epoch": 5.925215723873442,
"grad_norm": 0.3283056914806366,
"learning_rate": 9.067764916739971e-05,
"loss": 0.0421,
"step": 6180
},
{
"epoch": 5.934803451581975,
"grad_norm": 0.4326401352882385,
"learning_rate": 9.06406363717085e-05,
"loss": 0.0397,
"step": 6190
},
{
"epoch": 5.944391179290508,
"grad_norm": 0.3044590651988983,
"learning_rate": 9.060355783047982e-05,
"loss": 0.045,
"step": 6200
},
{
"epoch": 5.953978906999041,
"grad_norm": 0.2913448214530945,
"learning_rate": 9.056641360369672e-05,
"loss": 0.0364,
"step": 6210
},
{
"epoch": 5.963566634707575,
"grad_norm": 0.4203062057495117,
"learning_rate": 9.052920375144847e-05,
"loss": 0.0544,
"step": 6220
},
{
"epoch": 5.973154362416107,
"grad_norm": 0.34060561656951904,
"learning_rate": 9.049192833393055e-05,
"loss": 0.0428,
"step": 6230
},
{
"epoch": 5.98274209012464,
"grad_norm": 0.28594672679901123,
"learning_rate": 9.045458741144446e-05,
"loss": 0.0497,
"step": 6240
},
{
"epoch": 5.992329817833173,
"grad_norm": 0.4794290065765381,
"learning_rate": 9.041718104439772e-05,
"loss": 0.0462,
"step": 6250
},
{
"epoch": 6.001917545541707,
"grad_norm": 0.47997909784317017,
"learning_rate": 9.037970929330368e-05,
"loss": 0.0489,
"step": 6260
},
{
"epoch": 6.01150527325024,
"grad_norm": 0.37695473432540894,
"learning_rate": 9.03421722187815e-05,
"loss": 0.0409,
"step": 6270
},
{
"epoch": 6.021093000958773,
"grad_norm": 0.4723213315010071,
"learning_rate": 9.030456988155596e-05,
"loss": 0.0458,
"step": 6280
},
{
"epoch": 6.030680728667305,
"grad_norm": 0.4923066794872284,
"learning_rate": 9.026690234245749e-05,
"loss": 0.048,
"step": 6290
},
{
"epoch": 6.040268456375839,
"grad_norm": 0.3513863980770111,
"learning_rate": 9.022916966242192e-05,
"loss": 0.0414,
"step": 6300
},
{
"epoch": 6.049856184084372,
"grad_norm": 0.34284889698028564,
"learning_rate": 9.019137190249055e-05,
"loss": 0.0435,
"step": 6310
},
{
"epoch": 6.059443911792905,
"grad_norm": 0.33619949221611023,
"learning_rate": 9.015350912380989e-05,
"loss": 0.0428,
"step": 6320
},
{
"epoch": 6.069031639501438,
"grad_norm": 0.5763192176818848,
"learning_rate": 9.011558138763165e-05,
"loss": 0.0386,
"step": 6330
},
{
"epoch": 6.0786193672099715,
"grad_norm": 0.5095228552818298,
"learning_rate": 9.007758875531264e-05,
"loss": 0.041,
"step": 6340
},
{
"epoch": 6.088207094918504,
"grad_norm": 0.3965105414390564,
"learning_rate": 9.003953128831464e-05,
"loss": 0.0409,
"step": 6350
},
{
"epoch": 6.097794822627037,
"grad_norm": 0.3434533178806305,
"learning_rate": 9.000140904820432e-05,
"loss": 0.0393,
"step": 6360
},
{
"epoch": 6.10738255033557,
"grad_norm": 0.2840021252632141,
"learning_rate": 8.996322209665313e-05,
"loss": 0.0476,
"step": 6370
},
{
"epoch": 6.1169702780441035,
"grad_norm": 0.3020944893360138,
"learning_rate": 8.992497049543722e-05,
"loss": 0.042,
"step": 6380
},
{
"epoch": 6.126558005752637,
"grad_norm": 0.36320140957832336,
"learning_rate": 8.988665430643732e-05,
"loss": 0.0376,
"step": 6390
},
{
"epoch": 6.13614573346117,
"grad_norm": 0.34080708026885986,
"learning_rate": 8.984827359163863e-05,
"loss": 0.0428,
"step": 6400
},
{
"epoch": 6.145733461169703,
"grad_norm": 0.28345417976379395,
"learning_rate": 8.980982841313074e-05,
"loss": 0.041,
"step": 6410
},
{
"epoch": 6.1553211888782355,
"grad_norm": 0.37377986311912537,
"learning_rate": 8.977131883310757e-05,
"loss": 0.0429,
"step": 6420
},
{
"epoch": 6.164908916586769,
"grad_norm": 0.30983594059944153,
"learning_rate": 8.973274491386712e-05,
"loss": 0.0406,
"step": 6430
},
{
"epoch": 6.174496644295302,
"grad_norm": 0.32864126563072205,
"learning_rate": 8.96941067178116e-05,
"loss": 0.0414,
"step": 6440
},
{
"epoch": 6.184084372003835,
"grad_norm": 0.28770530223846436,
"learning_rate": 8.965540430744712e-05,
"loss": 0.0371,
"step": 6450
},
{
"epoch": 6.193672099712368,
"grad_norm": 0.36449259519577026,
"learning_rate": 8.961663774538367e-05,
"loss": 0.0337,
"step": 6460
},
{
"epoch": 6.203259827420902,
"grad_norm": 0.3418562412261963,
"learning_rate": 8.957780709433509e-05,
"loss": 0.0417,
"step": 6470
},
{
"epoch": 6.212847555129434,
"grad_norm": 0.2991498112678528,
"learning_rate": 8.95389124171188e-05,
"loss": 0.0402,
"step": 6480
},
{
"epoch": 6.222435282837967,
"grad_norm": 1.1912015676498413,
"learning_rate": 8.94999537766559e-05,
"loss": 0.045,
"step": 6490
},
{
"epoch": 6.2320230105465,
"grad_norm": 0.6341769695281982,
"learning_rate": 8.946093123597088e-05,
"loss": 0.0531,
"step": 6500
},
{
"epoch": 6.241610738255034,
"grad_norm": 0.34102702140808105,
"learning_rate": 8.942184485819162e-05,
"loss": 0.0432,
"step": 6510
},
{
"epoch": 6.251198465963567,
"grad_norm": 0.4004610776901245,
"learning_rate": 8.938269470654936e-05,
"loss": 0.0476,
"step": 6520
},
{
"epoch": 6.2607861936721,
"grad_norm": 0.4373878240585327,
"learning_rate": 8.934348084437835e-05,
"loss": 0.0428,
"step": 6530
},
{
"epoch": 6.270373921380632,
"grad_norm": 0.36895817518234253,
"learning_rate": 8.930420333511606e-05,
"loss": 0.0503,
"step": 6540
},
{
"epoch": 6.279961649089166,
"grad_norm": 0.4267611503601074,
"learning_rate": 8.926486224230282e-05,
"loss": 0.0429,
"step": 6550
},
{
"epoch": 6.289549376797699,
"grad_norm": 0.4211304485797882,
"learning_rate": 8.922545762958188e-05,
"loss": 0.0428,
"step": 6560
},
{
"epoch": 6.299137104506232,
"grad_norm": 0.41338953375816345,
"learning_rate": 8.918598956069919e-05,
"loss": 0.047,
"step": 6570
},
{
"epoch": 6.308724832214765,
"grad_norm": 0.35418424010276794,
"learning_rate": 8.914645809950344e-05,
"loss": 0.0508,
"step": 6580
},
{
"epoch": 6.3183125599232985,
"grad_norm": 0.5311810374259949,
"learning_rate": 8.91068633099458e-05,
"loss": 0.0469,
"step": 6590
},
{
"epoch": 6.327900287631831,
"grad_norm": 0.2737090587615967,
"learning_rate": 8.90672052560799e-05,
"loss": 0.0438,
"step": 6600
},
{
"epoch": 6.337488015340364,
"grad_norm": 0.2861912250518799,
"learning_rate": 8.902748400206174e-05,
"loss": 0.0498,
"step": 6610
},
{
"epoch": 6.347075743048897,
"grad_norm": 0.37531477212905884,
"learning_rate": 8.898769961214952e-05,
"loss": 0.0456,
"step": 6620
},
{
"epoch": 6.3566634707574305,
"grad_norm": 0.34681612253189087,
"learning_rate": 8.894785215070365e-05,
"loss": 0.0418,
"step": 6630
},
{
"epoch": 6.366251198465964,
"grad_norm": 0.28546613454818726,
"learning_rate": 8.890794168218649e-05,
"loss": 0.0428,
"step": 6640
},
{
"epoch": 6.375838926174497,
"grad_norm": 0.35588616132736206,
"learning_rate": 8.886796827116237e-05,
"loss": 0.0433,
"step": 6650
},
{
"epoch": 6.385426653883029,
"grad_norm": 0.362427294254303,
"learning_rate": 8.882793198229744e-05,
"loss": 0.0421,
"step": 6660
},
{
"epoch": 6.3950143815915625,
"grad_norm": 0.4580886960029602,
"learning_rate": 8.878783288035957e-05,
"loss": 0.034,
"step": 6670
},
{
"epoch": 6.404602109300096,
"grad_norm": 0.37446141242980957,
"learning_rate": 8.874767103021824e-05,
"loss": 0.0485,
"step": 6680
},
{
"epoch": 6.414189837008629,
"grad_norm": 0.2968175411224365,
"learning_rate": 8.870744649684444e-05,
"loss": 0.0354,
"step": 6690
},
{
"epoch": 6.423777564717162,
"grad_norm": 0.3441408574581146,
"learning_rate": 8.866715934531057e-05,
"loss": 0.0427,
"step": 6700
},
{
"epoch": 6.433365292425695,
"grad_norm": 0.3193801939487457,
"learning_rate": 8.862680964079031e-05,
"loss": 0.0377,
"step": 6710
},
{
"epoch": 6.442953020134228,
"grad_norm": 0.3228664696216583,
"learning_rate": 8.858639744855857e-05,
"loss": 0.0451,
"step": 6720
},
{
"epoch": 6.452540747842761,
"grad_norm": 0.4861704111099243,
"learning_rate": 8.85459228339913e-05,
"loss": 0.0426,
"step": 6730
},
{
"epoch": 6.462128475551294,
"grad_norm": 0.281361848115921,
"learning_rate": 8.85053858625655e-05,
"loss": 0.0411,
"step": 6740
},
{
"epoch": 6.471716203259827,
"grad_norm": 0.39643704891204834,
"learning_rate": 8.846478659985895e-05,
"loss": 0.0376,
"step": 6750
},
{
"epoch": 6.481303930968361,
"grad_norm": 0.269710510969162,
"learning_rate": 8.84241251115503e-05,
"loss": 0.0393,
"step": 6760
},
{
"epoch": 6.490891658676894,
"grad_norm": 0.31520572304725647,
"learning_rate": 8.838340146341881e-05,
"loss": 0.0486,
"step": 6770
},
{
"epoch": 6.500479386385427,
"grad_norm": 0.3355605900287628,
"learning_rate": 8.83426157213443e-05,
"loss": 0.045,
"step": 6780
},
{
"epoch": 6.510067114093959,
"grad_norm": 0.2806301712989807,
"learning_rate": 8.830176795130707e-05,
"loss": 0.0447,
"step": 6790
},
{
"epoch": 6.519654841802493,
"grad_norm": 0.27659860253334045,
"learning_rate": 8.82608582193877e-05,
"loss": 0.0426,
"step": 6800
},
{
"epoch": 6.529242569511026,
"grad_norm": 0.2935637831687927,
"learning_rate": 8.82198865917671e-05,
"loss": 0.0454,
"step": 6810
},
{
"epoch": 6.538830297219559,
"grad_norm": 0.3571741580963135,
"learning_rate": 8.817885313472623e-05,
"loss": 0.0454,
"step": 6820
},
{
"epoch": 6.548418024928092,
"grad_norm": 0.3467845022678375,
"learning_rate": 8.813775791464611e-05,
"loss": 0.043,
"step": 6830
},
{
"epoch": 6.558005752636625,
"grad_norm": 0.4052905738353729,
"learning_rate": 8.80966009980077e-05,
"loss": 0.0449,
"step": 6840
},
{
"epoch": 6.567593480345158,
"grad_norm": 0.3361055254936218,
"learning_rate": 8.805538245139169e-05,
"loss": 0.0464,
"step": 6850
},
{
"epoch": 6.577181208053691,
"grad_norm": 0.29235902428627014,
"learning_rate": 8.801410234147855e-05,
"loss": 0.0432,
"step": 6860
},
{
"epoch": 6.586768935762224,
"grad_norm": 0.4435720443725586,
"learning_rate": 8.797276073504832e-05,
"loss": 0.0483,
"step": 6870
},
{
"epoch": 6.5963566634707576,
"grad_norm": 0.36006295680999756,
"learning_rate": 8.793135769898048e-05,
"loss": 0.0389,
"step": 6880
},
{
"epoch": 6.605944391179291,
"grad_norm": 0.30433642864227295,
"learning_rate": 8.788989330025397e-05,
"loss": 0.0455,
"step": 6890
},
{
"epoch": 6.615532118887824,
"grad_norm": 0.2952471673488617,
"learning_rate": 8.784836760594692e-05,
"loss": 0.0373,
"step": 6900
},
{
"epoch": 6.625119846596356,
"grad_norm": 0.42555341124534607,
"learning_rate": 8.780678068323666e-05,
"loss": 0.041,
"step": 6910
},
{
"epoch": 6.6347075743048896,
"grad_norm": 0.3166603744029999,
"learning_rate": 8.776513259939957e-05,
"loss": 0.0441,
"step": 6920
},
{
"epoch": 6.644295302013423,
"grad_norm": 0.5082001686096191,
"learning_rate": 8.772342342181095e-05,
"loss": 0.0501,
"step": 6930
},
{
"epoch": 6.653883029721956,
"grad_norm": 0.32811877131462097,
"learning_rate": 8.768165321794496e-05,
"loss": 0.0449,
"step": 6940
},
{
"epoch": 6.663470757430489,
"grad_norm": 0.39213889837265015,
"learning_rate": 8.763982205537446e-05,
"loss": 0.0497,
"step": 6950
},
{
"epoch": 6.673058485139022,
"grad_norm": 0.33301976323127747,
"learning_rate": 8.759793000177094e-05,
"loss": 0.0466,
"step": 6960
},
{
"epoch": 6.682646212847555,
"grad_norm": 0.33493635058403015,
"learning_rate": 8.755597712490442e-05,
"loss": 0.0485,
"step": 6970
},
{
"epoch": 6.692233940556088,
"grad_norm": 0.43134915828704834,
"learning_rate": 8.751396349264324e-05,
"loss": 0.051,
"step": 6980
},
{
"epoch": 6.701821668264621,
"grad_norm": 0.3931342363357544,
"learning_rate": 8.747188917295409e-05,
"loss": 0.0436,
"step": 6990
},
{
"epoch": 6.7114093959731544,
"grad_norm": 0.3660528063774109,
"learning_rate": 8.742975423390183e-05,
"loss": 0.0393,
"step": 7000
},
{
"epoch": 6.720997123681688,
"grad_norm": 0.33165839314460754,
"learning_rate": 8.738755874364937e-05,
"loss": 0.0366,
"step": 7010
},
{
"epoch": 6.730584851390221,
"grad_norm": 0.3469119966030121,
"learning_rate": 8.734530277045759e-05,
"loss": 0.0378,
"step": 7020
},
{
"epoch": 6.740172579098753,
"grad_norm": 0.27698802947998047,
"learning_rate": 8.730298638268516e-05,
"loss": 0.0362,
"step": 7030
},
{
"epoch": 6.7497603068072864,
"grad_norm": 0.4078359603881836,
"learning_rate": 8.726060964878858e-05,
"loss": 0.046,
"step": 7040
},
{
"epoch": 6.75934803451582,
"grad_norm": 0.34536081552505493,
"learning_rate": 8.721817263732191e-05,
"loss": 0.0537,
"step": 7050
},
{
"epoch": 6.768935762224353,
"grad_norm": 0.7122533917427063,
"learning_rate": 8.717567541693673e-05,
"loss": 0.0466,
"step": 7060
},
{
"epoch": 6.778523489932886,
"grad_norm": 0.24024972319602966,
"learning_rate": 8.7133118056382e-05,
"loss": 0.0492,
"step": 7070
},
{
"epoch": 6.788111217641419,
"grad_norm": 0.41367456316947937,
"learning_rate": 8.709050062450403e-05,
"loss": 0.0424,
"step": 7080
},
{
"epoch": 6.797698945349952,
"grad_norm": 0.35695597529411316,
"learning_rate": 8.70478231902463e-05,
"loss": 0.0425,
"step": 7090
},
{
"epoch": 6.807286673058485,
"grad_norm": 0.38064390420913696,
"learning_rate": 8.700508582264928e-05,
"loss": 0.0488,
"step": 7100
},
{
"epoch": 6.816874400767018,
"grad_norm": 0.3264651894569397,
"learning_rate": 8.696228859085049e-05,
"loss": 0.0429,
"step": 7110
},
{
"epoch": 6.826462128475551,
"grad_norm": 0.36960527300834656,
"learning_rate": 8.691943156408425e-05,
"loss": 0.0465,
"step": 7120
},
{
"epoch": 6.836049856184085,
"grad_norm": 0.34985673427581787,
"learning_rate": 8.687651481168158e-05,
"loss": 0.0389,
"step": 7130
},
{
"epoch": 6.845637583892618,
"grad_norm": 0.308672696352005,
"learning_rate": 8.68335384030702e-05,
"loss": 0.0426,
"step": 7140
},
{
"epoch": 6.855225311601151,
"grad_norm": 0.3914170563220978,
"learning_rate": 8.679050240777427e-05,
"loss": 0.0421,
"step": 7150
},
{
"epoch": 6.864813039309683,
"grad_norm": 0.2807207703590393,
"learning_rate": 8.674740689541439e-05,
"loss": 0.0484,
"step": 7160
},
{
"epoch": 6.874400767018217,
"grad_norm": 0.31063312292099,
"learning_rate": 8.670425193570739e-05,
"loss": 0.0413,
"step": 7170
},
{
"epoch": 6.88398849472675,
"grad_norm": 0.3080969452857971,
"learning_rate": 8.666103759846634e-05,
"loss": 0.0438,
"step": 7180
},
{
"epoch": 6.893576222435283,
"grad_norm": 0.27219802141189575,
"learning_rate": 8.661776395360029e-05,
"loss": 0.045,
"step": 7190
},
{
"epoch": 6.903163950143816,
"grad_norm": 0.44108715653419495,
"learning_rate": 8.65744310711143e-05,
"loss": 0.0431,
"step": 7200
},
{
"epoch": 6.912751677852349,
"grad_norm": 0.34575361013412476,
"learning_rate": 8.653103902110922e-05,
"loss": 0.0419,
"step": 7210
},
{
"epoch": 6.922339405560882,
"grad_norm": 0.29534199833869934,
"learning_rate": 8.648758787378164e-05,
"loss": 0.0392,
"step": 7220
},
{
"epoch": 6.931927133269415,
"grad_norm": 0.3387232720851898,
"learning_rate": 8.644407769942373e-05,
"loss": 0.0354,
"step": 7230
},
{
"epoch": 6.941514860977948,
"grad_norm": 0.27211427688598633,
"learning_rate": 8.640050856842317e-05,
"loss": 0.0401,
"step": 7240
},
{
"epoch": 6.9511025886864815,
"grad_norm": 0.27033731341362,
"learning_rate": 8.635688055126299e-05,
"loss": 0.0389,
"step": 7250
},
{
"epoch": 6.960690316395015,
"grad_norm": 0.3898187279701233,
"learning_rate": 8.631319371852151e-05,
"loss": 0.0393,
"step": 7260
},
{
"epoch": 6.970278044103548,
"grad_norm": 0.2771322727203369,
"learning_rate": 8.626944814087221e-05,
"loss": 0.0463,
"step": 7270
},
{
"epoch": 6.97986577181208,
"grad_norm": 0.28191322088241577,
"learning_rate": 8.622564388908357e-05,
"loss": 0.0443,
"step": 7280
},
{
"epoch": 6.9894534995206135,
"grad_norm": 0.3647807240486145,
"learning_rate": 8.618178103401897e-05,
"loss": 0.044,
"step": 7290
},
{
"epoch": 6.999041227229147,
"grad_norm": 0.2619480490684509,
"learning_rate": 8.613785964663665e-05,
"loss": 0.0422,
"step": 7300
},
{
"epoch": 7.00862895493768,
"grad_norm": 0.2431744933128357,
"learning_rate": 8.609387979798952e-05,
"loss": 0.0458,
"step": 7310
},
{
"epoch": 7.018216682646213,
"grad_norm": 0.31808608770370483,
"learning_rate": 8.604984155922506e-05,
"loss": 0.0391,
"step": 7320
},
{
"epoch": 7.027804410354746,
"grad_norm": 0.41725489497184753,
"learning_rate": 8.600574500158518e-05,
"loss": 0.0395,
"step": 7330
},
{
"epoch": 7.037392138063279,
"grad_norm": 0.23228147625923157,
"learning_rate": 8.596159019640619e-05,
"loss": 0.0415,
"step": 7340
},
{
"epoch": 7.046979865771812,
"grad_norm": 0.25770825147628784,
"learning_rate": 8.59173772151186e-05,
"loss": 0.0428,
"step": 7350
},
{
"epoch": 7.056567593480345,
"grad_norm": 0.2742254436016083,
"learning_rate": 8.587310612924699e-05,
"loss": 0.0456,
"step": 7360
},
{
"epoch": 7.066155321188878,
"grad_norm": 0.34984004497528076,
"learning_rate": 8.582877701041004e-05,
"loss": 0.0304,
"step": 7370
},
{
"epoch": 7.075743048897412,
"grad_norm": 0.34064123034477234,
"learning_rate": 8.578438993032021e-05,
"loss": 0.038,
"step": 7380
},
{
"epoch": 7.085330776605945,
"grad_norm": 0.3359072506427765,
"learning_rate": 8.57399449607838e-05,
"loss": 0.0463,
"step": 7390
},
{
"epoch": 7.094918504314477,
"grad_norm": 0.330243855714798,
"learning_rate": 8.569544217370072e-05,
"loss": 0.0469,
"step": 7400
},
{
"epoch": 7.10450623202301,
"grad_norm": 0.23439550399780273,
"learning_rate": 8.565088164106439e-05,
"loss": 0.0388,
"step": 7410
},
{
"epoch": 7.114093959731544,
"grad_norm": 0.45976459980010986,
"learning_rate": 8.56062634349617e-05,
"loss": 0.0454,
"step": 7420
},
{
"epoch": 7.123681687440077,
"grad_norm": 0.310160368680954,
"learning_rate": 8.556158762757282e-05,
"loss": 0.0401,
"step": 7430
},
{
"epoch": 7.13326941514861,
"grad_norm": 0.4018678665161133,
"learning_rate": 8.551685429117111e-05,
"loss": 0.0512,
"step": 7440
},
{
"epoch": 7.142857142857143,
"grad_norm": 0.3131730556488037,
"learning_rate": 8.547206349812298e-05,
"loss": 0.0421,
"step": 7450
},
{
"epoch": 7.152444870565676,
"grad_norm": 0.30326828360557556,
"learning_rate": 8.542721532088778e-05,
"loss": 0.0461,
"step": 7460
},
{
"epoch": 7.162032598274209,
"grad_norm": 0.3814712166786194,
"learning_rate": 8.538230983201771e-05,
"loss": 0.0446,
"step": 7470
},
{
"epoch": 7.171620325982742,
"grad_norm": 0.33048462867736816,
"learning_rate": 8.533734710415771e-05,
"loss": 0.048,
"step": 7480
},
{
"epoch": 7.181208053691275,
"grad_norm": 0.2931906580924988,
"learning_rate": 8.529232721004527e-05,
"loss": 0.0405,
"step": 7490
},
{
"epoch": 7.1907957813998085,
"grad_norm": 0.3595677614212036,
"learning_rate": 8.524725022251039e-05,
"loss": 0.0404,
"step": 7500
},
{
"epoch": 7.200383509108342,
"grad_norm": 0.37149250507354736,
"learning_rate": 8.520211621447541e-05,
"loss": 0.0382,
"step": 7510
},
{
"epoch": 7.209971236816874,
"grad_norm": 0.2645772099494934,
"learning_rate": 8.515692525895494e-05,
"loss": 0.0438,
"step": 7520
},
{
"epoch": 7.219558964525407,
"grad_norm": 0.3602275848388672,
"learning_rate": 8.511167742905569e-05,
"loss": 0.0421,
"step": 7530
},
{
"epoch": 7.2291466922339405,
"grad_norm": 0.27108579874038696,
"learning_rate": 8.506637279797638e-05,
"loss": 0.0406,
"step": 7540
},
{
"epoch": 7.238734419942474,
"grad_norm": 0.329333633184433,
"learning_rate": 8.502101143900764e-05,
"loss": 0.0357,
"step": 7550
},
{
"epoch": 7.248322147651007,
"grad_norm": 0.2549634873867035,
"learning_rate": 8.497559342553185e-05,
"loss": 0.0354,
"step": 7560
},
{
"epoch": 7.25790987535954,
"grad_norm": 0.3205493092536926,
"learning_rate": 8.493011883102307e-05,
"loss": 0.0373,
"step": 7570
},
{
"epoch": 7.2674976030680725,
"grad_norm": 0.2169693112373352,
"learning_rate": 8.488458772904684e-05,
"loss": 0.0394,
"step": 7580
},
{
"epoch": 7.277085330776606,
"grad_norm": 0.37165510654449463,
"learning_rate": 8.483900019326017e-05,
"loss": 0.0381,
"step": 7590
},
{
"epoch": 7.286673058485139,
"grad_norm": 0.26651856303215027,
"learning_rate": 8.479335629741133e-05,
"loss": 0.0422,
"step": 7600
},
{
"epoch": 7.296260786193672,
"grad_norm": 0.32148563861846924,
"learning_rate": 8.474765611533977e-05,
"loss": 0.0357,
"step": 7610
},
{
"epoch": 7.305848513902205,
"grad_norm": 0.26410454511642456,
"learning_rate": 8.470189972097601e-05,
"loss": 0.0362,
"step": 7620
},
{
"epoch": 7.315436241610739,
"grad_norm": 0.43451759219169617,
"learning_rate": 8.465608718834152e-05,
"loss": 0.0394,
"step": 7630
},
{
"epoch": 7.325023969319272,
"grad_norm": 0.39956948161125183,
"learning_rate": 8.461021859154851e-05,
"loss": 0.0467,
"step": 7640
},
{
"epoch": 7.334611697027804,
"grad_norm": 0.36985108256340027,
"learning_rate": 8.45642940048e-05,
"loss": 0.0426,
"step": 7650
},
{
"epoch": 7.344199424736337,
"grad_norm": 0.27028191089630127,
"learning_rate": 8.451831350238947e-05,
"loss": 0.0404,
"step": 7660
},
{
"epoch": 7.353787152444871,
"grad_norm": 0.3216499388217926,
"learning_rate": 8.447227715870097e-05,
"loss": 0.0389,
"step": 7670
},
{
"epoch": 7.363374880153404,
"grad_norm": 0.2922750413417816,
"learning_rate": 8.442618504820878e-05,
"loss": 0.0416,
"step": 7680
},
{
"epoch": 7.372962607861937,
"grad_norm": 0.32347607612609863,
"learning_rate": 8.438003724547747e-05,
"loss": 0.0371,
"step": 7690
},
{
"epoch": 7.382550335570469,
"grad_norm": 0.37498921155929565,
"learning_rate": 8.433383382516169e-05,
"loss": 0.0388,
"step": 7700
},
{
"epoch": 7.392138063279003,
"grad_norm": 0.41235196590423584,
"learning_rate": 8.428757486200603e-05,
"loss": 0.0382,
"step": 7710
},
{
"epoch": 7.401725790987536,
"grad_norm": 0.32482102513313293,
"learning_rate": 8.424126043084499e-05,
"loss": 0.0397,
"step": 7720
},
{
"epoch": 7.411313518696069,
"grad_norm": 0.3329836130142212,
"learning_rate": 8.419489060660272e-05,
"loss": 0.0381,
"step": 7730
},
{
"epoch": 7.420901246404602,
"grad_norm": 0.28950804471969604,
"learning_rate": 8.41484654642931e-05,
"loss": 0.037,
"step": 7740
},
{
"epoch": 7.4304889741131355,
"grad_norm": 0.43603238463401794,
"learning_rate": 8.410198507901936e-05,
"loss": 0.0465,
"step": 7750
},
{
"epoch": 7.440076701821669,
"grad_norm": 0.3902181386947632,
"learning_rate": 8.405544952597422e-05,
"loss": 0.0423,
"step": 7760
},
{
"epoch": 7.449664429530201,
"grad_norm": 0.4409140348434448,
"learning_rate": 8.400885888043956e-05,
"loss": 0.0384,
"step": 7770
},
{
"epoch": 7.459252157238734,
"grad_norm": 0.33337706327438354,
"learning_rate": 8.396221321778645e-05,
"loss": 0.0407,
"step": 7780
},
{
"epoch": 7.4688398849472675,
"grad_norm": 0.29487982392311096,
"learning_rate": 8.391551261347493e-05,
"loss": 0.0407,
"step": 7790
},
{
"epoch": 7.478427612655801,
"grad_norm": 0.2853257954120636,
"learning_rate": 8.38687571430539e-05,
"loss": 0.0412,
"step": 7800
},
{
"epoch": 7.488015340364334,
"grad_norm": 0.24586626887321472,
"learning_rate": 8.382194688216105e-05,
"loss": 0.0453,
"step": 7810
},
{
"epoch": 7.497603068072867,
"grad_norm": 0.24528749287128448,
"learning_rate": 8.377508190652272e-05,
"loss": 0.0435,
"step": 7820
},
{
"epoch": 7.5071907957813995,
"grad_norm": 0.21899107098579407,
"learning_rate": 8.37281622919537e-05,
"loss": 0.0516,
"step": 7830
},
{
"epoch": 7.516778523489933,
"grad_norm": 0.5243720412254333,
"learning_rate": 8.368118811435726e-05,
"loss": 0.0373,
"step": 7840
},
{
"epoch": 7.526366251198466,
"grad_norm": 0.24362969398498535,
"learning_rate": 8.363415944972487e-05,
"loss": 0.0452,
"step": 7850
},
{
"epoch": 7.535953978906999,
"grad_norm": 0.3614483177661896,
"learning_rate": 8.358707637413615e-05,
"loss": 0.0343,
"step": 7860
},
{
"epoch": 7.545541706615532,
"grad_norm": 0.3958549201488495,
"learning_rate": 8.353993896375878e-05,
"loss": 0.0454,
"step": 7870
},
{
"epoch": 7.555129434324066,
"grad_norm": 0.3544330596923828,
"learning_rate": 8.349274729484832e-05,
"loss": 0.0434,
"step": 7880
},
{
"epoch": 7.564717162032598,
"grad_norm": 0.3171081244945526,
"learning_rate": 8.344550144374808e-05,
"loss": 0.0423,
"step": 7890
},
{
"epoch": 7.574304889741131,
"grad_norm": 0.3729722797870636,
"learning_rate": 8.339820148688907e-05,
"loss": 0.0407,
"step": 7900
},
{
"epoch": 7.583892617449664,
"grad_norm": 0.3339761197566986,
"learning_rate": 8.335084750078978e-05,
"loss": 0.0452,
"step": 7910
},
{
"epoch": 7.593480345158198,
"grad_norm": 0.20363827049732208,
"learning_rate": 8.330343956205615e-05,
"loss": 0.0387,
"step": 7920
},
{
"epoch": 7.603068072866731,
"grad_norm": 0.3942667245864868,
"learning_rate": 8.325597774738137e-05,
"loss": 0.0407,
"step": 7930
},
{
"epoch": 7.612655800575264,
"grad_norm": 0.234974667429924,
"learning_rate": 8.32084621335458e-05,
"loss": 0.0417,
"step": 7940
},
{
"epoch": 7.622243528283796,
"grad_norm": 0.4611276388168335,
"learning_rate": 8.316089279741682e-05,
"loss": 0.0455,
"step": 7950
},
{
"epoch": 7.63183125599233,
"grad_norm": 0.31897857785224915,
"learning_rate": 8.311326981594872e-05,
"loss": 0.0489,
"step": 7960
},
{
"epoch": 7.641418983700863,
"grad_norm": 0.34105560183525085,
"learning_rate": 8.306559326618259e-05,
"loss": 0.0441,
"step": 7970
},
{
"epoch": 7.651006711409396,
"grad_norm": 0.35638663172721863,
"learning_rate": 8.301786322524619e-05,
"loss": 0.0443,
"step": 7980
},
{
"epoch": 7.660594439117929,
"grad_norm": 0.4538173973560333,
"learning_rate": 8.297007977035376e-05,
"loss": 0.0414,
"step": 7990
},
{
"epoch": 7.6701821668264625,
"grad_norm": 0.37664180994033813,
"learning_rate": 8.292224297880598e-05,
"loss": 0.0453,
"step": 8000
},
{
"epoch": 7.679769894534996,
"grad_norm": 0.2357359379529953,
"learning_rate": 8.287435292798984e-05,
"loss": 0.0424,
"step": 8010
},
{
"epoch": 7.689357622243528,
"grad_norm": 0.32804933190345764,
"learning_rate": 8.282640969537848e-05,
"loss": 0.0381,
"step": 8020
},
{
"epoch": 7.698945349952061,
"grad_norm": 0.45805230736732483,
"learning_rate": 8.277841335853101e-05,
"loss": 0.0346,
"step": 8030
},
{
"epoch": 7.7085330776605945,
"grad_norm": 0.2550659775733948,
"learning_rate": 8.273036399509253e-05,
"loss": 0.044,
"step": 8040
},
{
"epoch": 7.718120805369128,
"grad_norm": 0.3587624728679657,
"learning_rate": 8.268226168279389e-05,
"loss": 0.0396,
"step": 8050
},
{
"epoch": 7.727708533077661,
"grad_norm": 0.41126248240470886,
"learning_rate": 8.263410649945159e-05,
"loss": 0.0438,
"step": 8060
},
{
"epoch": 7.737296260786193,
"grad_norm": 0.542373538017273,
"learning_rate": 8.258589852296765e-05,
"loss": 0.0473,
"step": 8070
},
{
"epoch": 7.7468839884947265,
"grad_norm": 0.44072815775871277,
"learning_rate": 8.253763783132955e-05,
"loss": 0.0413,
"step": 8080
},
{
"epoch": 7.75647171620326,
"grad_norm": 0.3905545771121979,
"learning_rate": 8.248932450261e-05,
"loss": 0.0394,
"step": 8090
},
{
"epoch": 7.766059443911793,
"grad_norm": 0.3717019855976105,
"learning_rate": 8.244095861496686e-05,
"loss": 0.0391,
"step": 8100
},
{
"epoch": 7.775647171620326,
"grad_norm": 0.28803032636642456,
"learning_rate": 8.239254024664304e-05,
"loss": 0.0398,
"step": 8110
},
{
"epoch": 7.785234899328859,
"grad_norm": 0.26609280705451965,
"learning_rate": 8.234406947596633e-05,
"loss": 0.0399,
"step": 8120
},
{
"epoch": 7.794822627037393,
"grad_norm": 0.28858163952827454,
"learning_rate": 8.229554638134933e-05,
"loss": 0.0327,
"step": 8130
},
{
"epoch": 7.804410354745925,
"grad_norm": 0.3346012830734253,
"learning_rate": 8.224697104128925e-05,
"loss": 0.0372,
"step": 8140
},
{
"epoch": 7.813998082454458,
"grad_norm": 0.3210478723049164,
"learning_rate": 8.219834353436781e-05,
"loss": 0.0424,
"step": 8150
},
{
"epoch": 7.823585810162991,
"grad_norm": 0.2401236593723297,
"learning_rate": 8.214966393925115e-05,
"loss": 0.0366,
"step": 8160
},
{
"epoch": 7.833173537871525,
"grad_norm": 0.29601314663887024,
"learning_rate": 8.210093233468968e-05,
"loss": 0.0416,
"step": 8170
},
{
"epoch": 7.842761265580058,
"grad_norm": 0.29966652393341064,
"learning_rate": 8.20521487995179e-05,
"loss": 0.0349,
"step": 8180
},
{
"epoch": 7.85234899328859,
"grad_norm": 0.3385706841945648,
"learning_rate": 8.200331341265436e-05,
"loss": 0.0421,
"step": 8190
},
{
"epoch": 7.861936720997123,
"grad_norm": 0.4073570966720581,
"learning_rate": 8.19544262531015e-05,
"loss": 0.0416,
"step": 8200
},
{
"epoch": 7.871524448705657,
"grad_norm": 0.30653032660484314,
"learning_rate": 8.19054873999455e-05,
"loss": 0.04,
"step": 8210
},
{
"epoch": 7.88111217641419,
"grad_norm": 0.24951298534870148,
"learning_rate": 8.185649693235614e-05,
"loss": 0.0397,
"step": 8220
},
{
"epoch": 7.890699904122723,
"grad_norm": 0.24890607595443726,
"learning_rate": 8.180745492958674e-05,
"loss": 0.0396,
"step": 8230
},
{
"epoch": 7.900287631831256,
"grad_norm": 0.2634108066558838,
"learning_rate": 8.175836147097396e-05,
"loss": 0.0364,
"step": 8240
},
{
"epoch": 7.9098753595397895,
"grad_norm": 0.29432108998298645,
"learning_rate": 8.170921663593773e-05,
"loss": 0.0353,
"step": 8250
},
{
"epoch": 7.919463087248322,
"grad_norm": 0.3281777799129486,
"learning_rate": 8.166002050398106e-05,
"loss": 0.0429,
"step": 8260
},
{
"epoch": 7.929050814956855,
"grad_norm": 0.24084685742855072,
"learning_rate": 8.161077315468997e-05,
"loss": 0.0454,
"step": 8270
},
{
"epoch": 7.938638542665388,
"grad_norm": 0.290452241897583,
"learning_rate": 8.156147466773332e-05,
"loss": 0.0427,
"step": 8280
},
{
"epoch": 7.9482262703739215,
"grad_norm": 0.3068200945854187,
"learning_rate": 8.15121251228627e-05,
"loss": 0.0416,
"step": 8290
},
{
"epoch": 7.957813998082455,
"grad_norm": 0.5520877242088318,
"learning_rate": 8.146272459991233e-05,
"loss": 0.0369,
"step": 8300
},
{
"epoch": 7.967401725790987,
"grad_norm": 0.268451064825058,
"learning_rate": 8.141327317879884e-05,
"loss": 0.0419,
"step": 8310
},
{
"epoch": 7.97698945349952,
"grad_norm": 0.45414549112319946,
"learning_rate": 8.136377093952123e-05,
"loss": 0.0414,
"step": 8320
},
{
"epoch": 7.9865771812080535,
"grad_norm": 0.2249930500984192,
"learning_rate": 8.131421796216072e-05,
"loss": 0.0389,
"step": 8330
},
{
"epoch": 7.996164908916587,
"grad_norm": 0.28440603613853455,
"learning_rate": 8.126461432688061e-05,
"loss": 0.038,
"step": 8340
},
{
"epoch": 8.00575263662512,
"grad_norm": 0.26801931858062744,
"learning_rate": 8.121496011392613e-05,
"loss": 0.0382,
"step": 8350
},
{
"epoch": 8.015340364333653,
"grad_norm": 0.3116857409477234,
"learning_rate": 8.116525540362434e-05,
"loss": 0.0395,
"step": 8360
},
{
"epoch": 8.024928092042186,
"grad_norm": 0.37847548723220825,
"learning_rate": 8.1115500276384e-05,
"loss": 0.0395,
"step": 8370
},
{
"epoch": 8.03451581975072,
"grad_norm": 0.3358413279056549,
"learning_rate": 8.10656948126954e-05,
"loss": 0.0443,
"step": 8380
},
{
"epoch": 8.044103547459253,
"grad_norm": 0.3593525290489197,
"learning_rate": 8.101583909313033e-05,
"loss": 0.0393,
"step": 8390
},
{
"epoch": 8.053691275167786,
"grad_norm": 0.2807999551296234,
"learning_rate": 8.09659331983418e-05,
"loss": 0.0337,
"step": 8400
},
{
"epoch": 8.063279002876317,
"grad_norm": 0.24256014823913574,
"learning_rate": 8.091597720906403e-05,
"loss": 0.0383,
"step": 8410
},
{
"epoch": 8.07286673058485,
"grad_norm": 0.4359792172908783,
"learning_rate": 8.086597120611228e-05,
"loss": 0.0389,
"step": 8420
},
{
"epoch": 8.082454458293384,
"grad_norm": 0.3423149883747101,
"learning_rate": 8.081591527038271e-05,
"loss": 0.0401,
"step": 8430
},
{
"epoch": 8.092042186001917,
"grad_norm": 0.2202298790216446,
"learning_rate": 8.076580948285227e-05,
"loss": 0.0364,
"step": 8440
},
{
"epoch": 8.10162991371045,
"grad_norm": 0.36670511960983276,
"learning_rate": 8.071565392457852e-05,
"loss": 0.0379,
"step": 8450
},
{
"epoch": 8.111217641418984,
"grad_norm": 0.22374413907527924,
"learning_rate": 8.066544867669961e-05,
"loss": 0.0363,
"step": 8460
},
{
"epoch": 8.120805369127517,
"grad_norm": 0.43999022245407104,
"learning_rate": 8.061519382043399e-05,
"loss": 0.0385,
"step": 8470
},
{
"epoch": 8.13039309683605,
"grad_norm": 0.2890577018260956,
"learning_rate": 8.056488943708041e-05,
"loss": 0.0379,
"step": 8480
},
{
"epoch": 8.139980824544583,
"grad_norm": 0.3366747200489044,
"learning_rate": 8.051453560801772e-05,
"loss": 0.0417,
"step": 8490
},
{
"epoch": 8.149568552253116,
"grad_norm": 0.2634000778198242,
"learning_rate": 8.046413241470478e-05,
"loss": 0.0351,
"step": 8500
},
{
"epoch": 8.15915627996165,
"grad_norm": 0.21788382530212402,
"learning_rate": 8.041367993868031e-05,
"loss": 0.0391,
"step": 8510
},
{
"epoch": 8.168744007670183,
"grad_norm": 0.31453433632850647,
"learning_rate": 8.036317826156275e-05,
"loss": 0.0392,
"step": 8520
},
{
"epoch": 8.178331735378714,
"grad_norm": 0.2942139506340027,
"learning_rate": 8.031262746505012e-05,
"loss": 0.0443,
"step": 8530
},
{
"epoch": 8.187919463087248,
"grad_norm": 0.24110645055770874,
"learning_rate": 8.02620276309199e-05,
"loss": 0.038,
"step": 8540
},
{
"epoch": 8.19750719079578,
"grad_norm": 0.26143452525138855,
"learning_rate": 8.021137884102891e-05,
"loss": 0.0349,
"step": 8550
},
{
"epoch": 8.207094918504314,
"grad_norm": 0.23739804327487946,
"learning_rate": 8.016068117731318e-05,
"loss": 0.0367,
"step": 8560
},
{
"epoch": 8.216682646212847,
"grad_norm": 0.31131234765052795,
"learning_rate": 8.010993472178778e-05,
"loss": 0.0383,
"step": 8570
},
{
"epoch": 8.22627037392138,
"grad_norm": 0.301734060049057,
"learning_rate": 8.005913955654675e-05,
"loss": 0.0402,
"step": 8580
},
{
"epoch": 8.235858101629914,
"grad_norm": 0.2536526620388031,
"learning_rate": 8.000829576376288e-05,
"loss": 0.0324,
"step": 8590
},
{
"epoch": 8.245445829338447,
"grad_norm": 0.398578941822052,
"learning_rate": 7.995740342568767e-05,
"loss": 0.0382,
"step": 8600
},
{
"epoch": 8.25503355704698,
"grad_norm": 0.2876124083995819,
"learning_rate": 7.990646262465112e-05,
"loss": 0.038,
"step": 8610
},
{
"epoch": 8.264621284755513,
"grad_norm": 0.30959025025367737,
"learning_rate": 7.985547344306161e-05,
"loss": 0.0464,
"step": 8620
},
{
"epoch": 8.274209012464047,
"grad_norm": 0.327210396528244,
"learning_rate": 7.980443596340589e-05,
"loss": 0.0426,
"step": 8630
},
{
"epoch": 8.28379674017258,
"grad_norm": 0.23988771438598633,
"learning_rate": 7.975335026824873e-05,
"loss": 0.043,
"step": 8640
},
{
"epoch": 8.293384467881111,
"grad_norm": 0.2276514321565628,
"learning_rate": 7.970221644023293e-05,
"loss": 0.0407,
"step": 8650
},
{
"epoch": 8.302972195589644,
"grad_norm": 0.27630215883255005,
"learning_rate": 7.965103456207919e-05,
"loss": 0.0439,
"step": 8660
},
{
"epoch": 8.312559923298178,
"grad_norm": 0.1922815442085266,
"learning_rate": 7.959980471658592e-05,
"loss": 0.0396,
"step": 8670
},
{
"epoch": 8.322147651006711,
"grad_norm": 0.303406298160553,
"learning_rate": 7.954852698662913e-05,
"loss": 0.0363,
"step": 8680
},
{
"epoch": 8.331735378715244,
"grad_norm": 0.3184201717376709,
"learning_rate": 7.94972014551623e-05,
"loss": 0.0414,
"step": 8690
},
{
"epoch": 8.341323106423777,
"grad_norm": 0.31593239307403564,
"learning_rate": 7.94458282052162e-05,
"loss": 0.0431,
"step": 8700
},
{
"epoch": 8.35091083413231,
"grad_norm": 0.2461700290441513,
"learning_rate": 7.939440731989887e-05,
"loss": 0.0447,
"step": 8710
},
{
"epoch": 8.360498561840844,
"grad_norm": 0.5149932503700256,
"learning_rate": 7.934293888239532e-05,
"loss": 0.0377,
"step": 8720
},
{
"epoch": 8.370086289549377,
"grad_norm": 0.273589164018631,
"learning_rate": 7.929142297596756e-05,
"loss": 0.0436,
"step": 8730
},
{
"epoch": 8.37967401725791,
"grad_norm": 0.37680765986442566,
"learning_rate": 7.92398596839544e-05,
"loss": 0.0351,
"step": 8740
},
{
"epoch": 8.389261744966444,
"grad_norm": 0.3258054256439209,
"learning_rate": 7.918824908977123e-05,
"loss": 0.0387,
"step": 8750
},
{
"epoch": 8.398849472674977,
"grad_norm": 0.36646002531051636,
"learning_rate": 7.913659127691002e-05,
"loss": 0.0388,
"step": 8760
},
{
"epoch": 8.40843720038351,
"grad_norm": 0.31907573342323303,
"learning_rate": 7.908488632893913e-05,
"loss": 0.043,
"step": 8770
},
{
"epoch": 8.418024928092041,
"grad_norm": 0.3218369781970978,
"learning_rate": 7.903313432950313e-05,
"loss": 0.041,
"step": 8780
},
{
"epoch": 8.427612655800575,
"grad_norm": 0.2750600576400757,
"learning_rate": 7.898133536232275e-05,
"loss": 0.0372,
"step": 8790
},
{
"epoch": 8.437200383509108,
"grad_norm": 0.3370470106601715,
"learning_rate": 7.892948951119467e-05,
"loss": 0.0381,
"step": 8800
},
{
"epoch": 8.446788111217641,
"grad_norm": 0.30544212460517883,
"learning_rate": 7.887759685999143e-05,
"loss": 0.0511,
"step": 8810
},
{
"epoch": 8.456375838926174,
"grad_norm": 0.3022957742214203,
"learning_rate": 7.88256574926613e-05,
"loss": 0.0382,
"step": 8820
},
{
"epoch": 8.465963566634708,
"grad_norm": 0.4892277121543884,
"learning_rate": 7.877367149322807e-05,
"loss": 0.0471,
"step": 8830
},
{
"epoch": 8.47555129434324,
"grad_norm": 0.2292528748512268,
"learning_rate": 7.872163894579103e-05,
"loss": 0.0374,
"step": 8840
},
{
"epoch": 8.485139022051774,
"grad_norm": 0.4441846013069153,
"learning_rate": 7.866955993452473e-05,
"loss": 0.0396,
"step": 8850
},
{
"epoch": 8.494726749760307,
"grad_norm": 0.3326236605644226,
"learning_rate": 7.86174345436789e-05,
"loss": 0.0407,
"step": 8860
},
{
"epoch": 8.50431447746884,
"grad_norm": 0.3634801506996155,
"learning_rate": 7.856526285757829e-05,
"loss": 0.0343,
"step": 8870
},
{
"epoch": 8.513902205177374,
"grad_norm": 0.3255830705165863,
"learning_rate": 7.851304496062254e-05,
"loss": 0.0391,
"step": 8880
},
{
"epoch": 8.523489932885907,
"grad_norm": 0.2465457022190094,
"learning_rate": 7.846078093728611e-05,
"loss": 0.0418,
"step": 8890
},
{
"epoch": 8.53307766059444,
"grad_norm": 0.28741371631622314,
"learning_rate": 7.840847087211799e-05,
"loss": 0.0408,
"step": 8900
},
{
"epoch": 8.542665388302972,
"grad_norm": 0.5026047825813293,
"learning_rate": 7.835611484974169e-05,
"loss": 0.0425,
"step": 8910
},
{
"epoch": 8.552253116011505,
"grad_norm": 0.29450881481170654,
"learning_rate": 7.830371295485506e-05,
"loss": 0.0386,
"step": 8920
},
{
"epoch": 8.561840843720038,
"grad_norm": 0.37559008598327637,
"learning_rate": 7.82512652722302e-05,
"loss": 0.0346,
"step": 8930
},
{
"epoch": 8.571428571428571,
"grad_norm": 0.3274129033088684,
"learning_rate": 7.819877188671322e-05,
"loss": 0.0377,
"step": 8940
},
{
"epoch": 8.581016299137104,
"grad_norm": 1.9449902772903442,
"learning_rate": 7.81462328832242e-05,
"loss": 0.0422,
"step": 8950
},
{
"epoch": 8.590604026845638,
"grad_norm": 0.32859793305397034,
"learning_rate": 7.809364834675703e-05,
"loss": 0.0381,
"step": 8960
},
{
"epoch": 8.60019175455417,
"grad_norm": 0.41501474380493164,
"learning_rate": 7.804101836237921e-05,
"loss": 0.0413,
"step": 8970
},
{
"epoch": 8.609779482262704,
"grad_norm": 0.3548615574836731,
"learning_rate": 7.798834301523182e-05,
"loss": 0.0436,
"step": 8980
},
{
"epoch": 8.619367209971237,
"grad_norm": 0.3612217903137207,
"learning_rate": 7.793562239052928e-05,
"loss": 0.0364,
"step": 8990
},
{
"epoch": 8.62895493767977,
"grad_norm": 0.3534400761127472,
"learning_rate": 7.78828565735593e-05,
"loss": 0.0381,
"step": 9000
},
{
"epoch": 8.638542665388304,
"grad_norm": 0.34939974546432495,
"learning_rate": 7.783004564968263e-05,
"loss": 0.0405,
"step": 9010
},
{
"epoch": 8.648130393096835,
"grad_norm": 0.37234190106391907,
"learning_rate": 7.777718970433309e-05,
"loss": 0.0439,
"step": 9020
},
{
"epoch": 8.657718120805368,
"grad_norm": 0.40179571509361267,
"learning_rate": 7.772428882301724e-05,
"loss": 0.0428,
"step": 9030
},
{
"epoch": 8.667305848513902,
"grad_norm": 0.37865087389945984,
"learning_rate": 7.767134309131437e-05,
"loss": 0.0364,
"step": 9040
},
{
"epoch": 8.676893576222435,
"grad_norm": 0.32325470447540283,
"learning_rate": 7.761835259487635e-05,
"loss": 0.0387,
"step": 9050
},
{
"epoch": 8.686481303930968,
"grad_norm": 0.26749640703201294,
"learning_rate": 7.756531741942743e-05,
"loss": 0.048,
"step": 9060
},
{
"epoch": 8.696069031639501,
"grad_norm": 0.381815105676651,
"learning_rate": 7.751223765076418e-05,
"loss": 0.0337,
"step": 9070
},
{
"epoch": 8.705656759348035,
"grad_norm": 0.4329027533531189,
"learning_rate": 7.745911337475524e-05,
"loss": 0.0408,
"step": 9080
},
{
"epoch": 8.715244487056568,
"grad_norm": 0.4740753173828125,
"learning_rate": 7.740594467734131e-05,
"loss": 0.0368,
"step": 9090
},
{
"epoch": 8.724832214765101,
"grad_norm": 0.23423776030540466,
"learning_rate": 7.735273164453494e-05,
"loss": 0.0445,
"step": 9100
},
{
"epoch": 8.734419942473634,
"grad_norm": 0.35593661665916443,
"learning_rate": 7.72994743624204e-05,
"loss": 0.0415,
"step": 9110
},
{
"epoch": 8.744007670182167,
"grad_norm": 0.2637054920196533,
"learning_rate": 7.724617291715355e-05,
"loss": 0.0424,
"step": 9120
},
{
"epoch": 8.7535953978907,
"grad_norm": 0.25044816732406616,
"learning_rate": 7.719282739496167e-05,
"loss": 0.0384,
"step": 9130
},
{
"epoch": 8.763183125599234,
"grad_norm": 0.22907428443431854,
"learning_rate": 7.713943788214337e-05,
"loss": 0.0365,
"step": 9140
},
{
"epoch": 8.772770853307765,
"grad_norm": 0.4074908494949341,
"learning_rate": 7.70860044650684e-05,
"loss": 0.0481,
"step": 9150
},
{
"epoch": 8.782358581016299,
"grad_norm": 0.29292604327201843,
"learning_rate": 7.703252723017757e-05,
"loss": 0.0433,
"step": 9160
},
{
"epoch": 8.791946308724832,
"grad_norm": 0.2879285514354706,
"learning_rate": 7.697900626398255e-05,
"loss": 0.0388,
"step": 9170
},
{
"epoch": 8.801534036433365,
"grad_norm": 0.31987619400024414,
"learning_rate": 7.692544165306574e-05,
"loss": 0.0423,
"step": 9180
},
{
"epoch": 8.811121764141898,
"grad_norm": 0.3260093331336975,
"learning_rate": 7.687183348408018e-05,
"loss": 0.0342,
"step": 9190
},
{
"epoch": 8.820709491850431,
"grad_norm": 0.3373820185661316,
"learning_rate": 7.681818184374938e-05,
"loss": 0.0382,
"step": 9200
},
{
"epoch": 8.830297219558965,
"grad_norm": 0.17047972977161407,
"learning_rate": 7.676448681886715e-05,
"loss": 0.0375,
"step": 9210
},
{
"epoch": 8.839884947267498,
"grad_norm": 0.26559868454933167,
"learning_rate": 7.671074849629746e-05,
"loss": 0.0398,
"step": 9220
},
{
"epoch": 8.849472674976031,
"grad_norm": 0.30938103795051575,
"learning_rate": 7.665696696297439e-05,
"loss": 0.0437,
"step": 9230
},
{
"epoch": 8.859060402684564,
"grad_norm": 0.47756102681159973,
"learning_rate": 7.660314230590187e-05,
"loss": 0.0393,
"step": 9240
},
{
"epoch": 8.868648130393098,
"grad_norm": 0.3115938901901245,
"learning_rate": 7.654927461215362e-05,
"loss": 0.0389,
"step": 9250
},
{
"epoch": 8.87823585810163,
"grad_norm": 0.2378511130809784,
"learning_rate": 7.649536396887296e-05,
"loss": 0.0456,
"step": 9260
},
{
"epoch": 8.887823585810162,
"grad_norm": 0.27728554606437683,
"learning_rate": 7.644141046327271e-05,
"loss": 0.0445,
"step": 9270
},
{
"epoch": 8.897411313518695,
"grad_norm": 0.5434097051620483,
"learning_rate": 7.638741418263505e-05,
"loss": 0.0402,
"step": 9280
},
{
"epoch": 8.906999041227229,
"grad_norm": 0.23838652670383453,
"learning_rate": 7.633337521431127e-05,
"loss": 0.038,
"step": 9290
},
{
"epoch": 8.916586768935762,
"grad_norm": 0.2675243020057678,
"learning_rate": 7.627929364572184e-05,
"loss": 0.0409,
"step": 9300
},
{
"epoch": 8.926174496644295,
"grad_norm": 0.36112427711486816,
"learning_rate": 7.622516956435604e-05,
"loss": 0.038,
"step": 9310
},
{
"epoch": 8.935762224352828,
"grad_norm": 0.40189293026924133,
"learning_rate": 7.617100305777199e-05,
"loss": 0.0349,
"step": 9320
},
{
"epoch": 8.945349952061362,
"grad_norm": 0.32217565178871155,
"learning_rate": 7.611679421359639e-05,
"loss": 0.0414,
"step": 9330
},
{
"epoch": 8.954937679769895,
"grad_norm": 0.37468934059143066,
"learning_rate": 7.60625431195245e-05,
"loss": 0.0419,
"step": 9340
},
{
"epoch": 8.964525407478428,
"grad_norm": 0.25082099437713623,
"learning_rate": 7.600824986331989e-05,
"loss": 0.0361,
"step": 9350
},
{
"epoch": 8.974113135186961,
"grad_norm": 0.3598342835903168,
"learning_rate": 7.595391453281431e-05,
"loss": 0.034,
"step": 9360
},
{
"epoch": 8.983700862895494,
"grad_norm": 0.3254631459712982,
"learning_rate": 7.589953721590764e-05,
"loss": 0.0482,
"step": 9370
},
{
"epoch": 8.993288590604028,
"grad_norm": 0.3480936586856842,
"learning_rate": 7.584511800056759e-05,
"loss": 0.0359,
"step": 9380
},
{
"epoch": 9.002876318312559,
"grad_norm": 0.321119099855423,
"learning_rate": 7.579065697482974e-05,
"loss": 0.0397,
"step": 9390
},
{
"epoch": 9.012464046021092,
"grad_norm": 0.2790512144565582,
"learning_rate": 7.573615422679726e-05,
"loss": 0.0341,
"step": 9400
},
{
"epoch": 9.022051773729626,
"grad_norm": 0.6163461208343506,
"learning_rate": 7.568160984464083e-05,
"loss": 0.0361,
"step": 9410
},
{
"epoch": 9.031639501438159,
"grad_norm": 0.35653308033943176,
"learning_rate": 7.56270239165985e-05,
"loss": 0.0392,
"step": 9420
},
{
"epoch": 9.041227229146692,
"grad_norm": 0.2938978970050812,
"learning_rate": 7.55723965309755e-05,
"loss": 0.0326,
"step": 9430
},
{
"epoch": 9.050814956855225,
"grad_norm": 0.26529833674430847,
"learning_rate": 7.551772777614412e-05,
"loss": 0.0454,
"step": 9440
},
{
"epoch": 9.060402684563758,
"grad_norm": 0.351085364818573,
"learning_rate": 7.54630177405436e-05,
"loss": 0.0467,
"step": 9450
},
{
"epoch": 9.069990412272292,
"grad_norm": 0.23490998148918152,
"learning_rate": 7.540826651267999e-05,
"loss": 0.0405,
"step": 9460
},
{
"epoch": 9.079578139980825,
"grad_norm": 0.3685658276081085,
"learning_rate": 7.535347418112588e-05,
"loss": 0.0372,
"step": 9470
},
{
"epoch": 9.089165867689358,
"grad_norm": 0.36048129200935364,
"learning_rate": 7.529864083452046e-05,
"loss": 0.0378,
"step": 9480
},
{
"epoch": 9.098753595397891,
"grad_norm": 0.3054652810096741,
"learning_rate": 7.52437665615692e-05,
"loss": 0.0447,
"step": 9490
},
{
"epoch": 9.108341323106425,
"grad_norm": 0.2997536063194275,
"learning_rate": 7.518885145104381e-05,
"loss": 0.038,
"step": 9500
},
{
"epoch": 9.117929050814958,
"grad_norm": 0.5517327189445496,
"learning_rate": 7.513389559178209e-05,
"loss": 0.0472,
"step": 9510
},
{
"epoch": 9.12751677852349,
"grad_norm": 0.30378520488739014,
"learning_rate": 7.507889907268769e-05,
"loss": 0.0355,
"step": 9520
},
{
"epoch": 9.137104506232022,
"grad_norm": 0.46029695868492126,
"learning_rate": 7.50238619827301e-05,
"loss": 0.0358,
"step": 9530
},
{
"epoch": 9.146692233940556,
"grad_norm": 0.30406633019447327,
"learning_rate": 7.496878441094439e-05,
"loss": 0.0397,
"step": 9540
},
{
"epoch": 9.156279961649089,
"grad_norm": 0.4107452929019928,
"learning_rate": 7.491366644643118e-05,
"loss": 0.043,
"step": 9550
},
{
"epoch": 9.165867689357622,
"grad_norm": 0.6011910438537598,
"learning_rate": 7.485850817835639e-05,
"loss": 0.0459,
"step": 9560
},
{
"epoch": 9.175455417066155,
"grad_norm": 0.42862173914909363,
"learning_rate": 7.480330969595114e-05,
"loss": 0.0392,
"step": 9570
},
{
"epoch": 9.185043144774689,
"grad_norm": 0.3231380879878998,
"learning_rate": 7.474807108851163e-05,
"loss": 0.0379,
"step": 9580
},
{
"epoch": 9.194630872483222,
"grad_norm": 0.31742027401924133,
"learning_rate": 7.469279244539897e-05,
"loss": 0.0398,
"step": 9590
},
{
"epoch": 9.204218600191755,
"grad_norm": 0.34327855706214905,
"learning_rate": 7.463747385603899e-05,
"loss": 0.0365,
"step": 9600
},
{
"epoch": 9.213806327900288,
"grad_norm": 0.40726932883262634,
"learning_rate": 7.458211540992222e-05,
"loss": 0.0421,
"step": 9610
},
{
"epoch": 9.223394055608821,
"grad_norm": 0.28824758529663086,
"learning_rate": 7.452671719660359e-05,
"loss": 0.0392,
"step": 9620
},
{
"epoch": 9.232981783317355,
"grad_norm": 0.4298984408378601,
"learning_rate": 7.447127930570241e-05,
"loss": 0.0396,
"step": 9630
},
{
"epoch": 9.242569511025886,
"grad_norm": 0.3513946831226349,
"learning_rate": 7.441580182690218e-05,
"loss": 0.0344,
"step": 9640
},
{
"epoch": 9.25215723873442,
"grad_norm": 0.31864580512046814,
"learning_rate": 7.436028484995043e-05,
"loss": 0.0352,
"step": 9650
},
{
"epoch": 9.261744966442953,
"grad_norm": 0.42778101563453674,
"learning_rate": 7.430472846465856e-05,
"loss": 0.0345,
"step": 9660
},
{
"epoch": 9.271332694151486,
"grad_norm": 0.25153082609176636,
"learning_rate": 7.424913276090176e-05,
"loss": 0.0376,
"step": 9670
},
{
"epoch": 9.280920421860019,
"grad_norm": 0.30971595644950867,
"learning_rate": 7.419349782861882e-05,
"loss": 0.0402,
"step": 9680
},
{
"epoch": 9.290508149568552,
"grad_norm": 0.5045586228370667,
"learning_rate": 7.413782375781198e-05,
"loss": 0.0321,
"step": 9690
},
{
"epoch": 9.300095877277085,
"grad_norm": 0.26688501238822937,
"learning_rate": 7.40821106385468e-05,
"loss": 0.0405,
"step": 9700
},
{
"epoch": 9.309683604985619,
"grad_norm": 0.3186158537864685,
"learning_rate": 7.402635856095202e-05,
"loss": 0.039,
"step": 9710
},
{
"epoch": 9.319271332694152,
"grad_norm": 0.23956236243247986,
"learning_rate": 7.397056761521936e-05,
"loss": 0.0385,
"step": 9720
},
{
"epoch": 9.328859060402685,
"grad_norm": 0.35403645038604736,
"learning_rate": 7.391473789160352e-05,
"loss": 0.037,
"step": 9730
},
{
"epoch": 9.338446788111218,
"grad_norm": 0.30190348625183105,
"learning_rate": 7.38588694804218e-05,
"loss": 0.0402,
"step": 9740
},
{
"epoch": 9.348034515819752,
"grad_norm": 0.45342007279396057,
"learning_rate": 7.380296247205417e-05,
"loss": 0.0385,
"step": 9750
},
{
"epoch": 9.357622243528283,
"grad_norm": 0.355342835187912,
"learning_rate": 7.374701695694304e-05,
"loss": 0.0375,
"step": 9760
},
{
"epoch": 9.367209971236816,
"grad_norm": 0.3231160640716553,
"learning_rate": 7.369103302559308e-05,
"loss": 0.0353,
"step": 9770
},
{
"epoch": 9.37679769894535,
"grad_norm": 0.4163530766963959,
"learning_rate": 7.363501076857112e-05,
"loss": 0.0381,
"step": 9780
},
{
"epoch": 9.386385426653883,
"grad_norm": 0.24439620971679688,
"learning_rate": 7.357895027650598e-05,
"loss": 0.0347,
"step": 9790
},
{
"epoch": 9.395973154362416,
"grad_norm": 0.3847917318344116,
"learning_rate": 7.352285164008838e-05,
"loss": 0.0331,
"step": 9800
},
{
"epoch": 9.405560882070949,
"grad_norm": 0.3097192049026489,
"learning_rate": 7.346671495007068e-05,
"loss": 0.0405,
"step": 9810
},
{
"epoch": 9.415148609779482,
"grad_norm": 0.21436016261577606,
"learning_rate": 7.341054029726685e-05,
"loss": 0.0375,
"step": 9820
},
{
"epoch": 9.424736337488016,
"grad_norm": 0.41024893522262573,
"learning_rate": 7.335432777255225e-05,
"loss": 0.0463,
"step": 9830
},
{
"epoch": 9.434324065196549,
"grad_norm": 0.299177348613739,
"learning_rate": 7.329807746686352e-05,
"loss": 0.0418,
"step": 9840
},
{
"epoch": 9.443911792905082,
"grad_norm": 0.3526586890220642,
"learning_rate": 7.324178947119842e-05,
"loss": 0.0383,
"step": 9850
},
{
"epoch": 9.453499520613615,
"grad_norm": 0.277421772480011,
"learning_rate": 7.318546387661564e-05,
"loss": 0.0512,
"step": 9860
},
{
"epoch": 9.463087248322148,
"grad_norm": 0.24628502130508423,
"learning_rate": 7.312910077423477e-05,
"loss": 0.0367,
"step": 9870
},
{
"epoch": 9.47267497603068,
"grad_norm": 0.5568169951438904,
"learning_rate": 7.307270025523601e-05,
"loss": 0.0396,
"step": 9880
},
{
"epoch": 9.482262703739213,
"grad_norm": 0.30765804648399353,
"learning_rate": 7.301626241086012e-05,
"loss": 0.043,
"step": 9890
},
{
"epoch": 9.491850431447746,
"grad_norm": 0.32168257236480713,
"learning_rate": 7.295978733240827e-05,
"loss": 0.0385,
"step": 9900
},
{
"epoch": 9.50143815915628,
"grad_norm": 0.46826574206352234,
"learning_rate": 7.29032751112418e-05,
"loss": 0.0375,
"step": 9910
},
{
"epoch": 9.511025886864813,
"grad_norm": 0.19892945885658264,
"learning_rate": 7.284672583878219e-05,
"loss": 0.0432,
"step": 9920
},
{
"epoch": 9.520613614573346,
"grad_norm": 0.21767093241214752,
"learning_rate": 7.279013960651083e-05,
"loss": 0.0331,
"step": 9930
},
{
"epoch": 9.53020134228188,
"grad_norm": 0.32079631090164185,
"learning_rate": 7.273351650596889e-05,
"loss": 0.0355,
"step": 9940
},
{
"epoch": 9.539789069990412,
"grad_norm": 0.40111902356147766,
"learning_rate": 7.267685662875725e-05,
"loss": 0.0412,
"step": 9950
},
{
"epoch": 9.549376797698946,
"grad_norm": 0.58073490858078,
"learning_rate": 7.26201600665362e-05,
"loss": 0.0384,
"step": 9960
},
{
"epoch": 9.558964525407479,
"grad_norm": 0.20928962528705597,
"learning_rate": 7.256342691102545e-05,
"loss": 0.0334,
"step": 9970
},
{
"epoch": 9.568552253116012,
"grad_norm": 0.2809102535247803,
"learning_rate": 7.250665725400385e-05,
"loss": 0.0421,
"step": 9980
},
{
"epoch": 9.578139980824545,
"grad_norm": 0.2836989164352417,
"learning_rate": 7.244985118730933e-05,
"loss": 0.0394,
"step": 9990
},
{
"epoch": 9.587727708533077,
"grad_norm": 0.21493583917617798,
"learning_rate": 7.239300880283869e-05,
"loss": 0.0438,
"step": 10000
},
{
"epoch": 9.59731543624161,
"grad_norm": 0.3654724955558777,
"learning_rate": 7.233613019254755e-05,
"loss": 0.0398,
"step": 10010
},
{
"epoch": 9.606903163950143,
"grad_norm": 0.24901500344276428,
"learning_rate": 7.227921544845003e-05,
"loss": 0.0393,
"step": 10020
},
{
"epoch": 9.616490891658676,
"grad_norm": 0.21980980038642883,
"learning_rate": 7.222226466261883e-05,
"loss": 0.0386,
"step": 10030
},
{
"epoch": 9.62607861936721,
"grad_norm": 0.18104171752929688,
"learning_rate": 7.216527792718484e-05,
"loss": 0.0378,
"step": 10040
},
{
"epoch": 9.635666347075743,
"grad_norm": 0.33641284704208374,
"learning_rate": 7.210825533433719e-05,
"loss": 0.0418,
"step": 10050
},
{
"epoch": 9.645254074784276,
"grad_norm": 0.2590009570121765,
"learning_rate": 7.205119697632297e-05,
"loss": 0.0327,
"step": 10060
},
{
"epoch": 9.65484180249281,
"grad_norm": 0.40689241886138916,
"learning_rate": 7.199410294544713e-05,
"loss": 0.0542,
"step": 10070
},
{
"epoch": 9.664429530201343,
"grad_norm": 0.3199746310710907,
"learning_rate": 7.193697333407234e-05,
"loss": 0.0363,
"step": 10080
},
{
"epoch": 9.674017257909876,
"grad_norm": 0.49059638381004333,
"learning_rate": 7.187980823461887e-05,
"loss": 0.0377,
"step": 10090
},
{
"epoch": 9.683604985618409,
"grad_norm": 0.28129157423973083,
"learning_rate": 7.182260773956433e-05,
"loss": 0.0382,
"step": 10100
},
{
"epoch": 9.693192713326942,
"grad_norm": 0.3830220401287079,
"learning_rate": 7.176537194144362e-05,
"loss": 0.0349,
"step": 10110
},
{
"epoch": 9.702780441035475,
"grad_norm": 0.3658897578716278,
"learning_rate": 7.170810093284876e-05,
"loss": 0.0359,
"step": 10120
},
{
"epoch": 9.712368168744007,
"grad_norm": 0.31416580080986023,
"learning_rate": 7.165079480642873e-05,
"loss": 0.0343,
"step": 10130
},
{
"epoch": 9.72195589645254,
"grad_norm": 0.24944183230400085,
"learning_rate": 7.159345365488929e-05,
"loss": 0.0332,
"step": 10140
},
{
"epoch": 9.731543624161073,
"grad_norm": 0.2953116297721863,
"learning_rate": 7.153607757099292e-05,
"loss": 0.0354,
"step": 10150
},
{
"epoch": 9.741131351869607,
"grad_norm": 0.4103414714336395,
"learning_rate": 7.147866664755856e-05,
"loss": 0.036,
"step": 10160
},
{
"epoch": 9.75071907957814,
"grad_norm": 0.28444069623947144,
"learning_rate": 7.142122097746153e-05,
"loss": 0.0389,
"step": 10170
},
{
"epoch": 9.760306807286673,
"grad_norm": 0.2912525534629822,
"learning_rate": 7.136374065363334e-05,
"loss": 0.0345,
"step": 10180
},
{
"epoch": 9.769894534995206,
"grad_norm": 0.25480780005455017,
"learning_rate": 7.13062257690616e-05,
"loss": 0.0355,
"step": 10190
},
{
"epoch": 9.77948226270374,
"grad_norm": 0.305532306432724,
"learning_rate": 7.124867641678981e-05,
"loss": 0.0376,
"step": 10200
},
{
"epoch": 9.789069990412273,
"grad_norm": 0.32806769013404846,
"learning_rate": 7.119109268991723e-05,
"loss": 0.0357,
"step": 10210
},
{
"epoch": 9.798657718120806,
"grad_norm": 0.23281969130039215,
"learning_rate": 7.113347468159871e-05,
"loss": 0.0332,
"step": 10220
},
{
"epoch": 9.808245445829339,
"grad_norm": 0.3487169146537781,
"learning_rate": 7.107582248504458e-05,
"loss": 0.0397,
"step": 10230
},
{
"epoch": 9.817833173537872,
"grad_norm": 0.3124096989631653,
"learning_rate": 7.101813619352048e-05,
"loss": 0.0391,
"step": 10240
},
{
"epoch": 9.827420901246404,
"grad_norm": 0.39542460441589355,
"learning_rate": 7.09604159003472e-05,
"loss": 0.0361,
"step": 10250
},
{
"epoch": 9.837008628954937,
"grad_norm": 0.3044220209121704,
"learning_rate": 7.090266169890051e-05,
"loss": 0.0382,
"step": 10260
},
{
"epoch": 9.84659635666347,
"grad_norm": 0.3320329189300537,
"learning_rate": 7.08448736826111e-05,
"loss": 0.043,
"step": 10270
},
{
"epoch": 9.856184084372003,
"grad_norm": 0.25773710012435913,
"learning_rate": 7.078705194496429e-05,
"loss": 0.0363,
"step": 10280
},
{
"epoch": 9.865771812080537,
"grad_norm": 0.4256868064403534,
"learning_rate": 7.07291965795e-05,
"loss": 0.0388,
"step": 10290
},
{
"epoch": 9.87535953978907,
"grad_norm": 0.48361513018608093,
"learning_rate": 7.067130767981252e-05,
"loss": 0.0387,
"step": 10300
},
{
"epoch": 9.884947267497603,
"grad_norm": 0.3017280697822571,
"learning_rate": 7.061338533955043e-05,
"loss": 0.0334,
"step": 10310
},
{
"epoch": 9.894534995206136,
"grad_norm": 0.3394894599914551,
"learning_rate": 7.055542965241634e-05,
"loss": 0.0402,
"step": 10320
},
{
"epoch": 9.90412272291467,
"grad_norm": 0.3364240527153015,
"learning_rate": 7.049744071216687e-05,
"loss": 0.0332,
"step": 10330
},
{
"epoch": 9.913710450623203,
"grad_norm": 0.2847566604614258,
"learning_rate": 7.043941861261242e-05,
"loss": 0.0372,
"step": 10340
},
{
"epoch": 9.923298178331736,
"grad_norm": 0.6304646730422974,
"learning_rate": 7.038136344761703e-05,
"loss": 0.0338,
"step": 10350
},
{
"epoch": 9.93288590604027,
"grad_norm": 0.37469327449798584,
"learning_rate": 7.03232753110982e-05,
"loss": 0.0377,
"step": 10360
},
{
"epoch": 9.9424736337488,
"grad_norm": 0.3126644790172577,
"learning_rate": 7.026515429702682e-05,
"loss": 0.0313,
"step": 10370
},
{
"epoch": 9.952061361457334,
"grad_norm": 0.22097988426685333,
"learning_rate": 7.020700049942694e-05,
"loss": 0.037,
"step": 10380
},
{
"epoch": 9.961649089165867,
"grad_norm": 0.2554224729537964,
"learning_rate": 7.014881401237563e-05,
"loss": 0.0338,
"step": 10390
},
{
"epoch": 9.9712368168744,
"grad_norm": 0.41450753808021545,
"learning_rate": 7.009059493000285e-05,
"loss": 0.0373,
"step": 10400
},
{
"epoch": 9.980824544582934,
"grad_norm": 0.2980963885784149,
"learning_rate": 7.003234334649133e-05,
"loss": 0.0357,
"step": 10410
},
{
"epoch": 9.990412272291467,
"grad_norm": 0.34623420238494873,
"learning_rate": 6.997405935607635e-05,
"loss": 0.0393,
"step": 10420
},
{
"epoch": 10.0,
"grad_norm": 0.31464067101478577,
"learning_rate": 6.991574305304558e-05,
"loss": 0.0373,
"step": 10430
},
{
"epoch": 10.009587727708533,
"grad_norm": 0.3440396785736084,
"learning_rate": 6.985739453173903e-05,
"loss": 0.0352,
"step": 10440
},
{
"epoch": 10.019175455417066,
"grad_norm": 0.3453032374382019,
"learning_rate": 6.979901388654879e-05,
"loss": 0.0384,
"step": 10450
},
{
"epoch": 10.0287631831256,
"grad_norm": 0.2174844592809677,
"learning_rate": 6.97406012119189e-05,
"loss": 0.033,
"step": 10460
},
{
"epoch": 10.038350910834133,
"grad_norm": 0.34027159214019775,
"learning_rate": 6.968215660234527e-05,
"loss": 0.0439,
"step": 10470
},
{
"epoch": 10.047938638542666,
"grad_norm": 0.29484447836875916,
"learning_rate": 6.962368015237543e-05,
"loss": 0.0406,
"step": 10480
},
{
"epoch": 10.0575263662512,
"grad_norm": 0.2926745116710663,
"learning_rate": 6.956517195660842e-05,
"loss": 0.0366,
"step": 10490
},
{
"epoch": 10.06711409395973,
"grad_norm": 0.25546324253082275,
"learning_rate": 6.950663210969466e-05,
"loss": 0.0387,
"step": 10500
},
{
"epoch": 10.076701821668264,
"grad_norm": 0.19871650636196136,
"learning_rate": 6.944806070633578e-05,
"loss": 0.0408,
"step": 10510
},
{
"epoch": 10.086289549376797,
"grad_norm": 0.432463139295578,
"learning_rate": 6.93894578412844e-05,
"loss": 0.0415,
"step": 10520
},
{
"epoch": 10.09587727708533,
"grad_norm": 0.3453048765659332,
"learning_rate": 6.933082360934408e-05,
"loss": 0.0359,
"step": 10530
},
{
"epoch": 10.105465004793864,
"grad_norm": 0.28228339552879333,
"learning_rate": 6.927215810536915e-05,
"loss": 0.0363,
"step": 10540
},
{
"epoch": 10.115052732502397,
"grad_norm": 0.2979227304458618,
"learning_rate": 6.921346142426448e-05,
"loss": 0.0349,
"step": 10550
},
{
"epoch": 10.12464046021093,
"grad_norm": 0.23034702241420746,
"learning_rate": 6.915473366098541e-05,
"loss": 0.0337,
"step": 10560
},
{
"epoch": 10.134228187919463,
"grad_norm": 0.30385303497314453,
"learning_rate": 6.909597491053751e-05,
"loss": 0.0358,
"step": 10570
},
{
"epoch": 10.143815915627997,
"grad_norm": 0.34254565834999084,
"learning_rate": 6.903718526797658e-05,
"loss": 0.0383,
"step": 10580
},
{
"epoch": 10.15340364333653,
"grad_norm": 0.3243492841720581,
"learning_rate": 6.897836482840828e-05,
"loss": 0.0388,
"step": 10590
},
{
"epoch": 10.162991371045063,
"grad_norm": 0.24607200920581818,
"learning_rate": 6.891951368698815e-05,
"loss": 0.0359,
"step": 10600
},
{
"epoch": 10.172579098753596,
"grad_norm": 0.2082456648349762,
"learning_rate": 6.88606319389214e-05,
"loss": 0.0347,
"step": 10610
},
{
"epoch": 10.182166826462128,
"grad_norm": 0.23741546273231506,
"learning_rate": 6.880171967946273e-05,
"loss": 0.0335,
"step": 10620
},
{
"epoch": 10.191754554170661,
"grad_norm": 0.7699126601219177,
"learning_rate": 6.874277700391623e-05,
"loss": 0.0402,
"step": 10630
},
{
"epoch": 10.201342281879194,
"grad_norm": 0.23752135038375854,
"learning_rate": 6.868380400763516e-05,
"loss": 0.0378,
"step": 10640
},
{
"epoch": 10.210930009587727,
"grad_norm": 0.2777273952960968,
"learning_rate": 6.86248007860219e-05,
"loss": 0.0341,
"step": 10650
},
{
"epoch": 10.22051773729626,
"grad_norm": 0.33273088932037354,
"learning_rate": 6.856576743452761e-05,
"loss": 0.0379,
"step": 10660
},
{
"epoch": 10.230105465004794,
"grad_norm": 0.22550059854984283,
"learning_rate": 6.850670404865227e-05,
"loss": 0.0323,
"step": 10670
},
{
"epoch": 10.239693192713327,
"grad_norm": 0.22732175886631012,
"learning_rate": 6.844761072394446e-05,
"loss": 0.0335,
"step": 10680
},
{
"epoch": 10.24928092042186,
"grad_norm": 0.1689731478691101,
"learning_rate": 6.838848755600114e-05,
"loss": 0.0368,
"step": 10690
},
{
"epoch": 10.258868648130393,
"grad_norm": 0.20502756536006927,
"learning_rate": 6.83293346404676e-05,
"loss": 0.041,
"step": 10700
},
{
"epoch": 10.268456375838927,
"grad_norm": 0.2094731330871582,
"learning_rate": 6.827015207303722e-05,
"loss": 0.0383,
"step": 10710
},
{
"epoch": 10.27804410354746,
"grad_norm": 0.3424762487411499,
"learning_rate": 6.821093994945135e-05,
"loss": 0.0435,
"step": 10720
},
{
"epoch": 10.287631831255993,
"grad_norm": 0.3471381366252899,
"learning_rate": 6.815169836549916e-05,
"loss": 0.04,
"step": 10730
},
{
"epoch": 10.297219558964525,
"grad_norm": 0.2713249623775482,
"learning_rate": 6.80924274170175e-05,
"loss": 0.0313,
"step": 10740
},
{
"epoch": 10.306807286673058,
"grad_norm": 0.24895431101322174,
"learning_rate": 6.803312719989068e-05,
"loss": 0.0371,
"step": 10750
},
{
"epoch": 10.316395014381591,
"grad_norm": 0.3460264205932617,
"learning_rate": 6.797379781005039e-05,
"loss": 0.0312,
"step": 10760
},
{
"epoch": 10.325982742090124,
"grad_norm": 0.36002618074417114,
"learning_rate": 6.791443934347553e-05,
"loss": 0.0443,
"step": 10770
},
{
"epoch": 10.335570469798657,
"grad_norm": 0.46812546253204346,
"learning_rate": 6.785505189619197e-05,
"loss": 0.0417,
"step": 10780
},
{
"epoch": 10.34515819750719,
"grad_norm": 0.3170137107372284,
"learning_rate": 6.779563556427255e-05,
"loss": 0.0413,
"step": 10790
},
{
"epoch": 10.354745925215724,
"grad_norm": 0.27735644578933716,
"learning_rate": 6.773619044383677e-05,
"loss": 0.0411,
"step": 10800
},
{
"epoch": 10.364333652924257,
"grad_norm": 0.2342735081911087,
"learning_rate": 6.767671663105075e-05,
"loss": 0.0327,
"step": 10810
},
{
"epoch": 10.37392138063279,
"grad_norm": 0.31249138712882996,
"learning_rate": 6.761721422212696e-05,
"loss": 0.042,
"step": 10820
},
{
"epoch": 10.383509108341324,
"grad_norm": 0.26663604378700256,
"learning_rate": 6.755768331332424e-05,
"loss": 0.0359,
"step": 10830
},
{
"epoch": 10.393096836049857,
"grad_norm": 0.30388474464416504,
"learning_rate": 6.749812400094742e-05,
"loss": 0.0443,
"step": 10840
},
{
"epoch": 10.40268456375839,
"grad_norm": 0.3067167401313782,
"learning_rate": 6.743853638134734e-05,
"loss": 0.0424,
"step": 10850
},
{
"epoch": 10.412272291466923,
"grad_norm": 0.3138778805732727,
"learning_rate": 6.737892055092064e-05,
"loss": 0.0313,
"step": 10860
},
{
"epoch": 10.421860019175455,
"grad_norm": 0.28191816806793213,
"learning_rate": 6.731927660610954e-05,
"loss": 0.0358,
"step": 10870
},
{
"epoch": 10.431447746883988,
"grad_norm": 0.37692686915397644,
"learning_rate": 6.725960464340182e-05,
"loss": 0.0317,
"step": 10880
},
{
"epoch": 10.441035474592521,
"grad_norm": 0.26821082830429077,
"learning_rate": 6.719990475933053e-05,
"loss": 0.0319,
"step": 10890
},
{
"epoch": 10.450623202301054,
"grad_norm": 0.46883681416511536,
"learning_rate": 6.71401770504739e-05,
"loss": 0.0376,
"step": 10900
},
{
"epoch": 10.460210930009588,
"grad_norm": 0.8076095581054688,
"learning_rate": 6.708042161345521e-05,
"loss": 0.0355,
"step": 10910
},
{
"epoch": 10.46979865771812,
"grad_norm": 0.29810166358947754,
"learning_rate": 6.702063854494254e-05,
"loss": 0.0269,
"step": 10920
},
{
"epoch": 10.479386385426654,
"grad_norm": 0.3273125886917114,
"learning_rate": 6.696082794164868e-05,
"loss": 0.0386,
"step": 10930
},
{
"epoch": 10.488974113135187,
"grad_norm": 0.4401116371154785,
"learning_rate": 6.690098990033102e-05,
"loss": 0.0298,
"step": 10940
},
{
"epoch": 10.49856184084372,
"grad_norm": 0.2832469642162323,
"learning_rate": 6.684112451779127e-05,
"loss": 0.0397,
"step": 10950
},
{
"epoch": 10.508149568552254,
"grad_norm": 0.3664191961288452,
"learning_rate": 6.67812318908754e-05,
"loss": 0.0382,
"step": 10960
},
{
"epoch": 10.517737296260787,
"grad_norm": 0.32039886713027954,
"learning_rate": 6.672131211647344e-05,
"loss": 0.0332,
"step": 10970
},
{
"epoch": 10.527325023969318,
"grad_norm": 0.31571629643440247,
"learning_rate": 6.666136529151938e-05,
"loss": 0.0358,
"step": 10980
},
{
"epoch": 10.536912751677852,
"grad_norm": 0.30983471870422363,
"learning_rate": 6.660139151299093e-05,
"loss": 0.0402,
"step": 10990
},
{
"epoch": 10.546500479386385,
"grad_norm": 0.35966020822525024,
"learning_rate": 6.65413908779094e-05,
"loss": 0.0418,
"step": 11000
},
{
"epoch": 10.556088207094918,
"grad_norm": 0.3868638277053833,
"learning_rate": 6.648136348333954e-05,
"loss": 0.0428,
"step": 11010
},
{
"epoch": 10.565675934803451,
"grad_norm": 0.20595276355743408,
"learning_rate": 6.642130942638945e-05,
"loss": 0.0359,
"step": 11020
},
{
"epoch": 10.575263662511984,
"grad_norm": 0.6492677927017212,
"learning_rate": 6.636122880421032e-05,
"loss": 0.0345,
"step": 11030
},
{
"epoch": 10.584851390220518,
"grad_norm": 0.22226084768772125,
"learning_rate": 6.630112171399628e-05,
"loss": 0.0322,
"step": 11040
},
{
"epoch": 10.594439117929051,
"grad_norm": 0.27300918102264404,
"learning_rate": 6.624098825298436e-05,
"loss": 0.0345,
"step": 11050
},
{
"epoch": 10.604026845637584,
"grad_norm": 0.2507658898830414,
"learning_rate": 6.618082851845417e-05,
"loss": 0.0397,
"step": 11060
},
{
"epoch": 10.613614573346117,
"grad_norm": 0.22898472845554352,
"learning_rate": 6.612064260772788e-05,
"loss": 0.0312,
"step": 11070
},
{
"epoch": 10.62320230105465,
"grad_norm": 0.2579527199268341,
"learning_rate": 6.606043061816998e-05,
"loss": 0.0319,
"step": 11080
},
{
"epoch": 10.632790028763184,
"grad_norm": 0.3027057945728302,
"learning_rate": 6.600019264718713e-05,
"loss": 0.0425,
"step": 11090
},
{
"epoch": 10.642377756471717,
"grad_norm": 0.4396612048149109,
"learning_rate": 6.593992879222808e-05,
"loss": 0.0347,
"step": 11100
},
{
"epoch": 10.651965484180248,
"grad_norm": 0.3383849561214447,
"learning_rate": 6.587963915078342e-05,
"loss": 0.0427,
"step": 11110
},
{
"epoch": 10.661553211888782,
"grad_norm": 0.39786002039909363,
"learning_rate": 6.581932382038542e-05,
"loss": 0.0325,
"step": 11120
},
{
"epoch": 10.671140939597315,
"grad_norm": 0.29470136761665344,
"learning_rate": 6.575898289860798e-05,
"loss": 0.0327,
"step": 11130
},
{
"epoch": 10.680728667305848,
"grad_norm": 0.33293044567108154,
"learning_rate": 6.569861648306632e-05,
"loss": 0.0372,
"step": 11140
},
{
"epoch": 10.690316395014381,
"grad_norm": 0.2922416627407074,
"learning_rate": 6.563822467141697e-05,
"loss": 0.0371,
"step": 11150
},
{
"epoch": 10.699904122722915,
"grad_norm": 0.37106814980506897,
"learning_rate": 6.557780756135749e-05,
"loss": 0.0358,
"step": 11160
},
{
"epoch": 10.709491850431448,
"grad_norm": 0.2364514172077179,
"learning_rate": 6.551736525062645e-05,
"loss": 0.038,
"step": 11170
},
{
"epoch": 10.719079578139981,
"grad_norm": 0.327987939119339,
"learning_rate": 6.545689783700307e-05,
"loss": 0.0399,
"step": 11180
},
{
"epoch": 10.728667305848514,
"grad_norm": 0.25306403636932373,
"learning_rate": 6.539640541830728e-05,
"loss": 0.0319,
"step": 11190
},
{
"epoch": 10.738255033557047,
"grad_norm": 0.301178902387619,
"learning_rate": 6.533588809239941e-05,
"loss": 0.0408,
"step": 11200
},
{
"epoch": 10.74784276126558,
"grad_norm": 0.2662244439125061,
"learning_rate": 6.527534595718007e-05,
"loss": 0.0381,
"step": 11210
},
{
"epoch": 10.757430488974114,
"grad_norm": 0.3115426301956177,
"learning_rate": 6.521477911059008e-05,
"loss": 0.0368,
"step": 11220
},
{
"epoch": 10.767018216682647,
"grad_norm": 0.4020492136478424,
"learning_rate": 6.515418765061015e-05,
"loss": 0.0346,
"step": 11230
},
{
"epoch": 10.776605944391179,
"grad_norm": 0.49596187472343445,
"learning_rate": 6.509357167526084e-05,
"loss": 0.0376,
"step": 11240
},
{
"epoch": 10.786193672099712,
"grad_norm": 0.33604878187179565,
"learning_rate": 6.50329312826024e-05,
"loss": 0.0395,
"step": 11250
},
{
"epoch": 10.795781399808245,
"grad_norm": 0.2914005219936371,
"learning_rate": 6.497226657073454e-05,
"loss": 0.0371,
"step": 11260
},
{
"epoch": 10.805369127516778,
"grad_norm": 0.34624671936035156,
"learning_rate": 6.491157763779632e-05,
"loss": 0.0281,
"step": 11270
},
{
"epoch": 10.814956855225311,
"grad_norm": 0.30700233578681946,
"learning_rate": 6.485086458196602e-05,
"loss": 0.0331,
"step": 11280
},
{
"epoch": 10.824544582933845,
"grad_norm": 0.3025294244289398,
"learning_rate": 6.479012750146087e-05,
"loss": 0.0341,
"step": 11290
},
{
"epoch": 10.834132310642378,
"grad_norm": 0.23997899889945984,
"learning_rate": 6.472936649453701e-05,
"loss": 0.0383,
"step": 11300
},
{
"epoch": 10.843720038350911,
"grad_norm": 0.24672740697860718,
"learning_rate": 6.466858165948933e-05,
"loss": 0.0313,
"step": 11310
},
{
"epoch": 10.853307766059444,
"grad_norm": 0.2887534201145172,
"learning_rate": 6.460777309465118e-05,
"loss": 0.039,
"step": 11320
},
{
"epoch": 10.862895493767978,
"grad_norm": 0.24179044365882874,
"learning_rate": 6.454694089839436e-05,
"loss": 0.032,
"step": 11330
},
{
"epoch": 10.87248322147651,
"grad_norm": 0.47962746024131775,
"learning_rate": 6.448608516912888e-05,
"loss": 0.0368,
"step": 11340
},
{
"epoch": 10.882070949185042,
"grad_norm": 0.26336967945098877,
"learning_rate": 6.44252060053028e-05,
"loss": 0.045,
"step": 11350
},
{
"epoch": 10.891658676893575,
"grad_norm": 0.2424604296684265,
"learning_rate": 6.436430350540215e-05,
"loss": 0.0321,
"step": 11360
},
{
"epoch": 10.901246404602109,
"grad_norm": 0.25244084000587463,
"learning_rate": 6.430337776795064e-05,
"loss": 0.0346,
"step": 11370
},
{
"epoch": 10.910834132310642,
"grad_norm": 0.30204179883003235,
"learning_rate": 6.42424288915096e-05,
"loss": 0.0362,
"step": 11380
},
{
"epoch": 10.920421860019175,
"grad_norm": 0.3095405697822571,
"learning_rate": 6.418145697467784e-05,
"loss": 0.036,
"step": 11390
},
{
"epoch": 10.930009587727708,
"grad_norm": 0.22773784399032593,
"learning_rate": 6.412046211609134e-05,
"loss": 0.0399,
"step": 11400
},
{
"epoch": 10.939597315436242,
"grad_norm": 0.3239744007587433,
"learning_rate": 6.40594444144233e-05,
"loss": 0.0374,
"step": 11410
},
{
"epoch": 10.949185043144775,
"grad_norm": 0.28157058358192444,
"learning_rate": 6.399840396838382e-05,
"loss": 0.0352,
"step": 11420
},
{
"epoch": 10.958772770853308,
"grad_norm": 0.31856581568717957,
"learning_rate": 6.393734087671979e-05,
"loss": 0.0379,
"step": 11430
},
{
"epoch": 10.968360498561841,
"grad_norm": 0.2937244772911072,
"learning_rate": 6.387625523821474e-05,
"loss": 0.0322,
"step": 11440
},
{
"epoch": 10.977948226270374,
"grad_norm": 0.2260034680366516,
"learning_rate": 6.38151471516887e-05,
"loss": 0.0319,
"step": 11450
},
{
"epoch": 10.987535953978908,
"grad_norm": 0.42635470628738403,
"learning_rate": 6.375401671599798e-05,
"loss": 0.0383,
"step": 11460
},
{
"epoch": 10.997123681687441,
"grad_norm": 0.288327693939209,
"learning_rate": 6.369286403003509e-05,
"loss": 0.0406,
"step": 11470
},
{
"epoch": 11.006711409395972,
"grad_norm": 0.2826128900051117,
"learning_rate": 6.363168919272846e-05,
"loss": 0.0356,
"step": 11480
},
{
"epoch": 11.016299137104506,
"grad_norm": 0.2275691032409668,
"learning_rate": 6.357049230304244e-05,
"loss": 0.0336,
"step": 11490
},
{
"epoch": 11.025886864813039,
"grad_norm": 0.24633708596229553,
"learning_rate": 6.3509273459977e-05,
"loss": 0.0353,
"step": 11500
},
{
"epoch": 11.035474592521572,
"grad_norm": 0.3283119201660156,
"learning_rate": 6.344803276256764e-05,
"loss": 0.0324,
"step": 11510
},
{
"epoch": 11.045062320230105,
"grad_norm": 0.5711014270782471,
"learning_rate": 6.338677030988521e-05,
"loss": 0.033,
"step": 11520
},
{
"epoch": 11.054650047938638,
"grad_norm": 0.3481939435005188,
"learning_rate": 6.332548620103575e-05,
"loss": 0.0398,
"step": 11530
},
{
"epoch": 11.064237775647172,
"grad_norm": 0.24051983654499054,
"learning_rate": 6.326418053516037e-05,
"loss": 0.04,
"step": 11540
},
{
"epoch": 11.073825503355705,
"grad_norm": 0.4249405264854431,
"learning_rate": 6.320285341143501e-05,
"loss": 0.0389,
"step": 11550
},
{
"epoch": 11.083413231064238,
"grad_norm": 0.24299634993076324,
"learning_rate": 6.314150492907034e-05,
"loss": 0.0323,
"step": 11560
},
{
"epoch": 11.093000958772771,
"grad_norm": 0.2705395817756653,
"learning_rate": 6.308013518731157e-05,
"loss": 0.0358,
"step": 11570
},
{
"epoch": 11.102588686481305,
"grad_norm": 0.3055950105190277,
"learning_rate": 6.301874428543833e-05,
"loss": 0.0299,
"step": 11580
},
{
"epoch": 11.112176414189838,
"grad_norm": 0.35363319516181946,
"learning_rate": 6.295733232276447e-05,
"loss": 0.0361,
"step": 11590
},
{
"epoch": 11.12176414189837,
"grad_norm": 0.4558916985988617,
"learning_rate": 6.28958993986379e-05,
"loss": 0.0391,
"step": 11600
},
{
"epoch": 11.131351869606902,
"grad_norm": 0.26662135124206543,
"learning_rate": 6.283444561244042e-05,
"loss": 0.0372,
"step": 11610
},
{
"epoch": 11.140939597315436,
"grad_norm": 0.24726532399654388,
"learning_rate": 6.27729710635876e-05,
"loss": 0.0346,
"step": 11620
},
{
"epoch": 11.150527325023969,
"grad_norm": 0.2278524488210678,
"learning_rate": 6.271147585152866e-05,
"loss": 0.0338,
"step": 11630
},
{
"epoch": 11.160115052732502,
"grad_norm": 0.3538067042827606,
"learning_rate": 6.264996007574615e-05,
"loss": 0.0388,
"step": 11640
},
{
"epoch": 11.169702780441035,
"grad_norm": 0.3667300045490265,
"learning_rate": 6.258842383575591e-05,
"loss": 0.0367,
"step": 11650
},
{
"epoch": 11.179290508149569,
"grad_norm": 0.29877883195877075,
"learning_rate": 6.252686723110696e-05,
"loss": 0.0348,
"step": 11660
},
{
"epoch": 11.188878235858102,
"grad_norm": 0.2846558392047882,
"learning_rate": 6.246529036138116e-05,
"loss": 0.0341,
"step": 11670
},
{
"epoch": 11.198465963566635,
"grad_norm": 0.2631428837776184,
"learning_rate": 6.24036933261932e-05,
"loss": 0.0356,
"step": 11680
},
{
"epoch": 11.208053691275168,
"grad_norm": 0.34309467673301697,
"learning_rate": 6.23420762251904e-05,
"loss": 0.0365,
"step": 11690
},
{
"epoch": 11.217641418983701,
"grad_norm": 0.2427697777748108,
"learning_rate": 6.228043915805254e-05,
"loss": 0.0378,
"step": 11700
},
{
"epoch": 11.227229146692235,
"grad_norm": 0.31478065252304077,
"learning_rate": 6.221878222449169e-05,
"loss": 0.0404,
"step": 11710
},
{
"epoch": 11.236816874400766,
"grad_norm": 0.27574971318244934,
"learning_rate": 6.215710552425206e-05,
"loss": 0.0311,
"step": 11720
},
{
"epoch": 11.2464046021093,
"grad_norm": 0.7589734792709351,
"learning_rate": 6.209540915710985e-05,
"loss": 0.0331,
"step": 11730
},
{
"epoch": 11.255992329817833,
"grad_norm": 0.2826196551322937,
"learning_rate": 6.203369322287306e-05,
"loss": 0.04,
"step": 11740
},
{
"epoch": 11.265580057526366,
"grad_norm": 0.6920874714851379,
"learning_rate": 6.197195782138132e-05,
"loss": 0.0367,
"step": 11750
},
{
"epoch": 11.275167785234899,
"grad_norm": 0.29903581738471985,
"learning_rate": 6.191020305250582e-05,
"loss": 0.0385,
"step": 11760
},
{
"epoch": 11.284755512943432,
"grad_norm": 0.2374860942363739,
"learning_rate": 6.184842901614902e-05,
"loss": 0.0349,
"step": 11770
},
{
"epoch": 11.294343240651965,
"grad_norm": 0.44580623507499695,
"learning_rate": 6.178663581224458e-05,
"loss": 0.0333,
"step": 11780
},
{
"epoch": 11.303930968360499,
"grad_norm": 0.2667308747768402,
"learning_rate": 6.172482354075716e-05,
"loss": 0.0359,
"step": 11790
},
{
"epoch": 11.313518696069032,
"grad_norm": 0.21850627660751343,
"learning_rate": 6.166299230168228e-05,
"loss": 0.0381,
"step": 11800
},
{
"epoch": 11.323106423777565,
"grad_norm": 0.27936065196990967,
"learning_rate": 6.16011421950461e-05,
"loss": 0.0371,
"step": 11810
},
{
"epoch": 11.332694151486098,
"grad_norm": 0.3284420371055603,
"learning_rate": 6.153927332090537e-05,
"loss": 0.0373,
"step": 11820
},
{
"epoch": 11.342281879194632,
"grad_norm": 0.2999724745750427,
"learning_rate": 6.147738577934711e-05,
"loss": 0.0376,
"step": 11830
},
{
"epoch": 11.351869606903165,
"grad_norm": 0.27732089161872864,
"learning_rate": 6.141547967048867e-05,
"loss": 0.0281,
"step": 11840
},
{
"epoch": 11.361457334611696,
"grad_norm": 0.22769756615161896,
"learning_rate": 6.135355509447727e-05,
"loss": 0.0407,
"step": 11850
},
{
"epoch": 11.37104506232023,
"grad_norm": 0.2970350682735443,
"learning_rate": 6.129161215149016e-05,
"loss": 0.0355,
"step": 11860
},
{
"epoch": 11.380632790028763,
"grad_norm": 0.319409042596817,
"learning_rate": 6.122965094173424e-05,
"loss": 0.0387,
"step": 11870
},
{
"epoch": 11.390220517737296,
"grad_norm": 0.31056809425354004,
"learning_rate": 6.116767156544592e-05,
"loss": 0.0353,
"step": 11880
},
{
"epoch": 11.39980824544583,
"grad_norm": 0.2925516366958618,
"learning_rate": 6.110567412289106e-05,
"loss": 0.0313,
"step": 11890
},
{
"epoch": 11.409395973154362,
"grad_norm": 0.2066742330789566,
"learning_rate": 6.10436587143647e-05,
"loss": 0.031,
"step": 11900
},
{
"epoch": 11.418983700862896,
"grad_norm": 0.2351049929857254,
"learning_rate": 6.0981625440191e-05,
"loss": 0.0384,
"step": 11910
},
{
"epoch": 11.428571428571429,
"grad_norm": 0.2299109846353531,
"learning_rate": 6.091957440072297e-05,
"loss": 0.029,
"step": 11920
},
{
"epoch": 11.438159156279962,
"grad_norm": 0.27398043870925903,
"learning_rate": 6.0857505696342376e-05,
"loss": 0.0334,
"step": 11930
},
{
"epoch": 11.447746883988495,
"grad_norm": 0.2886539697647095,
"learning_rate": 6.0795419427459564e-05,
"loss": 0.0403,
"step": 11940
},
{
"epoch": 11.457334611697028,
"grad_norm": 0.1952909678220749,
"learning_rate": 6.0733315694513306e-05,
"loss": 0.0342,
"step": 11950
},
{
"epoch": 11.466922339405562,
"grad_norm": 0.3800734281539917,
"learning_rate": 6.067119459797061e-05,
"loss": 0.0345,
"step": 11960
},
{
"epoch": 11.476510067114093,
"grad_norm": 0.2989748418331146,
"learning_rate": 6.060905623832656e-05,
"loss": 0.0397,
"step": 11970
},
{
"epoch": 11.486097794822626,
"grad_norm": 0.410169392824173,
"learning_rate": 6.0546900716104206e-05,
"loss": 0.0343,
"step": 11980
},
{
"epoch": 11.49568552253116,
"grad_norm": 0.3879852890968323,
"learning_rate": 6.048472813185433e-05,
"loss": 0.0328,
"step": 11990
},
{
"epoch": 11.505273250239693,
"grad_norm": 0.27426809072494507,
"learning_rate": 6.042253858615532e-05,
"loss": 0.0334,
"step": 12000
},
{
"epoch": 11.514860977948226,
"grad_norm": 0.3014174699783325,
"learning_rate": 6.036033217961303e-05,
"loss": 0.0305,
"step": 12010
},
{
"epoch": 11.52444870565676,
"grad_norm": 0.17615869641304016,
"learning_rate": 6.029810901286056e-05,
"loss": 0.04,
"step": 12020
},
{
"epoch": 11.534036433365292,
"grad_norm": 0.2742109000682831,
"learning_rate": 6.0235869186558125e-05,
"loss": 0.0349,
"step": 12030
},
{
"epoch": 11.543624161073826,
"grad_norm": 0.22772598266601562,
"learning_rate": 6.017361280139292e-05,
"loss": 0.0322,
"step": 12040
},
{
"epoch": 11.553211888782359,
"grad_norm": 0.2431521862745285,
"learning_rate": 6.011133995807888e-05,
"loss": 0.0334,
"step": 12050
},
{
"epoch": 11.562799616490892,
"grad_norm": 0.2893143594264984,
"learning_rate": 6.004905075735662e-05,
"loss": 0.0354,
"step": 12060
},
{
"epoch": 11.572387344199425,
"grad_norm": 0.26321181654930115,
"learning_rate": 5.998674529999316e-05,
"loss": 0.0364,
"step": 12070
},
{
"epoch": 11.581975071907959,
"grad_norm": 0.5845431685447693,
"learning_rate": 5.992442368678187e-05,
"loss": 0.0341,
"step": 12080
},
{
"epoch": 11.59156279961649,
"grad_norm": 0.23230616748332977,
"learning_rate": 5.986208601854222e-05,
"loss": 0.0316,
"step": 12090
},
{
"epoch": 11.601150527325023,
"grad_norm": 0.2684799134731293,
"learning_rate": 5.979973239611967e-05,
"loss": 0.0399,
"step": 12100
},
{
"epoch": 11.610738255033556,
"grad_norm": 0.19658780097961426,
"learning_rate": 5.973736292038549e-05,
"loss": 0.0396,
"step": 12110
},
{
"epoch": 11.62032598274209,
"grad_norm": 0.3254534602165222,
"learning_rate": 5.967497769223659e-05,
"loss": 0.0366,
"step": 12120
},
{
"epoch": 11.629913710450623,
"grad_norm": 0.573215663433075,
"learning_rate": 5.961257681259535e-05,
"loss": 0.0371,
"step": 12130
},
{
"epoch": 11.639501438159156,
"grad_norm": 0.24387991428375244,
"learning_rate": 5.955016038240951e-05,
"loss": 0.0314,
"step": 12140
},
{
"epoch": 11.64908916586769,
"grad_norm": 0.3126358091831207,
"learning_rate": 5.948772850265193e-05,
"loss": 0.0388,
"step": 12150
},
{
"epoch": 11.658676893576223,
"grad_norm": 0.2461678385734558,
"learning_rate": 5.9425281274320466e-05,
"loss": 0.0389,
"step": 12160
},
{
"epoch": 11.668264621284756,
"grad_norm": 0.2887043058872223,
"learning_rate": 5.936281879843782e-05,
"loss": 0.0316,
"step": 12170
},
{
"epoch": 11.677852348993289,
"grad_norm": 0.4977504014968872,
"learning_rate": 5.9300341176051364e-05,
"loss": 0.0369,
"step": 12180
},
{
"epoch": 11.687440076701822,
"grad_norm": 0.1966911256313324,
"learning_rate": 5.923784850823294e-05,
"loss": 0.0354,
"step": 12190
},
{
"epoch": 11.697027804410356,
"grad_norm": 0.28435948491096497,
"learning_rate": 5.917534089607877e-05,
"loss": 0.0347,
"step": 12200
},
{
"epoch": 11.706615532118889,
"grad_norm": 0.26728013157844543,
"learning_rate": 5.911281844070923e-05,
"loss": 0.0292,
"step": 12210
},
{
"epoch": 11.71620325982742,
"grad_norm": 0.24896536767482758,
"learning_rate": 5.905028124326869e-05,
"loss": 0.04,
"step": 12220
},
{
"epoch": 11.725790987535953,
"grad_norm": 0.394512414932251,
"learning_rate": 5.8987729404925405e-05,
"loss": 0.0394,
"step": 12230
},
{
"epoch": 11.735378715244487,
"grad_norm": 0.27139657735824585,
"learning_rate": 5.892516302687131e-05,
"loss": 0.0367,
"step": 12240
},
{
"epoch": 11.74496644295302,
"grad_norm": 0.30433669686317444,
"learning_rate": 5.886258221032184e-05,
"loss": 0.0315,
"step": 12250
},
{
"epoch": 11.754554170661553,
"grad_norm": 0.387657105922699,
"learning_rate": 5.8799987056515804e-05,
"loss": 0.0367,
"step": 12260
},
{
"epoch": 11.764141898370086,
"grad_norm": 0.7159243226051331,
"learning_rate": 5.87373776667152e-05,
"loss": 0.037,
"step": 12270
},
{
"epoch": 11.77372962607862,
"grad_norm": 0.4516725540161133,
"learning_rate": 5.867475414220506e-05,
"loss": 0.0389,
"step": 12280
},
{
"epoch": 11.783317353787153,
"grad_norm": 0.4054473638534546,
"learning_rate": 5.8612116584293266e-05,
"loss": 0.0349,
"step": 12290
},
{
"epoch": 11.792905081495686,
"grad_norm": 0.7706658244132996,
"learning_rate": 5.854946509431042e-05,
"loss": 0.0304,
"step": 12300
},
{
"epoch": 11.80249280920422,
"grad_norm": 0.2988179624080658,
"learning_rate": 5.848679977360963e-05,
"loss": 0.0308,
"step": 12310
},
{
"epoch": 11.812080536912752,
"grad_norm": 0.3133019804954529,
"learning_rate": 5.8424120723566453e-05,
"loss": 0.0341,
"step": 12320
},
{
"epoch": 11.821668264621284,
"grad_norm": 0.28148677945137024,
"learning_rate": 5.8361428045578595e-05,
"loss": 0.0365,
"step": 12330
},
{
"epoch": 11.831255992329817,
"grad_norm": 0.2674432098865509,
"learning_rate": 5.829872184106579e-05,
"loss": 0.0335,
"step": 12340
},
{
"epoch": 11.84084372003835,
"grad_norm": 0.2875913679599762,
"learning_rate": 5.823600221146974e-05,
"loss": 0.0324,
"step": 12350
},
{
"epoch": 11.850431447746884,
"grad_norm": 0.39202550053596497,
"learning_rate": 5.817326925825376e-05,
"loss": 0.0309,
"step": 12360
},
{
"epoch": 11.860019175455417,
"grad_norm": 0.2087734192609787,
"learning_rate": 5.811052308290279e-05,
"loss": 0.033,
"step": 12370
},
{
"epoch": 11.86960690316395,
"grad_norm": 0.2347189038991928,
"learning_rate": 5.804776378692313e-05,
"loss": 0.0357,
"step": 12380
},
{
"epoch": 11.879194630872483,
"grad_norm": 0.18789781630039215,
"learning_rate": 5.798499147184233e-05,
"loss": 0.0362,
"step": 12390
},
{
"epoch": 11.888782358581016,
"grad_norm": 0.44185203313827515,
"learning_rate": 5.792220623920898e-05,
"loss": 0.0353,
"step": 12400
},
{
"epoch": 11.89837008628955,
"grad_norm": 0.34168651700019836,
"learning_rate": 5.785940819059259e-05,
"loss": 0.0399,
"step": 12410
},
{
"epoch": 11.907957813998083,
"grad_norm": 0.3143576979637146,
"learning_rate": 5.779659742758336e-05,
"loss": 0.042,
"step": 12420
},
{
"epoch": 11.917545541706616,
"grad_norm": 0.2344186156988144,
"learning_rate": 5.773377405179209e-05,
"loss": 0.0377,
"step": 12430
},
{
"epoch": 11.92713326941515,
"grad_norm": 0.19894208014011383,
"learning_rate": 5.767093816484999e-05,
"loss": 0.0335,
"step": 12440
},
{
"epoch": 11.936720997123683,
"grad_norm": 0.332093745470047,
"learning_rate": 5.7608089868408486e-05,
"loss": 0.0324,
"step": 12450
},
{
"epoch": 11.946308724832214,
"grad_norm": 0.2045692652463913,
"learning_rate": 5.75452292641391e-05,
"loss": 0.0348,
"step": 12460
},
{
"epoch": 11.955896452540747,
"grad_norm": 0.2825275659561157,
"learning_rate": 5.748235645373325e-05,
"loss": 0.0385,
"step": 12470
},
{
"epoch": 11.96548418024928,
"grad_norm": 0.3274647891521454,
"learning_rate": 5.741947153890215e-05,
"loss": 0.0338,
"step": 12480
},
{
"epoch": 11.975071907957814,
"grad_norm": 0.29837775230407715,
"learning_rate": 5.7356574621376493e-05,
"loss": 0.0406,
"step": 12490
},
{
"epoch": 11.984659635666347,
"grad_norm": 0.3342297673225403,
"learning_rate": 5.729366580290646e-05,
"loss": 0.0325,
"step": 12500
},
{
"epoch": 11.99424736337488,
"grad_norm": 0.2670736014842987,
"learning_rate": 5.7230745185261505e-05,
"loss": 0.0361,
"step": 12510
},
{
"epoch": 12.003835091083413,
"grad_norm": 0.24446439743041992,
"learning_rate": 5.7167812870230094e-05,
"loss": 0.0298,
"step": 12520
},
{
"epoch": 12.013422818791947,
"grad_norm": 0.24025262892246246,
"learning_rate": 5.710486895961971e-05,
"loss": 0.0285,
"step": 12530
},
{
"epoch": 12.02301054650048,
"grad_norm": 0.20725701749324799,
"learning_rate": 5.7041913555256506e-05,
"loss": 0.0319,
"step": 12540
},
{
"epoch": 12.032598274209013,
"grad_norm": 0.24926726520061493,
"learning_rate": 5.6978946758985285e-05,
"loss": 0.0358,
"step": 12550
},
{
"epoch": 12.042186001917546,
"grad_norm": 0.22566866874694824,
"learning_rate": 5.691596867266925e-05,
"loss": 0.0353,
"step": 12560
},
{
"epoch": 12.05177372962608,
"grad_norm": 0.2323976457118988,
"learning_rate": 5.68529793981899e-05,
"loss": 0.0347,
"step": 12570
},
{
"epoch": 12.06136145733461,
"grad_norm": 0.2751142680644989,
"learning_rate": 5.6789979037446784e-05,
"loss": 0.0343,
"step": 12580
},
{
"epoch": 12.070949185043144,
"grad_norm": 0.20366577804088593,
"learning_rate": 5.672696769235744e-05,
"loss": 0.0346,
"step": 12590
},
{
"epoch": 12.080536912751677,
"grad_norm": 0.30414018034935,
"learning_rate": 5.666394546485714e-05,
"loss": 0.0335,
"step": 12600
},
{
"epoch": 12.09012464046021,
"grad_norm": 0.24006792902946472,
"learning_rate": 5.660091245689878e-05,
"loss": 0.0332,
"step": 12610
},
{
"epoch": 12.099712368168744,
"grad_norm": 0.25928163528442383,
"learning_rate": 5.653786877045266e-05,
"loss": 0.0345,
"step": 12620
},
{
"epoch": 12.109300095877277,
"grad_norm": 0.3005020320415497,
"learning_rate": 5.6474814507506426e-05,
"loss": 0.0279,
"step": 12630
},
{
"epoch": 12.11888782358581,
"grad_norm": 0.2962352931499481,
"learning_rate": 5.641174977006476e-05,
"loss": 0.0349,
"step": 12640
},
{
"epoch": 12.128475551294343,
"grad_norm": 0.3519500195980072,
"learning_rate": 5.634867466014932e-05,
"loss": 0.0322,
"step": 12650
},
{
"epoch": 12.138063279002877,
"grad_norm": 0.3588416576385498,
"learning_rate": 5.628558927979854e-05,
"loss": 0.0324,
"step": 12660
},
{
"epoch": 12.14765100671141,
"grad_norm": 0.29862353205680847,
"learning_rate": 5.622249373106748e-05,
"loss": 0.037,
"step": 12670
},
{
"epoch": 12.157238734419943,
"grad_norm": 0.3698887526988983,
"learning_rate": 5.6159388116027654e-05,
"loss": 0.0336,
"step": 12680
},
{
"epoch": 12.166826462128476,
"grad_norm": 0.268628865480423,
"learning_rate": 5.609627253676682e-05,
"loss": 0.0373,
"step": 12690
},
{
"epoch": 12.176414189837008,
"grad_norm": 0.23115096986293793,
"learning_rate": 5.603314709538891e-05,
"loss": 0.0393,
"step": 12700
},
{
"epoch": 12.186001917545541,
"grad_norm": 0.26541295647621155,
"learning_rate": 5.597001189401376e-05,
"loss": 0.0367,
"step": 12710
},
{
"epoch": 12.195589645254074,
"grad_norm": 0.28933706879615784,
"learning_rate": 5.5906867034777046e-05,
"loss": 0.0332,
"step": 12720
},
{
"epoch": 12.205177372962607,
"grad_norm": 0.320468008518219,
"learning_rate": 5.584371261983e-05,
"loss": 0.0351,
"step": 12730
},
{
"epoch": 12.21476510067114,
"grad_norm": 0.24627713859081268,
"learning_rate": 5.578054875133939e-05,
"loss": 0.032,
"step": 12740
},
{
"epoch": 12.224352828379674,
"grad_norm": 0.19859549403190613,
"learning_rate": 5.571737553148723e-05,
"loss": 0.0338,
"step": 12750
},
{
"epoch": 12.233940556088207,
"grad_norm": 0.2559930086135864,
"learning_rate": 5.565419306247065e-05,
"loss": 0.0372,
"step": 12760
},
{
"epoch": 12.24352828379674,
"grad_norm": 0.1816064417362213,
"learning_rate": 5.559100144650179e-05,
"loss": 0.0325,
"step": 12770
},
{
"epoch": 12.253116011505274,
"grad_norm": 0.5027087330818176,
"learning_rate": 5.552780078580756e-05,
"loss": 0.0357,
"step": 12780
},
{
"epoch": 12.262703739213807,
"grad_norm": 0.4723157584667206,
"learning_rate": 5.54645911826295e-05,
"loss": 0.0301,
"step": 12790
},
{
"epoch": 12.27229146692234,
"grad_norm": 0.18510127067565918,
"learning_rate": 5.5401372739223615e-05,
"loss": 0.0393,
"step": 12800
},
{
"epoch": 12.281879194630873,
"grad_norm": 0.19757391512393951,
"learning_rate": 5.533814555786021e-05,
"loss": 0.0319,
"step": 12810
},
{
"epoch": 12.291466922339406,
"grad_norm": 0.25884294509887695,
"learning_rate": 5.527490974082376e-05,
"loss": 0.0319,
"step": 12820
},
{
"epoch": 12.301054650047938,
"grad_norm": 0.29503270983695984,
"learning_rate": 5.521166539041266e-05,
"loss": 0.0405,
"step": 12830
},
{
"epoch": 12.310642377756471,
"grad_norm": 0.3443543016910553,
"learning_rate": 5.514841260893913e-05,
"loss": 0.0345,
"step": 12840
},
{
"epoch": 12.320230105465004,
"grad_norm": 0.3162010610103607,
"learning_rate": 5.508515149872903e-05,
"loss": 0.0374,
"step": 12850
},
{
"epoch": 12.329817833173538,
"grad_norm": 0.37343630194664,
"learning_rate": 5.502188216212172e-05,
"loss": 0.0339,
"step": 12860
},
{
"epoch": 12.33940556088207,
"grad_norm": 0.4099912643432617,
"learning_rate": 5.4958604701469804e-05,
"loss": 0.0348,
"step": 12870
},
{
"epoch": 12.348993288590604,
"grad_norm": 0.3237497806549072,
"learning_rate": 5.489531921913911e-05,
"loss": 0.0277,
"step": 12880
},
{
"epoch": 12.358581016299137,
"grad_norm": 0.2685404121875763,
"learning_rate": 5.483202581750838e-05,
"loss": 0.0326,
"step": 12890
},
{
"epoch": 12.36816874400767,
"grad_norm": 0.28428301215171814,
"learning_rate": 5.476872459896918e-05,
"loss": 0.0372,
"step": 12900
},
{
"epoch": 12.377756471716204,
"grad_norm": 0.34229859709739685,
"learning_rate": 5.470541566592573e-05,
"loss": 0.0324,
"step": 12910
},
{
"epoch": 12.387344199424737,
"grad_norm": 0.3393026292324066,
"learning_rate": 5.464209912079472e-05,
"loss": 0.034,
"step": 12920
},
{
"epoch": 12.39693192713327,
"grad_norm": 0.3479039967060089,
"learning_rate": 5.4578775066005196e-05,
"loss": 0.0384,
"step": 12930
},
{
"epoch": 12.406519654841803,
"grad_norm": 0.22416572272777557,
"learning_rate": 5.4515443603998304e-05,
"loss": 0.0339,
"step": 12940
},
{
"epoch": 12.416107382550335,
"grad_norm": 0.3005695343017578,
"learning_rate": 5.445210483722719e-05,
"loss": 0.0374,
"step": 12950
},
{
"epoch": 12.425695110258868,
"grad_norm": 0.2770855724811554,
"learning_rate": 5.438875886815682e-05,
"loss": 0.0407,
"step": 12960
},
{
"epoch": 12.435282837967401,
"grad_norm": 0.3203631043434143,
"learning_rate": 5.4325405799263786e-05,
"loss": 0.0381,
"step": 12970
},
{
"epoch": 12.444870565675934,
"grad_norm": 0.32981497049331665,
"learning_rate": 5.4262045733036204e-05,
"loss": 0.0389,
"step": 12980
},
{
"epoch": 12.454458293384468,
"grad_norm": 0.24350851774215698,
"learning_rate": 5.4198678771973475e-05,
"loss": 0.0377,
"step": 12990
},
{
"epoch": 12.464046021093,
"grad_norm": 0.25702494382858276,
"learning_rate": 5.413530501858621e-05,
"loss": 0.0308,
"step": 13000
},
{
"epoch": 12.473633748801534,
"grad_norm": 0.25904905796051025,
"learning_rate": 5.407192457539594e-05,
"loss": 0.0327,
"step": 13010
},
{
"epoch": 12.483221476510067,
"grad_norm": 0.29727786779403687,
"learning_rate": 5.4008537544935066e-05,
"loss": 0.0376,
"step": 13020
},
{
"epoch": 12.4928092042186,
"grad_norm": 0.21568791568279266,
"learning_rate": 5.394514402974661e-05,
"loss": 0.029,
"step": 13030
},
{
"epoch": 12.502396931927134,
"grad_norm": 0.31120288372039795,
"learning_rate": 5.3881744132384104e-05,
"loss": 0.0289,
"step": 13040
},
{
"epoch": 12.511984659635667,
"grad_norm": 0.3262520134449005,
"learning_rate": 5.381833795541141e-05,
"loss": 0.0458,
"step": 13050
},
{
"epoch": 12.5215723873442,
"grad_norm": 0.27970728278160095,
"learning_rate": 5.375492560140254e-05,
"loss": 0.0411,
"step": 13060
},
{
"epoch": 12.531160115052732,
"grad_norm": 0.25999292731285095,
"learning_rate": 5.3691507172941493e-05,
"loss": 0.0367,
"step": 13070
},
{
"epoch": 12.540747842761265,
"grad_norm": 0.32972532510757446,
"learning_rate": 5.362808277262211e-05,
"loss": 0.0336,
"step": 13080
},
{
"epoch": 12.550335570469798,
"grad_norm": 0.21841417253017426,
"learning_rate": 5.3564652503047895e-05,
"loss": 0.0383,
"step": 13090
},
{
"epoch": 12.559923298178331,
"grad_norm": 0.5416061878204346,
"learning_rate": 5.350121646683183e-05,
"loss": 0.0385,
"step": 13100
},
{
"epoch": 12.569511025886865,
"grad_norm": 0.28985804319381714,
"learning_rate": 5.343777476659621e-05,
"loss": 0.0334,
"step": 13110
},
{
"epoch": 12.579098753595398,
"grad_norm": 0.7717734575271606,
"learning_rate": 5.3374327504972544e-05,
"loss": 0.035,
"step": 13120
},
{
"epoch": 12.588686481303931,
"grad_norm": 0.38980838656425476,
"learning_rate": 5.331087478460129e-05,
"loss": 0.04,
"step": 13130
},
{
"epoch": 12.598274209012464,
"grad_norm": 0.43601536750793457,
"learning_rate": 5.324741670813178e-05,
"loss": 0.034,
"step": 13140
},
{
"epoch": 12.607861936720997,
"grad_norm": 0.27574917674064636,
"learning_rate": 5.318395337822195e-05,
"loss": 0.0328,
"step": 13150
},
{
"epoch": 12.61744966442953,
"grad_norm": 0.19968970119953156,
"learning_rate": 5.312048489753833e-05,
"loss": 0.0311,
"step": 13160
},
{
"epoch": 12.627037392138064,
"grad_norm": 0.4505964517593384,
"learning_rate": 5.305701136875566e-05,
"loss": 0.0326,
"step": 13170
},
{
"epoch": 12.636625119846597,
"grad_norm": 0.24829363822937012,
"learning_rate": 5.299353289455694e-05,
"loss": 0.0344,
"step": 13180
},
{
"epoch": 12.64621284755513,
"grad_norm": 0.2600340247154236,
"learning_rate": 5.2930049577633146e-05,
"loss": 0.0309,
"step": 13190
},
{
"epoch": 12.655800575263662,
"grad_norm": 0.2981709837913513,
"learning_rate": 5.2866561520683065e-05,
"loss": 0.0314,
"step": 13200
},
{
"epoch": 12.665388302972195,
"grad_norm": 0.22709107398986816,
"learning_rate": 5.280306882641319e-05,
"loss": 0.0323,
"step": 13210
},
{
"epoch": 12.674976030680728,
"grad_norm": 0.31488150358200073,
"learning_rate": 5.273957159753749e-05,
"loss": 0.0357,
"step": 13220
},
{
"epoch": 12.684563758389261,
"grad_norm": 0.5378819704055786,
"learning_rate": 5.2676069936777264e-05,
"loss": 0.0341,
"step": 13230
},
{
"epoch": 12.694151486097795,
"grad_norm": 0.3149401843547821,
"learning_rate": 5.2612563946861e-05,
"loss": 0.0369,
"step": 13240
},
{
"epoch": 12.703739213806328,
"grad_norm": 0.2183138132095337,
"learning_rate": 5.254905373052419e-05,
"loss": 0.0349,
"step": 13250
},
{
"epoch": 12.713326941514861,
"grad_norm": 1.1205395460128784,
"learning_rate": 5.2485539390509156e-05,
"loss": 0.0368,
"step": 13260
},
{
"epoch": 12.722914669223394,
"grad_norm": 0.21172510087490082,
"learning_rate": 5.242202102956486e-05,
"loss": 0.0402,
"step": 13270
},
{
"epoch": 12.732502396931928,
"grad_norm": 0.25088265538215637,
"learning_rate": 5.2358498750446835e-05,
"loss": 0.0356,
"step": 13280
},
{
"epoch": 12.74209012464046,
"grad_norm": 0.36349666118621826,
"learning_rate": 5.229497265591689e-05,
"loss": 0.0292,
"step": 13290
},
{
"epoch": 12.751677852348994,
"grad_norm": 0.3626287877559662,
"learning_rate": 5.2231442848743064e-05,
"loss": 0.0402,
"step": 13300
},
{
"epoch": 12.761265580057525,
"grad_norm": 0.18637891113758087,
"learning_rate": 5.2167909431699344e-05,
"loss": 0.0324,
"step": 13310
},
{
"epoch": 12.770853307766059,
"grad_norm": 0.28557726740837097,
"learning_rate": 5.2104372507565593e-05,
"loss": 0.0289,
"step": 13320
},
{
"epoch": 12.780441035474592,
"grad_norm": 0.3556912839412689,
"learning_rate": 5.204083217912732e-05,
"loss": 0.0343,
"step": 13330
},
{
"epoch": 12.790028763183125,
"grad_norm": 0.16443754732608795,
"learning_rate": 5.197728854917558e-05,
"loss": 0.0337,
"step": 13340
},
{
"epoch": 12.799616490891658,
"grad_norm": 0.19597065448760986,
"learning_rate": 5.191374172050676e-05,
"loss": 0.0293,
"step": 13350
},
{
"epoch": 12.809204218600192,
"grad_norm": 0.38750675320625305,
"learning_rate": 5.185019179592238e-05,
"loss": 0.0404,
"step": 13360
},
{
"epoch": 12.818791946308725,
"grad_norm": 0.25635913014411926,
"learning_rate": 5.178663887822901e-05,
"loss": 0.0259,
"step": 13370
},
{
"epoch": 12.828379674017258,
"grad_norm": 0.21815137565135956,
"learning_rate": 5.172308307023805e-05,
"loss": 0.0296,
"step": 13380
},
{
"epoch": 12.837967401725791,
"grad_norm": 0.3391851782798767,
"learning_rate": 5.165952447476559e-05,
"loss": 0.0312,
"step": 13390
},
{
"epoch": 12.847555129434324,
"grad_norm": 0.38378575444221497,
"learning_rate": 5.159596319463219e-05,
"loss": 0.0301,
"step": 13400
},
{
"epoch": 12.857142857142858,
"grad_norm": 0.29647505283355713,
"learning_rate": 5.15323993326628e-05,
"loss": 0.0355,
"step": 13410
},
{
"epoch": 12.86673058485139,
"grad_norm": 0.3213365972042084,
"learning_rate": 5.146883299168651e-05,
"loss": 0.0309,
"step": 13420
},
{
"epoch": 12.876318312559924,
"grad_norm": 0.21259522438049316,
"learning_rate": 5.1405264274536445e-05,
"loss": 0.0361,
"step": 13430
},
{
"epoch": 12.885906040268456,
"grad_norm": 0.41032230854034424,
"learning_rate": 5.134169328404956e-05,
"loss": 0.0347,
"step": 13440
},
{
"epoch": 12.895493767976989,
"grad_norm": 0.3352082371711731,
"learning_rate": 5.127812012306649e-05,
"loss": 0.0329,
"step": 13450
},
{
"epoch": 12.905081495685522,
"grad_norm": 2.1955349445343018,
"learning_rate": 5.1214544894431396e-05,
"loss": 0.036,
"step": 13460
},
{
"epoch": 12.914669223394055,
"grad_norm": 0.19683793187141418,
"learning_rate": 5.115096770099175e-05,
"loss": 0.0344,
"step": 13470
},
{
"epoch": 12.924256951102588,
"grad_norm": 0.2288978546857834,
"learning_rate": 5.1087388645598235e-05,
"loss": 0.0289,
"step": 13480
},
{
"epoch": 12.933844678811122,
"grad_norm": 0.3008512556552887,
"learning_rate": 5.1023807831104544e-05,
"loss": 0.0421,
"step": 13490
},
{
"epoch": 12.943432406519655,
"grad_norm": 0.29300564527511597,
"learning_rate": 5.096022536036721e-05,
"loss": 0.0374,
"step": 13500
},
{
"epoch": 12.953020134228188,
"grad_norm": 0.2803822457790375,
"learning_rate": 5.089664133624541e-05,
"loss": 0.0349,
"step": 13510
},
{
"epoch": 12.962607861936721,
"grad_norm": 0.35536760091781616,
"learning_rate": 5.083305586160089e-05,
"loss": 0.0311,
"step": 13520
},
{
"epoch": 12.972195589645255,
"grad_norm": 0.290683776140213,
"learning_rate": 5.07694690392977e-05,
"loss": 0.0315,
"step": 13530
},
{
"epoch": 12.981783317353788,
"grad_norm": 0.19355502724647522,
"learning_rate": 5.070588097220213e-05,
"loss": 0.0297,
"step": 13540
},
{
"epoch": 12.991371045062321,
"grad_norm": 0.2547348439693451,
"learning_rate": 5.06422917631824e-05,
"loss": 0.0325,
"step": 13550
},
{
"epoch": 13.000958772770852,
"grad_norm": 0.1929698884487152,
"learning_rate": 5.057870151510864e-05,
"loss": 0.0329,
"step": 13560
},
{
"epoch": 13.010546500479386,
"grad_norm": 0.29264265298843384,
"learning_rate": 5.051511033085264e-05,
"loss": 0.0319,
"step": 13570
},
{
"epoch": 13.020134228187919,
"grad_norm": 0.28177183866500854,
"learning_rate": 5.0451518313287704e-05,
"loss": 0.038,
"step": 13580
},
{
"epoch": 13.029721955896452,
"grad_norm": 0.3331814110279083,
"learning_rate": 5.0387925565288485e-05,
"loss": 0.0307,
"step": 13590
},
{
"epoch": 13.039309683604985,
"grad_norm": 0.297892689704895,
"learning_rate": 5.0324332189730796e-05,
"loss": 0.0339,
"step": 13600
},
{
"epoch": 13.048897411313519,
"grad_norm": 0.2248513251543045,
"learning_rate": 5.0260738289491516e-05,
"loss": 0.0227,
"step": 13610
},
{
"epoch": 13.058485139022052,
"grad_norm": 0.24514958262443542,
"learning_rate": 5.0197143967448335e-05,
"loss": 0.0335,
"step": 13620
},
{
"epoch": 13.068072866730585,
"grad_norm": 0.29958298802375793,
"learning_rate": 5.0133549326479645e-05,
"loss": 0.0305,
"step": 13630
},
{
"epoch": 13.077660594439118,
"grad_norm": 3.086843252182007,
"learning_rate": 5.006995446946433e-05,
"loss": 0.0377,
"step": 13640
},
{
"epoch": 13.087248322147651,
"grad_norm": 0.31443238258361816,
"learning_rate": 5.000635949928163e-05,
"loss": 0.0344,
"step": 13650
},
{
"epoch": 13.096836049856185,
"grad_norm": 0.27507051825523376,
"learning_rate": 4.994276451881098e-05,
"loss": 0.034,
"step": 13660
},
{
"epoch": 13.106423777564718,
"grad_norm": 0.2578774094581604,
"learning_rate": 4.987916963093184e-05,
"loss": 0.0328,
"step": 13670
},
{
"epoch": 13.116011505273251,
"grad_norm": 0.28767842054367065,
"learning_rate": 4.981557493852349e-05,
"loss": 0.0332,
"step": 13680
},
{
"epoch": 13.125599232981783,
"grad_norm": 0.17203165590763092,
"learning_rate": 4.975198054446492e-05,
"loss": 0.0327,
"step": 13690
},
{
"epoch": 13.135186960690316,
"grad_norm": 0.2606458067893982,
"learning_rate": 4.968838655163462e-05,
"loss": 0.0321,
"step": 13700
},
{
"epoch": 13.144774688398849,
"grad_norm": 0.3137904703617096,
"learning_rate": 4.9624793062910445e-05,
"loss": 0.0376,
"step": 13710
},
{
"epoch": 13.154362416107382,
"grad_norm": 0.255403608083725,
"learning_rate": 4.956120018116941e-05,
"loss": 0.0304,
"step": 13720
},
{
"epoch": 13.163950143815915,
"grad_norm": 0.32765787839889526,
"learning_rate": 4.94976080092876e-05,
"loss": 0.0352,
"step": 13730
},
{
"epoch": 13.173537871524449,
"grad_norm": 0.36302298307418823,
"learning_rate": 4.94340166501399e-05,
"loss": 0.0287,
"step": 13740
},
{
"epoch": 13.183125599232982,
"grad_norm": 0.1956561803817749,
"learning_rate": 4.93704262065999e-05,
"loss": 0.0299,
"step": 13750
},
{
"epoch": 13.192713326941515,
"grad_norm": 0.28090646862983704,
"learning_rate": 4.930683678153971e-05,
"loss": 0.0294,
"step": 13760
},
{
"epoch": 13.202301054650048,
"grad_norm": 0.3016568422317505,
"learning_rate": 4.9243248477829786e-05,
"loss": 0.0339,
"step": 13770
},
{
"epoch": 13.211888782358582,
"grad_norm": 0.34404152631759644,
"learning_rate": 4.9179661398338764e-05,
"loss": 0.0268,
"step": 13780
},
{
"epoch": 13.221476510067115,
"grad_norm": 0.35919350385665894,
"learning_rate": 4.911607564593331e-05,
"loss": 0.0308,
"step": 13790
},
{
"epoch": 13.231064237775648,
"grad_norm": 0.23123154044151306,
"learning_rate": 4.905249132347796e-05,
"loss": 0.0293,
"step": 13800
},
{
"epoch": 13.24065196548418,
"grad_norm": 0.2878974378108978,
"learning_rate": 4.89889085338349e-05,
"loss": 0.0366,
"step": 13810
},
{
"epoch": 13.250239693192713,
"grad_norm": 0.1915551722049713,
"learning_rate": 4.892532737986387e-05,
"loss": 0.0326,
"step": 13820
},
{
"epoch": 13.259827420901246,
"grad_norm": 0.29005202651023865,
"learning_rate": 4.886174796442193e-05,
"loss": 0.0332,
"step": 13830
},
{
"epoch": 13.269415148609779,
"grad_norm": 0.335665225982666,
"learning_rate": 4.879817039036336e-05,
"loss": 0.0254,
"step": 13840
},
{
"epoch": 13.279002876318312,
"grad_norm": 0.1871231645345688,
"learning_rate": 4.873459476053946e-05,
"loss": 0.0288,
"step": 13850
},
{
"epoch": 13.288590604026846,
"grad_norm": 0.26077544689178467,
"learning_rate": 4.867102117779834e-05,
"loss": 0.031,
"step": 13860
},
{
"epoch": 13.298178331735379,
"grad_norm": 0.46799513697624207,
"learning_rate": 4.8607449744984836e-05,
"loss": 0.0292,
"step": 13870
},
{
"epoch": 13.307766059443912,
"grad_norm": 0.24073362350463867,
"learning_rate": 4.8543880564940327e-05,
"loss": 0.0268,
"step": 13880
},
{
"epoch": 13.317353787152445,
"grad_norm": 0.22020606696605682,
"learning_rate": 4.848031374050251e-05,
"loss": 0.0339,
"step": 13890
},
{
"epoch": 13.326941514860978,
"grad_norm": 0.20859257876873016,
"learning_rate": 4.8416749374505285e-05,
"loss": 0.0319,
"step": 13900
},
{
"epoch": 13.336529242569512,
"grad_norm": 4.301571846008301,
"learning_rate": 4.835318756977856e-05,
"loss": 0.0382,
"step": 13910
},
{
"epoch": 13.346116970278045,
"grad_norm": 0.33860668540000916,
"learning_rate": 4.828962842914812e-05,
"loss": 0.0334,
"step": 13920
},
{
"epoch": 13.355704697986576,
"grad_norm": 0.24827070534229279,
"learning_rate": 4.8226072055435425e-05,
"loss": 0.0267,
"step": 13930
},
{
"epoch": 13.36529242569511,
"grad_norm": 0.22739817202091217,
"learning_rate": 4.816251855145748e-05,
"loss": 0.0308,
"step": 13940
},
{
"epoch": 13.374880153403643,
"grad_norm": 0.33846351504325867,
"learning_rate": 4.809896802002662e-05,
"loss": 0.0337,
"step": 13950
},
{
"epoch": 13.384467881112176,
"grad_norm": 0.2737593352794647,
"learning_rate": 4.8035420563950395e-05,
"loss": 0.0358,
"step": 13960
},
{
"epoch": 13.39405560882071,
"grad_norm": 0.3176287114620209,
"learning_rate": 4.797187628603136e-05,
"loss": 0.0273,
"step": 13970
},
{
"epoch": 13.403643336529242,
"grad_norm": 0.2898380756378174,
"learning_rate": 4.790833528906696e-05,
"loss": 0.0324,
"step": 13980
},
{
"epoch": 13.413231064237776,
"grad_norm": 0.48169559240341187,
"learning_rate": 4.784479767584929e-05,
"loss": 0.0269,
"step": 13990
},
{
"epoch": 13.422818791946309,
"grad_norm": 0.23410825431346893,
"learning_rate": 4.778126354916498e-05,
"loss": 0.0307,
"step": 14000
},
{
"epoch": 13.432406519654842,
"grad_norm": 0.39884692430496216,
"learning_rate": 4.771773301179506e-05,
"loss": 0.0324,
"step": 14010
},
{
"epoch": 13.441994247363375,
"grad_norm": 0.26422742009162903,
"learning_rate": 4.765420616651468e-05,
"loss": 0.0318,
"step": 14020
},
{
"epoch": 13.451581975071909,
"grad_norm": 0.261283278465271,
"learning_rate": 4.7590683116093135e-05,
"loss": 0.0312,
"step": 14030
},
{
"epoch": 13.461169702780442,
"grad_norm": 0.28744202852249146,
"learning_rate": 4.752716396329346e-05,
"loss": 0.0349,
"step": 14040
},
{
"epoch": 13.470757430488973,
"grad_norm": 0.2296159714460373,
"learning_rate": 4.746364881087244e-05,
"loss": 0.0329,
"step": 14050
},
{
"epoch": 13.480345158197506,
"grad_norm": 0.2238318920135498,
"learning_rate": 4.7400137761580376e-05,
"loss": 0.0287,
"step": 14060
},
{
"epoch": 13.48993288590604,
"grad_norm": 0.3209201395511627,
"learning_rate": 4.733663091816095e-05,
"loss": 0.0312,
"step": 14070
},
{
"epoch": 13.499520613614573,
"grad_norm": 4.273186206817627,
"learning_rate": 4.7273128383351015e-05,
"loss": 0.0292,
"step": 14080
},
{
"epoch": 13.509108341323106,
"grad_norm": 0.2698652446269989,
"learning_rate": 4.720963025988047e-05,
"loss": 0.0319,
"step": 14090
},
{
"epoch": 13.51869606903164,
"grad_norm": 0.28722748160362244,
"learning_rate": 4.714613665047207e-05,
"loss": 0.0285,
"step": 14100
},
{
"epoch": 13.528283796740173,
"grad_norm": 0.2316875010728836,
"learning_rate": 4.708264765784129e-05,
"loss": 0.0368,
"step": 14110
},
{
"epoch": 13.537871524448706,
"grad_norm": 0.5195225477218628,
"learning_rate": 4.701916338469608e-05,
"loss": 0.031,
"step": 14120
},
{
"epoch": 13.547459252157239,
"grad_norm": 0.40332475304603577,
"learning_rate": 4.6955683933736814e-05,
"loss": 0.032,
"step": 14130
},
{
"epoch": 13.557046979865772,
"grad_norm": 0.2699570059776306,
"learning_rate": 4.689220940765605e-05,
"loss": 0.0334,
"step": 14140
},
{
"epoch": 13.566634707574305,
"grad_norm": 0.26050880551338196,
"learning_rate": 4.682873990913835e-05,
"loss": 0.0333,
"step": 14150
},
{
"epoch": 13.576222435282839,
"grad_norm": 0.2826980650424957,
"learning_rate": 4.676527554086018e-05,
"loss": 0.0282,
"step": 14160
},
{
"epoch": 13.585810162991372,
"grad_norm": 0.17002440989017487,
"learning_rate": 4.6701816405489686e-05,
"loss": 0.0325,
"step": 14170
},
{
"epoch": 13.595397890699903,
"grad_norm": 0.33742156624794006,
"learning_rate": 4.6638362605686555e-05,
"loss": 0.0283,
"step": 14180
},
{
"epoch": 13.604985618408437,
"grad_norm": 0.29989632964134216,
"learning_rate": 4.657491424410185e-05,
"loss": 0.0327,
"step": 14190
},
{
"epoch": 13.61457334611697,
"grad_norm": 0.2583453357219696,
"learning_rate": 4.6511471423377815e-05,
"loss": 0.0285,
"step": 14200
},
{
"epoch": 13.624161073825503,
"grad_norm": 0.2405027151107788,
"learning_rate": 4.6448034246147754e-05,
"loss": 0.0262,
"step": 14210
},
{
"epoch": 13.633748801534036,
"grad_norm": 0.3429577052593231,
"learning_rate": 4.638460281503582e-05,
"loss": 0.0429,
"step": 14220
},
{
"epoch": 13.64333652924257,
"grad_norm": 0.30057376623153687,
"learning_rate": 4.6321177232656894e-05,
"loss": 0.0255,
"step": 14230
},
{
"epoch": 13.652924256951103,
"grad_norm": 0.25279220938682556,
"learning_rate": 4.6257757601616364e-05,
"loss": 0.0333,
"step": 14240
},
{
"epoch": 13.662511984659636,
"grad_norm": 1.2111369371414185,
"learning_rate": 4.6194344024510036e-05,
"loss": 0.0325,
"step": 14250
},
{
"epoch": 13.67209971236817,
"grad_norm": 1.4284824132919312,
"learning_rate": 4.613093660392386e-05,
"loss": 0.0368,
"step": 14260
},
{
"epoch": 13.681687440076702,
"grad_norm": 1.6276288032531738,
"learning_rate": 4.6067535442433885e-05,
"loss": 0.0343,
"step": 14270
},
{
"epoch": 13.691275167785236,
"grad_norm": 1.3329591751098633,
"learning_rate": 4.6004140642606e-05,
"loss": 0.0322,
"step": 14280
},
{
"epoch": 13.700862895493769,
"grad_norm": 0.2651832103729248,
"learning_rate": 4.5940752306995824e-05,
"loss": 0.0337,
"step": 14290
},
{
"epoch": 13.7104506232023,
"grad_norm": 0.1592620313167572,
"learning_rate": 4.58773705381485e-05,
"loss": 0.0309,
"step": 14300
},
{
"epoch": 13.720038350910833,
"grad_norm": 0.7592516541481018,
"learning_rate": 4.581399543859855e-05,
"loss": 0.0355,
"step": 14310
},
{
"epoch": 13.729626078619367,
"grad_norm": 0.27996954321861267,
"learning_rate": 4.5750627110869724e-05,
"loss": 0.0299,
"step": 14320
},
{
"epoch": 13.7392138063279,
"grad_norm": 0.19375735521316528,
"learning_rate": 4.5687265657474797e-05,
"loss": 0.0354,
"step": 14330
},
{
"epoch": 13.748801534036433,
"grad_norm": 0.263683944940567,
"learning_rate": 4.562391118091544e-05,
"loss": 0.0342,
"step": 14340
},
{
"epoch": 13.758389261744966,
"grad_norm": 0.4312153160572052,
"learning_rate": 4.556056378368203e-05,
"loss": 0.0349,
"step": 14350
},
{
"epoch": 13.7679769894535,
"grad_norm": 0.3268071413040161,
"learning_rate": 4.549722356825349e-05,
"loss": 0.0297,
"step": 14360
},
{
"epoch": 13.777564717162033,
"grad_norm": 0.43241703510284424,
"learning_rate": 4.543389063709712e-05,
"loss": 0.0333,
"step": 14370
},
{
"epoch": 13.787152444870566,
"grad_norm": 0.2650851905345917,
"learning_rate": 4.537056509266845e-05,
"loss": 0.0338,
"step": 14380
},
{
"epoch": 13.7967401725791,
"grad_norm": 0.34464454650878906,
"learning_rate": 4.530724703741104e-05,
"loss": 0.0334,
"step": 14390
},
{
"epoch": 13.806327900287632,
"grad_norm": 0.2718554735183716,
"learning_rate": 4.524393657375635e-05,
"loss": 0.0295,
"step": 14400
},
{
"epoch": 13.815915627996166,
"grad_norm": 0.27128416299819946,
"learning_rate": 4.5180633804123555e-05,
"loss": 0.0367,
"step": 14410
},
{
"epoch": 13.825503355704697,
"grad_norm": 0.190488800406456,
"learning_rate": 4.511733883091939e-05,
"loss": 0.0273,
"step": 14420
},
{
"epoch": 13.83509108341323,
"grad_norm": 0.45956146717071533,
"learning_rate": 4.5054051756537965e-05,
"loss": 0.0333,
"step": 14430
},
{
"epoch": 13.844678811121764,
"grad_norm": 0.2585156559944153,
"learning_rate": 4.499077268336063e-05,
"loss": 0.0277,
"step": 14440
},
{
"epoch": 13.854266538830297,
"grad_norm": 0.209930419921875,
"learning_rate": 4.492750171375576e-05,
"loss": 0.0317,
"step": 14450
},
{
"epoch": 13.86385426653883,
"grad_norm": 0.25458142161369324,
"learning_rate": 4.486423895007866e-05,
"loss": 0.0402,
"step": 14460
},
{
"epoch": 13.873441994247363,
"grad_norm": 0.2012961506843567,
"learning_rate": 4.480098449467132e-05,
"loss": 0.031,
"step": 14470
},
{
"epoch": 13.883029721955896,
"grad_norm": 0.2313721477985382,
"learning_rate": 4.473773844986229e-05,
"loss": 0.0278,
"step": 14480
},
{
"epoch": 13.89261744966443,
"grad_norm": 0.3655869960784912,
"learning_rate": 4.467450091796658e-05,
"loss": 0.0356,
"step": 14490
},
{
"epoch": 13.902205177372963,
"grad_norm": 0.2222936451435089,
"learning_rate": 4.461127200128536e-05,
"loss": 0.0335,
"step": 14500
},
{
"epoch": 13.911792905081496,
"grad_norm": 0.2714097797870636,
"learning_rate": 4.4548051802105914e-05,
"loss": 0.0289,
"step": 14510
},
{
"epoch": 13.92138063279003,
"grad_norm": 0.28923454880714417,
"learning_rate": 4.448484042270134e-05,
"loss": 0.0321,
"step": 14520
},
{
"epoch": 13.930968360498563,
"grad_norm": 0.3318518400192261,
"learning_rate": 4.4421637965330554e-05,
"loss": 0.0302,
"step": 14530
},
{
"epoch": 13.940556088207096,
"grad_norm": 0.21569694578647614,
"learning_rate": 4.4358444532237996e-05,
"loss": 0.0347,
"step": 14540
},
{
"epoch": 13.950143815915627,
"grad_norm": 0.24663789570331573,
"learning_rate": 4.429526022565352e-05,
"loss": 0.0293,
"step": 14550
},
{
"epoch": 13.95973154362416,
"grad_norm": 0.17170065641403198,
"learning_rate": 4.423208514779222e-05,
"loss": 0.0383,
"step": 14560
},
{
"epoch": 13.969319271332694,
"grad_norm": 0.2217435985803604,
"learning_rate": 4.4168919400854245e-05,
"loss": 0.0357,
"step": 14570
},
{
"epoch": 13.978906999041227,
"grad_norm": 0.18699301779270172,
"learning_rate": 4.4105763087024666e-05,
"loss": 0.0261,
"step": 14580
},
{
"epoch": 13.98849472674976,
"grad_norm": 0.35671454668045044,
"learning_rate": 4.404261630847329e-05,
"loss": 0.0356,
"step": 14590
},
{
"epoch": 13.998082454458293,
"grad_norm": 0.33537557721138,
"learning_rate": 4.3979479167354477e-05,
"loss": 0.0317,
"step": 14600
},
{
"epoch": 14.007670182166827,
"grad_norm": 0.25765296816825867,
"learning_rate": 4.391635176580702e-05,
"loss": 0.0314,
"step": 14610
},
{
"epoch": 14.01725790987536,
"grad_norm": 0.18932734429836273,
"learning_rate": 4.385323420595395e-05,
"loss": 0.036,
"step": 14620
},
{
"epoch": 14.026845637583893,
"grad_norm": 0.2255479097366333,
"learning_rate": 4.3790126589902344e-05,
"loss": 0.0329,
"step": 14630
},
{
"epoch": 14.036433365292426,
"grad_norm": 0.19790147244930267,
"learning_rate": 4.372702901974331e-05,
"loss": 0.032,
"step": 14640
},
{
"epoch": 14.04602109300096,
"grad_norm": 0.16959276795387268,
"learning_rate": 4.366394159755155e-05,
"loss": 0.0328,
"step": 14650
},
{
"epoch": 14.055608820709493,
"grad_norm": 0.36921027302742004,
"learning_rate": 4.3600864425385434e-05,
"loss": 0.0313,
"step": 14660
},
{
"epoch": 14.065196548418024,
"grad_norm": 0.1770399957895279,
"learning_rate": 4.3537797605286736e-05,
"loss": 0.0265,
"step": 14670
},
{
"epoch": 14.074784276126557,
"grad_norm": 0.28713101148605347,
"learning_rate": 4.347474123928048e-05,
"loss": 0.0282,
"step": 14680
},
{
"epoch": 14.08437200383509,
"grad_norm": 0.1728815734386444,
"learning_rate": 4.3411695429374793e-05,
"loss": 0.03,
"step": 14690
},
{
"epoch": 14.093959731543624,
"grad_norm": 0.2004602998495102,
"learning_rate": 4.3348660277560694e-05,
"loss": 0.0301,
"step": 14700
},
{
"epoch": 14.103547459252157,
"grad_norm": 0.24591505527496338,
"learning_rate": 4.328563588581199e-05,
"loss": 0.0384,
"step": 14710
},
{
"epoch": 14.11313518696069,
"grad_norm": 0.3375163674354553,
"learning_rate": 4.322262235608508e-05,
"loss": 0.0339,
"step": 14720
},
{
"epoch": 14.122722914669223,
"grad_norm": 0.22719378769397736,
"learning_rate": 4.315961979031875e-05,
"loss": 0.0323,
"step": 14730
},
{
"epoch": 14.132310642377757,
"grad_norm": 0.34426233172416687,
"learning_rate": 4.30966282904341e-05,
"loss": 0.0335,
"step": 14740
},
{
"epoch": 14.14189837008629,
"grad_norm": 0.30899283289909363,
"learning_rate": 4.3033647958334306e-05,
"loss": 0.0334,
"step": 14750
},
{
"epoch": 14.151486097794823,
"grad_norm": 0.3567700684070587,
"learning_rate": 4.2970678895904476e-05,
"loss": 0.0356,
"step": 14760
},
{
"epoch": 14.161073825503356,
"grad_norm": 0.22836564481258392,
"learning_rate": 4.29077212050115e-05,
"loss": 0.0321,
"step": 14770
},
{
"epoch": 14.17066155321189,
"grad_norm": 0.17751692235469818,
"learning_rate": 4.284477498750383e-05,
"loss": 0.0302,
"step": 14780
},
{
"epoch": 14.180249280920421,
"grad_norm": 0.3431791067123413,
"learning_rate": 4.278184034521144e-05,
"loss": 0.0332,
"step": 14790
},
{
"epoch": 14.189837008628954,
"grad_norm": 0.26100659370422363,
"learning_rate": 4.27189173799455e-05,
"loss": 0.0315,
"step": 14800
},
{
"epoch": 14.199424736337487,
"grad_norm": 0.2879122197628021,
"learning_rate": 4.265600619349832e-05,
"loss": 0.0277,
"step": 14810
},
{
"epoch": 14.20901246404602,
"grad_norm": 0.26366403698921204,
"learning_rate": 4.2593106887643156e-05,
"loss": 0.0332,
"step": 14820
},
{
"epoch": 14.218600191754554,
"grad_norm": 0.25366711616516113,
"learning_rate": 4.2530219564134046e-05,
"loss": 0.0309,
"step": 14830
},
{
"epoch": 14.228187919463087,
"grad_norm": 0.259772926568985,
"learning_rate": 4.246734432470563e-05,
"loss": 0.0302,
"step": 14840
},
{
"epoch": 14.23777564717162,
"grad_norm": 0.32079434394836426,
"learning_rate": 4.240448127107301e-05,
"loss": 0.0293,
"step": 14850
},
{
"epoch": 14.247363374880154,
"grad_norm": 0.25380274653434753,
"learning_rate": 4.234163050493158e-05,
"loss": 0.0287,
"step": 14860
},
{
"epoch": 14.256951102588687,
"grad_norm": 0.26985570788383484,
"learning_rate": 4.2278792127956846e-05,
"loss": 0.0307,
"step": 14870
},
{
"epoch": 14.26653883029722,
"grad_norm": 0.2960470914840698,
"learning_rate": 4.221596624180426e-05,
"loss": 0.0313,
"step": 14880
},
{
"epoch": 14.276126558005753,
"grad_norm": 0.41474372148513794,
"learning_rate": 4.21531529481091e-05,
"loss": 0.0287,
"step": 14890
},
{
"epoch": 14.285714285714286,
"grad_norm": 0.2426476627588272,
"learning_rate": 4.2090352348486256e-05,
"loss": 0.0272,
"step": 14900
},
{
"epoch": 14.29530201342282,
"grad_norm": 0.2811989486217499,
"learning_rate": 4.202756454453007e-05,
"loss": 0.0328,
"step": 14910
},
{
"epoch": 14.304889741131351,
"grad_norm": 0.20871858298778534,
"learning_rate": 4.196478963781421e-05,
"loss": 0.028,
"step": 14920
},
{
"epoch": 14.314477468839884,
"grad_norm": 0.1654272824525833,
"learning_rate": 4.190202772989144e-05,
"loss": 0.0301,
"step": 14930
},
{
"epoch": 14.324065196548418,
"grad_norm": 0.6324641108512878,
"learning_rate": 4.183927892229354e-05,
"loss": 0.0284,
"step": 14940
},
{
"epoch": 14.33365292425695,
"grad_norm": 1.3468248844146729,
"learning_rate": 4.177654331653108e-05,
"loss": 0.0331,
"step": 14950
},
{
"epoch": 14.343240651965484,
"grad_norm": 0.16660985350608826,
"learning_rate": 4.171382101409327e-05,
"loss": 0.0262,
"step": 14960
},
{
"epoch": 14.352828379674017,
"grad_norm": 0.32994958758354187,
"learning_rate": 4.165111211644779e-05,
"loss": 0.0259,
"step": 14970
},
{
"epoch": 14.36241610738255,
"grad_norm": 0.20298174023628235,
"learning_rate": 4.158841672504066e-05,
"loss": 0.0298,
"step": 14980
},
{
"epoch": 14.372003835091084,
"grad_norm": 0.23911802470684052,
"learning_rate": 4.1525734941296026e-05,
"loss": 0.0315,
"step": 14990
},
{
"epoch": 14.381591562799617,
"grad_norm": 0.22921425104141235,
"learning_rate": 4.146306686661602e-05,
"loss": 0.0336,
"step": 15000
},
{
"epoch": 14.39117929050815,
"grad_norm": 0.24981558322906494,
"learning_rate": 4.140041260238062e-05,
"loss": 0.0326,
"step": 15010
},
{
"epoch": 14.400767018216683,
"grad_norm": 0.24186521768569946,
"learning_rate": 4.1337772249947435e-05,
"loss": 0.0264,
"step": 15020
},
{
"epoch": 14.410354745925215,
"grad_norm": 0.26285290718078613,
"learning_rate": 4.1275145910651603e-05,
"loss": 0.0304,
"step": 15030
},
{
"epoch": 14.419942473633748,
"grad_norm": 0.2739505469799042,
"learning_rate": 4.121253368580555e-05,
"loss": 0.0263,
"step": 15040
},
{
"epoch": 14.429530201342281,
"grad_norm": 0.6612746119499207,
"learning_rate": 4.1149935676698904e-05,
"loss": 0.0395,
"step": 15050
},
{
"epoch": 14.439117929050814,
"grad_norm": 0.2866060435771942,
"learning_rate": 4.108735198459827e-05,
"loss": 0.025,
"step": 15060
},
{
"epoch": 14.448705656759348,
"grad_norm": 0.27634814381599426,
"learning_rate": 4.102478271074712e-05,
"loss": 0.0278,
"step": 15070
},
{
"epoch": 14.458293384467881,
"grad_norm": 0.25131815671920776,
"learning_rate": 4.0962227956365574e-05,
"loss": 0.0271,
"step": 15080
},
{
"epoch": 14.467881112176414,
"grad_norm": 0.3638950288295746,
"learning_rate": 4.089968782265025e-05,
"loss": 0.0297,
"step": 15090
},
{
"epoch": 14.477468839884947,
"grad_norm": 0.2399180382490158,
"learning_rate": 4.083716241077419e-05,
"loss": 0.0284,
"step": 15100
},
{
"epoch": 14.48705656759348,
"grad_norm": 0.27603140473365784,
"learning_rate": 4.077465182188654e-05,
"loss": 0.0302,
"step": 15110
},
{
"epoch": 14.496644295302014,
"grad_norm": 0.17177820205688477,
"learning_rate": 4.07121561571125e-05,
"loss": 0.0382,
"step": 15120
},
{
"epoch": 14.506232023010547,
"grad_norm": 0.26461273431777954,
"learning_rate": 4.064967551755312e-05,
"loss": 0.0328,
"step": 15130
},
{
"epoch": 14.51581975071908,
"grad_norm": 0.31283822655677795,
"learning_rate": 4.058721000428514e-05,
"loss": 0.025,
"step": 15140
},
{
"epoch": 14.525407478427613,
"grad_norm": 0.18203134834766388,
"learning_rate": 4.052475971836083e-05,
"loss": 0.0286,
"step": 15150
},
{
"epoch": 14.534995206136145,
"grad_norm": 0.295449435710907,
"learning_rate": 4.0462324760807846e-05,
"loss": 0.033,
"step": 15160
},
{
"epoch": 14.544582933844678,
"grad_norm": 0.16782015562057495,
"learning_rate": 4.039990523262902e-05,
"loss": 0.0278,
"step": 15170
},
{
"epoch": 14.554170661553211,
"grad_norm": 0.23671366274356842,
"learning_rate": 4.033750123480224e-05,
"loss": 0.0319,
"step": 15180
},
{
"epoch": 14.563758389261745,
"grad_norm": 0.18487486243247986,
"learning_rate": 4.027511286828028e-05,
"loss": 0.0297,
"step": 15190
},
{
"epoch": 14.573346116970278,
"grad_norm": 0.19782863557338715,
"learning_rate": 4.0212740233990587e-05,
"loss": 0.0316,
"step": 15200
},
{
"epoch": 14.582933844678811,
"grad_norm": 0.30595293641090393,
"learning_rate": 4.0150383432835186e-05,
"loss": 0.0282,
"step": 15210
},
{
"epoch": 14.592521572387344,
"grad_norm": 0.2661206126213074,
"learning_rate": 4.00880425656905e-05,
"loss": 0.0286,
"step": 15220
},
{
"epoch": 14.602109300095877,
"grad_norm": 0.2635152339935303,
"learning_rate": 4.002571773340714e-05,
"loss": 0.0334,
"step": 15230
},
{
"epoch": 14.61169702780441,
"grad_norm": 0.27702832221984863,
"learning_rate": 3.996340903680979e-05,
"loss": 0.0304,
"step": 15240
},
{
"epoch": 14.621284755512944,
"grad_norm": 0.21685196459293365,
"learning_rate": 3.9901116576697083e-05,
"loss": 0.0394,
"step": 15250
},
{
"epoch": 14.630872483221477,
"grad_norm": 0.2177799493074417,
"learning_rate": 3.983884045384131e-05,
"loss": 0.0321,
"step": 15260
},
{
"epoch": 14.64046021093001,
"grad_norm": 0.21278002858161926,
"learning_rate": 3.977658076898836e-05,
"loss": 0.0329,
"step": 15270
},
{
"epoch": 14.650047938638544,
"grad_norm": 0.4188462495803833,
"learning_rate": 3.971433762285754e-05,
"loss": 0.0324,
"step": 15280
},
{
"epoch": 14.659635666347075,
"grad_norm": 0.4150042235851288,
"learning_rate": 3.965211111614139e-05,
"loss": 0.0311,
"step": 15290
},
{
"epoch": 14.669223394055608,
"grad_norm": 0.5566287040710449,
"learning_rate": 3.958990134950555e-05,
"loss": 0.028,
"step": 15300
},
{
"epoch": 14.678811121764141,
"grad_norm": 0.2592385411262512,
"learning_rate": 3.9527708423588546e-05,
"loss": 0.0354,
"step": 15310
},
{
"epoch": 14.688398849472675,
"grad_norm": 0.20564644038677216,
"learning_rate": 3.946553243900169e-05,
"loss": 0.0359,
"step": 15320
},
{
"epoch": 14.697986577181208,
"grad_norm": 0.27093440294265747,
"learning_rate": 3.9403373496328885e-05,
"loss": 0.0377,
"step": 15330
},
{
"epoch": 14.707574304889741,
"grad_norm": 0.35600170493125916,
"learning_rate": 3.934123169612645e-05,
"loss": 0.0323,
"step": 15340
},
{
"epoch": 14.717162032598274,
"grad_norm": 0.3020756244659424,
"learning_rate": 3.927910713892298e-05,
"loss": 0.0313,
"step": 15350
},
{
"epoch": 14.726749760306808,
"grad_norm": 0.26487666368484497,
"learning_rate": 3.921699992521917e-05,
"loss": 0.0322,
"step": 15360
},
{
"epoch": 14.73633748801534,
"grad_norm": 0.2509137988090515,
"learning_rate": 3.915491015548766e-05,
"loss": 0.0249,
"step": 15370
},
{
"epoch": 14.745925215723874,
"grad_norm": 0.2903117537498474,
"learning_rate": 3.9092837930172884e-05,
"loss": 0.0325,
"step": 15380
},
{
"epoch": 14.755512943432407,
"grad_norm": 2.1292974948883057,
"learning_rate": 3.903078334969087e-05,
"loss": 0.0352,
"step": 15390
},
{
"epoch": 14.765100671140939,
"grad_norm": 0.18879927694797516,
"learning_rate": 3.8968746514429134e-05,
"loss": 0.0348,
"step": 15400
},
{
"epoch": 14.774688398849472,
"grad_norm": 0.27570220828056335,
"learning_rate": 3.890672752474646e-05,
"loss": 0.0267,
"step": 15410
},
{
"epoch": 14.784276126558005,
"grad_norm": 0.28451746702194214,
"learning_rate": 3.884472648097276e-05,
"loss": 0.029,
"step": 15420
},
{
"epoch": 14.793863854266538,
"grad_norm": 0.2464732676744461,
"learning_rate": 3.878274348340892e-05,
"loss": 0.027,
"step": 15430
},
{
"epoch": 14.803451581975072,
"grad_norm": 0.1651841551065445,
"learning_rate": 3.872077863232665e-05,
"loss": 0.0275,
"step": 15440
},
{
"epoch": 14.813039309683605,
"grad_norm": 0.1864641159772873,
"learning_rate": 3.865883202796829e-05,
"loss": 0.028,
"step": 15450
},
{
"epoch": 14.822627037392138,
"grad_norm": 0.40212348103523254,
"learning_rate": 3.8596903770546636e-05,
"loss": 0.0296,
"step": 15460
},
{
"epoch": 14.832214765100671,
"grad_norm": 0.34442323446273804,
"learning_rate": 3.853499396024486e-05,
"loss": 0.0279,
"step": 15470
},
{
"epoch": 14.841802492809204,
"grad_norm": 0.21626895666122437,
"learning_rate": 3.8473102697216226e-05,
"loss": 0.0298,
"step": 15480
},
{
"epoch": 14.851390220517738,
"grad_norm": 0.22285476326942444,
"learning_rate": 3.841123008158405e-05,
"loss": 0.0265,
"step": 15490
},
{
"epoch": 14.860977948226271,
"grad_norm": 0.330901563167572,
"learning_rate": 3.8349376213441444e-05,
"loss": 0.032,
"step": 15500
},
{
"epoch": 14.870565675934804,
"grad_norm": 0.3265020251274109,
"learning_rate": 3.828754119285123e-05,
"loss": 0.0291,
"step": 15510
},
{
"epoch": 14.880153403643337,
"grad_norm": 0.2532041668891907,
"learning_rate": 3.822572511984569e-05,
"loss": 0.0267,
"step": 15520
},
{
"epoch": 14.889741131351869,
"grad_norm": 0.3086365759372711,
"learning_rate": 3.816392809442649e-05,
"loss": 0.036,
"step": 15530
},
{
"epoch": 14.899328859060402,
"grad_norm": 0.22954832017421722,
"learning_rate": 3.8102150216564484e-05,
"loss": 0.0302,
"step": 15540
},
{
"epoch": 14.908916586768935,
"grad_norm": 0.2649918496608734,
"learning_rate": 3.804039158619951e-05,
"loss": 0.037,
"step": 15550
},
{
"epoch": 14.918504314477468,
"grad_norm": 0.22433148324489594,
"learning_rate": 3.797865230324033e-05,
"loss": 0.0258,
"step": 15560
},
{
"epoch": 14.928092042186002,
"grad_norm": 0.2442513406276703,
"learning_rate": 3.791693246756436e-05,
"loss": 0.0289,
"step": 15570
},
{
"epoch": 14.937679769894535,
"grad_norm": 0.22684846818447113,
"learning_rate": 3.785523217901757e-05,
"loss": 0.032,
"step": 15580
},
{
"epoch": 14.947267497603068,
"grad_norm": 0.27900537848472595,
"learning_rate": 3.7793551537414313e-05,
"loss": 0.0284,
"step": 15590
},
{
"epoch": 14.956855225311601,
"grad_norm": 0.29420506954193115,
"learning_rate": 3.7731890642537154e-05,
"loss": 0.0278,
"step": 15600
},
{
"epoch": 14.966442953020135,
"grad_norm": 0.23950040340423584,
"learning_rate": 3.76702495941367e-05,
"loss": 0.03,
"step": 15610
},
{
"epoch": 14.976030680728668,
"grad_norm": 0.3971647322177887,
"learning_rate": 3.760862849193148e-05,
"loss": 0.0324,
"step": 15620
},
{
"epoch": 14.985618408437201,
"grad_norm": 0.18756671249866486,
"learning_rate": 3.754702743560773e-05,
"loss": 0.026,
"step": 15630
},
{
"epoch": 14.995206136145734,
"grad_norm": 0.24370504915714264,
"learning_rate": 3.748544652481927e-05,
"loss": 0.0353,
"step": 15640
},
{
"epoch": 15.004793863854266,
"grad_norm": 0.26173216104507446,
"learning_rate": 3.742388585918733e-05,
"loss": 0.0356,
"step": 15650
},
{
"epoch": 15.014381591562799,
"grad_norm": 0.22543974220752716,
"learning_rate": 3.736234553830038e-05,
"loss": 0.0314,
"step": 15660
},
{
"epoch": 15.023969319271332,
"grad_norm": 0.1632285714149475,
"learning_rate": 3.7300825661714e-05,
"loss": 0.0267,
"step": 15670
},
{
"epoch": 15.033557046979865,
"grad_norm": 0.2474079430103302,
"learning_rate": 3.723932632895067e-05,
"loss": 0.0289,
"step": 15680
},
{
"epoch": 15.043144774688399,
"grad_norm": 0.21004092693328857,
"learning_rate": 3.717784763949964e-05,
"loss": 0.0272,
"step": 15690
},
{
"epoch": 15.052732502396932,
"grad_norm": 0.20469725131988525,
"learning_rate": 3.7116389692816754e-05,
"loss": 0.0282,
"step": 15700
},
{
"epoch": 15.062320230105465,
"grad_norm": 0.4098300337791443,
"learning_rate": 3.7054952588324364e-05,
"loss": 0.0318,
"step": 15710
},
{
"epoch": 15.071907957813998,
"grad_norm": 0.1645730435848236,
"learning_rate": 3.699353642541103e-05,
"loss": 0.0307,
"step": 15720
},
{
"epoch": 15.081495685522532,
"grad_norm": 0.16053102910518646,
"learning_rate": 3.693214130343148e-05,
"loss": 0.0263,
"step": 15730
},
{
"epoch": 15.091083413231065,
"grad_norm": 0.2607749104499817,
"learning_rate": 3.687076732170635e-05,
"loss": 0.0279,
"step": 15740
},
{
"epoch": 15.100671140939598,
"grad_norm": 0.20249375700950623,
"learning_rate": 3.680941457952214e-05,
"loss": 0.031,
"step": 15750
},
{
"epoch": 15.110258868648131,
"grad_norm": 0.17298898100852966,
"learning_rate": 3.6748083176130955e-05,
"loss": 0.0304,
"step": 15760
},
{
"epoch": 15.119846596356663,
"grad_norm": 0.3816901743412018,
"learning_rate": 3.6686773210750385e-05,
"loss": 0.0267,
"step": 15770
},
{
"epoch": 15.129434324065196,
"grad_norm": 0.26607292890548706,
"learning_rate": 3.6625484782563345e-05,
"loss": 0.0285,
"step": 15780
},
{
"epoch": 15.139022051773729,
"grad_norm": 0.24211320281028748,
"learning_rate": 3.656421799071791e-05,
"loss": 0.0325,
"step": 15790
},
{
"epoch": 15.148609779482262,
"grad_norm": 0.3071950376033783,
"learning_rate": 3.650297293432713e-05,
"loss": 0.0344,
"step": 15800
},
{
"epoch": 15.158197507190796,
"grad_norm": 0.3314298689365387,
"learning_rate": 3.6441749712468944e-05,
"loss": 0.0297,
"step": 15810
},
{
"epoch": 15.167785234899329,
"grad_norm": 0.2220297008752823,
"learning_rate": 3.6380548424185894e-05,
"loss": 0.0328,
"step": 15820
},
{
"epoch": 15.177372962607862,
"grad_norm": 0.15199415385723114,
"learning_rate": 3.6319369168485104e-05,
"loss": 0.025,
"step": 15830
},
{
"epoch": 15.186960690316395,
"grad_norm": 0.2900523841381073,
"learning_rate": 3.625821204433803e-05,
"loss": 0.0261,
"step": 15840
},
{
"epoch": 15.196548418024928,
"grad_norm": 0.17855972051620483,
"learning_rate": 3.61970771506803e-05,
"loss": 0.034,
"step": 15850
},
{
"epoch": 15.206136145733462,
"grad_norm": 0.35416078567504883,
"learning_rate": 3.613596458641167e-05,
"loss": 0.0362,
"step": 15860
},
{
"epoch": 15.215723873441995,
"grad_norm": 0.21492497622966766,
"learning_rate": 3.6074874450395666e-05,
"loss": 0.0259,
"step": 15870
},
{
"epoch": 15.225311601150528,
"grad_norm": 0.2749202847480774,
"learning_rate": 3.6013806841459586e-05,
"loss": 0.0257,
"step": 15880
},
{
"epoch": 15.234899328859061,
"grad_norm": 0.16736426949501038,
"learning_rate": 3.595276185839426e-05,
"loss": 0.0328,
"step": 15890
},
{
"epoch": 15.244487056567593,
"grad_norm": 0.2754712998867035,
"learning_rate": 3.5891739599953945e-05,
"loss": 0.0276,
"step": 15900
},
{
"epoch": 15.254074784276126,
"grad_norm": 0.29541146755218506,
"learning_rate": 3.583074016485611e-05,
"loss": 0.031,
"step": 15910
},
{
"epoch": 15.26366251198466,
"grad_norm": 0.26210564374923706,
"learning_rate": 3.576976365178132e-05,
"loss": 0.0325,
"step": 15920
},
{
"epoch": 15.273250239693192,
"grad_norm": 0.2595176100730896,
"learning_rate": 3.5708810159373044e-05,
"loss": 0.0375,
"step": 15930
},
{
"epoch": 15.282837967401726,
"grad_norm": 0.21411257982254028,
"learning_rate": 3.564787978623753e-05,
"loss": 0.0277,
"step": 15940
},
{
"epoch": 15.292425695110259,
"grad_norm": 0.2823658585548401,
"learning_rate": 3.5586972630943594e-05,
"loss": 0.0259,
"step": 15950
},
{
"epoch": 15.302013422818792,
"grad_norm": 0.2719429135322571,
"learning_rate": 3.552608879202252e-05,
"loss": 0.0295,
"step": 15960
},
{
"epoch": 15.311601150527325,
"grad_norm": 0.2882955074310303,
"learning_rate": 3.5465228367967854e-05,
"loss": 0.0297,
"step": 15970
},
{
"epoch": 15.321188878235859,
"grad_norm": 0.22343681752681732,
"learning_rate": 3.540439145723529e-05,
"loss": 0.0241,
"step": 15980
},
{
"epoch": 15.330776605944392,
"grad_norm": 0.18314386904239655,
"learning_rate": 3.534357815824243e-05,
"loss": 0.0345,
"step": 15990
},
{
"epoch": 15.340364333652925,
"grad_norm": 0.22451230883598328,
"learning_rate": 3.528278856936874e-05,
"loss": 0.0259,
"step": 16000
},
{
"epoch": 15.349952061361458,
"grad_norm": 0.23394083976745605,
"learning_rate": 3.52220227889553e-05,
"loss": 0.0235,
"step": 16010
},
{
"epoch": 15.35953978906999,
"grad_norm": 0.21090802550315857,
"learning_rate": 3.516128091530469e-05,
"loss": 0.0259,
"step": 16020
},
{
"epoch": 15.369127516778523,
"grad_norm": 0.42782530188560486,
"learning_rate": 3.5100563046680764e-05,
"loss": 0.0297,
"step": 16030
},
{
"epoch": 15.378715244487056,
"grad_norm": 0.2408047765493393,
"learning_rate": 3.503986928130862e-05,
"loss": 0.0287,
"step": 16040
},
{
"epoch": 15.38830297219559,
"grad_norm": 0.24126370251178741,
"learning_rate": 3.49791997173743e-05,
"loss": 0.0295,
"step": 16050
},
{
"epoch": 15.397890699904123,
"grad_norm": 0.28855326771736145,
"learning_rate": 3.4918554453024746e-05,
"loss": 0.0272,
"step": 16060
},
{
"epoch": 15.407478427612656,
"grad_norm": 0.2622244358062744,
"learning_rate": 3.485793358636753e-05,
"loss": 0.0264,
"step": 16070
},
{
"epoch": 15.417066155321189,
"grad_norm": 0.433159202337265,
"learning_rate": 3.479733721547082e-05,
"loss": 0.0331,
"step": 16080
},
{
"epoch": 15.426653883029722,
"grad_norm": 0.35671567916870117,
"learning_rate": 3.47367654383631e-05,
"loss": 0.0309,
"step": 16090
},
{
"epoch": 15.436241610738255,
"grad_norm": 0.2572173476219177,
"learning_rate": 3.467621835303309e-05,
"loss": 0.0299,
"step": 16100
},
{
"epoch": 15.445829338446789,
"grad_norm": 0.3275107145309448,
"learning_rate": 3.461569605742958e-05,
"loss": 0.0258,
"step": 16110
},
{
"epoch": 15.455417066155322,
"grad_norm": 0.38686898350715637,
"learning_rate": 3.455519864946125e-05,
"loss": 0.0281,
"step": 16120
},
{
"epoch": 15.465004793863855,
"grad_norm": 0.5980708003044128,
"learning_rate": 3.449472622699651e-05,
"loss": 0.0266,
"step": 16130
},
{
"epoch": 15.474592521572387,
"grad_norm": 0.1607155054807663,
"learning_rate": 3.443427888786335e-05,
"loss": 0.0286,
"step": 16140
},
{
"epoch": 15.48418024928092,
"grad_norm": 0.19821766018867493,
"learning_rate": 3.437385672984918e-05,
"loss": 0.0299,
"step": 16150
},
{
"epoch": 15.493767976989453,
"grad_norm": 0.27373266220092773,
"learning_rate": 3.431345985070067e-05,
"loss": 0.0387,
"step": 16160
},
{
"epoch": 15.503355704697986,
"grad_norm": 0.24755899608135223,
"learning_rate": 3.425308834812364e-05,
"loss": 0.0268,
"step": 16170
},
{
"epoch": 15.51294343240652,
"grad_norm": 0.34930139780044556,
"learning_rate": 3.4192742319782805e-05,
"loss": 0.0358,
"step": 16180
},
{
"epoch": 15.522531160115053,
"grad_norm": 0.21849294006824493,
"learning_rate": 3.413242186330168e-05,
"loss": 0.0327,
"step": 16190
},
{
"epoch": 15.532118887823586,
"grad_norm": 0.2413625419139862,
"learning_rate": 3.407212707626243e-05,
"loss": 0.0283,
"step": 16200
},
{
"epoch": 15.541706615532119,
"grad_norm": 0.27283817529678345,
"learning_rate": 3.401185805620568e-05,
"loss": 0.0295,
"step": 16210
},
{
"epoch": 15.551294343240652,
"grad_norm": 0.3242924213409424,
"learning_rate": 3.395161490063037e-05,
"loss": 0.0328,
"step": 16220
},
{
"epoch": 15.560882070949186,
"grad_norm": 0.2872219383716583,
"learning_rate": 3.38913977069936e-05,
"loss": 0.0273,
"step": 16230
},
{
"epoch": 15.570469798657719,
"grad_norm": 0.14021213352680206,
"learning_rate": 3.3831206572710464e-05,
"loss": 0.0271,
"step": 16240
},
{
"epoch": 15.580057526366252,
"grad_norm": 0.19898459315299988,
"learning_rate": 3.377104159515393e-05,
"loss": 0.0299,
"step": 16250
},
{
"epoch": 15.589645254074785,
"grad_norm": 0.2079470306634903,
"learning_rate": 3.371090287165462e-05,
"loss": 0.031,
"step": 16260
},
{
"epoch": 15.599232981783317,
"grad_norm": 0.2817933261394501,
"learning_rate": 3.3650790499500675e-05,
"loss": 0.0273,
"step": 16270
},
{
"epoch": 15.60882070949185,
"grad_norm": 0.20972701907157898,
"learning_rate": 3.3590704575937655e-05,
"loss": 0.0279,
"step": 16280
},
{
"epoch": 15.618408437200383,
"grad_norm": 0.21050924062728882,
"learning_rate": 3.3530645198168295e-05,
"loss": 0.0327,
"step": 16290
},
{
"epoch": 15.627996164908916,
"grad_norm": 0.33600106835365295,
"learning_rate": 3.3470612463352376e-05,
"loss": 0.0314,
"step": 16300
},
{
"epoch": 15.63758389261745,
"grad_norm": 0.33707502484321594,
"learning_rate": 3.341060646860659e-05,
"loss": 0.029,
"step": 16310
},
{
"epoch": 15.647171620325983,
"grad_norm": 0.2761129140853882,
"learning_rate": 3.335062731100441e-05,
"loss": 0.0271,
"step": 16320
},
{
"epoch": 15.656759348034516,
"grad_norm": 0.2787131369113922,
"learning_rate": 3.3290675087575856e-05,
"loss": 0.0252,
"step": 16330
},
{
"epoch": 15.66634707574305,
"grad_norm": 0.23235364258289337,
"learning_rate": 3.3230749895307375e-05,
"loss": 0.0357,
"step": 16340
},
{
"epoch": 15.675934803451582,
"grad_norm": 0.22941578924655914,
"learning_rate": 3.317085183114168e-05,
"loss": 0.0223,
"step": 16350
},
{
"epoch": 15.685522531160116,
"grad_norm": 0.2411498874425888,
"learning_rate": 3.311098099197761e-05,
"loss": 0.0271,
"step": 16360
},
{
"epoch": 15.695110258868649,
"grad_norm": 0.35220983624458313,
"learning_rate": 3.3051137474669966e-05,
"loss": 0.0262,
"step": 16370
},
{
"epoch": 15.70469798657718,
"grad_norm": 0.28711986541748047,
"learning_rate": 3.299132137602934e-05,
"loss": 0.0342,
"step": 16380
},
{
"epoch": 15.714285714285714,
"grad_norm": 0.1615312546491623,
"learning_rate": 3.293153279282199e-05,
"loss": 0.0334,
"step": 16390
},
{
"epoch": 15.723873441994247,
"grad_norm": 0.17363496124744415,
"learning_rate": 3.287177182176961e-05,
"loss": 0.0279,
"step": 16400
},
{
"epoch": 15.73346116970278,
"grad_norm": 0.3049766421318054,
"learning_rate": 3.2812038559549275e-05,
"loss": 0.032,
"step": 16410
},
{
"epoch": 15.743048897411313,
"grad_norm": 0.3206036686897278,
"learning_rate": 3.275233310279321e-05,
"loss": 0.0281,
"step": 16420
},
{
"epoch": 15.752636625119846,
"grad_norm": 0.20691925287246704,
"learning_rate": 3.2692655548088704e-05,
"loss": 0.026,
"step": 16430
},
{
"epoch": 15.76222435282838,
"grad_norm": 0.2701127827167511,
"learning_rate": 3.263300599197781e-05,
"loss": 0.0247,
"step": 16440
},
{
"epoch": 15.771812080536913,
"grad_norm": 0.18183131515979767,
"learning_rate": 3.2573384530957384e-05,
"loss": 0.0249,
"step": 16450
},
{
"epoch": 15.781399808245446,
"grad_norm": 0.260061115026474,
"learning_rate": 3.251379126147877e-05,
"loss": 0.0249,
"step": 16460
},
{
"epoch": 15.79098753595398,
"grad_norm": 0.2887513041496277,
"learning_rate": 3.245422627994777e-05,
"loss": 0.0333,
"step": 16470
},
{
"epoch": 15.800575263662513,
"grad_norm": 0.3020176887512207,
"learning_rate": 3.239468968272436e-05,
"loss": 0.0289,
"step": 16480
},
{
"epoch": 15.810162991371046,
"grad_norm": 0.23766952753067017,
"learning_rate": 3.233518156612262e-05,
"loss": 0.0302,
"step": 16490
},
{
"epoch": 15.819750719079579,
"grad_norm": 0.31386175751686096,
"learning_rate": 3.227570202641056e-05,
"loss": 0.0287,
"step": 16500
},
{
"epoch": 15.82933844678811,
"grad_norm": 0.2746824026107788,
"learning_rate": 3.2216251159809955e-05,
"loss": 0.0293,
"step": 16510
},
{
"epoch": 15.838926174496644,
"grad_norm": 0.21857379376888275,
"learning_rate": 3.215682906249621e-05,
"loss": 0.0294,
"step": 16520
},
{
"epoch": 15.848513902205177,
"grad_norm": 0.16576367616653442,
"learning_rate": 3.209743583059817e-05,
"loss": 0.0271,
"step": 16530
},
{
"epoch": 15.85810162991371,
"grad_norm": 0.31498968601226807,
"learning_rate": 3.203807156019798e-05,
"loss": 0.0312,
"step": 16540
},
{
"epoch": 15.867689357622243,
"grad_norm": 0.2268988937139511,
"learning_rate": 3.197873634733096e-05,
"loss": 0.0309,
"step": 16550
},
{
"epoch": 15.877277085330777,
"grad_norm": 0.2843955159187317,
"learning_rate": 3.1919430287985415e-05,
"loss": 0.0271,
"step": 16560
},
{
"epoch": 15.88686481303931,
"grad_norm": 0.270082026720047,
"learning_rate": 3.186015347810245e-05,
"loss": 0.0267,
"step": 16570
},
{
"epoch": 15.896452540747843,
"grad_norm": 0.13555888831615448,
"learning_rate": 3.18009060135759e-05,
"loss": 0.0303,
"step": 16580
},
{
"epoch": 15.906040268456376,
"grad_norm": 0.5174959301948547,
"learning_rate": 3.17416879902521e-05,
"loss": 0.0298,
"step": 16590
},
{
"epoch": 15.91562799616491,
"grad_norm": 0.23616893589496613,
"learning_rate": 3.168249950392978e-05,
"loss": 0.026,
"step": 16600
},
{
"epoch": 15.925215723873443,
"grad_norm": 0.2044319212436676,
"learning_rate": 3.162334065035985e-05,
"loss": 0.0294,
"step": 16610
},
{
"epoch": 15.934803451581976,
"grad_norm": 0.2839745879173279,
"learning_rate": 3.156421152524532e-05,
"loss": 0.0311,
"step": 16620
},
{
"epoch": 15.944391179290509,
"grad_norm": 0.28521618247032166,
"learning_rate": 3.150511222424111e-05,
"loss": 0.029,
"step": 16630
},
{
"epoch": 15.95397890699904,
"grad_norm": 0.4045862555503845,
"learning_rate": 3.1446042842953845e-05,
"loss": 0.0347,
"step": 16640
},
{
"epoch": 15.963566634707574,
"grad_norm": 0.2557837963104248,
"learning_rate": 3.138700347694179e-05,
"loss": 0.0211,
"step": 16650
},
{
"epoch": 15.973154362416107,
"grad_norm": 0.23164719343185425,
"learning_rate": 3.132799422171464e-05,
"loss": 0.0273,
"step": 16660
},
{
"epoch": 15.98274209012464,
"grad_norm": 0.17888516187667847,
"learning_rate": 3.126901517273339e-05,
"loss": 0.0252,
"step": 16670
},
{
"epoch": 15.992329817833173,
"grad_norm": 0.2732132077217102,
"learning_rate": 3.121006642541014e-05,
"loss": 0.0259,
"step": 16680
},
{
"epoch": 16.001917545541705,
"grad_norm": 0.3599238693714142,
"learning_rate": 3.115114807510803e-05,
"loss": 0.0292,
"step": 16690
},
{
"epoch": 16.01150527325024,
"grad_norm": 0.18428216874599457,
"learning_rate": 3.109226021714093e-05,
"loss": 0.0238,
"step": 16700
},
{
"epoch": 16.02109300095877,
"grad_norm": 0.1668870896100998,
"learning_rate": 3.1033402946773474e-05,
"loss": 0.0276,
"step": 16710
},
{
"epoch": 16.030680728667306,
"grad_norm": 0.2498198300600052,
"learning_rate": 3.097457635922077e-05,
"loss": 0.0326,
"step": 16720
},
{
"epoch": 16.040268456375838,
"grad_norm": 0.27348780632019043,
"learning_rate": 3.09157805496483e-05,
"loss": 0.0337,
"step": 16730
},
{
"epoch": 16.049856184084373,
"grad_norm": 0.3426136076450348,
"learning_rate": 3.085701561317174e-05,
"loss": 0.027,
"step": 16740
},
{
"epoch": 16.059443911792904,
"grad_norm": 0.1942438781261444,
"learning_rate": 3.079828164485684e-05,
"loss": 0.0231,
"step": 16750
},
{
"epoch": 16.06903163950144,
"grad_norm": 0.3608817160129547,
"learning_rate": 3.073957873971925e-05,
"loss": 0.0246,
"step": 16760
},
{
"epoch": 16.07861936720997,
"grad_norm": 0.2943773567676544,
"learning_rate": 3.068090699272436e-05,
"loss": 0.033,
"step": 16770
},
{
"epoch": 16.088207094918506,
"grad_norm": 0.3121021091938019,
"learning_rate": 3.062226649878717e-05,
"loss": 0.0228,
"step": 16780
},
{
"epoch": 16.097794822627037,
"grad_norm": 0.2769118547439575,
"learning_rate": 3.056365735277209e-05,
"loss": 0.0228,
"step": 16790
},
{
"epoch": 16.107382550335572,
"grad_norm": 0.2802489995956421,
"learning_rate": 3.0505079649492853e-05,
"loss": 0.0281,
"step": 16800
},
{
"epoch": 16.116970278044104,
"grad_norm": 0.27936017513275146,
"learning_rate": 3.0446533483712304e-05,
"loss": 0.0285,
"step": 16810
},
{
"epoch": 16.126558005752635,
"grad_norm": 0.2072148621082306,
"learning_rate": 3.038801895014229e-05,
"loss": 0.0295,
"step": 16820
},
{
"epoch": 16.13614573346117,
"grad_norm": 0.2498210370540619,
"learning_rate": 3.0329536143443444e-05,
"loss": 0.0292,
"step": 16830
},
{
"epoch": 16.1457334611697,
"grad_norm": 0.274496853351593,
"learning_rate": 3.027108515822511e-05,
"loss": 0.0292,
"step": 16840
},
{
"epoch": 16.155321188878236,
"grad_norm": 0.40636447072029114,
"learning_rate": 3.0212666089045155e-05,
"loss": 0.0281,
"step": 16850
},
{
"epoch": 16.164908916586768,
"grad_norm": 0.22214102745056152,
"learning_rate": 3.0154279030409794e-05,
"loss": 0.0218,
"step": 16860
},
{
"epoch": 16.174496644295303,
"grad_norm": 0.26967325806617737,
"learning_rate": 3.0095924076773467e-05,
"loss": 0.0255,
"step": 16870
},
{
"epoch": 16.184084372003834,
"grad_norm": 0.23795704543590546,
"learning_rate": 3.003760132253868e-05,
"loss": 0.0327,
"step": 16880
},
{
"epoch": 16.19367209971237,
"grad_norm": 0.1818399578332901,
"learning_rate": 2.9979310862055842e-05,
"loss": 0.0312,
"step": 16890
},
{
"epoch": 16.2032598274209,
"grad_norm": 0.23240040242671967,
"learning_rate": 2.9921052789623137e-05,
"loss": 0.0294,
"step": 16900
},
{
"epoch": 16.212847555129436,
"grad_norm": 0.20948819816112518,
"learning_rate": 2.9862827199486327e-05,
"loss": 0.0271,
"step": 16910
},
{
"epoch": 16.222435282837967,
"grad_norm": 0.20456750690937042,
"learning_rate": 2.9804634185838614e-05,
"loss": 0.0258,
"step": 16920
},
{
"epoch": 16.232023010546502,
"grad_norm": 0.2674747705459595,
"learning_rate": 2.9746473842820578e-05,
"loss": 0.0287,
"step": 16930
},
{
"epoch": 16.241610738255034,
"grad_norm": 0.19764818251132965,
"learning_rate": 2.9688346264519866e-05,
"loss": 0.0284,
"step": 16940
},
{
"epoch": 16.251198465963565,
"grad_norm": 0.3688560426235199,
"learning_rate": 2.9630251544971165e-05,
"loss": 0.0289,
"step": 16950
},
{
"epoch": 16.2607861936721,
"grad_norm": 0.34308168292045593,
"learning_rate": 2.957218977815598e-05,
"loss": 0.0289,
"step": 16960
},
{
"epoch": 16.27037392138063,
"grad_norm": 0.3008866608142853,
"learning_rate": 2.9514161058002498e-05,
"loss": 0.0307,
"step": 16970
},
{
"epoch": 16.279961649089167,
"grad_norm": 0.12983451783657074,
"learning_rate": 2.9456165478385494e-05,
"loss": 0.0232,
"step": 16980
},
{
"epoch": 16.289549376797698,
"grad_norm": 0.14965233206748962,
"learning_rate": 2.9398203133126085e-05,
"loss": 0.0248,
"step": 16990
},
{
"epoch": 16.299137104506233,
"grad_norm": 0.256956547498703,
"learning_rate": 2.9340274115991638e-05,
"loss": 0.0348,
"step": 17000
},
{
"epoch": 16.308724832214764,
"grad_norm": 0.18191608786582947,
"learning_rate": 2.9282378520695618e-05,
"loss": 0.0292,
"step": 17010
},
{
"epoch": 16.3183125599233,
"grad_norm": 0.20375274121761322,
"learning_rate": 2.922451644089741e-05,
"loss": 0.0282,
"step": 17020
},
{
"epoch": 16.32790028763183,
"grad_norm": 0.24703994393348694,
"learning_rate": 2.9166687970202177e-05,
"loss": 0.0335,
"step": 17030
},
{
"epoch": 16.337488015340366,
"grad_norm": 0.266993910074234,
"learning_rate": 2.9108893202160702e-05,
"loss": 0.021,
"step": 17040
},
{
"epoch": 16.347075743048897,
"grad_norm": 0.42793118953704834,
"learning_rate": 2.9051132230269272e-05,
"loss": 0.0257,
"step": 17050
},
{
"epoch": 16.35666347075743,
"grad_norm": 0.36531713604927063,
"learning_rate": 2.8993405147969493e-05,
"loss": 0.0322,
"step": 17060
},
{
"epoch": 16.366251198465964,
"grad_norm": 0.21013452112674713,
"learning_rate": 2.8935712048648112e-05,
"loss": 0.0278,
"step": 17070
},
{
"epoch": 16.375838926174495,
"grad_norm": 0.1972169280052185,
"learning_rate": 2.8878053025636975e-05,
"loss": 0.025,
"step": 17080
},
{
"epoch": 16.38542665388303,
"grad_norm": 0.2844037115573883,
"learning_rate": 2.882042817221273e-05,
"loss": 0.0265,
"step": 17090
},
{
"epoch": 16.39501438159156,
"grad_norm": 0.18470896780490875,
"learning_rate": 2.8762837581596792e-05,
"loss": 0.0234,
"step": 17100
},
{
"epoch": 16.404602109300097,
"grad_norm": 0.27581846714019775,
"learning_rate": 2.8705281346955116e-05,
"loss": 0.0303,
"step": 17110
},
{
"epoch": 16.414189837008628,
"grad_norm": 0.27025681734085083,
"learning_rate": 2.86477595613981e-05,
"loss": 0.0309,
"step": 17120
},
{
"epoch": 16.423777564717163,
"grad_norm": 0.35465800762176514,
"learning_rate": 2.8590272317980437e-05,
"loss": 0.0318,
"step": 17130
},
{
"epoch": 16.433365292425695,
"grad_norm": 0.2873314917087555,
"learning_rate": 2.8532819709700854e-05,
"loss": 0.0335,
"step": 17140
},
{
"epoch": 16.44295302013423,
"grad_norm": 0.3287470042705536,
"learning_rate": 2.8475401829502124e-05,
"loss": 0.0308,
"step": 17150
},
{
"epoch": 16.45254074784276,
"grad_norm": 0.18719346821308136,
"learning_rate": 2.841801877027083e-05,
"loss": 0.0297,
"step": 17160
},
{
"epoch": 16.462128475551296,
"grad_norm": 0.16801686584949493,
"learning_rate": 2.836067062483721e-05,
"loss": 0.026,
"step": 17170
},
{
"epoch": 16.471716203259827,
"grad_norm": 0.3017866909503937,
"learning_rate": 2.830335748597502e-05,
"loss": 0.0298,
"step": 17180
},
{
"epoch": 16.48130393096836,
"grad_norm": 0.16507741808891296,
"learning_rate": 2.8246079446401386e-05,
"loss": 0.028,
"step": 17190
},
{
"epoch": 16.490891658676894,
"grad_norm": 0.25729814171791077,
"learning_rate": 2.8188836598776662e-05,
"loss": 0.0291,
"step": 17200
},
{
"epoch": 16.500479386385425,
"grad_norm": 0.36721915006637573,
"learning_rate": 2.8131629035704264e-05,
"loss": 0.0324,
"step": 17210
},
{
"epoch": 16.51006711409396,
"grad_norm": 5.430606365203857,
"learning_rate": 2.8074456849730507e-05,
"loss": 0.026,
"step": 17220
},
{
"epoch": 16.51965484180249,
"grad_norm": 0.18490955233573914,
"learning_rate": 2.8017320133344533e-05,
"loss": 0.0265,
"step": 17230
},
{
"epoch": 16.529242569511027,
"grad_norm": 0.17146821320056915,
"learning_rate": 2.7960218978978047e-05,
"loss": 0.0293,
"step": 17240
},
{
"epoch": 16.538830297219558,
"grad_norm": 0.21457697451114655,
"learning_rate": 2.7903153479005255e-05,
"loss": 0.0294,
"step": 17250
},
{
"epoch": 16.548418024928093,
"grad_norm": 0.2303658127784729,
"learning_rate": 2.7846123725742678e-05,
"loss": 0.0278,
"step": 17260
},
{
"epoch": 16.558005752636625,
"grad_norm": 0.20711682736873627,
"learning_rate": 2.778912981144898e-05,
"loss": 0.0245,
"step": 17270
},
{
"epoch": 16.56759348034516,
"grad_norm": 0.2282470464706421,
"learning_rate": 2.7732171828324872e-05,
"loss": 0.029,
"step": 17280
},
{
"epoch": 16.57718120805369,
"grad_norm": 0.27450570464134216,
"learning_rate": 2.7675249868512954e-05,
"loss": 0.036,
"step": 17290
},
{
"epoch": 16.586768935762223,
"grad_norm": 0.18990963697433472,
"learning_rate": 2.761836402409752e-05,
"loss": 0.0362,
"step": 17300
},
{
"epoch": 16.596356663470758,
"grad_norm": 0.19880448281764984,
"learning_rate": 2.7561514387104464e-05,
"loss": 0.0283,
"step": 17310
},
{
"epoch": 16.60594439117929,
"grad_norm": 0.2031632959842682,
"learning_rate": 2.750470104950109e-05,
"loss": 0.0253,
"step": 17320
},
{
"epoch": 16.615532118887824,
"grad_norm": 0.5270239114761353,
"learning_rate": 2.7447924103195976e-05,
"loss": 0.0278,
"step": 17330
},
{
"epoch": 16.625119846596355,
"grad_norm": 0.29472750425338745,
"learning_rate": 2.7391183640038847e-05,
"loss": 0.0284,
"step": 17340
},
{
"epoch": 16.63470757430489,
"grad_norm": 0.21734996140003204,
"learning_rate": 2.7334479751820396e-05,
"loss": 0.0294,
"step": 17350
},
{
"epoch": 16.644295302013422,
"grad_norm": 0.29278430342674255,
"learning_rate": 2.7277812530272147e-05,
"loss": 0.0297,
"step": 17360
},
{
"epoch": 16.653883029721957,
"grad_norm": 0.2573314309120178,
"learning_rate": 2.7221182067066307e-05,
"loss": 0.0241,
"step": 17370
},
{
"epoch": 16.66347075743049,
"grad_norm": 0.23133955895900726,
"learning_rate": 2.7164588453815602e-05,
"loss": 0.0258,
"step": 17380
},
{
"epoch": 16.673058485139023,
"grad_norm": 0.20745334029197693,
"learning_rate": 2.710803178207323e-05,
"loss": 0.0242,
"step": 17390
},
{
"epoch": 16.682646212847555,
"grad_norm": 0.22852954268455505,
"learning_rate": 2.7051512143332492e-05,
"loss": 0.027,
"step": 17400
},
{
"epoch": 16.69223394055609,
"grad_norm": 0.25844722986221313,
"learning_rate": 2.6995029629026874e-05,
"loss": 0.0244,
"step": 17410
},
{
"epoch": 16.70182166826462,
"grad_norm": 0.23631109297275543,
"learning_rate": 2.6938584330529782e-05,
"loss": 0.0215,
"step": 17420
},
{
"epoch": 16.711409395973153,
"grad_norm": 0.27872714400291443,
"learning_rate": 2.6882176339154404e-05,
"loss": 0.0308,
"step": 17430
},
{
"epoch": 16.720997123681688,
"grad_norm": 0.23717211186885834,
"learning_rate": 2.6825805746153594e-05,
"loss": 0.0266,
"step": 17440
},
{
"epoch": 16.73058485139022,
"grad_norm": 0.281259685754776,
"learning_rate": 2.6769472642719695e-05,
"loss": 0.0329,
"step": 17450
},
{
"epoch": 16.740172579098754,
"grad_norm": 0.257068932056427,
"learning_rate": 2.67131771199844e-05,
"loss": 0.0245,
"step": 17460
},
{
"epoch": 16.749760306807286,
"grad_norm": 0.18098169565200806,
"learning_rate": 2.665691926901862e-05,
"loss": 0.0284,
"step": 17470
},
{
"epoch": 16.75934803451582,
"grad_norm": 0.23477615416049957,
"learning_rate": 2.6600699180832307e-05,
"loss": 0.026,
"step": 17480
},
{
"epoch": 16.768935762224352,
"grad_norm": 0.24687384068965912,
"learning_rate": 2.654451694637433e-05,
"loss": 0.0255,
"step": 17490
},
{
"epoch": 16.778523489932887,
"grad_norm": 0.2607274651527405,
"learning_rate": 2.6488372656532322e-05,
"loss": 0.0294,
"step": 17500
},
{
"epoch": 16.78811121764142,
"grad_norm": 0.4215647578239441,
"learning_rate": 2.6432266402132532e-05,
"loss": 0.0283,
"step": 17510
},
{
"epoch": 16.797698945349953,
"grad_norm": 0.20454095304012299,
"learning_rate": 2.637619827393968e-05,
"loss": 0.0306,
"step": 17520
},
{
"epoch": 16.807286673058485,
"grad_norm": 0.19789418578147888,
"learning_rate": 2.6320168362656796e-05,
"loss": 0.025,
"step": 17530
},
{
"epoch": 16.81687440076702,
"grad_norm": 0.34662866592407227,
"learning_rate": 2.6264176758925098e-05,
"loss": 0.0317,
"step": 17540
},
{
"epoch": 16.82646212847555,
"grad_norm": 0.20395246148109436,
"learning_rate": 2.620822355332383e-05,
"loss": 0.0306,
"step": 17550
},
{
"epoch": 16.836049856184083,
"grad_norm": 0.39246705174446106,
"learning_rate": 2.615230883637012e-05,
"loss": 0.0259,
"step": 17560
},
{
"epoch": 16.845637583892618,
"grad_norm": 0.22869329154491425,
"learning_rate": 2.609643269851883e-05,
"loss": 0.0285,
"step": 17570
},
{
"epoch": 16.85522531160115,
"grad_norm": 0.3232511281967163,
"learning_rate": 2.60405952301624e-05,
"loss": 0.0288,
"step": 17580
},
{
"epoch": 16.864813039309684,
"grad_norm": 0.2171912044286728,
"learning_rate": 2.5984796521630737e-05,
"loss": 0.0249,
"step": 17590
},
{
"epoch": 16.874400767018216,
"grad_norm": 0.28310737013816833,
"learning_rate": 2.592903666319103e-05,
"loss": 0.0295,
"step": 17600
},
{
"epoch": 16.88398849472675,
"grad_norm": 0.19829969108104706,
"learning_rate": 2.587331574504761e-05,
"loss": 0.025,
"step": 17610
},
{
"epoch": 16.893576222435282,
"grad_norm": 0.1657049059867859,
"learning_rate": 2.581763385734183e-05,
"loss": 0.0244,
"step": 17620
},
{
"epoch": 16.903163950143817,
"grad_norm": 0.256913959980011,
"learning_rate": 2.5761991090151906e-05,
"loss": 0.0306,
"step": 17630
},
{
"epoch": 16.91275167785235,
"grad_norm": 0.2738933861255646,
"learning_rate": 2.5706387533492737e-05,
"loss": 0.0326,
"step": 17640
},
{
"epoch": 16.922339405560884,
"grad_norm": 0.2700929343700409,
"learning_rate": 2.5650823277315837e-05,
"loss": 0.0313,
"step": 17650
},
{
"epoch": 16.931927133269415,
"grad_norm": 0.2965131103992462,
"learning_rate": 2.5595298411509094e-05,
"loss": 0.0275,
"step": 17660
},
{
"epoch": 16.941514860977946,
"grad_norm": 0.3247256278991699,
"learning_rate": 2.553981302589671e-05,
"loss": 0.0326,
"step": 17670
},
{
"epoch": 16.95110258868648,
"grad_norm": 0.30926892161369324,
"learning_rate": 2.5484367210239e-05,
"loss": 0.0297,
"step": 17680
},
{
"epoch": 16.960690316395013,
"grad_norm": 0.15213845670223236,
"learning_rate": 2.5428961054232264e-05,
"loss": 0.0271,
"step": 17690
},
{
"epoch": 16.970278044103548,
"grad_norm": 0.20840811729431152,
"learning_rate": 2.537359464750866e-05,
"loss": 0.0273,
"step": 17700
},
{
"epoch": 16.97986577181208,
"grad_norm": 0.21467389166355133,
"learning_rate": 2.5318268079636022e-05,
"loss": 0.0314,
"step": 17710
},
{
"epoch": 16.989453499520614,
"grad_norm": 0.2677682638168335,
"learning_rate": 2.526298144011775e-05,
"loss": 0.0238,
"step": 17720
},
{
"epoch": 16.999041227229146,
"grad_norm": 0.16417664289474487,
"learning_rate": 2.5207734818392648e-05,
"loss": 0.0258,
"step": 17730
},
{
"epoch": 17.00862895493768,
"grad_norm": 0.20820669829845428,
"learning_rate": 2.5152528303834777e-05,
"loss": 0.0329,
"step": 17740
},
{
"epoch": 17.018216682646212,
"grad_norm": 0.19568829238414764,
"learning_rate": 2.5097361985753316e-05,
"loss": 0.0269,
"step": 17750
},
{
"epoch": 17.027804410354747,
"grad_norm": 0.1650926023721695,
"learning_rate": 2.5042235953392423e-05,
"loss": 0.026,
"step": 17760
},
{
"epoch": 17.03739213806328,
"grad_norm": 0.16357482969760895,
"learning_rate": 2.4987150295931082e-05,
"loss": 0.0305,
"step": 17770
},
{
"epoch": 17.046979865771814,
"grad_norm": 0.22878289222717285,
"learning_rate": 2.4932105102482955e-05,
"loss": 0.0276,
"step": 17780
},
{
"epoch": 17.056567593480345,
"grad_norm": 0.2666637599468231,
"learning_rate": 2.487710046209626e-05,
"loss": 0.0278,
"step": 17790
},
{
"epoch": 17.066155321188877,
"grad_norm": 0.2581173777580261,
"learning_rate": 2.4822136463753594e-05,
"loss": 0.0285,
"step": 17800
},
{
"epoch": 17.07574304889741,
"grad_norm": 0.19729219377040863,
"learning_rate": 2.4767213196371813e-05,
"loss": 0.0251,
"step": 17810
},
{
"epoch": 17.085330776605943,
"grad_norm": 0.21068008244037628,
"learning_rate": 2.47123307488019e-05,
"loss": 0.0218,
"step": 17820
},
{
"epoch": 17.094918504314478,
"grad_norm": 0.21502196788787842,
"learning_rate": 2.465748920982873e-05,
"loss": 0.0244,
"step": 17830
},
{
"epoch": 17.10450623202301,
"grad_norm": 0.20099669694900513,
"learning_rate": 2.4602688668171103e-05,
"loss": 0.0299,
"step": 17840
},
{
"epoch": 17.114093959731544,
"grad_norm": 0.6751896739006042,
"learning_rate": 2.4547929212481435e-05,
"loss": 0.0386,
"step": 17850
},
{
"epoch": 17.123681687440076,
"grad_norm": 0.32390302419662476,
"learning_rate": 2.4493210931345684e-05,
"loss": 0.029,
"step": 17860
},
{
"epoch": 17.13326941514861,
"grad_norm": 0.31073060631752014,
"learning_rate": 2.4438533913283206e-05,
"loss": 0.0232,
"step": 17870
},
{
"epoch": 17.142857142857142,
"grad_norm": 0.17248332500457764,
"learning_rate": 2.4383898246746596e-05,
"loss": 0.0214,
"step": 17880
},
{
"epoch": 17.152444870565677,
"grad_norm": 0.33149340748786926,
"learning_rate": 2.4329304020121558e-05,
"loss": 0.0298,
"step": 17890
},
{
"epoch": 17.16203259827421,
"grad_norm": 0.2364264875650406,
"learning_rate": 2.4274751321726762e-05,
"loss": 0.0333,
"step": 17900
},
{
"epoch": 17.171620325982744,
"grad_norm": 0.15520252287387848,
"learning_rate": 2.4220240239813684e-05,
"loss": 0.0196,
"step": 17910
},
{
"epoch": 17.181208053691275,
"grad_norm": 0.23256506025791168,
"learning_rate": 2.4165770862566494e-05,
"loss": 0.029,
"step": 17920
},
{
"epoch": 17.190795781399807,
"grad_norm": 0.17074307799339294,
"learning_rate": 2.4111343278101884e-05,
"loss": 0.0302,
"step": 17930
},
{
"epoch": 17.20038350910834,
"grad_norm": 0.24341343343257904,
"learning_rate": 2.4056957574468932e-05,
"loss": 0.0296,
"step": 17940
},
{
"epoch": 17.209971236816873,
"grad_norm": 0.1940905898809433,
"learning_rate": 2.4002613839648987e-05,
"loss": 0.029,
"step": 17950
},
{
"epoch": 17.219558964525408,
"grad_norm": 0.194035604596138,
"learning_rate": 2.3948312161555453e-05,
"loss": 0.0297,
"step": 17960
},
{
"epoch": 17.22914669223394,
"grad_norm": 0.14753536880016327,
"learning_rate": 2.389405262803375e-05,
"loss": 0.0259,
"step": 17970
},
{
"epoch": 17.238734419942475,
"grad_norm": 0.18068645894527435,
"learning_rate": 2.3839835326861104e-05,
"loss": 0.0284,
"step": 17980
},
{
"epoch": 17.248322147651006,
"grad_norm": 0.33698755502700806,
"learning_rate": 2.378566034574639e-05,
"loss": 0.0289,
"step": 17990
},
{
"epoch": 17.25790987535954,
"grad_norm": 0.2708437144756317,
"learning_rate": 2.3731527772330098e-05,
"loss": 0.0252,
"step": 18000
},
{
"epoch": 17.267497603068072,
"grad_norm": 0.37091711163520813,
"learning_rate": 2.367743769418403e-05,
"loss": 0.031,
"step": 18010
},
{
"epoch": 17.277085330776607,
"grad_norm": 0.22311721742153168,
"learning_rate": 2.362339019881129e-05,
"loss": 0.0356,
"step": 18020
},
{
"epoch": 17.28667305848514,
"grad_norm": 0.3006376624107361,
"learning_rate": 2.3569385373646068e-05,
"loss": 0.0283,
"step": 18030
},
{
"epoch": 17.29626078619367,
"grad_norm": 0.2278210073709488,
"learning_rate": 2.351542330605355e-05,
"loss": 0.0292,
"step": 18040
},
{
"epoch": 17.305848513902205,
"grad_norm": 0.1900917887687683,
"learning_rate": 2.3461504083329732e-05,
"loss": 0.0293,
"step": 18050
},
{
"epoch": 17.315436241610737,
"grad_norm": 0.36089229583740234,
"learning_rate": 2.340762779270131e-05,
"loss": 0.0335,
"step": 18060
},
{
"epoch": 17.325023969319272,
"grad_norm": 0.20157793164253235,
"learning_rate": 2.3353794521325516e-05,
"loss": 0.0224,
"step": 18070
},
{
"epoch": 17.334611697027803,
"grad_norm": 0.25802189111709595,
"learning_rate": 2.330000435629002e-05,
"loss": 0.0241,
"step": 18080
},
{
"epoch": 17.34419942473634,
"grad_norm": 0.19763995707035065,
"learning_rate": 2.32462573846127e-05,
"loss": 0.0324,
"step": 18090
},
{
"epoch": 17.35378715244487,
"grad_norm": 0.24877896904945374,
"learning_rate": 2.319255369324161e-05,
"loss": 0.0297,
"step": 18100
},
{
"epoch": 17.363374880153405,
"grad_norm": 0.23094792664051056,
"learning_rate": 2.3138893369054766e-05,
"loss": 0.0279,
"step": 18110
},
{
"epoch": 17.372962607861936,
"grad_norm": 0.1878676414489746,
"learning_rate": 2.3085276498860032e-05,
"loss": 0.0278,
"step": 18120
},
{
"epoch": 17.38255033557047,
"grad_norm": 0.20479904115200043,
"learning_rate": 2.3031703169394985e-05,
"loss": 0.0263,
"step": 18130
},
{
"epoch": 17.392138063279003,
"grad_norm": 0.3048153519630432,
"learning_rate": 2.2978173467326724e-05,
"loss": 0.0282,
"step": 18140
},
{
"epoch": 17.401725790987538,
"grad_norm": 0.2260926365852356,
"learning_rate": 2.292468747925185e-05,
"loss": 0.0282,
"step": 18150
},
{
"epoch": 17.41131351869607,
"grad_norm": 0.23683381080627441,
"learning_rate": 2.287124529169618e-05,
"loss": 0.0255,
"step": 18160
},
{
"epoch": 17.4209012464046,
"grad_norm": 0.21933788061141968,
"learning_rate": 2.2817846991114684e-05,
"loss": 0.0259,
"step": 18170
},
{
"epoch": 17.430488974113135,
"grad_norm": 0.2983873784542084,
"learning_rate": 2.2764492663891353e-05,
"loss": 0.0294,
"step": 18180
},
{
"epoch": 17.440076701821667,
"grad_norm": 0.2740059792995453,
"learning_rate": 2.271118239633902e-05,
"loss": 0.0292,
"step": 18190
},
{
"epoch": 17.449664429530202,
"grad_norm": 0.18633967638015747,
"learning_rate": 2.2657916274699265e-05,
"loss": 0.024,
"step": 18200
},
{
"epoch": 17.459252157238733,
"grad_norm": 0.21379147469997406,
"learning_rate": 2.2604694385142233e-05,
"loss": 0.0245,
"step": 18210
},
{
"epoch": 17.46883988494727,
"grad_norm": 0.2814527153968811,
"learning_rate": 2.2551516813766538e-05,
"loss": 0.0264,
"step": 18220
},
{
"epoch": 17.4784276126558,
"grad_norm": 0.18947578966617584,
"learning_rate": 2.2498383646599048e-05,
"loss": 0.0222,
"step": 18230
},
{
"epoch": 17.488015340364335,
"grad_norm": 0.41355225443840027,
"learning_rate": 2.2445294969594844e-05,
"loss": 0.0285,
"step": 18240
},
{
"epoch": 17.497603068072866,
"grad_norm": 0.4395101070404053,
"learning_rate": 2.2392250868637026e-05,
"loss": 0.0301,
"step": 18250
},
{
"epoch": 17.5071907957814,
"grad_norm": 0.1704569309949875,
"learning_rate": 2.233925142953657e-05,
"loss": 0.0236,
"step": 18260
},
{
"epoch": 17.516778523489933,
"grad_norm": 6.209451198577881,
"learning_rate": 2.2286296738032214e-05,
"loss": 0.03,
"step": 18270
},
{
"epoch": 17.526366251198468,
"grad_norm": 0.5336940884590149,
"learning_rate": 2.223338687979029e-05,
"loss": 0.024,
"step": 18280
},
{
"epoch": 17.535953978907,
"grad_norm": 0.2711230516433716,
"learning_rate": 2.2180521940404607e-05,
"loss": 0.025,
"step": 18290
},
{
"epoch": 17.54554170661553,
"grad_norm": 0.35838785767555237,
"learning_rate": 2.212770200539634e-05,
"loss": 0.0328,
"step": 18300
},
{
"epoch": 17.555129434324066,
"grad_norm": 0.2138790637254715,
"learning_rate": 2.207492716021381e-05,
"loss": 0.0272,
"step": 18310
},
{
"epoch": 17.564717162032597,
"grad_norm": 0.18834197521209717,
"learning_rate": 2.2022197490232427e-05,
"loss": 0.0266,
"step": 18320
},
{
"epoch": 17.574304889741132,
"grad_norm": 0.28788337111473083,
"learning_rate": 2.1969513080754504e-05,
"loss": 0.0247,
"step": 18330
},
{
"epoch": 17.583892617449663,
"grad_norm": 0.1590379774570465,
"learning_rate": 2.1916874017009136e-05,
"loss": 0.0233,
"step": 18340
},
{
"epoch": 17.5934803451582,
"grad_norm": 0.2774651050567627,
"learning_rate": 2.186428038415209e-05,
"loss": 0.022,
"step": 18350
},
{
"epoch": 17.60306807286673,
"grad_norm": 0.18108907341957092,
"learning_rate": 2.1811732267265577e-05,
"loss": 0.0228,
"step": 18360
},
{
"epoch": 17.612655800575265,
"grad_norm": 0.2790849208831787,
"learning_rate": 2.1759229751358217e-05,
"loss": 0.0295,
"step": 18370
},
{
"epoch": 17.622243528283796,
"grad_norm": 0.1974640190601349,
"learning_rate": 2.170677292136487e-05,
"loss": 0.0265,
"step": 18380
},
{
"epoch": 17.63183125599233,
"grad_norm": 0.2952618896961212,
"learning_rate": 2.1654361862146465e-05,
"loss": 0.0257,
"step": 18390
},
{
"epoch": 17.641418983700863,
"grad_norm": 0.21564097702503204,
"learning_rate": 2.160199665848989e-05,
"loss": 0.0286,
"step": 18400
},
{
"epoch": 17.651006711409394,
"grad_norm": 0.2616369426250458,
"learning_rate": 2.154967739510787e-05,
"loss": 0.0265,
"step": 18410
},
{
"epoch": 17.66059443911793,
"grad_norm": 0.22359015047550201,
"learning_rate": 2.1497404156638784e-05,
"loss": 0.0217,
"step": 18420
},
{
"epoch": 17.67018216682646,
"grad_norm": 0.26012542843818665,
"learning_rate": 2.144517702764657e-05,
"loss": 0.0265,
"step": 18430
},
{
"epoch": 17.679769894534996,
"grad_norm": 0.13236083090305328,
"learning_rate": 2.1392996092620555e-05,
"loss": 0.0203,
"step": 18440
},
{
"epoch": 17.689357622243527,
"grad_norm": 0.23233279585838318,
"learning_rate": 2.1340861435975384e-05,
"loss": 0.0239,
"step": 18450
},
{
"epoch": 17.698945349952062,
"grad_norm": 0.22985659539699554,
"learning_rate": 2.1288773142050794e-05,
"loss": 0.026,
"step": 18460
},
{
"epoch": 17.708533077660594,
"grad_norm": 0.2680293321609497,
"learning_rate": 2.123673129511152e-05,
"loss": 0.0307,
"step": 18470
},
{
"epoch": 17.71812080536913,
"grad_norm": 0.23979081213474274,
"learning_rate": 2.1184735979347205e-05,
"loss": 0.0251,
"step": 18480
},
{
"epoch": 17.72770853307766,
"grad_norm": 0.2722991704940796,
"learning_rate": 2.113278727887213e-05,
"loss": 0.0301,
"step": 18490
},
{
"epoch": 17.737296260786195,
"grad_norm": 0.22843940556049347,
"learning_rate": 2.1080885277725236e-05,
"loss": 0.0228,
"step": 18500
},
{
"epoch": 17.746883988494726,
"grad_norm": 0.34953558444976807,
"learning_rate": 2.1029030059869898e-05,
"loss": 0.0296,
"step": 18510
},
{
"epoch": 17.75647171620326,
"grad_norm": 0.12219765037298203,
"learning_rate": 2.0977221709193813e-05,
"loss": 0.0271,
"step": 18520
},
{
"epoch": 17.766059443911793,
"grad_norm": 0.33025461435317993,
"learning_rate": 2.0925460309508843e-05,
"loss": 0.0305,
"step": 18530
},
{
"epoch": 17.775647171620324,
"grad_norm": 0.3049762547016144,
"learning_rate": 2.087374594455092e-05,
"loss": 0.0316,
"step": 18540
},
{
"epoch": 17.78523489932886,
"grad_norm": 0.3146844506263733,
"learning_rate": 2.082207869797987e-05,
"loss": 0.0272,
"step": 18550
},
{
"epoch": 17.79482262703739,
"grad_norm": 0.18491698801517487,
"learning_rate": 2.0770458653379286e-05,
"loss": 0.0281,
"step": 18560
},
{
"epoch": 17.804410354745926,
"grad_norm": 0.21474412083625793,
"learning_rate": 2.0718885894256428e-05,
"loss": 0.0238,
"step": 18570
},
{
"epoch": 17.813998082454457,
"grad_norm": 0.1813114583492279,
"learning_rate": 2.0667360504042045e-05,
"loss": 0.027,
"step": 18580
},
{
"epoch": 17.823585810162992,
"grad_norm": 0.36077314615249634,
"learning_rate": 2.0615882566090243e-05,
"loss": 0.0311,
"step": 18590
},
{
"epoch": 17.833173537871524,
"grad_norm": 0.1905115246772766,
"learning_rate": 2.0564452163678378e-05,
"loss": 0.0254,
"step": 18600
},
{
"epoch": 17.84276126558006,
"grad_norm": 0.22948439419269562,
"learning_rate": 2.0513069380006943e-05,
"loss": 0.0296,
"step": 18610
},
{
"epoch": 17.85234899328859,
"grad_norm": 0.27490001916885376,
"learning_rate": 2.046173429819931e-05,
"loss": 0.0239,
"step": 18620
},
{
"epoch": 17.861936720997125,
"grad_norm": 0.21853777766227722,
"learning_rate": 2.0410447001301753e-05,
"loss": 0.028,
"step": 18630
},
{
"epoch": 17.871524448705657,
"grad_norm": 0.20548582077026367,
"learning_rate": 2.0359207572283224e-05,
"loss": 0.0225,
"step": 18640
},
{
"epoch": 17.88111217641419,
"grad_norm": 0.14802424609661102,
"learning_rate": 2.0308016094035226e-05,
"loss": 0.0295,
"step": 18650
},
{
"epoch": 17.890699904122723,
"grad_norm": 0.32737597823143005,
"learning_rate": 2.02568726493717e-05,
"loss": 0.0242,
"step": 18660
},
{
"epoch": 17.900287631831254,
"grad_norm": 0.21833331882953644,
"learning_rate": 2.020577732102889e-05,
"loss": 0.0273,
"step": 18670
},
{
"epoch": 17.90987535953979,
"grad_norm": 0.24916410446166992,
"learning_rate": 2.015473019166519e-05,
"loss": 0.0305,
"step": 18680
},
{
"epoch": 17.91946308724832,
"grad_norm": 0.18901677429676056,
"learning_rate": 2.0103731343861014e-05,
"loss": 0.0256,
"step": 18690
},
{
"epoch": 17.929050814956856,
"grad_norm": 0.20720627903938293,
"learning_rate": 2.0052780860118692e-05,
"loss": 0.0262,
"step": 18700
},
{
"epoch": 17.938638542665387,
"grad_norm": 0.20290115475654602,
"learning_rate": 2.0001878822862292e-05,
"loss": 0.0302,
"step": 18710
},
{
"epoch": 17.948226270373922,
"grad_norm": 0.28782570362091064,
"learning_rate": 1.995102531443752e-05,
"loss": 0.0272,
"step": 18720
},
{
"epoch": 17.957813998082454,
"grad_norm": 0.19285361468791962,
"learning_rate": 1.9900220417111577e-05,
"loss": 0.0226,
"step": 18730
},
{
"epoch": 17.96740172579099,
"grad_norm": 0.2487422674894333,
"learning_rate": 1.984946421307301e-05,
"loss": 0.0259,
"step": 18740
},
{
"epoch": 17.97698945349952,
"grad_norm": 0.20847800374031067,
"learning_rate": 1.9798756784431616e-05,
"loss": 0.0248,
"step": 18750
},
{
"epoch": 17.986577181208055,
"grad_norm": 0.29753822088241577,
"learning_rate": 1.974809821321827e-05,
"loss": 0.0307,
"step": 18760
},
{
"epoch": 17.996164908916587,
"grad_norm": 0.2475176304578781,
"learning_rate": 1.969748858138481e-05,
"loss": 0.0192,
"step": 18770
},
{
"epoch": 18.005752636625118,
"grad_norm": 0.24821995198726654,
"learning_rate": 1.9646927970803913e-05,
"loss": 0.0217,
"step": 18780
},
{
"epoch": 18.015340364333653,
"grad_norm": 0.24269837141036987,
"learning_rate": 1.959641646326894e-05,
"loss": 0.0267,
"step": 18790
},
{
"epoch": 18.024928092042185,
"grad_norm": 0.4261660575866699,
"learning_rate": 1.9545954140493828e-05,
"loss": 0.028,
"step": 18800
},
{
"epoch": 18.03451581975072,
"grad_norm": 0.27009981870651245,
"learning_rate": 1.9495541084112945e-05,
"loss": 0.0261,
"step": 18810
},
{
"epoch": 18.04410354745925,
"grad_norm": 0.4468768537044525,
"learning_rate": 1.9445177375680944e-05,
"loss": 0.0237,
"step": 18820
},
{
"epoch": 18.053691275167786,
"grad_norm": 0.34373733401298523,
"learning_rate": 1.939486309667267e-05,
"loss": 0.0283,
"step": 18830
},
{
"epoch": 18.063279002876317,
"grad_norm": 0.3583851456642151,
"learning_rate": 1.9344598328482994e-05,
"loss": 0.0239,
"step": 18840
},
{
"epoch": 18.072866730584852,
"grad_norm": 0.2819909453392029,
"learning_rate": 1.9294383152426682e-05,
"loss": 0.0228,
"step": 18850
},
{
"epoch": 18.082454458293384,
"grad_norm": 0.21321451663970947,
"learning_rate": 1.924421764973829e-05,
"loss": 0.0257,
"step": 18860
},
{
"epoch": 18.09204218600192,
"grad_norm": 0.20414310693740845,
"learning_rate": 1.9194101901572e-05,
"loss": 0.027,
"step": 18870
},
{
"epoch": 18.10162991371045,
"grad_norm": 0.1880536824464798,
"learning_rate": 1.9144035989001518e-05,
"loss": 0.0236,
"step": 18880
},
{
"epoch": 18.111217641418985,
"grad_norm": 0.15333381295204163,
"learning_rate": 1.909401999301993e-05,
"loss": 0.0285,
"step": 18890
},
{
"epoch": 18.120805369127517,
"grad_norm": 0.25423663854599,
"learning_rate": 1.904405399453955e-05,
"loss": 0.0253,
"step": 18900
},
{
"epoch": 18.13039309683605,
"grad_norm": 0.16123837232589722,
"learning_rate": 1.8994138074391843e-05,
"loss": 0.0269,
"step": 18910
},
{
"epoch": 18.139980824544583,
"grad_norm": 0.28160786628723145,
"learning_rate": 1.8944272313327226e-05,
"loss": 0.0289,
"step": 18920
},
{
"epoch": 18.149568552253115,
"grad_norm": 0.17112663388252258,
"learning_rate": 1.8894456792014996e-05,
"loss": 0.0273,
"step": 18930
},
{
"epoch": 18.15915627996165,
"grad_norm": 0.19048067927360535,
"learning_rate": 1.8844691591043173e-05,
"loss": 0.0225,
"step": 18940
},
{
"epoch": 18.16874400767018,
"grad_norm": 0.22992561757564545,
"learning_rate": 1.8794976790918363e-05,
"loss": 0.0229,
"step": 18950
},
{
"epoch": 18.178331735378716,
"grad_norm": 0.30747804045677185,
"learning_rate": 1.8745312472065635e-05,
"loss": 0.0259,
"step": 18960
},
{
"epoch": 18.187919463087248,
"grad_norm": 0.2523973882198334,
"learning_rate": 1.8695698714828406e-05,
"loss": 0.0249,
"step": 18970
},
{
"epoch": 18.197507190795783,
"grad_norm": 0.2866404056549072,
"learning_rate": 1.8646135599468297e-05,
"loss": 0.0252,
"step": 18980
},
{
"epoch": 18.207094918504314,
"grad_norm": 0.1944408118724823,
"learning_rate": 1.8596623206164987e-05,
"loss": 0.0265,
"step": 18990
},
{
"epoch": 18.21668264621285,
"grad_norm": 0.22918511927127838,
"learning_rate": 1.8547161615016116e-05,
"loss": 0.0272,
"step": 19000
},
{
"epoch": 18.22627037392138,
"grad_norm": 0.2857123911380768,
"learning_rate": 1.8497750906037148e-05,
"loss": 0.0307,
"step": 19010
},
{
"epoch": 18.235858101629915,
"grad_norm": 0.17393842339515686,
"learning_rate": 1.8448391159161204e-05,
"loss": 0.029,
"step": 19020
},
{
"epoch": 18.245445829338447,
"grad_norm": 0.2042463719844818,
"learning_rate": 1.839908245423899e-05,
"loss": 0.0251,
"step": 19030
},
{
"epoch": 18.25503355704698,
"grad_norm": 0.277891606092453,
"learning_rate": 1.8349824871038644e-05,
"loss": 0.0237,
"step": 19040
},
{
"epoch": 18.264621284755513,
"grad_norm": 0.16384513676166534,
"learning_rate": 1.8300618489245537e-05,
"loss": 0.0239,
"step": 19050
},
{
"epoch": 18.274209012464045,
"grad_norm": 0.27934807538986206,
"learning_rate": 1.8251463388462315e-05,
"loss": 0.0258,
"step": 19060
},
{
"epoch": 18.28379674017258,
"grad_norm": 0.28241196274757385,
"learning_rate": 1.8202359648208593e-05,
"loss": 0.0244,
"step": 19070
},
{
"epoch": 18.29338446788111,
"grad_norm": 0.22691746056079865,
"learning_rate": 1.8153307347920918e-05,
"loss": 0.0279,
"step": 19080
},
{
"epoch": 18.302972195589646,
"grad_norm": 0.31362423300743103,
"learning_rate": 1.8104306566952618e-05,
"loss": 0.0235,
"step": 19090
},
{
"epoch": 18.312559923298178,
"grad_norm": 0.5029933452606201,
"learning_rate": 1.805535738457368e-05,
"loss": 0.027,
"step": 19100
},
{
"epoch": 18.322147651006713,
"grad_norm": 0.23722821474075317,
"learning_rate": 1.8006459879970622e-05,
"loss": 0.0309,
"step": 19110
},
{
"epoch": 18.331735378715244,
"grad_norm": 0.2513883113861084,
"learning_rate": 1.7957614132246347e-05,
"loss": 0.0263,
"step": 19120
},
{
"epoch": 18.34132310642378,
"grad_norm": 0.24489589035511017,
"learning_rate": 1.7908820220420052e-05,
"loss": 0.0251,
"step": 19130
},
{
"epoch": 18.35091083413231,
"grad_norm": 0.2208951860666275,
"learning_rate": 1.7860078223427056e-05,
"loss": 0.0267,
"step": 19140
},
{
"epoch": 18.360498561840842,
"grad_norm": 0.2466048002243042,
"learning_rate": 1.7811388220118707e-05,
"loss": 0.0246,
"step": 19150
},
{
"epoch": 18.370086289549377,
"grad_norm": 0.1647568643093109,
"learning_rate": 1.7762750289262238e-05,
"loss": 0.0221,
"step": 19160
},
{
"epoch": 18.37967401725791,
"grad_norm": 0.20359550416469574,
"learning_rate": 1.7714164509540654e-05,
"loss": 0.024,
"step": 19170
},
{
"epoch": 18.389261744966444,
"grad_norm": 0.15871766209602356,
"learning_rate": 1.7665630959552548e-05,
"loss": 0.0252,
"step": 19180
},
{
"epoch": 18.398849472674975,
"grad_norm": 0.2411220222711563,
"learning_rate": 1.7617149717812076e-05,
"loss": 0.0225,
"step": 19190
},
{
"epoch": 18.40843720038351,
"grad_norm": 0.3407461643218994,
"learning_rate": 1.7568720862748744e-05,
"loss": 0.0312,
"step": 19200
},
{
"epoch": 18.41802492809204,
"grad_norm": 0.21590691804885864,
"learning_rate": 1.75203444727073e-05,
"loss": 0.0248,
"step": 19210
},
{
"epoch": 18.427612655800576,
"grad_norm": 0.17383931577205658,
"learning_rate": 1.7472020625947678e-05,
"loss": 0.0231,
"step": 19220
},
{
"epoch": 18.437200383509108,
"grad_norm": 0.31559276580810547,
"learning_rate": 1.742374940064474e-05,
"loss": 0.0263,
"step": 19230
},
{
"epoch": 18.446788111217643,
"grad_norm": 0.23316271603107452,
"learning_rate": 1.737553087488825e-05,
"loss": 0.0316,
"step": 19240
},
{
"epoch": 18.456375838926174,
"grad_norm": 0.21858806908130646,
"learning_rate": 1.7327365126682726e-05,
"loss": 0.0272,
"step": 19250
},
{
"epoch": 18.46596356663471,
"grad_norm": 0.2449788898229599,
"learning_rate": 1.7279252233947286e-05,
"loss": 0.0286,
"step": 19260
},
{
"epoch": 18.47555129434324,
"grad_norm": 0.21250544488430023,
"learning_rate": 1.7231192274515562e-05,
"loss": 0.0247,
"step": 19270
},
{
"epoch": 18.485139022051772,
"grad_norm": 0.2528996169567108,
"learning_rate": 1.7183185326135543e-05,
"loss": 0.0253,
"step": 19280
},
{
"epoch": 18.494726749760307,
"grad_norm": 0.2549261748790741,
"learning_rate": 1.7135231466469463e-05,
"loss": 0.0294,
"step": 19290
},
{
"epoch": 18.50431447746884,
"grad_norm": 0.352224200963974,
"learning_rate": 1.7087330773093673e-05,
"loss": 0.0228,
"step": 19300
},
{
"epoch": 18.513902205177374,
"grad_norm": 0.18530428409576416,
"learning_rate": 1.7039483323498534e-05,
"loss": 0.0258,
"step": 19310
},
{
"epoch": 18.523489932885905,
"grad_norm": 0.14298230409622192,
"learning_rate": 1.6991689195088217e-05,
"loss": 0.0236,
"step": 19320
},
{
"epoch": 18.53307766059444,
"grad_norm": 0.2754952311515808,
"learning_rate": 1.6943948465180693e-05,
"loss": 0.0235,
"step": 19330
},
{
"epoch": 18.54266538830297,
"grad_norm": 0.2274174690246582,
"learning_rate": 1.6896261211007518e-05,
"loss": 0.0305,
"step": 19340
},
{
"epoch": 18.552253116011507,
"grad_norm": 0.3091070055961609,
"learning_rate": 1.684862750971376e-05,
"loss": 0.0227,
"step": 19350
},
{
"epoch": 18.561840843720038,
"grad_norm": 0.15530341863632202,
"learning_rate": 1.6801047438357818e-05,
"loss": 0.0224,
"step": 19360
},
{
"epoch": 18.571428571428573,
"grad_norm": 0.25515303015708923,
"learning_rate": 1.675352107391139e-05,
"loss": 0.0291,
"step": 19370
},
{
"epoch": 18.581016299137104,
"grad_norm": 0.27960437536239624,
"learning_rate": 1.670604849325923e-05,
"loss": 0.0258,
"step": 19380
},
{
"epoch": 18.59060402684564,
"grad_norm": 0.2250082641839981,
"learning_rate": 1.6658629773199124e-05,
"loss": 0.0232,
"step": 19390
},
{
"epoch": 18.60019175455417,
"grad_norm": 0.27883338928222656,
"learning_rate": 1.6611264990441706e-05,
"loss": 0.0343,
"step": 19400
},
{
"epoch": 18.609779482262702,
"grad_norm": 0.17993752658367157,
"learning_rate": 1.6563954221610355e-05,
"loss": 0.0278,
"step": 19410
},
{
"epoch": 18.619367209971237,
"grad_norm": 0.26345837116241455,
"learning_rate": 1.6516697543241083e-05,
"loss": 0.026,
"step": 19420
},
{
"epoch": 18.62895493767977,
"grad_norm": 0.22277230024337769,
"learning_rate": 1.646949503178239e-05,
"loss": 0.0213,
"step": 19430
},
{
"epoch": 18.638542665388304,
"grad_norm": 0.2661077082157135,
"learning_rate": 1.642234676359516e-05,
"loss": 0.0243,
"step": 19440
},
{
"epoch": 18.648130393096835,
"grad_norm": 0.28437718749046326,
"learning_rate": 1.6375252814952487e-05,
"loss": 0.0263,
"step": 19450
},
{
"epoch": 18.65771812080537,
"grad_norm": 0.20834000408649445,
"learning_rate": 1.6328213262039637e-05,
"loss": 0.0225,
"step": 19460
},
{
"epoch": 18.6673058485139,
"grad_norm": 0.24616943299770355,
"learning_rate": 1.6281228180953857e-05,
"loss": 0.0233,
"step": 19470
},
{
"epoch": 18.676893576222437,
"grad_norm": 0.21522550284862518,
"learning_rate": 1.623429764770428e-05,
"loss": 0.0233,
"step": 19480
},
{
"epoch": 18.686481303930968,
"grad_norm": 0.2068173587322235,
"learning_rate": 1.618742173821179e-05,
"loss": 0.0308,
"step": 19490
},
{
"epoch": 18.696069031639503,
"grad_norm": 0.5226014256477356,
"learning_rate": 1.614060052830891e-05,
"loss": 0.031,
"step": 19500
},
{
"epoch": 18.705656759348035,
"grad_norm": 0.18240250647068024,
"learning_rate": 1.6093834093739647e-05,
"loss": 0.0245,
"step": 19510
},
{
"epoch": 18.715244487056566,
"grad_norm": 0.2039356231689453,
"learning_rate": 1.6047122510159458e-05,
"loss": 0.02,
"step": 19520
},
{
"epoch": 18.7248322147651,
"grad_norm": 0.2688858211040497,
"learning_rate": 1.600046585313501e-05,
"loss": 0.0232,
"step": 19530
},
{
"epoch": 18.734419942473632,
"grad_norm": 0.3605387806892395,
"learning_rate": 1.5953864198144135e-05,
"loss": 0.0285,
"step": 19540
},
{
"epoch": 18.744007670182167,
"grad_norm": 0.19552724063396454,
"learning_rate": 1.5907317620575686e-05,
"loss": 0.0249,
"step": 19550
},
{
"epoch": 18.7535953978907,
"grad_norm": 0.2785275876522064,
"learning_rate": 1.58608261957294e-05,
"loss": 0.0301,
"step": 19560
},
{
"epoch": 18.763183125599234,
"grad_norm": 0.265622079372406,
"learning_rate": 1.5814389998815836e-05,
"loss": 0.0244,
"step": 19570
},
{
"epoch": 18.772770853307765,
"grad_norm": 0.22419176995754242,
"learning_rate": 1.5768009104956137e-05,
"loss": 0.0197,
"step": 19580
},
{
"epoch": 18.7823585810163,
"grad_norm": 0.22098082304000854,
"learning_rate": 1.572168358918204e-05,
"loss": 0.0219,
"step": 19590
},
{
"epoch": 18.79194630872483,
"grad_norm": 0.26601535081863403,
"learning_rate": 1.5675413526435677e-05,
"loss": 0.0234,
"step": 19600
},
{
"epoch": 18.801534036433367,
"grad_norm": 0.2946853041648865,
"learning_rate": 1.562919899156947e-05,
"loss": 0.0246,
"step": 19610
},
{
"epoch": 18.811121764141898,
"grad_norm": 0.3101515471935272,
"learning_rate": 1.558304005934602e-05,
"loss": 0.0244,
"step": 19620
},
{
"epoch": 18.82070949185043,
"grad_norm": 0.24001409113407135,
"learning_rate": 1.5536936804437963e-05,
"loss": 0.0254,
"step": 19630
},
{
"epoch": 18.830297219558965,
"grad_norm": 0.1419634222984314,
"learning_rate": 1.549088930142788e-05,
"loss": 0.0231,
"step": 19640
},
{
"epoch": 18.839884947267496,
"grad_norm": 0.24882347881793976,
"learning_rate": 1.544489762480815e-05,
"loss": 0.0201,
"step": 19650
},
{
"epoch": 18.84947267497603,
"grad_norm": 0.22982530295848846,
"learning_rate": 1.5398961848980838e-05,
"loss": 0.0218,
"step": 19660
},
{
"epoch": 18.859060402684563,
"grad_norm": 0.1851414293050766,
"learning_rate": 1.5353082048257596e-05,
"loss": 0.0267,
"step": 19670
},
{
"epoch": 18.868648130393098,
"grad_norm": 0.23806796967983246,
"learning_rate": 1.53072582968595e-05,
"loss": 0.0273,
"step": 19680
},
{
"epoch": 18.87823585810163,
"grad_norm": 0.2619253396987915,
"learning_rate": 1.526149066891697e-05,
"loss": 0.0263,
"step": 19690
},
{
"epoch": 18.887823585810164,
"grad_norm": 0.24336743354797363,
"learning_rate": 1.5215779238469641e-05,
"loss": 0.0344,
"step": 19700
},
{
"epoch": 18.897411313518695,
"grad_norm": 0.21095559000968933,
"learning_rate": 1.5170124079466186e-05,
"loss": 0.0239,
"step": 19710
},
{
"epoch": 18.90699904122723,
"grad_norm": 0.3702682554721832,
"learning_rate": 1.51245252657643e-05,
"loss": 0.0265,
"step": 19720
},
{
"epoch": 18.916586768935762,
"grad_norm": 0.45442819595336914,
"learning_rate": 1.5078982871130504e-05,
"loss": 0.0284,
"step": 19730
},
{
"epoch": 18.926174496644297,
"grad_norm": 0.1986912339925766,
"learning_rate": 1.5033496969240057e-05,
"loss": 0.025,
"step": 19740
},
{
"epoch": 18.93576222435283,
"grad_norm": 0.18418286740779877,
"learning_rate": 1.4988067633676816e-05,
"loss": 0.0255,
"step": 19750
},
{
"epoch": 18.94534995206136,
"grad_norm": 0.16016803681850433,
"learning_rate": 1.4942694937933144e-05,
"loss": 0.0224,
"step": 19760
},
{
"epoch": 18.954937679769895,
"grad_norm": 0.2799144387245178,
"learning_rate": 1.4897378955409763e-05,
"loss": 0.0279,
"step": 19770
},
{
"epoch": 18.964525407478426,
"grad_norm": 0.17058733105659485,
"learning_rate": 1.4852119759415661e-05,
"loss": 0.0257,
"step": 19780
},
{
"epoch": 18.97411313518696,
"grad_norm": 0.24392423033714294,
"learning_rate": 1.4806917423167944e-05,
"loss": 0.0237,
"step": 19790
},
{
"epoch": 18.983700862895493,
"grad_norm": 0.19233231246471405,
"learning_rate": 1.4761772019791748e-05,
"loss": 0.0296,
"step": 19800
},
{
"epoch": 18.993288590604028,
"grad_norm": 0.2076229453086853,
"learning_rate": 1.4716683622320105e-05,
"loss": 0.026,
"step": 19810
},
{
"epoch": 19.00287631831256,
"grad_norm": 0.22467122972011566,
"learning_rate": 1.4671652303693806e-05,
"loss": 0.028,
"step": 19820
},
{
"epoch": 19.012464046021094,
"grad_norm": 0.16231553256511688,
"learning_rate": 1.4626678136761369e-05,
"loss": 0.0248,
"step": 19830
},
{
"epoch": 19.022051773729626,
"grad_norm": 0.24173732101917267,
"learning_rate": 1.4581761194278765e-05,
"loss": 0.0318,
"step": 19840
},
{
"epoch": 19.03163950143816,
"grad_norm": 0.21880550682544708,
"learning_rate": 1.4536901548909448e-05,
"loss": 0.0299,
"step": 19850
},
{
"epoch": 19.041227229146692,
"grad_norm": 0.3532547950744629,
"learning_rate": 1.4492099273224174e-05,
"loss": 0.0282,
"step": 19860
},
{
"epoch": 19.050814956855227,
"grad_norm": 0.20322856307029724,
"learning_rate": 1.4447354439700889e-05,
"loss": 0.0235,
"step": 19870
},
{
"epoch": 19.06040268456376,
"grad_norm": 0.2487279176712036,
"learning_rate": 1.4402667120724594e-05,
"loss": 0.0285,
"step": 19880
},
{
"epoch": 19.06999041227229,
"grad_norm": 0.19251792132854462,
"learning_rate": 1.4358037388587281e-05,
"loss": 0.0269,
"step": 19890
},
{
"epoch": 19.079578139980825,
"grad_norm": 0.2209775298833847,
"learning_rate": 1.4313465315487745e-05,
"loss": 0.0209,
"step": 19900
},
{
"epoch": 19.089165867689356,
"grad_norm": 0.15831854939460754,
"learning_rate": 1.4268950973531536e-05,
"loss": 0.023,
"step": 19910
},
{
"epoch": 19.09875359539789,
"grad_norm": 0.21416033804416656,
"learning_rate": 1.4224494434730794e-05,
"loss": 0.0217,
"step": 19920
},
{
"epoch": 19.108341323106423,
"grad_norm": 0.15104466676712036,
"learning_rate": 1.4180095771004154e-05,
"loss": 0.0212,
"step": 19930
},
{
"epoch": 19.117929050814958,
"grad_norm": 0.19750936329364777,
"learning_rate": 1.413575505417662e-05,
"loss": 0.0172,
"step": 19940
},
{
"epoch": 19.12751677852349,
"grad_norm": 0.26296430826187134,
"learning_rate": 1.4091472355979463e-05,
"loss": 0.0248,
"step": 19950
},
{
"epoch": 19.137104506232024,
"grad_norm": 0.20639511942863464,
"learning_rate": 1.404724774805008e-05,
"loss": 0.0255,
"step": 19960
},
{
"epoch": 19.146692233940556,
"grad_norm": 0.14402848482131958,
"learning_rate": 1.4003081301931909e-05,
"loss": 0.0273,
"step": 19970
},
{
"epoch": 19.15627996164909,
"grad_norm": 0.14169853925704956,
"learning_rate": 1.395897308907429e-05,
"loss": 0.0256,
"step": 19980
},
{
"epoch": 19.165867689357622,
"grad_norm": 0.17262916266918182,
"learning_rate": 1.3914923180832368e-05,
"loss": 0.0264,
"step": 19990
},
{
"epoch": 19.175455417066157,
"grad_norm": 0.13429339230060577,
"learning_rate": 1.3870931648466945e-05,
"loss": 0.0246,
"step": 20000
},
{
"epoch": 19.18504314477469,
"grad_norm": 0.2229502946138382,
"learning_rate": 1.3826998563144411e-05,
"loss": 0.0238,
"step": 20010
},
{
"epoch": 19.19463087248322,
"grad_norm": 0.1920672059059143,
"learning_rate": 1.3783123995936587e-05,
"loss": 0.0239,
"step": 20020
},
{
"epoch": 19.204218600191755,
"grad_norm": 0.22073961794376373,
"learning_rate": 1.373930801782064e-05,
"loss": 0.021,
"step": 20030
},
{
"epoch": 19.213806327900286,
"grad_norm": 0.3254948556423187,
"learning_rate": 1.369555069967895e-05,
"loss": 0.0253,
"step": 20040
},
{
"epoch": 19.22339405560882,
"grad_norm": 0.14167852699756622,
"learning_rate": 1.3651852112298995e-05,
"loss": 0.0207,
"step": 20050
},
{
"epoch": 19.232981783317353,
"grad_norm": 0.2205292135477066,
"learning_rate": 1.3608212326373249e-05,
"loss": 0.0266,
"step": 20060
},
{
"epoch": 19.242569511025888,
"grad_norm": 0.268951416015625,
"learning_rate": 1.3564631412499067e-05,
"loss": 0.0187,
"step": 20070
},
{
"epoch": 19.25215723873442,
"grad_norm": 0.18108440935611725,
"learning_rate": 1.3521109441178559e-05,
"loss": 0.0258,
"step": 20080
},
{
"epoch": 19.261744966442954,
"grad_norm": 0.2803739905357361,
"learning_rate": 1.3477646482818474e-05,
"loss": 0.0263,
"step": 20090
},
{
"epoch": 19.271332694151486,
"grad_norm": 0.2689793109893799,
"learning_rate": 1.3434242607730108e-05,
"loss": 0.0254,
"step": 20100
},
{
"epoch": 19.28092042186002,
"grad_norm": 0.3495311439037323,
"learning_rate": 1.3390897886129162e-05,
"loss": 0.0277,
"step": 20110
},
{
"epoch": 19.290508149568552,
"grad_norm": 0.17431464791297913,
"learning_rate": 1.334761238813566e-05,
"loss": 0.0211,
"step": 20120
},
{
"epoch": 19.300095877277084,
"grad_norm": 0.3109664022922516,
"learning_rate": 1.3304386183773809e-05,
"loss": 0.0239,
"step": 20130
},
{
"epoch": 19.30968360498562,
"grad_norm": 0.15485496819019318,
"learning_rate": 1.3261219342971887e-05,
"loss": 0.0283,
"step": 20140
},
{
"epoch": 19.31927133269415,
"grad_norm": 0.18140093982219696,
"learning_rate": 1.3218111935562149e-05,
"loss": 0.0275,
"step": 20150
},
{
"epoch": 19.328859060402685,
"grad_norm": 0.22020739316940308,
"learning_rate": 1.3175064031280703e-05,
"loss": 0.0318,
"step": 20160
},
{
"epoch": 19.338446788111217,
"grad_norm": 0.27302905917167664,
"learning_rate": 1.3132075699767393e-05,
"loss": 0.0204,
"step": 20170
},
{
"epoch": 19.34803451581975,
"grad_norm": 0.20312833786010742,
"learning_rate": 1.3089147010565689e-05,
"loss": 0.0243,
"step": 20180
},
{
"epoch": 19.357622243528283,
"grad_norm": 0.2138754427433014,
"learning_rate": 1.3046278033122577e-05,
"loss": 0.0246,
"step": 20190
},
{
"epoch": 19.367209971236818,
"grad_norm": 0.15521451830863953,
"learning_rate": 1.3003468836788446e-05,
"loss": 0.0202,
"step": 20200
},
{
"epoch": 19.37679769894535,
"grad_norm": 0.14165331423282623,
"learning_rate": 1.296071949081698e-05,
"loss": 0.029,
"step": 20210
},
{
"epoch": 19.386385426653884,
"grad_norm": 0.17273680865764618,
"learning_rate": 1.2918030064365034e-05,
"loss": 0.0248,
"step": 20220
},
{
"epoch": 19.395973154362416,
"grad_norm": 0.1982639878988266,
"learning_rate": 1.2875400626492534e-05,
"loss": 0.0218,
"step": 20230
},
{
"epoch": 19.40556088207095,
"grad_norm": 0.20939846336841583,
"learning_rate": 1.2832831246162359e-05,
"loss": 0.0221,
"step": 20240
},
{
"epoch": 19.415148609779482,
"grad_norm": 0.2230292111635208,
"learning_rate": 1.2790321992240228e-05,
"loss": 0.0262,
"step": 20250
},
{
"epoch": 19.424736337488014,
"grad_norm": 0.17387695610523224,
"learning_rate": 1.2747872933494615e-05,
"loss": 0.0231,
"step": 20260
},
{
"epoch": 19.43432406519655,
"grad_norm": 0.2639104723930359,
"learning_rate": 1.2705484138596552e-05,
"loss": 0.0227,
"step": 20270
},
{
"epoch": 19.44391179290508,
"grad_norm": 0.2716933488845825,
"learning_rate": 1.2663155676119665e-05,
"loss": 0.025,
"step": 20280
},
{
"epoch": 19.453499520613615,
"grad_norm": 0.2254800945520401,
"learning_rate": 1.2620887614539917e-05,
"loss": 0.0236,
"step": 20290
},
{
"epoch": 19.463087248322147,
"grad_norm": 0.1728450208902359,
"learning_rate": 1.2578680022235585e-05,
"loss": 0.0237,
"step": 20300
},
{
"epoch": 19.47267497603068,
"grad_norm": 0.2077593207359314,
"learning_rate": 1.253653296748712e-05,
"loss": 0.026,
"step": 20310
},
{
"epoch": 19.482262703739213,
"grad_norm": 0.20576708018779755,
"learning_rate": 1.2494446518477022e-05,
"loss": 0.021,
"step": 20320
},
{
"epoch": 19.491850431447748,
"grad_norm": 0.2826680839061737,
"learning_rate": 1.2452420743289778e-05,
"loss": 0.0241,
"step": 20330
},
{
"epoch": 19.50143815915628,
"grad_norm": 0.3109418451786041,
"learning_rate": 1.2410455709911694e-05,
"loss": 0.0262,
"step": 20340
},
{
"epoch": 19.511025886864815,
"grad_norm": 0.6010233759880066,
"learning_rate": 1.2368551486230828e-05,
"loss": 0.0261,
"step": 20350
},
{
"epoch": 19.520613614573346,
"grad_norm": 0.33683139085769653,
"learning_rate": 1.2326708140036852e-05,
"loss": 0.0268,
"step": 20360
},
{
"epoch": 19.530201342281877,
"grad_norm": 0.1394880712032318,
"learning_rate": 1.2284925739020974e-05,
"loss": 0.0287,
"step": 20370
},
{
"epoch": 19.539789069990412,
"grad_norm": 0.2836284935474396,
"learning_rate": 1.2243204350775789e-05,
"loss": 0.0215,
"step": 20380
},
{
"epoch": 19.549376797698944,
"grad_norm": 0.9439190626144409,
"learning_rate": 1.2201544042795198e-05,
"loss": 0.0254,
"step": 20390
},
{
"epoch": 19.55896452540748,
"grad_norm": 0.18774332106113434,
"learning_rate": 1.215994488247431e-05,
"loss": 0.0273,
"step": 20400
},
{
"epoch": 19.56855225311601,
"grad_norm": 0.4038194715976715,
"learning_rate": 1.211840693710926e-05,
"loss": 0.0186,
"step": 20410
},
{
"epoch": 19.578139980824545,
"grad_norm": 0.2532286047935486,
"learning_rate": 1.2076930273897214e-05,
"loss": 0.0303,
"step": 20420
},
{
"epoch": 19.587727708533077,
"grad_norm": 0.23393119871616364,
"learning_rate": 1.2035514959936144e-05,
"loss": 0.0223,
"step": 20430
},
{
"epoch": 19.59731543624161,
"grad_norm": 0.17693249881267548,
"learning_rate": 1.199416106222484e-05,
"loss": 0.0222,
"step": 20440
},
{
"epoch": 19.606903163950143,
"grad_norm": 0.4991660714149475,
"learning_rate": 1.1952868647662696e-05,
"loss": 0.0255,
"step": 20450
},
{
"epoch": 19.616490891658678,
"grad_norm": 0.24061451852321625,
"learning_rate": 1.1911637783049645e-05,
"loss": 0.0271,
"step": 20460
},
{
"epoch": 19.62607861936721,
"grad_norm": 0.20236246287822723,
"learning_rate": 1.1870468535086054e-05,
"loss": 0.0274,
"step": 20470
},
{
"epoch": 19.635666347075745,
"grad_norm": 0.16982276737689972,
"learning_rate": 1.1829360970372604e-05,
"loss": 0.0275,
"step": 20480
},
{
"epoch": 19.645254074784276,
"grad_norm": 0.17934417724609375,
"learning_rate": 1.1788315155410212e-05,
"loss": 0.02,
"step": 20490
},
{
"epoch": 19.654841802492808,
"grad_norm": 0.2388330101966858,
"learning_rate": 1.1747331156599873e-05,
"loss": 0.0192,
"step": 20500
},
{
"epoch": 19.664429530201343,
"grad_norm": 0.19787994027137756,
"learning_rate": 1.1706409040242588e-05,
"loss": 0.0261,
"step": 20510
},
{
"epoch": 19.674017257909874,
"grad_norm": 0.2273687720298767,
"learning_rate": 1.166554887253926e-05,
"loss": 0.0246,
"step": 20520
},
{
"epoch": 19.68360498561841,
"grad_norm": 0.33494409918785095,
"learning_rate": 1.1624750719590588e-05,
"loss": 0.0249,
"step": 20530
},
{
"epoch": 19.69319271332694,
"grad_norm": 0.2192111760377884,
"learning_rate": 1.158401464739689e-05,
"loss": 0.0239,
"step": 20540
},
{
"epoch": 19.702780441035475,
"grad_norm": 0.2234772890806198,
"learning_rate": 1.154334072185811e-05,
"loss": 0.0227,
"step": 20550
},
{
"epoch": 19.712368168744007,
"grad_norm": 0.3074262738227844,
"learning_rate": 1.1502729008773639e-05,
"loss": 0.027,
"step": 20560
},
{
"epoch": 19.721955896452542,
"grad_norm": 0.22344590723514557,
"learning_rate": 1.146217957384223e-05,
"loss": 0.0251,
"step": 20570
},
{
"epoch": 19.731543624161073,
"grad_norm": 0.12177485972642899,
"learning_rate": 1.1421692482661856e-05,
"loss": 0.0246,
"step": 20580
},
{
"epoch": 19.74113135186961,
"grad_norm": 0.12450756132602692,
"learning_rate": 1.1381267800729695e-05,
"loss": 0.0247,
"step": 20590
},
{
"epoch": 19.75071907957814,
"grad_norm": 0.26811161637306213,
"learning_rate": 1.1340905593441914e-05,
"loss": 0.0263,
"step": 20600
},
{
"epoch": 19.760306807286675,
"grad_norm": 0.18584440648555756,
"learning_rate": 1.1300605926093627e-05,
"loss": 0.0259,
"step": 20610
},
{
"epoch": 19.769894534995206,
"grad_norm": 0.15904641151428223,
"learning_rate": 1.1260368863878778e-05,
"loss": 0.0239,
"step": 20620
},
{
"epoch": 19.779482262703738,
"grad_norm": 0.22534583508968353,
"learning_rate": 1.1220194471890027e-05,
"loss": 0.0234,
"step": 20630
},
{
"epoch": 19.789069990412273,
"grad_norm": 0.22182218730449677,
"learning_rate": 1.1180082815118659e-05,
"loss": 0.0255,
"step": 20640
},
{
"epoch": 19.798657718120804,
"grad_norm": 0.13675539195537567,
"learning_rate": 1.114003395845446e-05,
"loss": 0.0203,
"step": 20650
},
{
"epoch": 19.80824544582934,
"grad_norm": 0.153213232755661,
"learning_rate": 1.1100047966685645e-05,
"loss": 0.0235,
"step": 20660
},
{
"epoch": 19.81783317353787,
"grad_norm": 0.23550502955913544,
"learning_rate": 1.1060124904498686e-05,
"loss": 0.0262,
"step": 20670
},
{
"epoch": 19.827420901246406,
"grad_norm": 0.16561271250247955,
"learning_rate": 1.10202648364783e-05,
"loss": 0.0252,
"step": 20680
},
{
"epoch": 19.837008628954937,
"grad_norm": 0.21752074360847473,
"learning_rate": 1.0980467827107265e-05,
"loss": 0.025,
"step": 20690
},
{
"epoch": 19.846596356663472,
"grad_norm": 0.3683970868587494,
"learning_rate": 1.0940733940766367e-05,
"loss": 0.0275,
"step": 20700
},
{
"epoch": 19.856184084372003,
"grad_norm": 0.19650644063949585,
"learning_rate": 1.090106324173426e-05,
"loss": 0.0227,
"step": 20710
},
{
"epoch": 19.86577181208054,
"grad_norm": 0.3195613622665405,
"learning_rate": 1.0861455794187398e-05,
"loss": 0.0246,
"step": 20720
},
{
"epoch": 19.87535953978907,
"grad_norm": 0.30373549461364746,
"learning_rate": 1.0821911662199874e-05,
"loss": 0.0289,
"step": 20730
},
{
"epoch": 19.8849472674976,
"grad_norm": 0.23305653035640717,
"learning_rate": 1.0782430909743407e-05,
"loss": 0.0317,
"step": 20740
},
{
"epoch": 19.894534995206136,
"grad_norm": 0.19694805145263672,
"learning_rate": 1.0743013600687146e-05,
"loss": 0.021,
"step": 20750
},
{
"epoch": 19.904122722914668,
"grad_norm": 0.18307265639305115,
"learning_rate": 1.0703659798797616e-05,
"loss": 0.0215,
"step": 20760
},
{
"epoch": 19.913710450623203,
"grad_norm": 0.15986226499080658,
"learning_rate": 1.0664369567738608e-05,
"loss": 0.0254,
"step": 20770
},
{
"epoch": 19.923298178331734,
"grad_norm": 0.22868862748146057,
"learning_rate": 1.0625142971071067e-05,
"loss": 0.0209,
"step": 20780
},
{
"epoch": 19.93288590604027,
"grad_norm": 0.23605976998806,
"learning_rate": 1.0585980072253005e-05,
"loss": 0.0251,
"step": 20790
},
{
"epoch": 19.9424736337488,
"grad_norm": 0.40362289547920227,
"learning_rate": 1.0546880934639364e-05,
"loss": 0.0291,
"step": 20800
},
{
"epoch": 19.952061361457336,
"grad_norm": 0.23181037604808807,
"learning_rate": 1.0507845621481954e-05,
"loss": 0.0239,
"step": 20810
},
{
"epoch": 19.961649089165867,
"grad_norm": 0.17109474539756775,
"learning_rate": 1.046887419592935e-05,
"loss": 0.0234,
"step": 20820
},
{
"epoch": 19.971236816874402,
"grad_norm": 0.19465407729148865,
"learning_rate": 1.0429966721026751e-05,
"loss": 0.0215,
"step": 20830
},
{
"epoch": 19.980824544582934,
"grad_norm": 0.22324107587337494,
"learning_rate": 1.0391123259715906e-05,
"loss": 0.0196,
"step": 20840
},
{
"epoch": 19.99041227229147,
"grad_norm": 0.3217203915119171,
"learning_rate": 1.0352343874835018e-05,
"loss": 0.0234,
"step": 20850
},
{
"epoch": 20.0,
"grad_norm": 0.33047834038734436,
"learning_rate": 1.0313628629118616e-05,
"loss": 0.0276,
"step": 20860
},
{
"epoch": 20.00958772770853,
"grad_norm": 0.23915094137191772,
"learning_rate": 1.0274977585197482e-05,
"loss": 0.0225,
"step": 20870
},
{
"epoch": 20.019175455417066,
"grad_norm": 0.19355012476444244,
"learning_rate": 1.0236390805598516e-05,
"loss": 0.0232,
"step": 20880
},
{
"epoch": 20.028763183125598,
"grad_norm": 0.4942316710948944,
"learning_rate": 1.01978683527447e-05,
"loss": 0.0262,
"step": 20890
},
{
"epoch": 20.038350910834133,
"grad_norm": 0.17617733776569366,
"learning_rate": 1.0159410288954912e-05,
"loss": 0.023,
"step": 20900
},
{
"epoch": 20.047938638542664,
"grad_norm": 0.21215124428272247,
"learning_rate": 1.0121016676443878e-05,
"loss": 0.0187,
"step": 20910
},
{
"epoch": 20.0575263662512,
"grad_norm": 0.23251253366470337,
"learning_rate": 1.008268757732207e-05,
"loss": 0.027,
"step": 20920
},
{
"epoch": 20.06711409395973,
"grad_norm": 0.16768194735050201,
"learning_rate": 1.0044423053595559e-05,
"loss": 0.0182,
"step": 20930
},
{
"epoch": 20.076701821668266,
"grad_norm": 0.19847920536994934,
"learning_rate": 1.000622316716599e-05,
"loss": 0.0284,
"step": 20940
},
{
"epoch": 20.086289549376797,
"grad_norm": 0.13995741307735443,
"learning_rate": 9.968087979830432e-06,
"loss": 0.0192,
"step": 20950
},
{
"epoch": 20.095877277085332,
"grad_norm": 0.1870267242193222,
"learning_rate": 9.930017553281279e-06,
"loss": 0.0259,
"step": 20960
},
{
"epoch": 20.105465004793864,
"grad_norm": 0.22275184094905853,
"learning_rate": 9.892011949106172e-06,
"loss": 0.0248,
"step": 20970
},
{
"epoch": 20.1150527325024,
"grad_norm": 0.14587286114692688,
"learning_rate": 9.854071228787875e-06,
"loss": 0.0234,
"step": 20980
},
{
"epoch": 20.12464046021093,
"grad_norm": 0.32902705669403076,
"learning_rate": 9.816195453704191e-06,
"loss": 0.0233,
"step": 20990
},
{
"epoch": 20.13422818791946,
"grad_norm": 0.17466923594474792,
"learning_rate": 9.778384685127867e-06,
"loss": 0.023,
"step": 21000
},
{
"epoch": 20.143815915627997,
"grad_norm": 0.17678095400333405,
"learning_rate": 9.740638984226481e-06,
"loss": 0.0265,
"step": 21010
},
{
"epoch": 20.153403643336528,
"grad_norm": 0.16016939282417297,
"learning_rate": 9.70295841206234e-06,
"loss": 0.0248,
"step": 21020
},
{
"epoch": 20.162991371045063,
"grad_norm": 0.2382485419511795,
"learning_rate": 9.665343029592417e-06,
"loss": 0.0233,
"step": 21030
},
{
"epoch": 20.172579098753594,
"grad_norm": 0.24307946860790253,
"learning_rate": 9.627792897668175e-06,
"loss": 0.025,
"step": 21040
},
{
"epoch": 20.18216682646213,
"grad_norm": 0.4551367461681366,
"learning_rate": 9.590308077035592e-06,
"loss": 0.0211,
"step": 21050
},
{
"epoch": 20.19175455417066,
"grad_norm": 0.2893312871456146,
"learning_rate": 9.55288862833495e-06,
"loss": 0.0206,
"step": 21060
},
{
"epoch": 20.201342281879196,
"grad_norm": 0.16855131089687347,
"learning_rate": 9.515534612100746e-06,
"loss": 0.027,
"step": 21070
},
{
"epoch": 20.210930009587727,
"grad_norm": 0.3295097053050995,
"learning_rate": 9.478246088761671e-06,
"loss": 0.0282,
"step": 21080
},
{
"epoch": 20.220517737296262,
"grad_norm": 0.1354684680700302,
"learning_rate": 9.441023118640457e-06,
"loss": 0.0278,
"step": 21090
},
{
"epoch": 20.230105465004794,
"grad_norm": 0.16148221492767334,
"learning_rate": 9.403865761953779e-06,
"loss": 0.0287,
"step": 21100
},
{
"epoch": 20.239693192713325,
"grad_norm": 0.3596087098121643,
"learning_rate": 9.366774078812174e-06,
"loss": 0.0273,
"step": 21110
},
{
"epoch": 20.24928092042186,
"grad_norm": 0.24658294022083282,
"learning_rate": 9.329748129219934e-06,
"loss": 0.0224,
"step": 21120
},
{
"epoch": 20.25886864813039,
"grad_norm": 0.2896967828273773,
"learning_rate": 9.292787973075007e-06,
"loss": 0.0203,
"step": 21130
},
{
"epoch": 20.268456375838927,
"grad_norm": 0.2359636425971985,
"learning_rate": 9.255893670168919e-06,
"loss": 0.0241,
"step": 21140
},
{
"epoch": 20.278044103547458,
"grad_norm": 0.24353083968162537,
"learning_rate": 9.219065280186656e-06,
"loss": 0.0247,
"step": 21150
},
{
"epoch": 20.287631831255993,
"grad_norm": 0.14789700508117676,
"learning_rate": 9.182302862706566e-06,
"loss": 0.0191,
"step": 21160
},
{
"epoch": 20.297219558964525,
"grad_norm": 0.27849042415618896,
"learning_rate": 9.145606477200286e-06,
"loss": 0.0217,
"step": 21170
},
{
"epoch": 20.30680728667306,
"grad_norm": 0.4151756763458252,
"learning_rate": 9.108976183032613e-06,
"loss": 0.0233,
"step": 21180
},
{
"epoch": 20.31639501438159,
"grad_norm": 0.2625637948513031,
"learning_rate": 9.072412039461453e-06,
"loss": 0.0223,
"step": 21190
},
{
"epoch": 20.325982742090126,
"grad_norm": 0.22075968980789185,
"learning_rate": 9.035914105637678e-06,
"loss": 0.0239,
"step": 21200
},
{
"epoch": 20.335570469798657,
"grad_norm": 0.22036759555339813,
"learning_rate": 8.99948244060505e-06,
"loss": 0.0243,
"step": 21210
},
{
"epoch": 20.345158197507192,
"grad_norm": 0.4981054663658142,
"learning_rate": 8.963117103300134e-06,
"loss": 0.0207,
"step": 21220
},
{
"epoch": 20.354745925215724,
"grad_norm": 0.20227645337581635,
"learning_rate": 8.92681815255219e-06,
"loss": 0.0198,
"step": 21230
},
{
"epoch": 20.364333652924255,
"grad_norm": 0.24407237768173218,
"learning_rate": 8.890585647083088e-06,
"loss": 0.0292,
"step": 21240
},
{
"epoch": 20.37392138063279,
"grad_norm": 0.4346962869167328,
"learning_rate": 8.8544196455072e-06,
"loss": 0.0237,
"step": 21250
},
{
"epoch": 20.383509108341322,
"grad_norm": 0.32540345191955566,
"learning_rate": 8.818320206331327e-06,
"loss": 0.0237,
"step": 21260
},
{
"epoch": 20.393096836049857,
"grad_norm": 0.2086063176393509,
"learning_rate": 8.782287387954563e-06,
"loss": 0.0215,
"step": 21270
},
{
"epoch": 20.40268456375839,
"grad_norm": 0.2799685001373291,
"learning_rate": 8.74632124866826e-06,
"loss": 0.0283,
"step": 21280
},
{
"epoch": 20.412272291466923,
"grad_norm": 0.40446561574935913,
"learning_rate": 8.71042184665588e-06,
"loss": 0.0244,
"step": 21290
},
{
"epoch": 20.421860019175455,
"grad_norm": 0.20995816588401794,
"learning_rate": 8.674589239992931e-06,
"loss": 0.0301,
"step": 21300
},
{
"epoch": 20.43144774688399,
"grad_norm": 0.25973740220069885,
"learning_rate": 8.638823486646853e-06,
"loss": 0.0241,
"step": 21310
},
{
"epoch": 20.44103547459252,
"grad_norm": 0.31719037890434265,
"learning_rate": 8.603124644476945e-06,
"loss": 0.0207,
"step": 21320
},
{
"epoch": 20.450623202301056,
"grad_norm": 0.1637444943189621,
"learning_rate": 8.56749277123427e-06,
"loss": 0.0264,
"step": 21330
},
{
"epoch": 20.460210930009588,
"grad_norm": 0.3017114996910095,
"learning_rate": 8.531927924561538e-06,
"loss": 0.0271,
"step": 21340
},
{
"epoch": 20.469798657718123,
"grad_norm": 0.20100443065166473,
"learning_rate": 8.496430161993036e-06,
"loss": 0.0247,
"step": 21350
},
{
"epoch": 20.479386385426654,
"grad_norm": 0.2818273603916168,
"learning_rate": 8.460999540954517e-06,
"loss": 0.0278,
"step": 21360
},
{
"epoch": 20.488974113135185,
"grad_norm": 0.22835665941238403,
"learning_rate": 8.425636118763136e-06,
"loss": 0.0228,
"step": 21370
},
{
"epoch": 20.49856184084372,
"grad_norm": 0.24139605462551117,
"learning_rate": 8.390339952627324e-06,
"loss": 0.0279,
"step": 21380
},
{
"epoch": 20.508149568552252,
"grad_norm": 0.17489181458950043,
"learning_rate": 8.355111099646712e-06,
"loss": 0.0255,
"step": 21390
},
{
"epoch": 20.517737296260787,
"grad_norm": 0.14566893875598907,
"learning_rate": 8.319949616812039e-06,
"loss": 0.0222,
"step": 21400
},
{
"epoch": 20.52732502396932,
"grad_norm": 0.2523178160190582,
"learning_rate": 8.284855561005062e-06,
"loss": 0.0194,
"step": 21410
},
{
"epoch": 20.536912751677853,
"grad_norm": 0.20255376398563385,
"learning_rate": 8.249828988998448e-06,
"loss": 0.0233,
"step": 21420
},
{
"epoch": 20.546500479386385,
"grad_norm": 0.2267649918794632,
"learning_rate": 8.214869957455694e-06,
"loss": 0.0247,
"step": 21430
},
{
"epoch": 20.55608820709492,
"grad_norm": 0.20469827950000763,
"learning_rate": 8.179978522931058e-06,
"loss": 0.0196,
"step": 21440
},
{
"epoch": 20.56567593480345,
"grad_norm": 0.2033228874206543,
"learning_rate": 8.14515474186941e-06,
"loss": 0.0284,
"step": 21450
},
{
"epoch": 20.575263662511986,
"grad_norm": 0.20115645229816437,
"learning_rate": 8.1103986706062e-06,
"loss": 0.0263,
"step": 21460
},
{
"epoch": 20.584851390220518,
"grad_norm": 0.12906615436077118,
"learning_rate": 8.075710365367328e-06,
"loss": 0.0207,
"step": 21470
},
{
"epoch": 20.59443911792905,
"grad_norm": 0.2021467238664627,
"learning_rate": 8.041089882269082e-06,
"loss": 0.0286,
"step": 21480
},
{
"epoch": 20.604026845637584,
"grad_norm": 0.21031218767166138,
"learning_rate": 8.00653727731801e-06,
"loss": 0.0211,
"step": 21490
},
{
"epoch": 20.613614573346116,
"grad_norm": 0.19011792540550232,
"learning_rate": 7.972052606410873e-06,
"loss": 0.024,
"step": 21500
},
{
"epoch": 20.62320230105465,
"grad_norm": 0.18954189121723175,
"learning_rate": 7.937635925334525e-06,
"loss": 0.0273,
"step": 21510
},
{
"epoch": 20.632790028763182,
"grad_norm": 0.14838428795337677,
"learning_rate": 7.903287289765826e-06,
"loss": 0.0218,
"step": 21520
},
{
"epoch": 20.642377756471717,
"grad_norm": 0.5171023607254028,
"learning_rate": 7.869006755271568e-06,
"loss": 0.0221,
"step": 21530
},
{
"epoch": 20.65196548418025,
"grad_norm": 0.21473269164562225,
"learning_rate": 7.834794377308358e-06,
"loss": 0.0247,
"step": 21540
},
{
"epoch": 20.661553211888783,
"grad_norm": 0.37152165174484253,
"learning_rate": 7.800650211222554e-06,
"loss": 0.0258,
"step": 21550
},
{
"epoch": 20.671140939597315,
"grad_norm": 0.23063114285469055,
"learning_rate": 7.766574312250168e-06,
"loss": 0.0208,
"step": 21560
},
{
"epoch": 20.68072866730585,
"grad_norm": 0.12213137745857239,
"learning_rate": 7.732566735516777e-06,
"loss": 0.0229,
"step": 21570
},
{
"epoch": 20.69031639501438,
"grad_norm": 0.22326141595840454,
"learning_rate": 7.698627536037411e-06,
"loss": 0.0248,
"step": 21580
},
{
"epoch": 20.699904122722916,
"grad_norm": 0.21889561414718628,
"learning_rate": 7.664756768716513e-06,
"loss": 0.0218,
"step": 21590
},
{
"epoch": 20.709491850431448,
"grad_norm": 0.16505682468414307,
"learning_rate": 7.630954488347797e-06,
"loss": 0.0247,
"step": 21600
},
{
"epoch": 20.71907957813998,
"grad_norm": 0.18476873636245728,
"learning_rate": 7.5972207496142036e-06,
"loss": 0.0279,
"step": 21610
},
{
"epoch": 20.728667305848514,
"grad_norm": 0.18805384635925293,
"learning_rate": 7.56355560708778e-06,
"loss": 0.0269,
"step": 21620
},
{
"epoch": 20.738255033557046,
"grad_norm": 0.19865307211875916,
"learning_rate": 7.52995911522959e-06,
"loss": 0.0231,
"step": 21630
},
{
"epoch": 20.74784276126558,
"grad_norm": 0.25113141536712646,
"learning_rate": 7.496431328389658e-06,
"loss": 0.02,
"step": 21640
},
{
"epoch": 20.757430488974112,
"grad_norm": 0.2101268321275711,
"learning_rate": 7.4629723008068584e-06,
"loss": 0.0275,
"step": 21650
},
{
"epoch": 20.767018216682647,
"grad_norm": 0.2648563086986542,
"learning_rate": 7.429582086608849e-06,
"loss": 0.0239,
"step": 21660
},
{
"epoch": 20.77660594439118,
"grad_norm": 0.19610466063022614,
"learning_rate": 7.396260739811933e-06,
"loss": 0.0248,
"step": 21670
},
{
"epoch": 20.786193672099714,
"grad_norm": 0.22526168823242188,
"learning_rate": 7.363008314321024e-06,
"loss": 0.0185,
"step": 21680
},
{
"epoch": 20.795781399808245,
"grad_norm": 0.20826375484466553,
"learning_rate": 7.3298248639295405e-06,
"loss": 0.0215,
"step": 21690
},
{
"epoch": 20.80536912751678,
"grad_norm": 0.2082778960466385,
"learning_rate": 7.296710442319305e-06,
"loss": 0.0246,
"step": 21700
},
{
"epoch": 20.81495685522531,
"grad_norm": 0.19884945452213287,
"learning_rate": 7.2636651030604855e-06,
"loss": 0.0197,
"step": 21710
},
{
"epoch": 20.824544582933846,
"grad_norm": 0.30188602209091187,
"learning_rate": 7.230688899611487e-06,
"loss": 0.0188,
"step": 21720
},
{
"epoch": 20.834132310642378,
"grad_norm": 0.2548470199108124,
"learning_rate": 7.197781885318866e-06,
"loss": 0.0223,
"step": 21730
},
{
"epoch": 20.84372003835091,
"grad_norm": 0.42960646748542786,
"learning_rate": 7.16494411341726e-06,
"loss": 0.0265,
"step": 21740
},
{
"epoch": 20.853307766059444,
"grad_norm": 0.1879805475473404,
"learning_rate": 7.132175637029293e-06,
"loss": 0.0225,
"step": 21750
},
{
"epoch": 20.862895493767976,
"grad_norm": 0.29146450757980347,
"learning_rate": 7.099476509165459e-06,
"loss": 0.0254,
"step": 21760
},
{
"epoch": 20.87248322147651,
"grad_norm": 0.27178776264190674,
"learning_rate": 7.066846782724107e-06,
"loss": 0.0253,
"step": 21770
},
{
"epoch": 20.882070949185042,
"grad_norm": 0.1681770384311676,
"learning_rate": 7.034286510491278e-06,
"loss": 0.02,
"step": 21780
},
{
"epoch": 20.891658676893577,
"grad_norm": 0.17788025736808777,
"learning_rate": 7.001795745140683e-06,
"loss": 0.0265,
"step": 21790
},
{
"epoch": 20.90124640460211,
"grad_norm": 0.29857704043388367,
"learning_rate": 6.969374539233553e-06,
"loss": 0.0193,
"step": 21800
},
{
"epoch": 20.910834132310644,
"grad_norm": 0.3122943937778473,
"learning_rate": 6.937022945218647e-06,
"loss": 0.0252,
"step": 21810
},
{
"epoch": 20.920421860019175,
"grad_norm": 0.22873246669769287,
"learning_rate": 6.904741015432059e-06,
"loss": 0.0292,
"step": 21820
},
{
"epoch": 20.93000958772771,
"grad_norm": 0.23916493356227875,
"learning_rate": 6.872528802097211e-06,
"loss": 0.0224,
"step": 21830
},
{
"epoch": 20.93959731543624,
"grad_norm": 0.16214150190353394,
"learning_rate": 6.84038635732473e-06,
"loss": 0.0225,
"step": 21840
},
{
"epoch": 20.949185043144773,
"grad_norm": 0.2523308992385864,
"learning_rate": 6.808313733112387e-06,
"loss": 0.0237,
"step": 21850
},
{
"epoch": 20.958772770853308,
"grad_norm": 0.1933407187461853,
"learning_rate": 6.776310981344996e-06,
"loss": 0.021,
"step": 21860
},
{
"epoch": 20.96836049856184,
"grad_norm": 0.1911810338497162,
"learning_rate": 6.744378153794334e-06,
"loss": 0.0242,
"step": 21870
},
{
"epoch": 20.977948226270374,
"grad_norm": 0.2139926254749298,
"learning_rate": 6.712515302119077e-06,
"loss": 0.021,
"step": 21880
},
{
"epoch": 20.987535953978906,
"grad_norm": 0.3361279368400574,
"learning_rate": 6.680722477864665e-06,
"loss": 0.0263,
"step": 21890
},
{
"epoch": 20.99712368168744,
"grad_norm": 0.14909407496452332,
"learning_rate": 6.648999732463284e-06,
"loss": 0.0214,
"step": 21900
},
{
"epoch": 21.006711409395972,
"grad_norm": 0.318256139755249,
"learning_rate": 6.617347117233735e-06,
"loss": 0.0296,
"step": 21910
},
{
"epoch": 21.016299137104507,
"grad_norm": 0.15366911888122559,
"learning_rate": 6.585764683381379e-06,
"loss": 0.0262,
"step": 21920
},
{
"epoch": 21.02588686481304,
"grad_norm": 0.3859999179840088,
"learning_rate": 6.554252481998035e-06,
"loss": 0.0229,
"step": 21930
},
{
"epoch": 21.035474592521574,
"grad_norm": 0.22637765109539032,
"learning_rate": 6.522810564061899e-06,
"loss": 0.0284,
"step": 21940
},
{
"epoch": 21.045062320230105,
"grad_norm": 0.2992878556251526,
"learning_rate": 6.491438980437475e-06,
"loss": 0.0254,
"step": 21950
},
{
"epoch": 21.05465004793864,
"grad_norm": 0.2881068289279938,
"learning_rate": 6.460137781875497e-06,
"loss": 0.029,
"step": 21960
},
{
"epoch": 21.06423777564717,
"grad_norm": 0.19176606833934784,
"learning_rate": 6.4289070190128196e-06,
"loss": 0.0232,
"step": 21970
},
{
"epoch": 21.073825503355703,
"grad_norm": 0.1914961189031601,
"learning_rate": 6.3977467423723516e-06,
"loss": 0.0245,
"step": 21980
},
{
"epoch": 21.083413231064238,
"grad_norm": 0.31313207745552063,
"learning_rate": 6.366657002362975e-06,
"loss": 0.0296,
"step": 21990
},
{
"epoch": 21.09300095877277,
"grad_norm": 0.2076486051082611,
"learning_rate": 6.335637849279464e-06,
"loss": 0.0236,
"step": 22000
},
{
"epoch": 21.102588686481305,
"grad_norm": 0.27381840348243713,
"learning_rate": 6.304689333302416e-06,
"loss": 0.027,
"step": 22010
},
{
"epoch": 21.112176414189836,
"grad_norm": 0.24494417011737823,
"learning_rate": 6.2738115044981225e-06,
"loss": 0.0248,
"step": 22020
},
{
"epoch": 21.12176414189837,
"grad_norm": 0.18255186080932617,
"learning_rate": 6.24300441281856e-06,
"loss": 0.0209,
"step": 22030
},
{
"epoch": 21.131351869606902,
"grad_norm": 0.2896704077720642,
"learning_rate": 6.212268108101249e-06,
"loss": 0.0254,
"step": 22040
},
{
"epoch": 21.140939597315437,
"grad_norm": 0.19372302293777466,
"learning_rate": 6.1816026400692006e-06,
"loss": 0.0275,
"step": 22050
},
{
"epoch": 21.15052732502397,
"grad_norm": 0.19685117900371552,
"learning_rate": 6.151008058330832e-06,
"loss": 0.0244,
"step": 22060
},
{
"epoch": 21.160115052732504,
"grad_norm": 0.18076103925704956,
"learning_rate": 6.120484412379896e-06,
"loss": 0.0191,
"step": 22070
},
{
"epoch": 21.169702780441035,
"grad_norm": 0.37527182698249817,
"learning_rate": 6.090031751595371e-06,
"loss": 0.029,
"step": 22080
},
{
"epoch": 21.179290508149567,
"grad_norm": 0.1315630078315735,
"learning_rate": 6.059650125241412e-06,
"loss": 0.0222,
"step": 22090
},
{
"epoch": 21.188878235858102,
"grad_norm": 0.15893638134002686,
"learning_rate": 6.029339582467253e-06,
"loss": 0.0204,
"step": 22100
},
{
"epoch": 21.198465963566633,
"grad_norm": 0.27391794323921204,
"learning_rate": 5.999100172307154e-06,
"loss": 0.0251,
"step": 22110
},
{
"epoch": 21.20805369127517,
"grad_norm": 0.1774057298898697,
"learning_rate": 5.968931943680284e-06,
"loss": 0.0214,
"step": 22120
},
{
"epoch": 21.2176414189837,
"grad_norm": 0.18632689118385315,
"learning_rate": 5.938834945390653e-06,
"loss": 0.0222,
"step": 22130
},
{
"epoch": 21.227229146692235,
"grad_norm": 0.19212083518505096,
"learning_rate": 5.908809226127054e-06,
"loss": 0.0233,
"step": 22140
},
{
"epoch": 21.236816874400766,
"grad_norm": 0.1936277598142624,
"learning_rate": 5.878854834462977e-06,
"loss": 0.0188,
"step": 22150
},
{
"epoch": 21.2464046021093,
"grad_norm": 0.23681025207042694,
"learning_rate": 5.848971818856486e-06,
"loss": 0.0231,
"step": 22160
},
{
"epoch": 21.255992329817833,
"grad_norm": 0.13978900015354156,
"learning_rate": 5.819160227650216e-06,
"loss": 0.0176,
"step": 22170
},
{
"epoch": 21.265580057526368,
"grad_norm": 0.20834662020206451,
"learning_rate": 5.789420109071242e-06,
"loss": 0.0256,
"step": 22180
},
{
"epoch": 21.2751677852349,
"grad_norm": 0.15531818568706512,
"learning_rate": 5.759751511231021e-06,
"loss": 0.0237,
"step": 22190
},
{
"epoch": 21.284755512943434,
"grad_norm": 0.38519012928009033,
"learning_rate": 5.7301544821253054e-06,
"loss": 0.0213,
"step": 22200
},
{
"epoch": 21.294343240651965,
"grad_norm": 0.17564308643341064,
"learning_rate": 5.700629069634061e-06,
"loss": 0.0224,
"step": 22210
},
{
"epoch": 21.303930968360497,
"grad_norm": 0.21635904908180237,
"learning_rate": 5.67117532152141e-06,
"loss": 0.0231,
"step": 22220
},
{
"epoch": 21.313518696069032,
"grad_norm": 0.2563495934009552,
"learning_rate": 5.641793285435537e-06,
"loss": 0.03,
"step": 22230
},
{
"epoch": 21.323106423777563,
"grad_norm": 0.1372513771057129,
"learning_rate": 5.612483008908609e-06,
"loss": 0.0205,
"step": 22240
},
{
"epoch": 21.3326941514861,
"grad_norm": 0.4297633767127991,
"learning_rate": 5.583244539356719e-06,
"loss": 0.0283,
"step": 22250
},
{
"epoch": 21.34228187919463,
"grad_norm": 0.18425339460372925,
"learning_rate": 5.554077924079776e-06,
"loss": 0.0254,
"step": 22260
},
{
"epoch": 21.351869606903165,
"grad_norm": 0.24806487560272217,
"learning_rate": 5.524983210261481e-06,
"loss": 0.0186,
"step": 22270
},
{
"epoch": 21.361457334611696,
"grad_norm": 0.12550103664398193,
"learning_rate": 5.495960444969189e-06,
"loss": 0.0221,
"step": 22280
},
{
"epoch": 21.37104506232023,
"grad_norm": 0.40927961468696594,
"learning_rate": 5.467009675153861e-06,
"loss": 0.0215,
"step": 22290
},
{
"epoch": 21.380632790028763,
"grad_norm": 0.3342265188694,
"learning_rate": 5.438130947650006e-06,
"loss": 0.0274,
"step": 22300
},
{
"epoch": 21.390220517737298,
"grad_norm": 0.23332703113555908,
"learning_rate": 5.409324309175573e-06,
"loss": 0.0213,
"step": 22310
},
{
"epoch": 21.39980824544583,
"grad_norm": 0.6654828786849976,
"learning_rate": 5.380589806331904e-06,
"loss": 0.0305,
"step": 22320
},
{
"epoch": 21.409395973154364,
"grad_norm": 0.18974971771240234,
"learning_rate": 5.3519274856036414e-06,
"loss": 0.0255,
"step": 22330
},
{
"epoch": 21.418983700862896,
"grad_norm": 0.2501462697982788,
"learning_rate": 5.3233373933586405e-06,
"loss": 0.0273,
"step": 22340
},
{
"epoch": 21.428571428571427,
"grad_norm": 0.205572247505188,
"learning_rate": 5.294819575847937e-06,
"loss": 0.0234,
"step": 22350
},
{
"epoch": 21.438159156279962,
"grad_norm": 0.2879716157913208,
"learning_rate": 5.266374079205627e-06,
"loss": 0.0218,
"step": 22360
},
{
"epoch": 21.447746883988493,
"grad_norm": 0.2402382642030716,
"learning_rate": 5.238000949448818e-06,
"loss": 0.0261,
"step": 22370
},
{
"epoch": 21.45733461169703,
"grad_norm": 0.23849613964557648,
"learning_rate": 5.209700232477543e-06,
"loss": 0.0244,
"step": 22380
},
{
"epoch": 21.46692233940556,
"grad_norm": 0.21024039387702942,
"learning_rate": 5.181471974074692e-06,
"loss": 0.0229,
"step": 22390
},
{
"epoch": 21.476510067114095,
"grad_norm": 0.3411503732204437,
"learning_rate": 5.153316219905946e-06,
"loss": 0.0259,
"step": 22400
},
{
"epoch": 21.486097794822626,
"grad_norm": 0.22467151284217834,
"learning_rate": 5.1252330155196756e-06,
"loss": 0.0234,
"step": 22410
},
{
"epoch": 21.49568552253116,
"grad_norm": 0.29987016320228577,
"learning_rate": 5.097222406346908e-06,
"loss": 0.0273,
"step": 22420
},
{
"epoch": 21.505273250239693,
"grad_norm": 0.26795509457588196,
"learning_rate": 5.06928443770121e-06,
"loss": 0.0217,
"step": 22430
},
{
"epoch": 21.514860977948228,
"grad_norm": 0.1902526170015335,
"learning_rate": 5.041419154778648e-06,
"loss": 0.0237,
"step": 22440
},
{
"epoch": 21.52444870565676,
"grad_norm": 0.27109450101852417,
"learning_rate": 5.0136266026577e-06,
"loss": 0.0239,
"step": 22450
},
{
"epoch": 21.53403643336529,
"grad_norm": 0.3726276457309723,
"learning_rate": 4.9859068262991805e-06,
"loss": 0.0297,
"step": 22460
},
{
"epoch": 21.543624161073826,
"grad_norm": 0.19683849811553955,
"learning_rate": 4.958259870546178e-06,
"loss": 0.0246,
"step": 22470
},
{
"epoch": 21.553211888782357,
"grad_norm": 0.2269980013370514,
"learning_rate": 4.930685780123978e-06,
"loss": 0.0209,
"step": 22480
},
{
"epoch": 21.562799616490892,
"grad_norm": 0.19175530970096588,
"learning_rate": 4.903184599639987e-06,
"loss": 0.0231,
"step": 22490
},
{
"epoch": 21.572387344199424,
"grad_norm": 0.14310085773468018,
"learning_rate": 4.875756373583662e-06,
"loss": 0.0313,
"step": 22500
},
{
"epoch": 21.58197507190796,
"grad_norm": 0.18566519021987915,
"learning_rate": 4.848401146326442e-06,
"loss": 0.0204,
"step": 22510
},
{
"epoch": 21.59156279961649,
"grad_norm": 0.1244194358587265,
"learning_rate": 4.821118962121668e-06,
"loss": 0.022,
"step": 22520
},
{
"epoch": 21.601150527325025,
"grad_norm": 0.3905356228351593,
"learning_rate": 4.7939098651045235e-06,
"loss": 0.0243,
"step": 22530
},
{
"epoch": 21.610738255033556,
"grad_norm": 0.19283372163772583,
"learning_rate": 4.76677389929196e-06,
"loss": 0.0224,
"step": 22540
},
{
"epoch": 21.62032598274209,
"grad_norm": 0.27373161911964417,
"learning_rate": 4.739711108582612e-06,
"loss": 0.0234,
"step": 22550
},
{
"epoch": 21.629913710450623,
"grad_norm": 0.23337620496749878,
"learning_rate": 4.712721536756743e-06,
"loss": 0.0185,
"step": 22560
},
{
"epoch": 21.639501438159158,
"grad_norm": 0.22057722508907318,
"learning_rate": 4.685805227476164e-06,
"loss": 0.019,
"step": 22570
},
{
"epoch": 21.64908916586769,
"grad_norm": 0.18951620161533356,
"learning_rate": 4.65896222428418e-06,
"loss": 0.0239,
"step": 22580
},
{
"epoch": 21.65867689357622,
"grad_norm": 0.19423332810401917,
"learning_rate": 4.632192570605481e-06,
"loss": 0.024,
"step": 22590
},
{
"epoch": 21.668264621284756,
"grad_norm": 0.21294209361076355,
"learning_rate": 4.605496309746127e-06,
"loss": 0.0248,
"step": 22600
},
{
"epoch": 21.677852348993287,
"grad_norm": 0.18906791508197784,
"learning_rate": 4.578873484893431e-06,
"loss": 0.0173,
"step": 22610
},
{
"epoch": 21.687440076701822,
"grad_norm": 0.24385139346122742,
"learning_rate": 4.552324139115905e-06,
"loss": 0.026,
"step": 22620
},
{
"epoch": 21.697027804410354,
"grad_norm": 0.14667080342769623,
"learning_rate": 4.525848315363196e-06,
"loss": 0.0178,
"step": 22630
},
{
"epoch": 21.70661553211889,
"grad_norm": 0.15309500694274902,
"learning_rate": 4.499446056466022e-06,
"loss": 0.0225,
"step": 22640
},
{
"epoch": 21.71620325982742,
"grad_norm": 0.7658588886260986,
"learning_rate": 4.473117405136073e-06,
"loss": 0.025,
"step": 22650
},
{
"epoch": 21.725790987535955,
"grad_norm": 0.27880871295928955,
"learning_rate": 4.446862403965984e-06,
"loss": 0.0179,
"step": 22660
},
{
"epoch": 21.735378715244487,
"grad_norm": 0.24886669218540192,
"learning_rate": 4.420681095429219e-06,
"loss": 0.0261,
"step": 22670
},
{
"epoch": 21.74496644295302,
"grad_norm": 0.21482285857200623,
"learning_rate": 4.394573521880052e-06,
"loss": 0.019,
"step": 22680
},
{
"epoch": 21.754554170661553,
"grad_norm": 0.2901485860347748,
"learning_rate": 4.368539725553461e-06,
"loss": 0.0284,
"step": 22690
},
{
"epoch": 21.764141898370085,
"grad_norm": 0.2559397518634796,
"learning_rate": 4.342579748565068e-06,
"loss": 0.0241,
"step": 22700
},
{
"epoch": 21.77372962607862,
"grad_norm": 0.24078018963336945,
"learning_rate": 4.316693632911089e-06,
"loss": 0.0234,
"step": 22710
},
{
"epoch": 21.78331735378715,
"grad_norm": 0.23349763453006744,
"learning_rate": 4.2908814204682405e-06,
"loss": 0.0204,
"step": 22720
},
{
"epoch": 21.792905081495686,
"grad_norm": 0.2100505232810974,
"learning_rate": 4.265143152993695e-06,
"loss": 0.0271,
"step": 22730
},
{
"epoch": 21.802492809204217,
"grad_norm": 0.41341936588287354,
"learning_rate": 4.23947887212498e-06,
"loss": 0.0319,
"step": 22740
},
{
"epoch": 21.812080536912752,
"grad_norm": 0.17779189348220825,
"learning_rate": 4.213888619379963e-06,
"loss": 0.0163,
"step": 22750
},
{
"epoch": 21.821668264621284,
"grad_norm": 0.15619170665740967,
"learning_rate": 4.188372436156734e-06,
"loss": 0.0233,
"step": 22760
},
{
"epoch": 21.83125599232982,
"grad_norm": 0.22723935544490814,
"learning_rate": 4.162930363733558e-06,
"loss": 0.0236,
"step": 22770
},
{
"epoch": 21.84084372003835,
"grad_norm": 0.16326063871383667,
"learning_rate": 4.137562443268822e-06,
"loss": 0.0195,
"step": 22780
},
{
"epoch": 21.850431447746885,
"grad_norm": 0.2659025490283966,
"learning_rate": 4.112268715800943e-06,
"loss": 0.0242,
"step": 22790
},
{
"epoch": 21.860019175455417,
"grad_norm": 0.22254447638988495,
"learning_rate": 4.087049222248324e-06,
"loss": 0.0217,
"step": 22800
},
{
"epoch": 21.86960690316395,
"grad_norm": 0.19777144491672516,
"learning_rate": 4.061904003409261e-06,
"loss": 0.0195,
"step": 22810
},
{
"epoch": 21.879194630872483,
"grad_norm": 0.251908540725708,
"learning_rate": 4.036833099961912e-06,
"loss": 0.0232,
"step": 22820
},
{
"epoch": 21.888782358581015,
"grad_norm": 0.1795634925365448,
"learning_rate": 4.0118365524642095e-06,
"loss": 0.0274,
"step": 22830
},
{
"epoch": 21.89837008628955,
"grad_norm": 0.2238956242799759,
"learning_rate": 3.986914401353797e-06,
"loss": 0.0198,
"step": 22840
},
{
"epoch": 21.90795781399808,
"grad_norm": 0.13608454167842865,
"learning_rate": 3.96206668694794e-06,
"loss": 0.0206,
"step": 22850
},
{
"epoch": 21.917545541706616,
"grad_norm": 0.32671546936035156,
"learning_rate": 3.93729344944353e-06,
"loss": 0.0222,
"step": 22860
},
{
"epoch": 21.927133269415148,
"grad_norm": 0.15036197006702423,
"learning_rate": 3.912594728916929e-06,
"loss": 0.0259,
"step": 22870
},
{
"epoch": 21.936720997123683,
"grad_norm": 0.2388329952955246,
"learning_rate": 3.887970565324006e-06,
"loss": 0.0273,
"step": 22880
},
{
"epoch": 21.946308724832214,
"grad_norm": 0.2434564232826233,
"learning_rate": 3.8634209984999615e-06,
"loss": 0.018,
"step": 22890
},
{
"epoch": 21.95589645254075,
"grad_norm": 0.16361406445503235,
"learning_rate": 3.8389460681593545e-06,
"loss": 0.0264,
"step": 22900
},
{
"epoch": 21.96548418024928,
"grad_norm": 0.4144115149974823,
"learning_rate": 3.8145458138959865e-06,
"loss": 0.0238,
"step": 22910
},
{
"epoch": 21.975071907957815,
"grad_norm": 0.1729598045349121,
"learning_rate": 3.790220275182854e-06,
"loss": 0.0276,
"step": 22920
},
{
"epoch": 21.984659635666347,
"grad_norm": 0.2305474728345871,
"learning_rate": 3.7659694913720956e-06,
"loss": 0.0209,
"step": 22930
},
{
"epoch": 21.994247363374882,
"grad_norm": 0.11657913029193878,
"learning_rate": 3.741793501694901e-06,
"loss": 0.0214,
"step": 22940
},
{
"epoch": 22.003835091083413,
"grad_norm": 0.15219245851039886,
"learning_rate": 3.71769234526147e-06,
"loss": 0.0187,
"step": 22950
},
{
"epoch": 22.013422818791945,
"grad_norm": 0.2325374186038971,
"learning_rate": 3.6936660610609465e-06,
"loss": 0.0256,
"step": 22960
},
{
"epoch": 22.02301054650048,
"grad_norm": 0.15524373948574066,
"learning_rate": 3.6697146879613564e-06,
"loss": 0.0209,
"step": 22970
},
{
"epoch": 22.03259827420901,
"grad_norm": 0.2640591263771057,
"learning_rate": 3.645838264709517e-06,
"loss": 0.0192,
"step": 22980
},
{
"epoch": 22.042186001917546,
"grad_norm": 0.15865999460220337,
"learning_rate": 3.6220368299310136e-06,
"loss": 0.0207,
"step": 22990
},
{
"epoch": 22.051773729626078,
"grad_norm": 0.20118165016174316,
"learning_rate": 3.5983104221301244e-06,
"loss": 0.0217,
"step": 23000
},
{
"epoch": 22.061361457334613,
"grad_norm": 0.40763455629348755,
"learning_rate": 3.5746590796897404e-06,
"loss": 0.0239,
"step": 23010
},
{
"epoch": 22.070949185043144,
"grad_norm": 0.18958386778831482,
"learning_rate": 3.551082840871328e-06,
"loss": 0.025,
"step": 23020
},
{
"epoch": 22.08053691275168,
"grad_norm": 0.2477806657552719,
"learning_rate": 3.5275817438148616e-06,
"loss": 0.0189,
"step": 23030
},
{
"epoch": 22.09012464046021,
"grad_norm": 0.1568249762058258,
"learning_rate": 3.504155826538741e-06,
"loss": 0.0222,
"step": 23040
},
{
"epoch": 22.099712368168746,
"grad_norm": 0.2600797116756439,
"learning_rate": 3.4808051269397512e-06,
"loss": 0.0238,
"step": 23050
},
{
"epoch": 22.109300095877277,
"grad_norm": 0.18155524134635925,
"learning_rate": 3.457529682793004e-06,
"loss": 0.022,
"step": 23060
},
{
"epoch": 22.11888782358581,
"grad_norm": 0.1558566689491272,
"learning_rate": 3.4343295317518565e-06,
"loss": 0.0225,
"step": 23070
},
{
"epoch": 22.128475551294343,
"grad_norm": 0.23820623755455017,
"learning_rate": 3.4112047113478653e-06,
"loss": 0.0242,
"step": 23080
},
{
"epoch": 22.138063279002875,
"grad_norm": 0.1538003832101822,
"learning_rate": 3.3881552589907216e-06,
"loss": 0.0214,
"step": 23090
},
{
"epoch": 22.14765100671141,
"grad_norm": 0.21073570847511292,
"learning_rate": 3.36518121196821e-06,
"loss": 0.0243,
"step": 23100
},
{
"epoch": 22.15723873441994,
"grad_norm": 0.1772642582654953,
"learning_rate": 3.34228260744609e-06,
"loss": 0.0228,
"step": 23110
},
{
"epoch": 22.166826462128476,
"grad_norm": 0.18608751893043518,
"learning_rate": 3.3194594824681123e-06,
"loss": 0.0229,
"step": 23120
},
{
"epoch": 22.176414189837008,
"grad_norm": 0.14585159718990326,
"learning_rate": 3.2967118739559045e-06,
"loss": 0.0216,
"step": 23130
},
{
"epoch": 22.186001917545543,
"grad_norm": 0.18371617794036865,
"learning_rate": 3.2740398187089405e-06,
"loss": 0.021,
"step": 23140
},
{
"epoch": 22.195589645254074,
"grad_norm": 0.11511317640542984,
"learning_rate": 3.2514433534044544e-06,
"loss": 0.023,
"step": 23150
},
{
"epoch": 22.20517737296261,
"grad_norm": 0.22962027788162231,
"learning_rate": 3.2289225145974046e-06,
"loss": 0.0187,
"step": 23160
},
{
"epoch": 22.21476510067114,
"grad_norm": 0.1875505894422531,
"learning_rate": 3.2064773387203984e-06,
"loss": 0.0298,
"step": 23170
},
{
"epoch": 22.224352828379676,
"grad_norm": 0.1442171037197113,
"learning_rate": 3.1841078620836683e-06,
"loss": 0.0231,
"step": 23180
},
{
"epoch": 22.233940556088207,
"grad_norm": 0.22087924182415009,
"learning_rate": 3.1618141208749617e-06,
"loss": 0.0229,
"step": 23190
},
{
"epoch": 22.24352828379674,
"grad_norm": 0.18746836483478546,
"learning_rate": 3.139596151159502e-06,
"loss": 0.0197,
"step": 23200
},
{
"epoch": 22.253116011505274,
"grad_norm": 0.23875057697296143,
"learning_rate": 3.1174539888799425e-06,
"loss": 0.0202,
"step": 23210
},
{
"epoch": 22.262703739213805,
"grad_norm": 0.3558003902435303,
"learning_rate": 3.0953876698563144e-06,
"loss": 0.0209,
"step": 23220
},
{
"epoch": 22.27229146692234,
"grad_norm": 0.32513800263404846,
"learning_rate": 3.0733972297859294e-06,
"loss": 0.0306,
"step": 23230
},
{
"epoch": 22.28187919463087,
"grad_norm": 0.14356492459774017,
"learning_rate": 3.0514827042433804e-06,
"loss": 0.0263,
"step": 23240
},
{
"epoch": 22.291466922339406,
"grad_norm": 0.28215450048446655,
"learning_rate": 3.029644128680409e-06,
"loss": 0.0266,
"step": 23250
},
{
"epoch": 22.301054650047938,
"grad_norm": 0.27818936109542847,
"learning_rate": 3.0078815384259163e-06,
"loss": 0.0242,
"step": 23260
},
{
"epoch": 22.310642377756473,
"grad_norm": 0.21585923433303833,
"learning_rate": 2.9861949686858903e-06,
"loss": 0.0225,
"step": 23270
},
{
"epoch": 22.320230105465004,
"grad_norm": 0.2902468144893646,
"learning_rate": 2.964584454543312e-06,
"loss": 0.0302,
"step": 23280
},
{
"epoch": 22.32981783317354,
"grad_norm": 0.15972809493541718,
"learning_rate": 2.9430500309581387e-06,
"loss": 0.0265,
"step": 23290
},
{
"epoch": 22.33940556088207,
"grad_norm": 0.29737722873687744,
"learning_rate": 2.9215917327672426e-06,
"loss": 0.0187,
"step": 23300
},
{
"epoch": 22.348993288590606,
"grad_norm": 0.24347999691963196,
"learning_rate": 2.9002095946843277e-06,
"loss": 0.0233,
"step": 23310
},
{
"epoch": 22.358581016299137,
"grad_norm": 0.38935643434524536,
"learning_rate": 2.878903651299891e-06,
"loss": 0.0225,
"step": 23320
},
{
"epoch": 22.36816874400767,
"grad_norm": 0.32759687304496765,
"learning_rate": 2.8576739370811957e-06,
"loss": 0.0218,
"step": 23330
},
{
"epoch": 22.377756471716204,
"grad_norm": 0.11117486655712128,
"learning_rate": 2.8365204863721573e-06,
"loss": 0.023,
"step": 23340
},
{
"epoch": 22.387344199424735,
"grad_norm": 0.2842784523963928,
"learning_rate": 2.815443333393325e-06,
"loss": 0.0238,
"step": 23350
},
{
"epoch": 22.39693192713327,
"grad_norm": 0.23901750147342682,
"learning_rate": 2.794442512241824e-06,
"loss": 0.0237,
"step": 23360
},
{
"epoch": 22.4065196548418,
"grad_norm": 0.23337188363075256,
"learning_rate": 2.7735180568912943e-06,
"loss": 0.0188,
"step": 23370
},
{
"epoch": 22.416107382550337,
"grad_norm": 0.2736450433731079,
"learning_rate": 2.7526700011918316e-06,
"loss": 0.0247,
"step": 23380
},
{
"epoch": 22.425695110258868,
"grad_norm": 0.13441747426986694,
"learning_rate": 2.731898378869935e-06,
"loss": 0.0229,
"step": 23390
},
{
"epoch": 22.435282837967403,
"grad_norm": 0.1925612986087799,
"learning_rate": 2.7112032235284744e-06,
"loss": 0.0268,
"step": 23400
},
{
"epoch": 22.444870565675934,
"grad_norm": 0.2339990884065628,
"learning_rate": 2.6905845686465924e-06,
"loss": 0.0239,
"step": 23410
},
{
"epoch": 22.45445829338447,
"grad_norm": 0.17596887052059174,
"learning_rate": 2.6700424475796905e-06,
"loss": 0.0222,
"step": 23420
},
{
"epoch": 22.464046021093,
"grad_norm": 0.3486236035823822,
"learning_rate": 2.6495768935593525e-06,
"loss": 0.0222,
"step": 23430
},
{
"epoch": 22.473633748801532,
"grad_norm": 0.290425181388855,
"learning_rate": 2.6291879396933004e-06,
"loss": 0.0259,
"step": 23440
},
{
"epoch": 22.483221476510067,
"grad_norm": 0.18840055167675018,
"learning_rate": 2.6088756189653397e-06,
"loss": 0.0291,
"step": 23450
},
{
"epoch": 22.4928092042186,
"grad_norm": 0.2448890209197998,
"learning_rate": 2.588639964235301e-06,
"loss": 0.0244,
"step": 23460
},
{
"epoch": 22.502396931927134,
"grad_norm": 0.49676454067230225,
"learning_rate": 2.568481008238982e-06,
"loss": 0.0248,
"step": 23470
},
{
"epoch": 22.511984659635665,
"grad_norm": 0.15404298901557922,
"learning_rate": 2.5483987835881127e-06,
"loss": 0.0215,
"step": 23480
},
{
"epoch": 22.5215723873442,
"grad_norm": 0.13425439596176147,
"learning_rate": 2.528393322770306e-06,
"loss": 0.0236,
"step": 23490
},
{
"epoch": 22.53116011505273,
"grad_norm": 0.23426268994808197,
"learning_rate": 2.508464658148968e-06,
"loss": 0.0218,
"step": 23500
},
{
"epoch": 22.540747842761267,
"grad_norm": 0.21721801161766052,
"learning_rate": 2.488612821963271e-06,
"loss": 0.0229,
"step": 23510
},
{
"epoch": 22.550335570469798,
"grad_norm": 0.2656913697719574,
"learning_rate": 2.4688378463281146e-06,
"loss": 0.0248,
"step": 23520
},
{
"epoch": 22.559923298178333,
"grad_norm": 0.15874969959259033,
"learning_rate": 2.4491397632340487e-06,
"loss": 0.0244,
"step": 23530
},
{
"epoch": 22.569511025886865,
"grad_norm": 0.26198479533195496,
"learning_rate": 2.429518604547232e-06,
"loss": 0.0248,
"step": 23540
},
{
"epoch": 22.5790987535954,
"grad_norm": 0.2117781937122345,
"learning_rate": 2.409974402009385e-06,
"loss": 0.0177,
"step": 23550
},
{
"epoch": 22.58868648130393,
"grad_norm": 0.3583064675331116,
"learning_rate": 2.390507187237734e-06,
"loss": 0.0242,
"step": 23560
},
{
"epoch": 22.598274209012462,
"grad_norm": 0.26825496554374695,
"learning_rate": 2.371116991724953e-06,
"loss": 0.0242,
"step": 23570
},
{
"epoch": 22.607861936720997,
"grad_norm": 0.2770189344882965,
"learning_rate": 2.3518038468391236e-06,
"loss": 0.0198,
"step": 23580
},
{
"epoch": 22.61744966442953,
"grad_norm": 0.13024599850177765,
"learning_rate": 2.332567783823686e-06,
"loss": 0.0221,
"step": 23590
},
{
"epoch": 22.627037392138064,
"grad_norm": 0.16579675674438477,
"learning_rate": 2.313408833797376e-06,
"loss": 0.0198,
"step": 23600
},
{
"epoch": 22.636625119846595,
"grad_norm": 0.23456168174743652,
"learning_rate": 2.294327027754184e-06,
"loss": 0.0242,
"step": 23610
},
{
"epoch": 22.64621284755513,
"grad_norm": 0.2899184226989746,
"learning_rate": 2.275322396563301e-06,
"loss": 0.0268,
"step": 23620
},
{
"epoch": 22.65580057526366,
"grad_norm": 0.19349917769432068,
"learning_rate": 2.2563949709690725e-06,
"loss": 0.0224,
"step": 23630
},
{
"epoch": 22.665388302972197,
"grad_norm": 0.32430675625801086,
"learning_rate": 2.2375447815909388e-06,
"loss": 0.0241,
"step": 23640
},
{
"epoch": 22.674976030680728,
"grad_norm": 0.18584056198596954,
"learning_rate": 2.218771858923402e-06,
"loss": 0.0231,
"step": 23650
},
{
"epoch": 22.684563758389263,
"grad_norm": 0.21064673364162445,
"learning_rate": 2.2000762333359625e-06,
"loss": 0.0294,
"step": 23660
},
{
"epoch": 22.694151486097795,
"grad_norm": 0.2811007797718048,
"learning_rate": 2.1814579350730835e-06,
"loss": 0.023,
"step": 23670
},
{
"epoch": 22.70373921380633,
"grad_norm": 0.17581719160079956,
"learning_rate": 2.162916994254116e-06,
"loss": 0.0212,
"step": 23680
},
{
"epoch": 22.71332694151486,
"grad_norm": 0.22076162695884705,
"learning_rate": 2.1444534408732898e-06,
"loss": 0.026,
"step": 23690
},
{
"epoch": 22.722914669223393,
"grad_norm": 0.17099499702453613,
"learning_rate": 2.1260673047996227e-06,
"loss": 0.0231,
"step": 23700
},
{
"epoch": 22.732502396931928,
"grad_norm": 0.2862556576728821,
"learning_rate": 2.1077586157769e-06,
"loss": 0.0212,
"step": 23710
},
{
"epoch": 22.74209012464046,
"grad_norm": 0.257538378238678,
"learning_rate": 2.0895274034236245e-06,
"loss": 0.022,
"step": 23720
},
{
"epoch": 22.751677852348994,
"grad_norm": 0.12845216691493988,
"learning_rate": 2.071373697232959e-06,
"loss": 0.0332,
"step": 23730
},
{
"epoch": 22.761265580057525,
"grad_norm": 0.2115718573331833,
"learning_rate": 2.0532975265726786e-06,
"loss": 0.0295,
"step": 23740
},
{
"epoch": 22.77085330776606,
"grad_norm": 0.24508948624134064,
"learning_rate": 2.0352989206851303e-06,
"loss": 0.0219,
"step": 23750
},
{
"epoch": 22.780441035474592,
"grad_norm": 0.16549085080623627,
"learning_rate": 2.0173779086871735e-06,
"loss": 0.0228,
"step": 23760
},
{
"epoch": 22.790028763183127,
"grad_norm": 0.2713741362094879,
"learning_rate": 1.999534519570162e-06,
"loss": 0.0253,
"step": 23770
},
{
"epoch": 22.79961649089166,
"grad_norm": 0.30385440587997437,
"learning_rate": 1.981768782199861e-06,
"loss": 0.0219,
"step": 23780
},
{
"epoch": 22.809204218600193,
"grad_norm": 0.13380350172519684,
"learning_rate": 1.964080725316414e-06,
"loss": 0.0223,
"step": 23790
},
{
"epoch": 22.818791946308725,
"grad_norm": 0.2257150113582611,
"learning_rate": 1.9464703775343096e-06,
"loss": 0.0246,
"step": 23800
},
{
"epoch": 22.828379674017256,
"grad_norm": 0.18876703083515167,
"learning_rate": 1.928937767342315e-06,
"loss": 0.0216,
"step": 23810
},
{
"epoch": 22.83796740172579,
"grad_norm": 0.21190357208251953,
"learning_rate": 1.911482923103447e-06,
"loss": 0.0236,
"step": 23820
},
{
"epoch": 22.847555129434323,
"grad_norm": 0.20915569365024567,
"learning_rate": 1.8941058730549132e-06,
"loss": 0.0202,
"step": 23830
},
{
"epoch": 22.857142857142858,
"grad_norm": 0.11660904437303543,
"learning_rate": 1.8768066453080657e-06,
"loss": 0.0227,
"step": 23840
},
{
"epoch": 22.86673058485139,
"grad_norm": 0.3356838524341583,
"learning_rate": 1.8595852678483738e-06,
"loss": 0.0264,
"step": 23850
},
{
"epoch": 22.876318312559924,
"grad_norm": 0.26690155267715454,
"learning_rate": 1.8424417685353634e-06,
"loss": 0.0249,
"step": 23860
},
{
"epoch": 22.885906040268456,
"grad_norm": 0.23408538103103638,
"learning_rate": 1.825376175102561e-06,
"loss": 0.0249,
"step": 23870
},
{
"epoch": 22.89549376797699,
"grad_norm": 0.40512949228286743,
"learning_rate": 1.8083885151574775e-06,
"loss": 0.0259,
"step": 23880
},
{
"epoch": 22.905081495685522,
"grad_norm": 0.20047682523727417,
"learning_rate": 1.7914788161815466e-06,
"loss": 0.025,
"step": 23890
},
{
"epoch": 22.914669223394057,
"grad_norm": 0.2298455536365509,
"learning_rate": 1.7746471055300751e-06,
"loss": 0.0208,
"step": 23900
},
{
"epoch": 22.92425695110259,
"grad_norm": 0.13947898149490356,
"learning_rate": 1.7578934104322097e-06,
"loss": 0.0201,
"step": 23910
},
{
"epoch": 22.933844678811123,
"grad_norm": 0.22104570269584656,
"learning_rate": 1.741217757990893e-06,
"loss": 0.0233,
"step": 23920
},
{
"epoch": 22.943432406519655,
"grad_norm": 0.2084132432937622,
"learning_rate": 1.7246201751828117e-06,
"loss": 0.0269,
"step": 23930
},
{
"epoch": 22.953020134228186,
"grad_norm": 0.2208138108253479,
"learning_rate": 1.7081006888583495e-06,
"loss": 0.023,
"step": 23940
},
{
"epoch": 22.96260786193672,
"grad_norm": 0.20159326493740082,
"learning_rate": 1.6916593257415735e-06,
"loss": 0.0181,
"step": 23950
},
{
"epoch": 22.972195589645253,
"grad_norm": 0.16700109839439392,
"learning_rate": 1.6752961124301415e-06,
"loss": 0.0225,
"step": 23960
},
{
"epoch": 22.981783317353788,
"grad_norm": 0.16461221873760223,
"learning_rate": 1.6590110753953058e-06,
"loss": 0.0267,
"step": 23970
},
{
"epoch": 22.99137104506232,
"grad_norm": 0.30045902729034424,
"learning_rate": 1.6428042409818434e-06,
"loss": 0.0252,
"step": 23980
},
{
"epoch": 23.000958772770854,
"grad_norm": 0.2971097528934479,
"learning_rate": 1.6266756354080148e-06,
"loss": 0.021,
"step": 23990
},
{
"epoch": 23.010546500479386,
"grad_norm": 0.18751384317874908,
"learning_rate": 1.610625284765538e-06,
"loss": 0.0225,
"step": 24000
},
{
"epoch": 23.02013422818792,
"grad_norm": 0.2587001621723175,
"learning_rate": 1.5946532150195315e-06,
"loss": 0.024,
"step": 24010
},
{
"epoch": 23.029721955896452,
"grad_norm": 0.2185692936182022,
"learning_rate": 1.578759452008477e-06,
"loss": 0.0269,
"step": 24020
},
{
"epoch": 23.039309683604987,
"grad_norm": 0.17715659737586975,
"learning_rate": 1.5629440214441737e-06,
"loss": 0.0227,
"step": 24030
},
{
"epoch": 23.04889741131352,
"grad_norm": 0.27682605385780334,
"learning_rate": 1.5472069489117058e-06,
"loss": 0.0261,
"step": 24040
},
{
"epoch": 23.058485139022054,
"grad_norm": 0.20817138254642487,
"learning_rate": 1.531548259869392e-06,
"loss": 0.0165,
"step": 24050
},
{
"epoch": 23.068072866730585,
"grad_norm": 0.2723507583141327,
"learning_rate": 1.515967979648747e-06,
"loss": 0.0228,
"step": 24060
},
{
"epoch": 23.077660594439116,
"grad_norm": 0.1344461739063263,
"learning_rate": 1.5004661334544422e-06,
"loss": 0.0207,
"step": 24070
},
{
"epoch": 23.08724832214765,
"grad_norm": 0.16644145548343658,
"learning_rate": 1.4850427463642568e-06,
"loss": 0.0282,
"step": 24080
},
{
"epoch": 23.096836049856183,
"grad_norm": 0.22762452065944672,
"learning_rate": 1.4696978433290653e-06,
"loss": 0.0273,
"step": 24090
},
{
"epoch": 23.106423777564718,
"grad_norm": 0.1904400885105133,
"learning_rate": 1.4544314491727607e-06,
"loss": 0.0216,
"step": 24100
},
{
"epoch": 23.11601150527325,
"grad_norm": 0.20357421040534973,
"learning_rate": 1.4392435885922262e-06,
"loss": 0.0176,
"step": 24110
},
{
"epoch": 23.125599232981784,
"grad_norm": 0.14489389955997467,
"learning_rate": 1.4241342861573081e-06,
"loss": 0.0241,
"step": 24120
},
{
"epoch": 23.135186960690316,
"grad_norm": 0.42258408665657043,
"learning_rate": 1.4091035663107599e-06,
"loss": 0.0261,
"step": 24130
},
{
"epoch": 23.14477468839885,
"grad_norm": 0.33509764075279236,
"learning_rate": 1.39415145336822e-06,
"loss": 0.0225,
"step": 24140
},
{
"epoch": 23.154362416107382,
"grad_norm": 0.1689392775297165,
"learning_rate": 1.3792779715181503e-06,
"loss": 0.0239,
"step": 24150
},
{
"epoch": 23.163950143815917,
"grad_norm": 0.16699060797691345,
"learning_rate": 1.3644831448218154e-06,
"loss": 0.018,
"step": 24160
},
{
"epoch": 23.17353787152445,
"grad_norm": 0.18517597019672394,
"learning_rate": 1.349766997213242e-06,
"loss": 0.0241,
"step": 24170
},
{
"epoch": 23.18312559923298,
"grad_norm": 0.25795888900756836,
"learning_rate": 1.3351295524991592e-06,
"loss": 0.019,
"step": 24180
},
{
"epoch": 23.192713326941515,
"grad_norm": 0.14099453389644623,
"learning_rate": 1.3205708343589973e-06,
"loss": 0.0202,
"step": 24190
},
{
"epoch": 23.202301054650047,
"grad_norm": 0.1665448546409607,
"learning_rate": 1.3060908663448057e-06,
"loss": 0.0227,
"step": 24200
},
{
"epoch": 23.21188878235858,
"grad_norm": 0.23710502684116364,
"learning_rate": 1.2916896718812577e-06,
"loss": 0.0206,
"step": 24210
},
{
"epoch": 23.221476510067113,
"grad_norm": 0.20079617202281952,
"learning_rate": 1.2773672742655784e-06,
"loss": 0.0254,
"step": 24220
},
{
"epoch": 23.231064237775648,
"grad_norm": 0.19830353558063507,
"learning_rate": 1.2631236966675287e-06,
"loss": 0.0192,
"step": 24230
},
{
"epoch": 23.24065196548418,
"grad_norm": 0.14918625354766846,
"learning_rate": 1.2489589621293485e-06,
"loss": 0.019,
"step": 24240
},
{
"epoch": 23.250239693192714,
"grad_norm": 0.2350005954504013,
"learning_rate": 1.2348730935657582e-06,
"loss": 0.0234,
"step": 24250
},
{
"epoch": 23.259827420901246,
"grad_norm": 0.25462934374809265,
"learning_rate": 1.2208661137638687e-06,
"loss": 0.0205,
"step": 24260
},
{
"epoch": 23.26941514860978,
"grad_norm": 0.18844899535179138,
"learning_rate": 1.2069380453831768e-06,
"loss": 0.0264,
"step": 24270
},
{
"epoch": 23.279002876318312,
"grad_norm": 0.247798353433609,
"learning_rate": 1.19308891095552e-06,
"loss": 0.0228,
"step": 24280
},
{
"epoch": 23.288590604026847,
"grad_norm": 0.19767391681671143,
"learning_rate": 1.1793187328850485e-06,
"loss": 0.0214,
"step": 24290
},
{
"epoch": 23.29817833173538,
"grad_norm": 0.30730581283569336,
"learning_rate": 1.165627533448177e-06,
"loss": 0.0214,
"step": 24300
},
{
"epoch": 23.30776605944391,
"grad_norm": 0.3338695466518402,
"learning_rate": 1.1520153347935658e-06,
"loss": 0.0286,
"step": 24310
},
{
"epoch": 23.317353787152445,
"grad_norm": 0.17891177535057068,
"learning_rate": 1.1384821589420502e-06,
"loss": 0.0213,
"step": 24320
},
{
"epoch": 23.326941514860977,
"grad_norm": 0.2726079225540161,
"learning_rate": 1.1250280277866509e-06,
"loss": 0.0247,
"step": 24330
},
{
"epoch": 23.33652924256951,
"grad_norm": 0.15224401652812958,
"learning_rate": 1.1116529630925022e-06,
"loss": 0.0221,
"step": 24340
},
{
"epoch": 23.346116970278043,
"grad_norm": 0.3975865840911865,
"learning_rate": 1.0983569864968346e-06,
"loss": 0.024,
"step": 24350
},
{
"epoch": 23.355704697986578,
"grad_norm": 0.1999419927597046,
"learning_rate": 1.0851401195089316e-06,
"loss": 0.0247,
"step": 24360
},
{
"epoch": 23.36529242569511,
"grad_norm": 0.22919629514217377,
"learning_rate": 1.072002383510118e-06,
"loss": 0.0218,
"step": 24370
},
{
"epoch": 23.374880153403645,
"grad_norm": 0.11979561299085617,
"learning_rate": 1.05894379975367e-06,
"loss": 0.0154,
"step": 24380
},
{
"epoch": 23.384467881112176,
"grad_norm": 0.16747049987316132,
"learning_rate": 1.0459643893648507e-06,
"loss": 0.0228,
"step": 24390
},
{
"epoch": 23.39405560882071,
"grad_norm": 0.2855249345302582,
"learning_rate": 1.0330641733408309e-06,
"loss": 0.0268,
"step": 24400
},
{
"epoch": 23.403643336529242,
"grad_norm": 0.39327678084373474,
"learning_rate": 1.0202431725506556e-06,
"loss": 0.0241,
"step": 24410
},
{
"epoch": 23.413231064237777,
"grad_norm": 0.21982307732105255,
"learning_rate": 1.0075014077352396e-06,
"loss": 0.0157,
"step": 24420
},
{
"epoch": 23.42281879194631,
"grad_norm": 0.18971124291419983,
"learning_rate": 9.948388995072943e-07,
"loss": 0.0256,
"step": 24430
},
{
"epoch": 23.43240651965484,
"grad_norm": 0.2546897530555725,
"learning_rate": 9.822556683513395e-07,
"loss": 0.02,
"step": 24440
},
{
"epoch": 23.441994247363375,
"grad_norm": 0.23896943032741547,
"learning_rate": 9.69751734623625e-07,
"loss": 0.02,
"step": 24450
},
{
"epoch": 23.451581975071907,
"grad_norm": 0.2220362275838852,
"learning_rate": 9.57327118552137e-07,
"loss": 0.0228,
"step": 24460
},
{
"epoch": 23.461169702780442,
"grad_norm": 0.20060044527053833,
"learning_rate": 9.449818402365251e-07,
"loss": 0.0198,
"step": 24470
},
{
"epoch": 23.470757430488973,
"grad_norm": 0.22660043835639954,
"learning_rate": 9.327159196481138e-07,
"loss": 0.0225,
"step": 24480
},
{
"epoch": 23.48034515819751,
"grad_norm": 0.20978592336177826,
"learning_rate": 9.205293766298307e-07,
"loss": 0.0201,
"step": 24490
},
{
"epoch": 23.48993288590604,
"grad_norm": 0.205510213971138,
"learning_rate": 9.084222308962053e-07,
"loss": 0.0257,
"step": 24500
},
{
"epoch": 23.499520613614575,
"grad_norm": 0.15999889373779297,
"learning_rate": 8.963945020333209e-07,
"loss": 0.0242,
"step": 24510
},
{
"epoch": 23.509108341323106,
"grad_norm": 0.16011640429496765,
"learning_rate": 8.844462094987793e-07,
"loss": 0.0243,
"step": 24520
},
{
"epoch": 23.51869606903164,
"grad_norm": 0.1832507699728012,
"learning_rate": 8.725773726216801e-07,
"loss": 0.0199,
"step": 24530
},
{
"epoch": 23.528283796740173,
"grad_norm": 0.1802021861076355,
"learning_rate": 8.607880106025868e-07,
"loss": 0.0228,
"step": 24540
},
{
"epoch": 23.537871524448704,
"grad_norm": 0.3221112787723541,
"learning_rate": 8.49078142513493e-07,
"loss": 0.0253,
"step": 24550
},
{
"epoch": 23.54745925215724,
"grad_norm": 0.16884173452854156,
"learning_rate": 8.37447787297796e-07,
"loss": 0.0232,
"step": 24560
},
{
"epoch": 23.55704697986577,
"grad_norm": 0.26041117310523987,
"learning_rate": 8.258969637702563e-07,
"loss": 0.023,
"step": 24570
},
{
"epoch": 23.566634707574305,
"grad_norm": 0.3258151710033417,
"learning_rate": 8.144256906169767e-07,
"loss": 0.0211,
"step": 24580
},
{
"epoch": 23.576222435282837,
"grad_norm": 0.2306542694568634,
"learning_rate": 8.030339863953684e-07,
"loss": 0.0213,
"step": 24590
},
{
"epoch": 23.585810162991372,
"grad_norm": 0.2112276405096054,
"learning_rate": 7.917218695341178e-07,
"loss": 0.0251,
"step": 24600
},
{
"epoch": 23.595397890699903,
"grad_norm": 0.20719598233699799,
"learning_rate": 7.804893583331696e-07,
"loss": 0.0226,
"step": 24610
},
{
"epoch": 23.60498561840844,
"grad_norm": 0.3165718913078308,
"learning_rate": 7.693364709636886e-07,
"loss": 0.0287,
"step": 24620
},
{
"epoch": 23.61457334611697,
"grad_norm": 0.24788926541805267,
"learning_rate": 7.582632254680089e-07,
"loss": 0.0228,
"step": 24630
},
{
"epoch": 23.624161073825505,
"grad_norm": 0.2913201153278351,
"learning_rate": 7.472696397596568e-07,
"loss": 0.0216,
"step": 24640
},
{
"epoch": 23.633748801534036,
"grad_norm": 0.16556118428707123,
"learning_rate": 7.363557316232673e-07,
"loss": 0.0192,
"step": 24650
},
{
"epoch": 23.64333652924257,
"grad_norm": 0.1945585161447525,
"learning_rate": 7.255215187145892e-07,
"loss": 0.023,
"step": 24660
},
{
"epoch": 23.652924256951103,
"grad_norm": 0.26050955057144165,
"learning_rate": 7.147670185604361e-07,
"loss": 0.019,
"step": 24670
},
{
"epoch": 23.662511984659634,
"grad_norm": 0.20233625173568726,
"learning_rate": 7.04092248558691e-07,
"loss": 0.0239,
"step": 24680
},
{
"epoch": 23.67209971236817,
"grad_norm": 0.19280561804771423,
"learning_rate": 6.93497225978218e-07,
"loss": 0.0211,
"step": 24690
},
{
"epoch": 23.6816874400767,
"grad_norm": 0.15425735712051392,
"learning_rate": 6.829819679589122e-07,
"loss": 0.0265,
"step": 24700
},
{
"epoch": 23.691275167785236,
"grad_norm": 0.20832641422748566,
"learning_rate": 6.725464915115997e-07,
"loss": 0.0204,
"step": 24710
},
{
"epoch": 23.700862895493767,
"grad_norm": 0.1475028246641159,
"learning_rate": 6.621908135180655e-07,
"loss": 0.022,
"step": 24720
},
{
"epoch": 23.710450623202302,
"grad_norm": 0.18368731439113617,
"learning_rate": 6.519149507309807e-07,
"loss": 0.0222,
"step": 24730
},
{
"epoch": 23.720038350910833,
"grad_norm": 0.2757015824317932,
"learning_rate": 6.417189197739093e-07,
"loss": 0.0198,
"step": 24740
},
{
"epoch": 23.72962607861937,
"grad_norm": 0.25313273072242737,
"learning_rate": 6.316027371412625e-07,
"loss": 0.0287,
"step": 24750
},
{
"epoch": 23.7392138063279,
"grad_norm": 0.30066144466400146,
"learning_rate": 6.215664191982884e-07,
"loss": 0.0214,
"step": 24760
},
{
"epoch": 23.748801534036435,
"grad_norm": 0.4100160002708435,
"learning_rate": 6.116099821810272e-07,
"loss": 0.0223,
"step": 24770
},
{
"epoch": 23.758389261744966,
"grad_norm": 0.231341153383255,
"learning_rate": 6.017334421963006e-07,
"loss": 0.0241,
"step": 24780
},
{
"epoch": 23.7679769894535,
"grad_norm": 0.19291090965270996,
"learning_rate": 5.919368152216664e-07,
"loss": 0.0233,
"step": 24790
},
{
"epoch": 23.777564717162033,
"grad_norm": 0.20064283907413483,
"learning_rate": 5.822201171054197e-07,
"loss": 0.0186,
"step": 24800
},
{
"epoch": 23.787152444870564,
"grad_norm": 0.22854574024677277,
"learning_rate": 5.725833635665423e-07,
"loss": 0.0148,
"step": 24810
},
{
"epoch": 23.7967401725791,
"grad_norm": 0.2627497613430023,
"learning_rate": 5.630265701946912e-07,
"loss": 0.0229,
"step": 24820
},
{
"epoch": 23.80632790028763,
"grad_norm": 0.19262003898620605,
"learning_rate": 5.535497524501665e-07,
"loss": 0.0195,
"step": 24830
},
{
"epoch": 23.815915627996166,
"grad_norm": 0.2796723246574402,
"learning_rate": 5.441529256638933e-07,
"loss": 0.0249,
"step": 24840
},
{
"epoch": 23.825503355704697,
"grad_norm": 0.2186604142189026,
"learning_rate": 5.348361050373896e-07,
"loss": 0.0213,
"step": 24850
},
{
"epoch": 23.835091083413232,
"grad_norm": 0.14878273010253906,
"learning_rate": 5.255993056427433e-07,
"loss": 0.0204,
"step": 24860
},
{
"epoch": 23.844678811121764,
"grad_norm": 0.2015840858221054,
"learning_rate": 5.164425424226016e-07,
"loss": 0.0183,
"step": 24870
},
{
"epoch": 23.8542665388303,
"grad_norm": 0.3008297383785248,
"learning_rate": 5.073658301901207e-07,
"loss": 0.0228,
"step": 24880
},
{
"epoch": 23.86385426653883,
"grad_norm": 0.1776721477508545,
"learning_rate": 4.983691836289606e-07,
"loss": 0.025,
"step": 24890
},
{
"epoch": 23.873441994247365,
"grad_norm": 0.27587175369262695,
"learning_rate": 4.894526172932623e-07,
"loss": 0.021,
"step": 24900
},
{
"epoch": 23.883029721955896,
"grad_norm": 0.13460497558116913,
"learning_rate": 4.806161456076097e-07,
"loss": 0.0215,
"step": 24910
},
{
"epoch": 23.892617449664428,
"grad_norm": 0.23786711692810059,
"learning_rate": 4.718597828670235e-07,
"loss": 0.0228,
"step": 24920
},
{
"epoch": 23.902205177372963,
"grad_norm": 0.18216513097286224,
"learning_rate": 4.6318354323692246e-07,
"loss": 0.0194,
"step": 24930
},
{
"epoch": 23.911792905081494,
"grad_norm": 0.10642199218273163,
"learning_rate": 4.5458744075311253e-07,
"loss": 0.0225,
"step": 24940
},
{
"epoch": 23.92138063279003,
"grad_norm": 0.2601510286331177,
"learning_rate": 4.460714893217588e-07,
"loss": 0.0237,
"step": 24950
},
{
"epoch": 23.93096836049856,
"grad_norm": 0.2652058005332947,
"learning_rate": 4.376357027193634e-07,
"loss": 0.0209,
"step": 24960
},
{
"epoch": 23.940556088207096,
"grad_norm": 0.1884078085422516,
"learning_rate": 4.292800945927378e-07,
"loss": 0.0241,
"step": 24970
},
{
"epoch": 23.950143815915627,
"grad_norm": 0.3231159448623657,
"learning_rate": 4.210046784590027e-07,
"loss": 0.0274,
"step": 24980
},
{
"epoch": 23.959731543624162,
"grad_norm": 0.2526688575744629,
"learning_rate": 4.128094677055272e-07,
"loss": 0.0229,
"step": 24990
},
{
"epoch": 23.969319271332694,
"grad_norm": 0.13271057605743408,
"learning_rate": 4.0469447558995065e-07,
"loss": 0.021,
"step": 25000
},
{
"epoch": 23.97890699904123,
"grad_norm": 0.24924740195274353,
"learning_rate": 3.9665971524012747e-07,
"loss": 0.0283,
"step": 25010
},
{
"epoch": 23.98849472674976,
"grad_norm": 0.29960912466049194,
"learning_rate": 3.8870519965412135e-07,
"loss": 0.0239,
"step": 25020
},
{
"epoch": 23.99808245445829,
"grad_norm": 0.17576931416988373,
"learning_rate": 3.8083094170018875e-07,
"loss": 0.0207,
"step": 25030
},
{
"epoch": 24.007670182166827,
"grad_norm": 0.21935302019119263,
"learning_rate": 3.7303695411674e-07,
"loss": 0.0222,
"step": 25040
},
{
"epoch": 24.017257909875358,
"grad_norm": 0.2815200686454773,
"learning_rate": 3.6532324951233934e-07,
"loss": 0.0238,
"step": 25050
},
{
"epoch": 24.026845637583893,
"grad_norm": 0.245747908949852,
"learning_rate": 3.576898403656659e-07,
"loss": 0.018,
"step": 25060
},
{
"epoch": 24.036433365292424,
"grad_norm": 0.20894894003868103,
"learning_rate": 3.501367390255139e-07,
"loss": 0.0214,
"step": 25070
},
{
"epoch": 24.04602109300096,
"grad_norm": 0.16371525824069977,
"learning_rate": 3.426639577107427e-07,
"loss": 0.019,
"step": 25080
},
{
"epoch": 24.05560882070949,
"grad_norm": 0.15699537098407745,
"learning_rate": 3.352715085103042e-07,
"loss": 0.0221,
"step": 25090
},
{
"epoch": 24.065196548418026,
"grad_norm": 0.29383793473243713,
"learning_rate": 3.279594033831601e-07,
"loss": 0.0197,
"step": 25100
},
{
"epoch": 24.074784276126557,
"grad_norm": 0.25920745730400085,
"learning_rate": 3.2072765415833153e-07,
"loss": 0.0217,
"step": 25110
},
{
"epoch": 24.084372003835092,
"grad_norm": 0.1622145175933838,
"learning_rate": 3.1357627253482127e-07,
"loss": 0.0242,
"step": 25120
},
{
"epoch": 24.093959731543624,
"grad_norm": 0.26235565543174744,
"learning_rate": 3.0650527008162513e-07,
"loss": 0.0224,
"step": 25130
},
{
"epoch": 24.10354745925216,
"grad_norm": 0.24994677305221558,
"learning_rate": 2.9951465823771505e-07,
"loss": 0.0243,
"step": 25140
},
{
"epoch": 24.11313518696069,
"grad_norm": 0.35253608226776123,
"learning_rate": 2.926044483120005e-07,
"loss": 0.0271,
"step": 25150
},
{
"epoch": 24.12272291466922,
"grad_norm": 0.12745951116085052,
"learning_rate": 2.857746514833337e-07,
"loss": 0.0182,
"step": 25160
},
{
"epoch": 24.132310642377757,
"grad_norm": 0.16648143529891968,
"learning_rate": 2.79025278800471e-07,
"loss": 0.0211,
"step": 25170
},
{
"epoch": 24.141898370086288,
"grad_norm": 0.15013748407363892,
"learning_rate": 2.7235634118207286e-07,
"loss": 0.0203,
"step": 25180
},
{
"epoch": 24.151486097794823,
"grad_norm": 0.2093784213066101,
"learning_rate": 2.6576784941667045e-07,
"loss": 0.0232,
"step": 25190
},
{
"epoch": 24.161073825503355,
"grad_norm": 0.23546428978443146,
"learning_rate": 2.592598141626601e-07,
"loss": 0.0243,
"step": 25200
},
{
"epoch": 24.17066155321189,
"grad_norm": 0.20667380094528198,
"learning_rate": 2.528322459482757e-07,
"loss": 0.0225,
"step": 25210
},
{
"epoch": 24.18024928092042,
"grad_norm": 0.26651787757873535,
"learning_rate": 2.4648515517158297e-07,
"loss": 0.0196,
"step": 25220
},
{
"epoch": 24.189837008628956,
"grad_norm": 0.2723236083984375,
"learning_rate": 2.402185521004574e-07,
"loss": 0.0199,
"step": 25230
},
{
"epoch": 24.199424736337487,
"grad_norm": 0.2087775468826294,
"learning_rate": 2.3403244687256743e-07,
"loss": 0.0251,
"step": 25240
},
{
"epoch": 24.209012464046022,
"grad_norm": 0.1503581702709198,
"learning_rate": 2.279268494953468e-07,
"loss": 0.0172,
"step": 25250
},
{
"epoch": 24.218600191754554,
"grad_norm": 0.1757836937904358,
"learning_rate": 2.219017698460002e-07,
"loss": 0.0227,
"step": 25260
},
{
"epoch": 24.22818791946309,
"grad_norm": 0.22135482728481293,
"learning_rate": 2.1595721767147526e-07,
"loss": 0.0182,
"step": 25270
},
{
"epoch": 24.23777564717162,
"grad_norm": 0.19286498427391052,
"learning_rate": 2.1009320258845167e-07,
"loss": 0.0265,
"step": 25280
},
{
"epoch": 24.247363374880152,
"grad_norm": 0.12808747589588165,
"learning_rate": 2.0430973408330778e-07,
"loss": 0.0201,
"step": 25290
},
{
"epoch": 24.256951102588687,
"grad_norm": 0.15946893393993378,
"learning_rate": 1.9860682151212616e-07,
"loss": 0.026,
"step": 25300
},
{
"epoch": 24.26653883029722,
"grad_norm": 0.2374187558889389,
"learning_rate": 1.929844741006881e-07,
"loss": 0.0193,
"step": 25310
},
{
"epoch": 24.276126558005753,
"grad_norm": 0.2157372087240219,
"learning_rate": 1.8744270094441796e-07,
"loss": 0.0266,
"step": 25320
},
{
"epoch": 24.285714285714285,
"grad_norm": 0.27296164631843567,
"learning_rate": 1.819815110084111e-07,
"loss": 0.0211,
"step": 25330
},
{
"epoch": 24.29530201342282,
"grad_norm": 0.16994787752628326,
"learning_rate": 1.766009131273838e-07,
"loss": 0.0188,
"step": 25340
},
{
"epoch": 24.30488974113135,
"grad_norm": 0.2888137102127075,
"learning_rate": 1.7130091600568443e-07,
"loss": 0.0247,
"step": 25350
},
{
"epoch": 24.314477468839886,
"grad_norm": 0.26905524730682373,
"learning_rate": 1.660815282172823e-07,
"loss": 0.026,
"step": 25360
},
{
"epoch": 24.324065196548418,
"grad_norm": 0.28536051511764526,
"learning_rate": 1.609427582057288e-07,
"loss": 0.0221,
"step": 25370
},
{
"epoch": 24.333652924256953,
"grad_norm": 0.26181870698928833,
"learning_rate": 1.5588461428415745e-07,
"loss": 0.0248,
"step": 25380
},
{
"epoch": 24.343240651965484,
"grad_norm": 0.20964038372039795,
"learning_rate": 1.5090710463527836e-07,
"loss": 0.0222,
"step": 25390
},
{
"epoch": 24.352828379674015,
"grad_norm": 0.22509586811065674,
"learning_rate": 1.4601023731135034e-07,
"loss": 0.0196,
"step": 25400
},
{
"epoch": 24.36241610738255,
"grad_norm": 0.13734106719493866,
"learning_rate": 1.4119402023418106e-07,
"loss": 0.0249,
"step": 25410
},
{
"epoch": 24.372003835091082,
"grad_norm": 0.2952769100666046,
"learning_rate": 1.3645846119510474e-07,
"loss": 0.0204,
"step": 25420
},
{
"epoch": 24.381591562799617,
"grad_norm": 0.33259129524230957,
"learning_rate": 1.3180356785496562e-07,
"loss": 0.0267,
"step": 25430
},
{
"epoch": 24.39117929050815,
"grad_norm": 0.1688985675573349,
"learning_rate": 1.2722934774412887e-07,
"loss": 0.0208,
"step": 25440
},
{
"epoch": 24.400767018216683,
"grad_norm": 0.13669002056121826,
"learning_rate": 1.2273580826244192e-07,
"loss": 0.0238,
"step": 25450
},
{
"epoch": 24.410354745925215,
"grad_norm": 0.14696350693702698,
"learning_rate": 1.1832295667922876e-07,
"loss": 0.0219,
"step": 25460
},
{
"epoch": 24.41994247363375,
"grad_norm": 0.20755600929260254,
"learning_rate": 1.139908001332901e-07,
"loss": 0.0186,
"step": 25470
},
{
"epoch": 24.42953020134228,
"grad_norm": 0.19683778285980225,
"learning_rate": 1.0973934563288658e-07,
"loss": 0.0211,
"step": 25480
},
{
"epoch": 24.439117929050816,
"grad_norm": 0.2026386559009552,
"learning_rate": 1.0556860005571101e-07,
"loss": 0.0238,
"step": 25490
},
{
"epoch": 24.448705656759348,
"grad_norm": 0.4480651617050171,
"learning_rate": 1.0147857014890516e-07,
"loss": 0.021,
"step": 25500
},
{
"epoch": 24.458293384467883,
"grad_norm": 0.31666049361228943,
"learning_rate": 9.746926252902633e-08,
"loss": 0.032,
"step": 25510
},
{
"epoch": 24.467881112176414,
"grad_norm": 0.5467284321784973,
"learning_rate": 9.354068368204739e-08,
"loss": 0.0209,
"step": 25520
},
{
"epoch": 24.477468839884946,
"grad_norm": 0.15496346354484558,
"learning_rate": 8.969283996335121e-08,
"loss": 0.0224,
"step": 25530
},
{
"epoch": 24.48705656759348,
"grad_norm": 0.210786372423172,
"learning_rate": 8.59257375976974e-08,
"loss": 0.025,
"step": 25540
},
{
"epoch": 24.496644295302012,
"grad_norm": 0.12938974797725677,
"learning_rate": 8.223938267924446e-08,
"loss": 0.0176,
"step": 25550
},
{
"epoch": 24.506232023010547,
"grad_norm": 0.22987248003482819,
"learning_rate": 7.863378117151099e-08,
"loss": 0.0231,
"step": 25560
},
{
"epoch": 24.51581975071908,
"grad_norm": 0.3242381811141968,
"learning_rate": 7.510893890738113e-08,
"loss": 0.023,
"step": 25570
},
{
"epoch": 24.525407478427613,
"grad_norm": 0.2817991375923157,
"learning_rate": 7.166486158909913e-08,
"loss": 0.0231,
"step": 25580
},
{
"epoch": 24.534995206136145,
"grad_norm": 0.20501790940761566,
"learning_rate": 6.830155478824707e-08,
"loss": 0.0191,
"step": 25590
},
{
"epoch": 24.54458293384468,
"grad_norm": 0.1096939668059349,
"learning_rate": 6.501902394574488e-08,
"loss": 0.0273,
"step": 25600
},
{
"epoch": 24.55417066155321,
"grad_norm": 0.1630508154630661,
"learning_rate": 6.181727437183372e-08,
"loss": 0.0209,
"step": 25610
},
{
"epoch": 24.563758389261746,
"grad_norm": 0.28238698840141296,
"learning_rate": 5.8696311246081436e-08,
"loss": 0.0251,
"step": 25620
},
{
"epoch": 24.573346116970278,
"grad_norm": 0.11937420070171356,
"learning_rate": 5.5656139617366045e-08,
"loss": 0.0185,
"step": 25630
},
{
"epoch": 24.582933844678813,
"grad_norm": 0.17204758524894714,
"learning_rate": 5.2696764403847855e-08,
"loss": 0.0229,
"step": 25640
},
{
"epoch": 24.592521572387344,
"grad_norm": 0.17664316296577454,
"learning_rate": 4.981819039300284e-08,
"loss": 0.019,
"step": 25650
},
{
"epoch": 24.602109300095876,
"grad_norm": 0.14691434800624847,
"learning_rate": 4.702042224158931e-08,
"loss": 0.0272,
"step": 25660
},
{
"epoch": 24.61169702780441,
"grad_norm": 0.21293459832668304,
"learning_rate": 4.430346447562572e-08,
"loss": 0.0174,
"step": 25670
},
{
"epoch": 24.621284755512942,
"grad_norm": 0.17576336860656738,
"learning_rate": 4.166732149041841e-08,
"loss": 0.0257,
"step": 25680
},
{
"epoch": 24.630872483221477,
"grad_norm": 0.19463558495044708,
"learning_rate": 3.911199755053385e-08,
"loss": 0.0212,
"step": 25690
},
{
"epoch": 24.64046021093001,
"grad_norm": 0.17403477430343628,
"learning_rate": 3.663749678979311e-08,
"loss": 0.0202,
"step": 25700
},
{
"epoch": 24.650047938638544,
"grad_norm": 0.3777727782726288,
"learning_rate": 3.424382321126629e-08,
"loss": 0.024,
"step": 25710
},
{
"epoch": 24.659635666347075,
"grad_norm": 0.14289294183254242,
"learning_rate": 3.193098068727252e-08,
"loss": 0.0244,
"step": 25720
},
{
"epoch": 24.66922339405561,
"grad_norm": 0.17767243087291718,
"learning_rate": 2.9698972959357753e-08,
"loss": 0.0241,
"step": 25730
},
{
"epoch": 24.67881112176414,
"grad_norm": 0.2469603717327118,
"learning_rate": 2.7547803638311442e-08,
"loss": 0.0244,
"step": 25740
},
{
"epoch": 24.688398849472676,
"grad_norm": 0.1393066793680191,
"learning_rate": 2.5477476204144314e-08,
"loss": 0.0237,
"step": 25750
},
{
"epoch": 24.697986577181208,
"grad_norm": 0.2745441794395447,
"learning_rate": 2.3487994006077263e-08,
"loss": 0.0192,
"step": 25760
},
{
"epoch": 24.70757430488974,
"grad_norm": 0.19631850719451904,
"learning_rate": 2.1579360262558025e-08,
"loss": 0.0228,
"step": 25770
},
{
"epoch": 24.717162032598274,
"grad_norm": 0.4640311300754547,
"learning_rate": 1.9751578061244504e-08,
"loss": 0.0216,
"step": 25780
},
{
"epoch": 24.726749760306806,
"grad_norm": 0.262236088514328,
"learning_rate": 1.8004650358982578e-08,
"loss": 0.0243,
"step": 25790
},
{
"epoch": 24.73633748801534,
"grad_norm": 0.1786222904920578,
"learning_rate": 1.6338579981833856e-08,
"loss": 0.0165,
"step": 25800
},
{
"epoch": 24.745925215723872,
"grad_norm": 0.2555926442146301,
"learning_rate": 1.475336962504792e-08,
"loss": 0.0201,
"step": 25810
},
{
"epoch": 24.755512943432407,
"grad_norm": 0.16927938163280487,
"learning_rate": 1.3249021853062315e-08,
"loss": 0.0225,
"step": 25820
},
{
"epoch": 24.76510067114094,
"grad_norm": 0.17081952095031738,
"learning_rate": 1.182553909950812e-08,
"loss": 0.0206,
"step": 25830
},
{
"epoch": 24.774688398849474,
"grad_norm": 0.2548944056034088,
"learning_rate": 1.048292366719883e-08,
"loss": 0.0238,
"step": 25840
},
{
"epoch": 24.784276126558005,
"grad_norm": 0.14747904241085052,
"learning_rate": 9.221177728108154e-09,
"loss": 0.0218,
"step": 25850
},
{
"epoch": 24.79386385426654,
"grad_norm": 0.2064131796360016,
"learning_rate": 8.040303323414433e-09,
"loss": 0.0275,
"step": 25860
},
{
"epoch": 24.80345158197507,
"grad_norm": 0.1762009561061859,
"learning_rate": 6.940302363445117e-09,
"loss": 0.0183,
"step": 25870
},
{
"epoch": 24.813039309683607,
"grad_norm": 0.36469346284866333,
"learning_rate": 5.9211766277045276e-09,
"loss": 0.0226,
"step": 25880
},
{
"epoch": 24.822627037392138,
"grad_norm": 0.23766785860061646,
"learning_rate": 4.982927764862755e-09,
"loss": 0.0225,
"step": 25890
},
{
"epoch": 24.83221476510067,
"grad_norm": 0.277342826128006,
"learning_rate": 4.125557292750104e-09,
"loss": 0.0245,
"step": 25900
},
{
"epoch": 24.841802492809204,
"grad_norm": 0.2073160707950592,
"learning_rate": 3.349066598362649e-09,
"loss": 0.0213,
"step": 25910
},
{
"epoch": 24.851390220517736,
"grad_norm": 0.24508048593997955,
"learning_rate": 2.6534569378455776e-09,
"loss": 0.0217,
"step": 25920
},
{
"epoch": 24.86097794822627,
"grad_norm": 0.11171819269657135,
"learning_rate": 2.0387294365209475e-09,
"loss": 0.0213,
"step": 25930
},
{
"epoch": 24.870565675934802,
"grad_norm": 0.24452242255210876,
"learning_rate": 1.5048850888377265e-09,
"loss": 0.0172,
"step": 25940
},
{
"epoch": 24.880153403643337,
"grad_norm": 0.2535116374492645,
"learning_rate": 1.0519247584106495e-09,
"loss": 0.0252,
"step": 25950
},
{
"epoch": 24.88974113135187,
"grad_norm": 0.2804677486419678,
"learning_rate": 6.798491780202199e-10,
"loss": 0.0258,
"step": 25960
},
{
"epoch": 24.899328859060404,
"grad_norm": 0.21948733925819397,
"learning_rate": 3.8865894956829905e-10,
"loss": 0.0209,
"step": 25970
},
{
"epoch": 24.908916586768935,
"grad_norm": 0.21812966465950012,
"learning_rate": 1.7835454413361875e-10,
"loss": 0.025,
"step": 25980
},
{
"epoch": 24.91850431447747,
"grad_norm": 0.14540976285934448,
"learning_rate": 4.893630192737142e-11,
"loss": 0.0231,
"step": 25990
},
{
"epoch": 24.928092042186,
"grad_norm": 0.1897832155227661,
"learning_rate": 4.0443231541509307e-13,
"loss": 0.025,
"step": 26000
},
{
"epoch": 24.928092042186,
"step": 26000,
"total_flos": 0.0,
"train_loss": 0.037590215687568374,
"train_runtime": 11164.0416,
"train_samples_per_second": 74.525,
"train_steps_per_second": 2.329
}
],
"logging_steps": 10,
"max_steps": 26000,
"num_input_tokens_seen": 0,
"num_train_epochs": 25,
"save_steps": 20000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}