{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 24.928092042186, "eval_steps": 500, "global_step": 26000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009587727708533078, "grad_norm": 16.952646255493164, "learning_rate": 6.923076923076923e-07, "loss": 1.0981, "step": 10 }, { "epoch": 0.019175455417066157, "grad_norm": 10.460495948791504, "learning_rate": 1.4615384615384616e-06, "loss": 1.0812, "step": 20 }, { "epoch": 0.028763183125599234, "grad_norm": 5.855829238891602, "learning_rate": 2.2307692307692307e-06, "loss": 0.8546, "step": 30 }, { "epoch": 0.038350910834132314, "grad_norm": 3.5648763179779053, "learning_rate": 3e-06, "loss": 0.4892, "step": 40 }, { "epoch": 0.04793863854266539, "grad_norm": 1.662581205368042, "learning_rate": 3.7692307692307694e-06, "loss": 0.3639, "step": 50 }, { "epoch": 0.05752636625119847, "grad_norm": 1.4345495700836182, "learning_rate": 4.538461538461539e-06, "loss": 0.2922, "step": 60 }, { "epoch": 0.06711409395973154, "grad_norm": 0.9792284369468689, "learning_rate": 5.307692307692308e-06, "loss": 0.2074, "step": 70 }, { "epoch": 0.07670182166826463, "grad_norm": 0.9946898818016052, "learning_rate": 6.0769230769230775e-06, "loss": 0.2146, "step": 80 }, { "epoch": 0.0862895493767977, "grad_norm": 1.3608415126800537, "learning_rate": 6.846153846153847e-06, "loss": 0.1745, "step": 90 }, { "epoch": 0.09587727708533078, "grad_norm": 1.0509544610977173, "learning_rate": 7.615384615384616e-06, "loss": 0.1748, "step": 100 }, { "epoch": 0.10546500479386385, "grad_norm": 0.9403872489929199, "learning_rate": 8.384615384615385e-06, "loss": 0.1564, "step": 110 }, { "epoch": 0.11505273250239693, "grad_norm": 0.9270913004875183, "learning_rate": 9.153846153846155e-06, "loss": 0.1696, "step": 120 }, { "epoch": 0.12464046021093, "grad_norm": 1.0389190912246704, "learning_rate": 9.923076923076923e-06, "loss": 0.1392, "step": 130 }, { "epoch": 0.1342281879194631, "grad_norm": 0.9624547958374023, "learning_rate": 1.0692307692307694e-05, "loss": 0.1311, "step": 140 }, { "epoch": 0.14381591562799617, "grad_norm": 1.0129961967468262, "learning_rate": 1.1461538461538462e-05, "loss": 0.1212, "step": 150 }, { "epoch": 0.15340364333652926, "grad_norm": 1.2572994232177734, "learning_rate": 1.2230769230769232e-05, "loss": 0.1267, "step": 160 }, { "epoch": 0.1629913710450623, "grad_norm": 1.06370210647583, "learning_rate": 1.3000000000000001e-05, "loss": 0.1241, "step": 170 }, { "epoch": 0.1725790987535954, "grad_norm": 1.2056634426116943, "learning_rate": 1.3769230769230771e-05, "loss": 0.1207, "step": 180 }, { "epoch": 0.18216682646212848, "grad_norm": 1.5257799625396729, "learning_rate": 1.453846153846154e-05, "loss": 0.1078, "step": 190 }, { "epoch": 0.19175455417066156, "grad_norm": 1.0377438068389893, "learning_rate": 1.5307692307692308e-05, "loss": 0.1191, "step": 200 }, { "epoch": 0.20134228187919462, "grad_norm": 1.1318110227584839, "learning_rate": 1.607692307692308e-05, "loss": 0.1211, "step": 210 }, { "epoch": 0.2109300095877277, "grad_norm": 0.8394863605499268, "learning_rate": 1.684615384615385e-05, "loss": 0.1103, "step": 220 }, { "epoch": 0.22051773729626079, "grad_norm": 0.6863590478897095, "learning_rate": 1.7615384615384615e-05, "loss": 0.101, "step": 230 }, { "epoch": 0.23010546500479387, "grad_norm": 0.9169079661369324, "learning_rate": 1.8384615384615386e-05, "loss": 0.1034, "step": 240 }, { "epoch": 0.23969319271332695, "grad_norm": 1.088216781616211, "learning_rate": 1.9153846153846156e-05, "loss": 0.0964, "step": 250 }, { "epoch": 0.24928092042186, "grad_norm": 0.8121523261070251, "learning_rate": 1.9923076923076926e-05, "loss": 0.0959, "step": 260 }, { "epoch": 0.2588686481303931, "grad_norm": 1.408576250076294, "learning_rate": 2.0692307692307693e-05, "loss": 0.1017, "step": 270 }, { "epoch": 0.2684563758389262, "grad_norm": 1.0147638320922852, "learning_rate": 2.1461538461538463e-05, "loss": 0.0935, "step": 280 }, { "epoch": 0.27804410354745923, "grad_norm": 1.0343986749649048, "learning_rate": 2.2230769230769233e-05, "loss": 0.1039, "step": 290 }, { "epoch": 0.28763183125599234, "grad_norm": 1.32474684715271, "learning_rate": 2.3000000000000003e-05, "loss": 0.0926, "step": 300 }, { "epoch": 0.2972195589645254, "grad_norm": 0.9797191619873047, "learning_rate": 2.376923076923077e-05, "loss": 0.0967, "step": 310 }, { "epoch": 0.3068072866730585, "grad_norm": 0.6505740880966187, "learning_rate": 2.453846153846154e-05, "loss": 0.0792, "step": 320 }, { "epoch": 0.31639501438159157, "grad_norm": 0.9509547352790833, "learning_rate": 2.530769230769231e-05, "loss": 0.0827, "step": 330 }, { "epoch": 0.3259827420901246, "grad_norm": 1.0206745862960815, "learning_rate": 2.6076923076923077e-05, "loss": 0.0771, "step": 340 }, { "epoch": 0.33557046979865773, "grad_norm": 1.0349384546279907, "learning_rate": 2.6846153846153848e-05, "loss": 0.0799, "step": 350 }, { "epoch": 0.3451581975071908, "grad_norm": 0.9108182191848755, "learning_rate": 2.7615384615384614e-05, "loss": 0.0873, "step": 360 }, { "epoch": 0.3547459252157239, "grad_norm": 0.5712908506393433, "learning_rate": 2.8384615384615388e-05, "loss": 0.0831, "step": 370 }, { "epoch": 0.36433365292425696, "grad_norm": 0.9796934127807617, "learning_rate": 2.9153846153846155e-05, "loss": 0.0719, "step": 380 }, { "epoch": 0.37392138063279, "grad_norm": 1.1480381488800049, "learning_rate": 2.9923076923076925e-05, "loss": 0.0828, "step": 390 }, { "epoch": 0.3835091083413231, "grad_norm": 0.5862910151481628, "learning_rate": 3.069230769230769e-05, "loss": 0.0709, "step": 400 }, { "epoch": 0.3930968360498562, "grad_norm": 0.7163400650024414, "learning_rate": 3.146153846153846e-05, "loss": 0.0721, "step": 410 }, { "epoch": 0.40268456375838924, "grad_norm": 0.7817345261573792, "learning_rate": 3.223076923076923e-05, "loss": 0.0717, "step": 420 }, { "epoch": 0.41227229146692235, "grad_norm": 0.6121333837509155, "learning_rate": 3.3e-05, "loss": 0.0665, "step": 430 }, { "epoch": 0.4218600191754554, "grad_norm": 0.774795413017273, "learning_rate": 3.376923076923077e-05, "loss": 0.0691, "step": 440 }, { "epoch": 0.4314477468839885, "grad_norm": 0.898847222328186, "learning_rate": 3.453846153846154e-05, "loss": 0.0741, "step": 450 }, { "epoch": 0.44103547459252157, "grad_norm": 0.7293726801872253, "learning_rate": 3.5307692307692306e-05, "loss": 0.0645, "step": 460 }, { "epoch": 0.4506232023010546, "grad_norm": 0.5674548149108887, "learning_rate": 3.607692307692308e-05, "loss": 0.0629, "step": 470 }, { "epoch": 0.46021093000958774, "grad_norm": 0.7961140275001526, "learning_rate": 3.684615384615385e-05, "loss": 0.0628, "step": 480 }, { "epoch": 0.4697986577181208, "grad_norm": 0.6253398656845093, "learning_rate": 3.761538461538462e-05, "loss": 0.0645, "step": 490 }, { "epoch": 0.4793863854266539, "grad_norm": 0.9273212552070618, "learning_rate": 3.838461538461539e-05, "loss": 0.062, "step": 500 }, { "epoch": 0.48897411313518696, "grad_norm": 0.6789622902870178, "learning_rate": 3.915384615384616e-05, "loss": 0.0654, "step": 510 }, { "epoch": 0.49856184084372, "grad_norm": 0.8248144388198853, "learning_rate": 3.992307692307692e-05, "loss": 0.0665, "step": 520 }, { "epoch": 0.5081495685522531, "grad_norm": 0.8695188164710999, "learning_rate": 4.06923076923077e-05, "loss": 0.0717, "step": 530 }, { "epoch": 0.5177372962607862, "grad_norm": 0.6595909595489502, "learning_rate": 4.146153846153846e-05, "loss": 0.0628, "step": 540 }, { "epoch": 0.5273250239693192, "grad_norm": 0.7226746678352356, "learning_rate": 4.223076923076924e-05, "loss": 0.0657, "step": 550 }, { "epoch": 0.5369127516778524, "grad_norm": 0.6370866298675537, "learning_rate": 4.3e-05, "loss": 0.0581, "step": 560 }, { "epoch": 0.5465004793863855, "grad_norm": 0.47755175828933716, "learning_rate": 4.376923076923077e-05, "loss": 0.052, "step": 570 }, { "epoch": 0.5560882070949185, "grad_norm": 0.7424858808517456, "learning_rate": 4.453846153846154e-05, "loss": 0.0606, "step": 580 }, { "epoch": 0.5656759348034516, "grad_norm": 0.4627436399459839, "learning_rate": 4.530769230769231e-05, "loss": 0.0618, "step": 590 }, { "epoch": 0.5752636625119847, "grad_norm": 0.5372833609580994, "learning_rate": 4.6076923076923076e-05, "loss": 0.0616, "step": 600 }, { "epoch": 0.5848513902205177, "grad_norm": 0.8923951387405396, "learning_rate": 4.684615384615385e-05, "loss": 0.0659, "step": 610 }, { "epoch": 0.5944391179290508, "grad_norm": 0.9428364038467407, "learning_rate": 4.7615384615384616e-05, "loss": 0.0707, "step": 620 }, { "epoch": 0.6040268456375839, "grad_norm": 0.7362667322158813, "learning_rate": 4.8384615384615386e-05, "loss": 0.062, "step": 630 }, { "epoch": 0.613614573346117, "grad_norm": 0.7807226181030273, "learning_rate": 4.9153846153846157e-05, "loss": 0.0662, "step": 640 }, { "epoch": 0.62320230105465, "grad_norm": 0.5898621678352356, "learning_rate": 4.992307692307693e-05, "loss": 0.0594, "step": 650 }, { "epoch": 0.6327900287631831, "grad_norm": 0.4694168269634247, "learning_rate": 5.06923076923077e-05, "loss": 0.0572, "step": 660 }, { "epoch": 0.6423777564717162, "grad_norm": 0.6720401048660278, "learning_rate": 5.146153846153846e-05, "loss": 0.0697, "step": 670 }, { "epoch": 0.6519654841802492, "grad_norm": 0.5371865034103394, "learning_rate": 5.223076923076924e-05, "loss": 0.059, "step": 680 }, { "epoch": 0.6615532118887824, "grad_norm": 0.6751993894577026, "learning_rate": 5.300000000000001e-05, "loss": 0.0566, "step": 690 }, { "epoch": 0.6711409395973155, "grad_norm": 0.7496346831321716, "learning_rate": 5.376923076923077e-05, "loss": 0.0592, "step": 700 }, { "epoch": 0.6807286673058485, "grad_norm": 0.7620933055877686, "learning_rate": 5.453846153846154e-05, "loss": 0.0645, "step": 710 }, { "epoch": 0.6903163950143816, "grad_norm": 0.9095701575279236, "learning_rate": 5.5307692307692305e-05, "loss": 0.0568, "step": 720 }, { "epoch": 0.6999041227229147, "grad_norm": 0.7606950998306274, "learning_rate": 5.607692307692308e-05, "loss": 0.0624, "step": 730 }, { "epoch": 0.7094918504314478, "grad_norm": 1.0387766361236572, "learning_rate": 5.684615384615385e-05, "loss": 0.0584, "step": 740 }, { "epoch": 0.7190795781399808, "grad_norm": 0.7113978862762451, "learning_rate": 5.7615384615384615e-05, "loss": 0.0652, "step": 750 }, { "epoch": 0.7286673058485139, "grad_norm": 0.604448139667511, "learning_rate": 5.838461538461538e-05, "loss": 0.0654, "step": 760 }, { "epoch": 0.738255033557047, "grad_norm": 0.8723410367965698, "learning_rate": 5.915384615384616e-05, "loss": 0.0531, "step": 770 }, { "epoch": 0.74784276126558, "grad_norm": 0.5730307102203369, "learning_rate": 5.9923076923076926e-05, "loss": 0.0559, "step": 780 }, { "epoch": 0.7574304889741131, "grad_norm": 0.7451117634773254, "learning_rate": 6.0692307692307696e-05, "loss": 0.0643, "step": 790 }, { "epoch": 0.7670182166826462, "grad_norm": 0.3902491331100464, "learning_rate": 6.146153846153846e-05, "loss": 0.0611, "step": 800 }, { "epoch": 0.7766059443911792, "grad_norm": 0.6148221492767334, "learning_rate": 6.223076923076924e-05, "loss": 0.0549, "step": 810 }, { "epoch": 0.7861936720997124, "grad_norm": 0.5791975259780884, "learning_rate": 6.3e-05, "loss": 0.0589, "step": 820 }, { "epoch": 0.7957813998082455, "grad_norm": 0.5318537950515747, "learning_rate": 6.376923076923077e-05, "loss": 0.0618, "step": 830 }, { "epoch": 0.8053691275167785, "grad_norm": 0.8901371359825134, "learning_rate": 6.453846153846154e-05, "loss": 0.0563, "step": 840 }, { "epoch": 0.8149568552253116, "grad_norm": 0.8964536786079407, "learning_rate": 6.530769230769231e-05, "loss": 0.0656, "step": 850 }, { "epoch": 0.8245445829338447, "grad_norm": 0.5159094929695129, "learning_rate": 6.607692307692308e-05, "loss": 0.0582, "step": 860 }, { "epoch": 0.8341323106423778, "grad_norm": 0.6684253811836243, "learning_rate": 6.684615384615385e-05, "loss": 0.0569, "step": 870 }, { "epoch": 0.8437200383509108, "grad_norm": 0.5698950290679932, "learning_rate": 6.761538461538461e-05, "loss": 0.0549, "step": 880 }, { "epoch": 0.8533077660594439, "grad_norm": 0.44796323776245117, "learning_rate": 6.838461538461539e-05, "loss": 0.0557, "step": 890 }, { "epoch": 0.862895493767977, "grad_norm": 0.7032187581062317, "learning_rate": 6.915384615384616e-05, "loss": 0.069, "step": 900 }, { "epoch": 0.87248322147651, "grad_norm": 0.538271963596344, "learning_rate": 6.992307692307692e-05, "loss": 0.0568, "step": 910 }, { "epoch": 0.8820709491850431, "grad_norm": 0.46786853671073914, "learning_rate": 7.069230769230769e-05, "loss": 0.0623, "step": 920 }, { "epoch": 0.8916586768935763, "grad_norm": 0.6529656052589417, "learning_rate": 7.146153846153847e-05, "loss": 0.064, "step": 930 }, { "epoch": 0.9012464046021093, "grad_norm": 0.9618151187896729, "learning_rate": 7.223076923076923e-05, "loss": 0.0557, "step": 940 }, { "epoch": 0.9108341323106424, "grad_norm": 0.5643552541732788, "learning_rate": 7.3e-05, "loss": 0.0651, "step": 950 }, { "epoch": 0.9204218600191755, "grad_norm": 0.7007706761360168, "learning_rate": 7.376923076923077e-05, "loss": 0.0514, "step": 960 }, { "epoch": 0.9300095877277086, "grad_norm": 0.4530331492424011, "learning_rate": 7.453846153846154e-05, "loss": 0.0563, "step": 970 }, { "epoch": 0.9395973154362416, "grad_norm": 0.6113521456718445, "learning_rate": 7.530769230769231e-05, "loss": 0.0606, "step": 980 }, { "epoch": 0.9491850431447747, "grad_norm": 0.5007736682891846, "learning_rate": 7.607692307692308e-05, "loss": 0.0561, "step": 990 }, { "epoch": 0.9587727708533078, "grad_norm": 0.49903005361557007, "learning_rate": 7.684615384615385e-05, "loss": 0.0578, "step": 1000 }, { "epoch": 0.9683604985618408, "grad_norm": 0.629622220993042, "learning_rate": 7.761538461538462e-05, "loss": 0.0572, "step": 1010 }, { "epoch": 0.9779482262703739, "grad_norm": 0.5830038785934448, "learning_rate": 7.838461538461539e-05, "loss": 0.0586, "step": 1020 }, { "epoch": 0.987535953978907, "grad_norm": 0.502075731754303, "learning_rate": 7.915384615384616e-05, "loss": 0.052, "step": 1030 }, { "epoch": 0.99712368168744, "grad_norm": 0.6076005101203918, "learning_rate": 7.992307692307692e-05, "loss": 0.0536, "step": 1040 }, { "epoch": 1.0067114093959733, "grad_norm": 0.6297442317008972, "learning_rate": 8.06923076923077e-05, "loss": 0.0565, "step": 1050 }, { "epoch": 1.0162991371045063, "grad_norm": 0.6776733994483948, "learning_rate": 8.146153846153847e-05, "loss": 0.0556, "step": 1060 }, { "epoch": 1.0258868648130393, "grad_norm": 0.5691619515419006, "learning_rate": 8.223076923076923e-05, "loss": 0.0528, "step": 1070 }, { "epoch": 1.0354745925215725, "grad_norm": 0.7027555108070374, "learning_rate": 8.3e-05, "loss": 0.0614, "step": 1080 }, { "epoch": 1.0450623202301055, "grad_norm": 0.7508878111839294, "learning_rate": 8.376923076923078e-05, "loss": 0.0496, "step": 1090 }, { "epoch": 1.0546500479386385, "grad_norm": 0.6663224101066589, "learning_rate": 8.453846153846154e-05, "loss": 0.0507, "step": 1100 }, { "epoch": 1.0642377756471717, "grad_norm": 0.5372412204742432, "learning_rate": 8.530769230769231e-05, "loss": 0.0547, "step": 1110 }, { "epoch": 1.0738255033557047, "grad_norm": 0.6460400223731995, "learning_rate": 8.607692307692308e-05, "loss": 0.0598, "step": 1120 }, { "epoch": 1.0834132310642377, "grad_norm": 0.5155197381973267, "learning_rate": 8.684615384615385e-05, "loss": 0.0601, "step": 1130 }, { "epoch": 1.093000958772771, "grad_norm": 0.42931079864501953, "learning_rate": 8.761538461538462e-05, "loss": 0.0602, "step": 1140 }, { "epoch": 1.102588686481304, "grad_norm": 0.5317569971084595, "learning_rate": 8.838461538461539e-05, "loss": 0.0577, "step": 1150 }, { "epoch": 1.112176414189837, "grad_norm": 0.6564596891403198, "learning_rate": 8.915384615384616e-05, "loss": 0.0596, "step": 1160 }, { "epoch": 1.1217641418983701, "grad_norm": 0.43666043877601624, "learning_rate": 8.992307692307693e-05, "loss": 0.0549, "step": 1170 }, { "epoch": 1.1313518696069031, "grad_norm": 0.6105823516845703, "learning_rate": 9.06923076923077e-05, "loss": 0.0641, "step": 1180 }, { "epoch": 1.1409395973154361, "grad_norm": 0.5657874345779419, "learning_rate": 9.146153846153847e-05, "loss": 0.0591, "step": 1190 }, { "epoch": 1.1505273250239694, "grad_norm": 0.5609491467475891, "learning_rate": 9.223076923076923e-05, "loss": 0.0622, "step": 1200 }, { "epoch": 1.1601150527325024, "grad_norm": 0.6493374705314636, "learning_rate": 9.300000000000001e-05, "loss": 0.0589, "step": 1210 }, { "epoch": 1.1697027804410354, "grad_norm": 0.7406426072120667, "learning_rate": 9.376923076923078e-05, "loss": 0.0579, "step": 1220 }, { "epoch": 1.1792905081495686, "grad_norm": 0.6438266634941101, "learning_rate": 9.453846153846154e-05, "loss": 0.058, "step": 1230 }, { "epoch": 1.1888782358581016, "grad_norm": 0.49737435579299927, "learning_rate": 9.530769230769231e-05, "loss": 0.0599, "step": 1240 }, { "epoch": 1.1984659635666346, "grad_norm": 0.5221342444419861, "learning_rate": 9.607692307692309e-05, "loss": 0.0613, "step": 1250 }, { "epoch": 1.2080536912751678, "grad_norm": 0.5636175870895386, "learning_rate": 9.684615384615385e-05, "loss": 0.054, "step": 1260 }, { "epoch": 1.2176414189837008, "grad_norm": 0.6858579516410828, "learning_rate": 9.761538461538462e-05, "loss": 0.0633, "step": 1270 }, { "epoch": 1.2272291466922338, "grad_norm": 0.5884243845939636, "learning_rate": 9.838461538461539e-05, "loss": 0.0576, "step": 1280 }, { "epoch": 1.236816874400767, "grad_norm": 0.753278374671936, "learning_rate": 9.915384615384616e-05, "loss": 0.0624, "step": 1290 }, { "epoch": 1.2464046021093, "grad_norm": 0.5968719720840454, "learning_rate": 9.992307692307693e-05, "loss": 0.0615, "step": 1300 }, { "epoch": 1.255992329817833, "grad_norm": 0.4386919140815735, "learning_rate": 9.99999672409862e-05, "loss": 0.0612, "step": 1310 }, { "epoch": 1.2655800575263663, "grad_norm": 0.7106592655181885, "learning_rate": 9.999985400000595e-05, "loss": 0.0582, "step": 1320 }, { "epoch": 1.2751677852348993, "grad_norm": 0.642195463180542, "learning_rate": 9.999965987281012e-05, "loss": 0.0539, "step": 1330 }, { "epoch": 1.2847555129434325, "grad_norm": 0.8102270364761353, "learning_rate": 9.999938485971279e-05, "loss": 0.0571, "step": 1340 }, { "epoch": 1.2943432406519655, "grad_norm": 0.5724937319755554, "learning_rate": 9.999902896115882e-05, "loss": 0.059, "step": 1350 }, { "epoch": 1.3039309683604985, "grad_norm": 0.5850300788879395, "learning_rate": 9.999859217772396e-05, "loss": 0.0546, "step": 1360 }, { "epoch": 1.3135186960690317, "grad_norm": 0.5836851000785828, "learning_rate": 9.999807451011483e-05, "loss": 0.0574, "step": 1370 }, { "epoch": 1.3231064237775647, "grad_norm": 0.4875651001930237, "learning_rate": 9.999747595916886e-05, "loss": 0.0584, "step": 1380 }, { "epoch": 1.332694151486098, "grad_norm": 0.6385061144828796, "learning_rate": 9.999679652585436e-05, "loss": 0.0551, "step": 1390 }, { "epoch": 1.342281879194631, "grad_norm": 0.6868314743041992, "learning_rate": 9.999603621127043e-05, "loss": 0.0644, "step": 1400 }, { "epoch": 1.351869606903164, "grad_norm": 0.879398763179779, "learning_rate": 9.99951950166471e-05, "loss": 0.0556, "step": 1410 }, { "epoch": 1.3614573346116972, "grad_norm": 0.5804061889648438, "learning_rate": 9.999427294334516e-05, "loss": 0.066, "step": 1420 }, { "epoch": 1.3710450623202302, "grad_norm": 0.8581869602203369, "learning_rate": 9.999326999285628e-05, "loss": 0.0604, "step": 1430 }, { "epoch": 1.3806327900287632, "grad_norm": 0.5264695882797241, "learning_rate": 9.999218616680299e-05, "loss": 0.0616, "step": 1440 }, { "epoch": 1.3902205177372964, "grad_norm": 0.9933851957321167, "learning_rate": 9.999102146693859e-05, "loss": 0.0593, "step": 1450 }, { "epoch": 1.3998082454458294, "grad_norm": 0.4718506932258606, "learning_rate": 9.998977589514729e-05, "loss": 0.056, "step": 1460 }, { "epoch": 1.4093959731543624, "grad_norm": 0.46576133370399475, "learning_rate": 9.998844945344405e-05, "loss": 0.0547, "step": 1470 }, { "epoch": 1.4189837008628956, "grad_norm": 0.8062454462051392, "learning_rate": 9.99870421439747e-05, "loss": 0.0624, "step": 1480 }, { "epoch": 1.4285714285714286, "grad_norm": 0.6128931641578674, "learning_rate": 9.99855539690159e-05, "loss": 0.054, "step": 1490 }, { "epoch": 1.4381591562799616, "grad_norm": 0.781894326210022, "learning_rate": 9.998398493097511e-05, "loss": 0.0593, "step": 1500 }, { "epoch": 1.4477468839884948, "grad_norm": 0.4165836572647095, "learning_rate": 9.998233503239059e-05, "loss": 0.0467, "step": 1510 }, { "epoch": 1.4573346116970278, "grad_norm": 0.2851610779762268, "learning_rate": 9.998060427593146e-05, "loss": 0.0544, "step": 1520 }, { "epoch": 1.4669223394055608, "grad_norm": 0.578106164932251, "learning_rate": 9.997879266439758e-05, "loss": 0.0518, "step": 1530 }, { "epoch": 1.476510067114094, "grad_norm": 0.5424726009368896, "learning_rate": 9.997690020071968e-05, "loss": 0.0589, "step": 1540 }, { "epoch": 1.486097794822627, "grad_norm": 0.3104839622974396, "learning_rate": 9.997492688795924e-05, "loss": 0.0675, "step": 1550 }, { "epoch": 1.49568552253116, "grad_norm": 0.3031683564186096, "learning_rate": 9.997287272930854e-05, "loss": 0.0568, "step": 1560 }, { "epoch": 1.5052732502396933, "grad_norm": 0.3921635150909424, "learning_rate": 9.997073772809065e-05, "loss": 0.0475, "step": 1570 }, { "epoch": 1.5148609779482263, "grad_norm": 0.32904309034347534, "learning_rate": 9.996852188775942e-05, "loss": 0.0475, "step": 1580 }, { "epoch": 1.5244487056567593, "grad_norm": 0.5768727660179138, "learning_rate": 9.996622521189952e-05, "loss": 0.0471, "step": 1590 }, { "epoch": 1.5340364333652925, "grad_norm": 0.5629034042358398, "learning_rate": 9.996384770422629e-05, "loss": 0.0498, "step": 1600 }, { "epoch": 1.5436241610738255, "grad_norm": 0.5300479531288147, "learning_rate": 9.996138936858593e-05, "loss": 0.0498, "step": 1610 }, { "epoch": 1.5532118887823585, "grad_norm": 0.5257939696311951, "learning_rate": 9.995885020895536e-05, "loss": 0.0512, "step": 1620 }, { "epoch": 1.5627996164908917, "grad_norm": 0.5948016047477722, "learning_rate": 9.995623022944223e-05, "loss": 0.0512, "step": 1630 }, { "epoch": 1.5723873441994247, "grad_norm": 0.5377443432807922, "learning_rate": 9.995352943428497e-05, "loss": 0.0563, "step": 1640 }, { "epoch": 1.5819750719079577, "grad_norm": 0.658616840839386, "learning_rate": 9.995074782785275e-05, "loss": 0.0591, "step": 1650 }, { "epoch": 1.591562799616491, "grad_norm": 0.5048621296882629, "learning_rate": 9.994788541464543e-05, "loss": 0.0461, "step": 1660 }, { "epoch": 1.601150527325024, "grad_norm": 0.30649903416633606, "learning_rate": 9.994494219929365e-05, "loss": 0.0472, "step": 1670 }, { "epoch": 1.610738255033557, "grad_norm": 0.5432398319244385, "learning_rate": 9.99419181865587e-05, "loss": 0.0516, "step": 1680 }, { "epoch": 1.6203259827420902, "grad_norm": 0.458732545375824, "learning_rate": 9.993881338133261e-05, "loss": 0.0471, "step": 1690 }, { "epoch": 1.6299137104506232, "grad_norm": 0.4103093445301056, "learning_rate": 9.993562778863817e-05, "loss": 0.0533, "step": 1700 }, { "epoch": 1.6395014381591562, "grad_norm": 0.7458987832069397, "learning_rate": 9.993236141362874e-05, "loss": 0.0533, "step": 1710 }, { "epoch": 1.6490891658676894, "grad_norm": 0.4409146010875702, "learning_rate": 9.992901426158848e-05, "loss": 0.0574, "step": 1720 }, { "epoch": 1.6586768935762224, "grad_norm": 0.43476009368896484, "learning_rate": 9.992558633793212e-05, "loss": 0.0551, "step": 1730 }, { "epoch": 1.6682646212847554, "grad_norm": 0.5552487373352051, "learning_rate": 9.992207764820516e-05, "loss": 0.0544, "step": 1740 }, { "epoch": 1.6778523489932886, "grad_norm": 0.3948347270488739, "learning_rate": 9.99184881980837e-05, "loss": 0.0549, "step": 1750 }, { "epoch": 1.6874400767018218, "grad_norm": 0.36312541365623474, "learning_rate": 9.991481799337448e-05, "loss": 0.0628, "step": 1760 }, { "epoch": 1.6970278044103546, "grad_norm": 0.48039504885673523, "learning_rate": 9.991106704001491e-05, "loss": 0.0518, "step": 1770 }, { "epoch": 1.7066155321188878, "grad_norm": 0.43102404475212097, "learning_rate": 9.990723534407302e-05, "loss": 0.0531, "step": 1780 }, { "epoch": 1.716203259827421, "grad_norm": 0.635412335395813, "learning_rate": 9.990332291174747e-05, "loss": 0.0623, "step": 1790 }, { "epoch": 1.7257909875359538, "grad_norm": 0.41768330335617065, "learning_rate": 9.989932974936746e-05, "loss": 0.0489, "step": 1800 }, { "epoch": 1.735378715244487, "grad_norm": 0.4321722984313965, "learning_rate": 9.98952558633929e-05, "loss": 0.0578, "step": 1810 }, { "epoch": 1.7449664429530203, "grad_norm": 0.5160396099090576, "learning_rate": 9.98911012604142e-05, "loss": 0.0538, "step": 1820 }, { "epoch": 1.754554170661553, "grad_norm": 0.5091599822044373, "learning_rate": 9.98868659471524e-05, "loss": 0.062, "step": 1830 }, { "epoch": 1.7641418983700863, "grad_norm": 0.289798378944397, "learning_rate": 9.988254993045908e-05, "loss": 0.0561, "step": 1840 }, { "epoch": 1.7737296260786195, "grad_norm": 0.6626523733139038, "learning_rate": 9.98781532173164e-05, "loss": 0.0584, "step": 1850 }, { "epoch": 1.7833173537871523, "grad_norm": 0.4821811020374298, "learning_rate": 9.987367581483705e-05, "loss": 0.0597, "step": 1860 }, { "epoch": 1.7929050814956855, "grad_norm": 0.45109039545059204, "learning_rate": 9.986911773026422e-05, "loss": 0.0618, "step": 1870 }, { "epoch": 1.8024928092042187, "grad_norm": 0.5203428864479065, "learning_rate": 9.98644789709717e-05, "loss": 0.054, "step": 1880 }, { "epoch": 1.8120805369127517, "grad_norm": 0.3689659833908081, "learning_rate": 9.985975954446372e-05, "loss": 0.0506, "step": 1890 }, { "epoch": 1.8216682646212847, "grad_norm": 0.5378998517990112, "learning_rate": 9.985495945837504e-05, "loss": 0.0527, "step": 1900 }, { "epoch": 1.831255992329818, "grad_norm": 0.36838144063949585, "learning_rate": 9.985007872047088e-05, "loss": 0.0484, "step": 1910 }, { "epoch": 1.840843720038351, "grad_norm": 0.3217353820800781, "learning_rate": 9.984511733864698e-05, "loss": 0.0495, "step": 1920 }, { "epoch": 1.850431447746884, "grad_norm": 0.5914832353591919, "learning_rate": 9.984007532092951e-05, "loss": 0.0562, "step": 1930 }, { "epoch": 1.8600191754554172, "grad_norm": 0.44079649448394775, "learning_rate": 9.983495267547508e-05, "loss": 0.0515, "step": 1940 }, { "epoch": 1.8696069031639502, "grad_norm": 0.5204843878746033, "learning_rate": 9.982974941057073e-05, "loss": 0.0547, "step": 1950 }, { "epoch": 1.8791946308724832, "grad_norm": 0.505711555480957, "learning_rate": 9.982446553463397e-05, "loss": 0.0445, "step": 1960 }, { "epoch": 1.8887823585810164, "grad_norm": 0.3592546582221985, "learning_rate": 9.981910105621262e-05, "loss": 0.0586, "step": 1970 }, { "epoch": 1.8983700862895494, "grad_norm": 0.3347618281841278, "learning_rate": 9.9813655983985e-05, "loss": 0.0616, "step": 1980 }, { "epoch": 1.9079578139980824, "grad_norm": 0.6229729056358337, "learning_rate": 9.980813032675974e-05, "loss": 0.0486, "step": 1990 }, { "epoch": 1.9175455417066156, "grad_norm": 0.4660274386405945, "learning_rate": 9.980252409347588e-05, "loss": 0.0481, "step": 2000 }, { "epoch": 1.9271332694151486, "grad_norm": 0.2956122159957886, "learning_rate": 9.979683729320275e-05, "loss": 0.0511, "step": 2010 }, { "epoch": 1.9367209971236816, "grad_norm": 0.45697900652885437, "learning_rate": 9.97910699351401e-05, "loss": 0.0519, "step": 2020 }, { "epoch": 1.9463087248322148, "grad_norm": 0.5107268691062927, "learning_rate": 9.97852220286179e-05, "loss": 0.0563, "step": 2030 }, { "epoch": 1.9558964525407478, "grad_norm": 0.3761272728443146, "learning_rate": 9.97792935830965e-05, "loss": 0.0532, "step": 2040 }, { "epoch": 1.9654841802492808, "grad_norm": 0.4759978950023651, "learning_rate": 9.977328460816654e-05, "loss": 0.0588, "step": 2050 }, { "epoch": 1.975071907957814, "grad_norm": 0.4457103908061981, "learning_rate": 9.976719511354889e-05, "loss": 0.0459, "step": 2060 }, { "epoch": 1.984659635666347, "grad_norm": 0.31241118907928467, "learning_rate": 9.976102510909469e-05, "loss": 0.0521, "step": 2070 }, { "epoch": 1.99424736337488, "grad_norm": 0.5308888554573059, "learning_rate": 9.975477460478538e-05, "loss": 0.0514, "step": 2080 }, { "epoch": 2.0038350910834133, "grad_norm": 0.35070937871932983, "learning_rate": 9.974844361073252e-05, "loss": 0.0524, "step": 2090 }, { "epoch": 2.0134228187919465, "grad_norm": 0.47052425146102905, "learning_rate": 9.9742032137178e-05, "loss": 0.0476, "step": 2100 }, { "epoch": 2.0230105465004793, "grad_norm": 0.6150134205818176, "learning_rate": 9.973554019449383e-05, "loss": 0.0412, "step": 2110 }, { "epoch": 2.0325982742090125, "grad_norm": 0.5497679114341736, "learning_rate": 9.972896779318219e-05, "loss": 0.0592, "step": 2120 }, { "epoch": 2.0421860019175457, "grad_norm": 0.5127347111701965, "learning_rate": 9.972231494387547e-05, "loss": 0.0468, "step": 2130 }, { "epoch": 2.0517737296260785, "grad_norm": 0.43948736786842346, "learning_rate": 9.971558165733619e-05, "loss": 0.0484, "step": 2140 }, { "epoch": 2.0613614573346117, "grad_norm": 0.47324222326278687, "learning_rate": 9.970876794445694e-05, "loss": 0.0517, "step": 2150 }, { "epoch": 2.070949185043145, "grad_norm": 0.34907156229019165, "learning_rate": 9.970187381626048e-05, "loss": 0.0566, "step": 2160 }, { "epoch": 2.0805369127516777, "grad_norm": 0.51346355676651, "learning_rate": 9.969489928389965e-05, "loss": 0.0409, "step": 2170 }, { "epoch": 2.090124640460211, "grad_norm": 0.34040042757987976, "learning_rate": 9.968784435865737e-05, "loss": 0.0462, "step": 2180 }, { "epoch": 2.099712368168744, "grad_norm": 0.4003884792327881, "learning_rate": 9.968070905194656e-05, "loss": 0.0434, "step": 2190 }, { "epoch": 2.109300095877277, "grad_norm": 0.4381425380706787, "learning_rate": 9.967349337531023e-05, "loss": 0.0438, "step": 2200 }, { "epoch": 2.11888782358581, "grad_norm": 0.5975500345230103, "learning_rate": 9.966619734042139e-05, "loss": 0.0441, "step": 2210 }, { "epoch": 2.1284755512943434, "grad_norm": 0.39649492502212524, "learning_rate": 9.965882095908305e-05, "loss": 0.0485, "step": 2220 }, { "epoch": 2.138063279002876, "grad_norm": 0.5102829337120056, "learning_rate": 9.96513642432282e-05, "loss": 0.0462, "step": 2230 }, { "epoch": 2.1476510067114094, "grad_norm": 0.5115483999252319, "learning_rate": 9.964382720491976e-05, "loss": 0.0539, "step": 2240 }, { "epoch": 2.1572387344199426, "grad_norm": 0.4768059551715851, "learning_rate": 9.963620985635065e-05, "loss": 0.0521, "step": 2250 }, { "epoch": 2.1668264621284754, "grad_norm": 0.4891989827156067, "learning_rate": 9.962851220984366e-05, "loss": 0.0486, "step": 2260 }, { "epoch": 2.1764141898370086, "grad_norm": 0.5893239974975586, "learning_rate": 9.962073427785149e-05, "loss": 0.053, "step": 2270 }, { "epoch": 2.186001917545542, "grad_norm": 0.640600323677063, "learning_rate": 9.961287607295673e-05, "loss": 0.0516, "step": 2280 }, { "epoch": 2.1955896452540746, "grad_norm": 0.5314393639564514, "learning_rate": 9.960493760787184e-05, "loss": 0.0552, "step": 2290 }, { "epoch": 2.205177372962608, "grad_norm": 0.4695710241794586, "learning_rate": 9.95969188954391e-05, "loss": 0.0488, "step": 2300 }, { "epoch": 2.214765100671141, "grad_norm": 0.41498687863349915, "learning_rate": 9.958881994863058e-05, "loss": 0.0554, "step": 2310 }, { "epoch": 2.224352828379674, "grad_norm": 0.3587738573551178, "learning_rate": 9.958064078054823e-05, "loss": 0.0415, "step": 2320 }, { "epoch": 2.233940556088207, "grad_norm": 0.3993861973285675, "learning_rate": 9.957238140442371e-05, "loss": 0.0529, "step": 2330 }, { "epoch": 2.2435282837967403, "grad_norm": 0.4770705997943878, "learning_rate": 9.956404183361845e-05, "loss": 0.0521, "step": 2340 }, { "epoch": 2.253116011505273, "grad_norm": 0.5887109041213989, "learning_rate": 9.955562208162362e-05, "loss": 0.0632, "step": 2350 }, { "epoch": 2.2627037392138063, "grad_norm": 0.6732892990112305, "learning_rate": 9.954712216206008e-05, "loss": 0.06, "step": 2360 }, { "epoch": 2.2722914669223395, "grad_norm": 0.37186869978904724, "learning_rate": 9.953854208867841e-05, "loss": 0.0572, "step": 2370 }, { "epoch": 2.2818791946308723, "grad_norm": 0.3546556234359741, "learning_rate": 9.952988187535886e-05, "loss": 0.0495, "step": 2380 }, { "epoch": 2.2914669223394055, "grad_norm": 0.23416608572006226, "learning_rate": 9.952114153611128e-05, "loss": 0.0463, "step": 2390 }, { "epoch": 2.3010546500479387, "grad_norm": 0.5339412689208984, "learning_rate": 9.951232108507517e-05, "loss": 0.0503, "step": 2400 }, { "epoch": 2.310642377756472, "grad_norm": 0.34483078122138977, "learning_rate": 9.950342053651967e-05, "loss": 0.0428, "step": 2410 }, { "epoch": 2.3202301054650047, "grad_norm": 0.449236124753952, "learning_rate": 9.949443990484342e-05, "loss": 0.0495, "step": 2420 }, { "epoch": 2.329817833173538, "grad_norm": 0.40906885266304016, "learning_rate": 9.948537920457466e-05, "loss": 0.0442, "step": 2430 }, { "epoch": 2.3394055608820707, "grad_norm": 0.3320155143737793, "learning_rate": 9.947623845037112e-05, "loss": 0.0469, "step": 2440 }, { "epoch": 2.348993288590604, "grad_norm": 0.3933449387550354, "learning_rate": 9.946701765702012e-05, "loss": 0.0499, "step": 2450 }, { "epoch": 2.358581016299137, "grad_norm": 0.42711353302001953, "learning_rate": 9.945771683943836e-05, "loss": 0.0465, "step": 2460 }, { "epoch": 2.3681687440076704, "grad_norm": 0.3379175364971161, "learning_rate": 9.944833601267207e-05, "loss": 0.0446, "step": 2470 }, { "epoch": 2.377756471716203, "grad_norm": 0.2655797302722931, "learning_rate": 9.943887519189685e-05, "loss": 0.0457, "step": 2480 }, { "epoch": 2.3873441994247364, "grad_norm": 0.534376859664917, "learning_rate": 9.94293343924178e-05, "loss": 0.0386, "step": 2490 }, { "epoch": 2.396931927133269, "grad_norm": 0.5116010904312134, "learning_rate": 9.941971362966929e-05, "loss": 0.0488, "step": 2500 }, { "epoch": 2.4065196548418024, "grad_norm": 0.33155950903892517, "learning_rate": 9.941001291921512e-05, "loss": 0.0561, "step": 2510 }, { "epoch": 2.4161073825503356, "grad_norm": 0.4785441756248474, "learning_rate": 9.940023227674844e-05, "loss": 0.055, "step": 2520 }, { "epoch": 2.425695110258869, "grad_norm": 0.4031260907649994, "learning_rate": 9.939037171809167e-05, "loss": 0.0489, "step": 2530 }, { "epoch": 2.4352828379674016, "grad_norm": 0.4069255590438843, "learning_rate": 9.93804312591965e-05, "loss": 0.0499, "step": 2540 }, { "epoch": 2.444870565675935, "grad_norm": 0.4854568541049957, "learning_rate": 9.937041091614392e-05, "loss": 0.0508, "step": 2550 }, { "epoch": 2.4544582933844676, "grad_norm": 0.42022451758384705, "learning_rate": 9.936031070514413e-05, "loss": 0.0533, "step": 2560 }, { "epoch": 2.464046021093001, "grad_norm": 0.3417539894580841, "learning_rate": 9.935013064253652e-05, "loss": 0.0487, "step": 2570 }, { "epoch": 2.473633748801534, "grad_norm": 0.7130690813064575, "learning_rate": 9.933987074478969e-05, "loss": 0.0482, "step": 2580 }, { "epoch": 2.4832214765100673, "grad_norm": 0.328921914100647, "learning_rate": 9.932953102850136e-05, "loss": 0.0462, "step": 2590 }, { "epoch": 2.4928092042186, "grad_norm": 0.27391597628593445, "learning_rate": 9.931911151039838e-05, "loss": 0.0543, "step": 2600 }, { "epoch": 2.5023969319271333, "grad_norm": 0.3968970775604248, "learning_rate": 9.930861220733674e-05, "loss": 0.0446, "step": 2610 }, { "epoch": 2.511984659635666, "grad_norm": 0.31161823868751526, "learning_rate": 9.929803313630145e-05, "loss": 0.0542, "step": 2620 }, { "epoch": 2.5215723873441993, "grad_norm": 0.49789026379585266, "learning_rate": 9.928737431440658e-05, "loss": 0.0496, "step": 2630 }, { "epoch": 2.5311601150527325, "grad_norm": 0.3426557779312134, "learning_rate": 9.927663575889521e-05, "loss": 0.0451, "step": 2640 }, { "epoch": 2.5407478427612658, "grad_norm": 0.35124093294143677, "learning_rate": 9.926581748713942e-05, "loss": 0.0469, "step": 2650 }, { "epoch": 2.5503355704697985, "grad_norm": 0.5212651491165161, "learning_rate": 9.925491951664023e-05, "loss": 0.0574, "step": 2660 }, { "epoch": 2.5599232981783318, "grad_norm": 0.5474659204483032, "learning_rate": 9.92439418650276e-05, "loss": 0.0592, "step": 2670 }, { "epoch": 2.569511025886865, "grad_norm": 0.36428266763687134, "learning_rate": 9.923288455006045e-05, "loss": 0.0534, "step": 2680 }, { "epoch": 2.5790987535953978, "grad_norm": 0.3940581977367401, "learning_rate": 9.922174758962645e-05, "loss": 0.0493, "step": 2690 }, { "epoch": 2.588686481303931, "grad_norm": 0.32265448570251465, "learning_rate": 9.921053100174223e-05, "loss": 0.0465, "step": 2700 }, { "epoch": 2.598274209012464, "grad_norm": 0.35290199518203735, "learning_rate": 9.919923480455317e-05, "loss": 0.048, "step": 2710 }, { "epoch": 2.607861936720997, "grad_norm": 0.4928702712059021, "learning_rate": 9.918785901633345e-05, "loss": 0.0463, "step": 2720 }, { "epoch": 2.61744966442953, "grad_norm": 0.39868831634521484, "learning_rate": 9.917640365548604e-05, "loss": 0.0478, "step": 2730 }, { "epoch": 2.6270373921380634, "grad_norm": 0.48915326595306396, "learning_rate": 9.916486874054259e-05, "loss": 0.0452, "step": 2740 }, { "epoch": 2.636625119846596, "grad_norm": 0.3415433466434479, "learning_rate": 9.915325429016345e-05, "loss": 0.0399, "step": 2750 }, { "epoch": 2.6462128475551294, "grad_norm": 0.4320572316646576, "learning_rate": 9.914156032313768e-05, "loss": 0.052, "step": 2760 }, { "epoch": 2.6558005752636626, "grad_norm": 0.5043158531188965, "learning_rate": 9.912978685838294e-05, "loss": 0.05, "step": 2770 }, { "epoch": 2.665388302972196, "grad_norm": 0.3065243363380432, "learning_rate": 9.911793391494552e-05, "loss": 0.0449, "step": 2780 }, { "epoch": 2.6749760306807286, "grad_norm": 0.42839324474334717, "learning_rate": 9.910600151200025e-05, "loss": 0.0506, "step": 2790 }, { "epoch": 2.684563758389262, "grad_norm": 0.32670149207115173, "learning_rate": 9.909398966885053e-05, "loss": 0.0482, "step": 2800 }, { "epoch": 2.6941514860977946, "grad_norm": 0.49310222268104553, "learning_rate": 9.908189840492827e-05, "loss": 0.0457, "step": 2810 }, { "epoch": 2.703739213806328, "grad_norm": 0.43462368845939636, "learning_rate": 9.906972773979388e-05, "loss": 0.0494, "step": 2820 }, { "epoch": 2.713326941514861, "grad_norm": 0.3611735701560974, "learning_rate": 9.905747769313616e-05, "loss": 0.0472, "step": 2830 }, { "epoch": 2.7229146692233943, "grad_norm": 0.3046175539493561, "learning_rate": 9.90451482847724e-05, "loss": 0.046, "step": 2840 }, { "epoch": 2.732502396931927, "grad_norm": 0.5815914869308472, "learning_rate": 9.903273953464821e-05, "loss": 0.0505, "step": 2850 }, { "epoch": 2.7420901246404603, "grad_norm": 0.4920728802680969, "learning_rate": 9.902025146283761e-05, "loss": 0.0475, "step": 2860 }, { "epoch": 2.751677852348993, "grad_norm": 0.3602769374847412, "learning_rate": 9.90076840895429e-05, "loss": 0.0425, "step": 2870 }, { "epoch": 2.7612655800575263, "grad_norm": 0.580506443977356, "learning_rate": 9.899503743509471e-05, "loss": 0.0493, "step": 2880 }, { "epoch": 2.7708533077660595, "grad_norm": 0.4402373135089874, "learning_rate": 9.898231151995187e-05, "loss": 0.0468, "step": 2890 }, { "epoch": 2.7804410354745928, "grad_norm": 0.5210007429122925, "learning_rate": 9.896950636470147e-05, "loss": 0.0461, "step": 2900 }, { "epoch": 2.7900287631831255, "grad_norm": 0.4113840162754059, "learning_rate": 9.89566219900588e-05, "loss": 0.0561, "step": 2910 }, { "epoch": 2.7996164908916588, "grad_norm": 0.4887576699256897, "learning_rate": 9.894365841686726e-05, "loss": 0.0484, "step": 2920 }, { "epoch": 2.8092042186001915, "grad_norm": 0.3261569142341614, "learning_rate": 9.893061566609843e-05, "loss": 0.0457, "step": 2930 }, { "epoch": 2.8187919463087248, "grad_norm": 0.3729310631752014, "learning_rate": 9.891749375885191e-05, "loss": 0.0459, "step": 2940 }, { "epoch": 2.828379674017258, "grad_norm": 0.4186583459377289, "learning_rate": 9.890429271635541e-05, "loss": 0.0448, "step": 2950 }, { "epoch": 2.837967401725791, "grad_norm": 0.4808233380317688, "learning_rate": 9.889101255996466e-05, "loss": 0.0513, "step": 2960 }, { "epoch": 2.847555129434324, "grad_norm": 0.24302266538143158, "learning_rate": 9.887765331116331e-05, "loss": 0.0439, "step": 2970 }, { "epoch": 2.857142857142857, "grad_norm": 0.28988000750541687, "learning_rate": 9.886421499156305e-05, "loss": 0.0448, "step": 2980 }, { "epoch": 2.86673058485139, "grad_norm": 0.408470094203949, "learning_rate": 9.88506976229034e-05, "loss": 0.0457, "step": 2990 }, { "epoch": 2.876318312559923, "grad_norm": 0.279012531042099, "learning_rate": 9.883710122705184e-05, "loss": 0.0521, "step": 3000 }, { "epoch": 2.8859060402684564, "grad_norm": 0.3104060888290405, "learning_rate": 9.882342582600361e-05, "loss": 0.0479, "step": 3010 }, { "epoch": 2.8954937679769897, "grad_norm": 0.36359190940856934, "learning_rate": 9.880967144188184e-05, "loss": 0.0545, "step": 3020 }, { "epoch": 2.9050814956855224, "grad_norm": 0.3486534059047699, "learning_rate": 9.879583809693738e-05, "loss": 0.0469, "step": 3030 }, { "epoch": 2.9146692233940557, "grad_norm": 0.35138458013534546, "learning_rate": 9.878192581354883e-05, "loss": 0.0486, "step": 3040 }, { "epoch": 2.9242569511025884, "grad_norm": 0.3004566431045532, "learning_rate": 9.87679346142225e-05, "loss": 0.048, "step": 3050 }, { "epoch": 2.9338446788111217, "grad_norm": 0.4111393988132477, "learning_rate": 9.875386452159237e-05, "loss": 0.0526, "step": 3060 }, { "epoch": 2.943432406519655, "grad_norm": 0.5051720142364502, "learning_rate": 9.873971555842e-05, "loss": 0.0521, "step": 3070 }, { "epoch": 2.953020134228188, "grad_norm": 0.3800508677959442, "learning_rate": 9.872548774759465e-05, "loss": 0.0509, "step": 3080 }, { "epoch": 2.962607861936721, "grad_norm": 0.3587114214897156, "learning_rate": 9.871118111213299e-05, "loss": 0.0463, "step": 3090 }, { "epoch": 2.972195589645254, "grad_norm": 0.3234626352787018, "learning_rate": 9.869679567517931e-05, "loss": 0.0421, "step": 3100 }, { "epoch": 2.981783317353787, "grad_norm": 0.37090590596199036, "learning_rate": 9.868233146000535e-05, "loss": 0.0497, "step": 3110 }, { "epoch": 2.99137104506232, "grad_norm": 0.274735689163208, "learning_rate": 9.86677884900103e-05, "loss": 0.0492, "step": 3120 }, { "epoch": 3.0009587727708533, "grad_norm": 0.35899800062179565, "learning_rate": 9.865316678872073e-05, "loss": 0.0436, "step": 3130 }, { "epoch": 3.0105465004793865, "grad_norm": 0.3196784555912018, "learning_rate": 9.863846637979057e-05, "loss": 0.0472, "step": 3140 }, { "epoch": 3.0201342281879193, "grad_norm": 0.3077564537525177, "learning_rate": 9.862368728700115e-05, "loss": 0.0527, "step": 3150 }, { "epoch": 3.0297219558964525, "grad_norm": 0.40086090564727783, "learning_rate": 9.860882953426099e-05, "loss": 0.0507, "step": 3160 }, { "epoch": 3.0393096836049858, "grad_norm": 0.6561570763587952, "learning_rate": 9.859389314560595e-05, "loss": 0.0545, "step": 3170 }, { "epoch": 3.0488974113135185, "grad_norm": 0.34068262577056885, "learning_rate": 9.857887814519902e-05, "loss": 0.0458, "step": 3180 }, { "epoch": 3.0584851390220518, "grad_norm": 0.31878864765167236, "learning_rate": 9.856378455733042e-05, "loss": 0.0399, "step": 3190 }, { "epoch": 3.068072866730585, "grad_norm": 0.41648054122924805, "learning_rate": 9.854861240641748e-05, "loss": 0.0452, "step": 3200 }, { "epoch": 3.0776605944391178, "grad_norm": 0.35710304975509644, "learning_rate": 9.853336171700464e-05, "loss": 0.0509, "step": 3210 }, { "epoch": 3.087248322147651, "grad_norm": 0.3782924711704254, "learning_rate": 9.851803251376336e-05, "loss": 0.0445, "step": 3220 }, { "epoch": 3.096836049856184, "grad_norm": 0.49359890818595886, "learning_rate": 9.85026248214922e-05, "loss": 0.0515, "step": 3230 }, { "epoch": 3.106423777564717, "grad_norm": 0.4491162598133087, "learning_rate": 9.848713866511655e-05, "loss": 0.0444, "step": 3240 }, { "epoch": 3.11601150527325, "grad_norm": 0.3755772113800049, "learning_rate": 9.847157406968885e-05, "loss": 0.0417, "step": 3250 }, { "epoch": 3.1255992329817834, "grad_norm": 0.32591307163238525, "learning_rate": 9.84559310603884e-05, "loss": 0.0571, "step": 3260 }, { "epoch": 3.135186960690316, "grad_norm": 0.4025377333164215, "learning_rate": 9.844020966252137e-05, "loss": 0.0479, "step": 3270 }, { "epoch": 3.1447746883988494, "grad_norm": 0.3106444478034973, "learning_rate": 9.842440990152068e-05, "loss": 0.0472, "step": 3280 }, { "epoch": 3.1543624161073827, "grad_norm": 0.3832003176212311, "learning_rate": 9.840853180294608e-05, "loss": 0.0566, "step": 3290 }, { "epoch": 3.1639501438159154, "grad_norm": 0.2815271019935608, "learning_rate": 9.839257539248403e-05, "loss": 0.0396, "step": 3300 }, { "epoch": 3.1735378715244487, "grad_norm": 0.38503342866897583, "learning_rate": 9.83765406959477e-05, "loss": 0.048, "step": 3310 }, { "epoch": 3.183125599232982, "grad_norm": 0.31450656056404114, "learning_rate": 9.836042773927685e-05, "loss": 0.0383, "step": 3320 }, { "epoch": 3.1927133269415147, "grad_norm": 0.39521682262420654, "learning_rate": 9.834423654853791e-05, "loss": 0.0449, "step": 3330 }, { "epoch": 3.202301054650048, "grad_norm": 0.4725668728351593, "learning_rate": 9.832796714992381e-05, "loss": 0.0436, "step": 3340 }, { "epoch": 3.211888782358581, "grad_norm": 0.43373286724090576, "learning_rate": 9.831161956975405e-05, "loss": 0.0502, "step": 3350 }, { "epoch": 3.221476510067114, "grad_norm": 0.30628758668899536, "learning_rate": 9.829519383447456e-05, "loss": 0.0454, "step": 3360 }, { "epoch": 3.231064237775647, "grad_norm": 0.6050196290016174, "learning_rate": 9.827868997065777e-05, "loss": 0.0528, "step": 3370 }, { "epoch": 3.2406519654841803, "grad_norm": 0.36287015676498413, "learning_rate": 9.826210800500242e-05, "loss": 0.0529, "step": 3380 }, { "epoch": 3.2502396931927136, "grad_norm": 0.41856274008750916, "learning_rate": 9.824544796433366e-05, "loss": 0.0489, "step": 3390 }, { "epoch": 3.2598274209012463, "grad_norm": 0.35269007086753845, "learning_rate": 9.82287098756029e-05, "loss": 0.049, "step": 3400 }, { "epoch": 3.2694151486097796, "grad_norm": 0.35962244868278503, "learning_rate": 9.821189376588786e-05, "loss": 0.0471, "step": 3410 }, { "epoch": 3.2790028763183123, "grad_norm": 0.5149263739585876, "learning_rate": 9.819499966239243e-05, "loss": 0.0533, "step": 3420 }, { "epoch": 3.2885906040268456, "grad_norm": 0.3651978075504303, "learning_rate": 9.81780275924467e-05, "loss": 0.0428, "step": 3430 }, { "epoch": 3.2981783317353788, "grad_norm": 0.377916157245636, "learning_rate": 9.816097758350688e-05, "loss": 0.0527, "step": 3440 }, { "epoch": 3.307766059443912, "grad_norm": 0.39240193367004395, "learning_rate": 9.814384966315526e-05, "loss": 0.0498, "step": 3450 }, { "epoch": 3.3173537871524448, "grad_norm": 0.4727850556373596, "learning_rate": 9.812664385910018e-05, "loss": 0.0519, "step": 3460 }, { "epoch": 3.326941514860978, "grad_norm": 0.3471921980381012, "learning_rate": 9.810936019917595e-05, "loss": 0.043, "step": 3470 }, { "epoch": 3.336529242569511, "grad_norm": 0.3818338215351105, "learning_rate": 9.809199871134287e-05, "loss": 0.0427, "step": 3480 }, { "epoch": 3.346116970278044, "grad_norm": 0.34183284640312195, "learning_rate": 9.807455942368711e-05, "loss": 0.0414, "step": 3490 }, { "epoch": 3.3557046979865772, "grad_norm": 0.3725120425224304, "learning_rate": 9.805704236442073e-05, "loss": 0.0493, "step": 3500 }, { "epoch": 3.3652924256951104, "grad_norm": 0.4457106590270996, "learning_rate": 9.803944756188157e-05, "loss": 0.0423, "step": 3510 }, { "epoch": 3.3748801534036432, "grad_norm": 0.3035670220851898, "learning_rate": 9.802177504453326e-05, "loss": 0.0431, "step": 3520 }, { "epoch": 3.3844678811121764, "grad_norm": 0.36193615198135376, "learning_rate": 9.800402484096513e-05, "loss": 0.0461, "step": 3530 }, { "epoch": 3.3940556088207097, "grad_norm": 0.39786848425865173, "learning_rate": 9.798619697989222e-05, "loss": 0.0558, "step": 3540 }, { "epoch": 3.4036433365292424, "grad_norm": 0.3743523061275482, "learning_rate": 9.796829149015517e-05, "loss": 0.0439, "step": 3550 }, { "epoch": 3.4132310642377757, "grad_norm": 0.36101034283638, "learning_rate": 9.79503084007202e-05, "loss": 0.0483, "step": 3560 }, { "epoch": 3.422818791946309, "grad_norm": 0.3148845434188843, "learning_rate": 9.79322477406791e-05, "loss": 0.043, "step": 3570 }, { "epoch": 3.4324065196548417, "grad_norm": 0.45851582288742065, "learning_rate": 9.79141095392491e-05, "loss": 0.0492, "step": 3580 }, { "epoch": 3.441994247363375, "grad_norm": 0.6849660277366638, "learning_rate": 9.789589382577291e-05, "loss": 0.0434, "step": 3590 }, { "epoch": 3.451581975071908, "grad_norm": 0.5036081671714783, "learning_rate": 9.787760062971861e-05, "loss": 0.0525, "step": 3600 }, { "epoch": 3.461169702780441, "grad_norm": 0.46620407700538635, "learning_rate": 9.785922998067963e-05, "loss": 0.0491, "step": 3610 }, { "epoch": 3.470757430488974, "grad_norm": 0.5096569657325745, "learning_rate": 9.784078190837472e-05, "loss": 0.0514, "step": 3620 }, { "epoch": 3.4803451581975073, "grad_norm": 0.2947571873664856, "learning_rate": 9.782225644264784e-05, "loss": 0.0457, "step": 3630 }, { "epoch": 3.48993288590604, "grad_norm": 0.4548271894454956, "learning_rate": 9.780365361346821e-05, "loss": 0.0448, "step": 3640 }, { "epoch": 3.4995206136145733, "grad_norm": 0.5976017713546753, "learning_rate": 9.778497345093013e-05, "loss": 0.0495, "step": 3650 }, { "epoch": 3.5091083413231066, "grad_norm": 0.3194081783294678, "learning_rate": 9.776621598525305e-05, "loss": 0.0473, "step": 3660 }, { "epoch": 3.5186960690316393, "grad_norm": 0.2841929793357849, "learning_rate": 9.774738124678148e-05, "loss": 0.0429, "step": 3670 }, { "epoch": 3.5282837967401726, "grad_norm": 0.2357761263847351, "learning_rate": 9.772846926598491e-05, "loss": 0.0494, "step": 3680 }, { "epoch": 3.537871524448706, "grad_norm": 0.893323540687561, "learning_rate": 9.770948007345779e-05, "loss": 0.0497, "step": 3690 }, { "epoch": 3.547459252157239, "grad_norm": 0.23153287172317505, "learning_rate": 9.769041369991953e-05, "loss": 0.0457, "step": 3700 }, { "epoch": 3.557046979865772, "grad_norm": 0.36270731687545776, "learning_rate": 9.767127017621431e-05, "loss": 0.0535, "step": 3710 }, { "epoch": 3.566634707574305, "grad_norm": 0.39080706238746643, "learning_rate": 9.76520495333112e-05, "loss": 0.0462, "step": 3720 }, { "epoch": 3.576222435282838, "grad_norm": 0.5226278901100159, "learning_rate": 9.763275180230395e-05, "loss": 0.0486, "step": 3730 }, { "epoch": 3.585810162991371, "grad_norm": 0.2358178198337555, "learning_rate": 9.761337701441111e-05, "loss": 0.0452, "step": 3740 }, { "epoch": 3.5953978906999042, "grad_norm": 0.47069254517555237, "learning_rate": 9.759392520097581e-05, "loss": 0.049, "step": 3750 }, { "epoch": 3.6049856184084375, "grad_norm": 0.327800452709198, "learning_rate": 9.75743963934658e-05, "loss": 0.0411, "step": 3760 }, { "epoch": 3.6145733461169702, "grad_norm": 0.4534970819950104, "learning_rate": 9.755479062347344e-05, "loss": 0.0472, "step": 3770 }, { "epoch": 3.6241610738255035, "grad_norm": 0.2962513566017151, "learning_rate": 9.753510792271549e-05, "loss": 0.0523, "step": 3780 }, { "epoch": 3.6337488015340362, "grad_norm": 0.46883541345596313, "learning_rate": 9.75153483230333e-05, "loss": 0.0468, "step": 3790 }, { "epoch": 3.6433365292425695, "grad_norm": 0.2845245599746704, "learning_rate": 9.749551185639249e-05, "loss": 0.0438, "step": 3800 }, { "epoch": 3.6529242569511027, "grad_norm": 0.2763413190841675, "learning_rate": 9.747559855488313e-05, "loss": 0.0472, "step": 3810 }, { "epoch": 3.662511984659636, "grad_norm": 0.27591028809547424, "learning_rate": 9.74556084507195e-05, "loss": 0.0457, "step": 3820 }, { "epoch": 3.6720997123681687, "grad_norm": 0.36455026268959045, "learning_rate": 9.743554157624023e-05, "loss": 0.0453, "step": 3830 }, { "epoch": 3.681687440076702, "grad_norm": 0.4757814407348633, "learning_rate": 9.741539796390804e-05, "loss": 0.0496, "step": 3840 }, { "epoch": 3.6912751677852347, "grad_norm": 0.3472752869129181, "learning_rate": 9.739517764630984e-05, "loss": 0.0438, "step": 3850 }, { "epoch": 3.700862895493768, "grad_norm": 0.39700034260749817, "learning_rate": 9.737488065615665e-05, "loss": 0.045, "step": 3860 }, { "epoch": 3.710450623202301, "grad_norm": 0.2766479551792145, "learning_rate": 9.735450702628348e-05, "loss": 0.0361, "step": 3870 }, { "epoch": 3.7200383509108343, "grad_norm": 0.3525460660457611, "learning_rate": 9.733405678964935e-05, "loss": 0.044, "step": 3880 }, { "epoch": 3.729626078619367, "grad_norm": 0.35298100113868713, "learning_rate": 9.731352997933718e-05, "loss": 0.0392, "step": 3890 }, { "epoch": 3.7392138063279003, "grad_norm": 0.32511138916015625, "learning_rate": 9.729292662855383e-05, "loss": 0.0463, "step": 3900 }, { "epoch": 3.748801534036433, "grad_norm": 0.33208218216896057, "learning_rate": 9.727224677062992e-05, "loss": 0.0479, "step": 3910 }, { "epoch": 3.7583892617449663, "grad_norm": 0.43648335337638855, "learning_rate": 9.725149043901985e-05, "loss": 0.0459, "step": 3920 }, { "epoch": 3.7679769894534996, "grad_norm": 0.3617904782295227, "learning_rate": 9.723065766730172e-05, "loss": 0.0545, "step": 3930 }, { "epoch": 3.777564717162033, "grad_norm": 0.34762272238731384, "learning_rate": 9.720974848917735e-05, "loss": 0.0433, "step": 3940 }, { "epoch": 3.7871524448705656, "grad_norm": 0.3334721028804779, "learning_rate": 9.71887629384721e-05, "loss": 0.0445, "step": 3950 }, { "epoch": 3.796740172579099, "grad_norm": 0.4064335823059082, "learning_rate": 9.716770104913492e-05, "loss": 0.0436, "step": 3960 }, { "epoch": 3.8063279002876316, "grad_norm": 0.4279939532279968, "learning_rate": 9.714656285523821e-05, "loss": 0.0534, "step": 3970 }, { "epoch": 3.815915627996165, "grad_norm": 0.28922349214553833, "learning_rate": 9.71253483909779e-05, "loss": 0.0488, "step": 3980 }, { "epoch": 3.825503355704698, "grad_norm": 0.701637327671051, "learning_rate": 9.710405769067317e-05, "loss": 0.0465, "step": 3990 }, { "epoch": 3.8350910834132312, "grad_norm": 0.3132900595664978, "learning_rate": 9.708269078876666e-05, "loss": 0.046, "step": 4000 }, { "epoch": 3.844678811121764, "grad_norm": 0.2793468236923218, "learning_rate": 9.706124771982421e-05, "loss": 0.0382, "step": 4010 }, { "epoch": 3.8542665388302972, "grad_norm": 0.32509496808052063, "learning_rate": 9.703972851853488e-05, "loss": 0.0419, "step": 4020 }, { "epoch": 3.8638542665388305, "grad_norm": 0.5768635869026184, "learning_rate": 9.701813321971091e-05, "loss": 0.0513, "step": 4030 }, { "epoch": 3.8734419942473632, "grad_norm": 0.37095797061920166, "learning_rate": 9.699646185828768e-05, "loss": 0.0493, "step": 4040 }, { "epoch": 3.8830297219558965, "grad_norm": 0.4116993844509125, "learning_rate": 9.697471446932353e-05, "loss": 0.0481, "step": 4050 }, { "epoch": 3.8926174496644297, "grad_norm": 0.4631316363811493, "learning_rate": 9.695289108799989e-05, "loss": 0.057, "step": 4060 }, { "epoch": 3.9022051773729625, "grad_norm": 0.5926663279533386, "learning_rate": 9.693099174962103e-05, "loss": 0.0541, "step": 4070 }, { "epoch": 3.9117929050814957, "grad_norm": 0.4884685277938843, "learning_rate": 9.690901648961418e-05, "loss": 0.0444, "step": 4080 }, { "epoch": 3.921380632790029, "grad_norm": 0.5205138921737671, "learning_rate": 9.688696534352935e-05, "loss": 0.0469, "step": 4090 }, { "epoch": 3.9309683604985617, "grad_norm": 0.3476182222366333, "learning_rate": 9.68648383470393e-05, "loss": 0.0506, "step": 4100 }, { "epoch": 3.940556088207095, "grad_norm": 0.310553640127182, "learning_rate": 9.684263553593953e-05, "loss": 0.0454, "step": 4110 }, { "epoch": 3.950143815915628, "grad_norm": 0.2743299603462219, "learning_rate": 9.682035694614817e-05, "loss": 0.0517, "step": 4120 }, { "epoch": 3.959731543624161, "grad_norm": 0.33413469791412354, "learning_rate": 9.679800261370594e-05, "loss": 0.0428, "step": 4130 }, { "epoch": 3.969319271332694, "grad_norm": 0.4639144837856293, "learning_rate": 9.677557257477609e-05, "loss": 0.0444, "step": 4140 }, { "epoch": 3.9789069990412274, "grad_norm": 0.33329275250434875, "learning_rate": 9.675306686564437e-05, "loss": 0.0472, "step": 4150 }, { "epoch": 3.98849472674976, "grad_norm": 0.4461182653903961, "learning_rate": 9.673048552271889e-05, "loss": 0.0375, "step": 4160 }, { "epoch": 3.9980824544582934, "grad_norm": 0.26508504152297974, "learning_rate": 9.670782858253015e-05, "loss": 0.0468, "step": 4170 }, { "epoch": 4.007670182166827, "grad_norm": 0.4112192690372467, "learning_rate": 9.668509608173094e-05, "loss": 0.0419, "step": 4180 }, { "epoch": 4.01725790987536, "grad_norm": 0.3724784255027771, "learning_rate": 9.66622880570963e-05, "loss": 0.0526, "step": 4190 }, { "epoch": 4.026845637583893, "grad_norm": 0.43858179450035095, "learning_rate": 9.663940454552342e-05, "loss": 0.0481, "step": 4200 }, { "epoch": 4.036433365292425, "grad_norm": 0.27318644523620605, "learning_rate": 9.661644558403162e-05, "loss": 0.0372, "step": 4210 }, { "epoch": 4.046021093000959, "grad_norm": 0.36369022727012634, "learning_rate": 9.659341120976229e-05, "loss": 0.0421, "step": 4220 }, { "epoch": 4.055608820709492, "grad_norm": 0.3167804479598999, "learning_rate": 9.657030145997878e-05, "loss": 0.0437, "step": 4230 }, { "epoch": 4.065196548418025, "grad_norm": 0.37195485830307007, "learning_rate": 9.654711637206644e-05, "loss": 0.0391, "step": 4240 }, { "epoch": 4.074784276126558, "grad_norm": 0.26798343658447266, "learning_rate": 9.652385598353244e-05, "loss": 0.0424, "step": 4250 }, { "epoch": 4.0843720038350915, "grad_norm": 0.38160890340805054, "learning_rate": 9.650052033200578e-05, "loss": 0.0473, "step": 4260 }, { "epoch": 4.093959731543624, "grad_norm": 0.3133178651332855, "learning_rate": 9.647710945523725e-05, "loss": 0.0446, "step": 4270 }, { "epoch": 4.103547459252157, "grad_norm": 0.314330130815506, "learning_rate": 9.645362339109927e-05, "loss": 0.0402, "step": 4280 }, { "epoch": 4.11313518696069, "grad_norm": 0.6541547775268555, "learning_rate": 9.643006217758594e-05, "loss": 0.0417, "step": 4290 }, { "epoch": 4.1227229146692235, "grad_norm": 0.3850661814212799, "learning_rate": 9.640642585281292e-05, "loss": 0.0483, "step": 4300 }, { "epoch": 4.132310642377757, "grad_norm": 0.4512180984020233, "learning_rate": 9.638271445501739e-05, "loss": 0.0382, "step": 4310 }, { "epoch": 4.14189837008629, "grad_norm": 0.277205228805542, "learning_rate": 9.635892802255794e-05, "loss": 0.0456, "step": 4320 }, { "epoch": 4.151486097794822, "grad_norm": 0.39535996317863464, "learning_rate": 9.63350665939146e-05, "loss": 0.0402, "step": 4330 }, { "epoch": 4.1610738255033555, "grad_norm": 0.3694916069507599, "learning_rate": 9.63111302076887e-05, "loss": 0.0414, "step": 4340 }, { "epoch": 4.170661553211889, "grad_norm": 0.4235345423221588, "learning_rate": 9.628711890260279e-05, "loss": 0.0475, "step": 4350 }, { "epoch": 4.180249280920422, "grad_norm": 0.46871331334114075, "learning_rate": 9.626303271750069e-05, "loss": 0.044, "step": 4360 }, { "epoch": 4.189837008628955, "grad_norm": 0.33372747898101807, "learning_rate": 9.623887169134731e-05, "loss": 0.0479, "step": 4370 }, { "epoch": 4.199424736337488, "grad_norm": 0.29731622338294983, "learning_rate": 9.621463586322863e-05, "loss": 0.0477, "step": 4380 }, { "epoch": 4.209012464046021, "grad_norm": 0.3897842466831207, "learning_rate": 9.619032527235168e-05, "loss": 0.0449, "step": 4390 }, { "epoch": 4.218600191754554, "grad_norm": 0.375775009393692, "learning_rate": 9.616593995804437e-05, "loss": 0.0527, "step": 4400 }, { "epoch": 4.228187919463087, "grad_norm": 0.44136837124824524, "learning_rate": 9.614147995975557e-05, "loss": 0.0465, "step": 4410 }, { "epoch": 4.23777564717162, "grad_norm": 0.49286016821861267, "learning_rate": 9.611694531705493e-05, "loss": 0.0478, "step": 4420 }, { "epoch": 4.247363374880154, "grad_norm": 0.36331725120544434, "learning_rate": 9.609233606963282e-05, "loss": 0.0453, "step": 4430 }, { "epoch": 4.256951102588687, "grad_norm": 0.4064538776874542, "learning_rate": 9.606765225730035e-05, "loss": 0.0512, "step": 4440 }, { "epoch": 4.26653883029722, "grad_norm": 0.3883167505264282, "learning_rate": 9.604289391998925e-05, "loss": 0.0416, "step": 4450 }, { "epoch": 4.276126558005752, "grad_norm": 0.364762544631958, "learning_rate": 9.601806109775179e-05, "loss": 0.0483, "step": 4460 }, { "epoch": 4.285714285714286, "grad_norm": 0.3354509472846985, "learning_rate": 9.599315383076075e-05, "loss": 0.0475, "step": 4470 }, { "epoch": 4.295302013422819, "grad_norm": 0.3480011224746704, "learning_rate": 9.596817215930934e-05, "loss": 0.0441, "step": 4480 }, { "epoch": 4.304889741131352, "grad_norm": 0.37383604049682617, "learning_rate": 9.594311612381114e-05, "loss": 0.0566, "step": 4490 }, { "epoch": 4.314477468839885, "grad_norm": 0.5128716826438904, "learning_rate": 9.591798576480001e-05, "loss": 0.0452, "step": 4500 }, { "epoch": 4.324065196548418, "grad_norm": 0.2675018310546875, "learning_rate": 9.589278112293007e-05, "loss": 0.0453, "step": 4510 }, { "epoch": 4.333652924256951, "grad_norm": 0.39544346928596497, "learning_rate": 9.586750223897562e-05, "loss": 0.0479, "step": 4520 }, { "epoch": 4.343240651965484, "grad_norm": 0.7438755631446838, "learning_rate": 9.584214915383103e-05, "loss": 0.0432, "step": 4530 }, { "epoch": 4.352828379674017, "grad_norm": 0.29193535447120667, "learning_rate": 9.58167219085107e-05, "loss": 0.0467, "step": 4540 }, { "epoch": 4.3624161073825505, "grad_norm": 0.34703853726387024, "learning_rate": 9.579122054414907e-05, "loss": 0.0435, "step": 4550 }, { "epoch": 4.372003835091084, "grad_norm": 0.408741295337677, "learning_rate": 9.576564510200038e-05, "loss": 0.0433, "step": 4560 }, { "epoch": 4.381591562799617, "grad_norm": 0.4278319478034973, "learning_rate": 9.573999562343882e-05, "loss": 0.047, "step": 4570 }, { "epoch": 4.391179290508149, "grad_norm": 0.2754301428794861, "learning_rate": 9.571427214995826e-05, "loss": 0.0423, "step": 4580 }, { "epoch": 4.4007670182166825, "grad_norm": 0.35973575711250305, "learning_rate": 9.568847472317232e-05, "loss": 0.0404, "step": 4590 }, { "epoch": 4.410354745925216, "grad_norm": 0.31195884943008423, "learning_rate": 9.566260338481425e-05, "loss": 0.0476, "step": 4600 }, { "epoch": 4.419942473633749, "grad_norm": 0.32485419511795044, "learning_rate": 9.563665817673688e-05, "loss": 0.0434, "step": 4610 }, { "epoch": 4.429530201342282, "grad_norm": 0.6262250542640686, "learning_rate": 9.56106391409125e-05, "loss": 0.0523, "step": 4620 }, { "epoch": 4.439117929050815, "grad_norm": 0.41280031204223633, "learning_rate": 9.558454631943286e-05, "loss": 0.0446, "step": 4630 }, { "epoch": 4.448705656759348, "grad_norm": 0.32726627588272095, "learning_rate": 9.55583797545091e-05, "loss": 0.0424, "step": 4640 }, { "epoch": 4.458293384467881, "grad_norm": 0.49836626648902893, "learning_rate": 9.55321394884716e-05, "loss": 0.0449, "step": 4650 }, { "epoch": 4.467881112176414, "grad_norm": 0.24192816019058228, "learning_rate": 9.550582556377003e-05, "loss": 0.0533, "step": 4660 }, { "epoch": 4.477468839884947, "grad_norm": 0.455990731716156, "learning_rate": 9.547943802297317e-05, "loss": 0.0473, "step": 4670 }, { "epoch": 4.487056567593481, "grad_norm": 0.37101179361343384, "learning_rate": 9.545297690876893e-05, "loss": 0.0362, "step": 4680 }, { "epoch": 4.496644295302014, "grad_norm": 0.5495269894599915, "learning_rate": 9.54264422639642e-05, "loss": 0.0498, "step": 4690 }, { "epoch": 4.506232023010546, "grad_norm": 0.2883033752441406, "learning_rate": 9.539983413148486e-05, "loss": 0.0487, "step": 4700 }, { "epoch": 4.515819750719079, "grad_norm": 0.36912739276885986, "learning_rate": 9.537315255437565e-05, "loss": 0.0388, "step": 4710 }, { "epoch": 4.525407478427613, "grad_norm": 0.4408855438232422, "learning_rate": 9.534639757580013e-05, "loss": 0.0439, "step": 4720 }, { "epoch": 4.534995206136146, "grad_norm": 0.4027664363384247, "learning_rate": 9.531956923904062e-05, "loss": 0.0443, "step": 4730 }, { "epoch": 4.544582933844679, "grad_norm": 0.41703498363494873, "learning_rate": 9.52926675874981e-05, "loss": 0.0405, "step": 4740 }, { "epoch": 4.554170661553212, "grad_norm": 0.5367491245269775, "learning_rate": 9.526569266469213e-05, "loss": 0.0518, "step": 4750 }, { "epoch": 4.563758389261745, "grad_norm": 0.5591031312942505, "learning_rate": 9.523864451426086e-05, "loss": 0.0436, "step": 4760 }, { "epoch": 4.573346116970278, "grad_norm": 0.6005666851997375, "learning_rate": 9.521152317996083e-05, "loss": 0.0408, "step": 4770 }, { "epoch": 4.582933844678811, "grad_norm": 0.36557164788246155, "learning_rate": 9.518432870566703e-05, "loss": 0.0441, "step": 4780 }, { "epoch": 4.592521572387344, "grad_norm": 0.3382973372936249, "learning_rate": 9.515706113537275e-05, "loss": 0.0448, "step": 4790 }, { "epoch": 4.6021093000958775, "grad_norm": 0.36412662267684937, "learning_rate": 9.512972051318952e-05, "loss": 0.0447, "step": 4800 }, { "epoch": 4.611697027804411, "grad_norm": 0.28740257024765015, "learning_rate": 9.510230688334709e-05, "loss": 0.0489, "step": 4810 }, { "epoch": 4.621284755512944, "grad_norm": 0.37100985646247864, "learning_rate": 9.507482029019324e-05, "loss": 0.0417, "step": 4820 }, { "epoch": 4.630872483221476, "grad_norm": 0.4594654142856598, "learning_rate": 9.504726077819387e-05, "loss": 0.0426, "step": 4830 }, { "epoch": 4.6404602109300095, "grad_norm": 0.37358155846595764, "learning_rate": 9.501962839193277e-05, "loss": 0.0491, "step": 4840 }, { "epoch": 4.650047938638543, "grad_norm": 0.31801337003707886, "learning_rate": 9.499192317611167e-05, "loss": 0.0444, "step": 4850 }, { "epoch": 4.659635666347076, "grad_norm": 0.4786074161529541, "learning_rate": 9.496414517555012e-05, "loss": 0.0426, "step": 4860 }, { "epoch": 4.669223394055609, "grad_norm": 0.23249605298042297, "learning_rate": 9.493629443518537e-05, "loss": 0.0495, "step": 4870 }, { "epoch": 4.6788111217641415, "grad_norm": 0.4823112487792969, "learning_rate": 9.490837100007237e-05, "loss": 0.0505, "step": 4880 }, { "epoch": 4.688398849472675, "grad_norm": 0.38305050134658813, "learning_rate": 9.488037491538369e-05, "loss": 0.0441, "step": 4890 }, { "epoch": 4.697986577181208, "grad_norm": 0.4120224714279175, "learning_rate": 9.485230622640939e-05, "loss": 0.0464, "step": 4900 }, { "epoch": 4.707574304889741, "grad_norm": 0.26718661189079285, "learning_rate": 9.482416497855705e-05, "loss": 0.0442, "step": 4910 }, { "epoch": 4.717162032598274, "grad_norm": 0.35078132152557373, "learning_rate": 9.47959512173515e-05, "loss": 0.0402, "step": 4920 }, { "epoch": 4.726749760306808, "grad_norm": 0.2796134948730469, "learning_rate": 9.476766498843504e-05, "loss": 0.0444, "step": 4930 }, { "epoch": 4.736337488015341, "grad_norm": 0.37330299615859985, "learning_rate": 9.473930633756706e-05, "loss": 0.0433, "step": 4940 }, { "epoch": 4.745925215723873, "grad_norm": 0.47067347168922424, "learning_rate": 9.471087531062424e-05, "loss": 0.0479, "step": 4950 }, { "epoch": 4.755512943432406, "grad_norm": 0.3017641603946686, "learning_rate": 9.468237195360023e-05, "loss": 0.0427, "step": 4960 }, { "epoch": 4.76510067114094, "grad_norm": 0.41320186853408813, "learning_rate": 9.465379631260574e-05, "loss": 0.052, "step": 4970 }, { "epoch": 4.774688398849473, "grad_norm": 0.4640481173992157, "learning_rate": 9.462514843386845e-05, "loss": 0.0463, "step": 4980 }, { "epoch": 4.784276126558006, "grad_norm": 0.2581227123737335, "learning_rate": 9.459642836373282e-05, "loss": 0.0371, "step": 4990 }, { "epoch": 4.793863854266538, "grad_norm": 0.3752846121788025, "learning_rate": 9.456763614866016e-05, "loss": 0.0437, "step": 5000 }, { "epoch": 4.803451581975072, "grad_norm": 0.27923133969306946, "learning_rate": 9.453877183522848e-05, "loss": 0.0442, "step": 5010 }, { "epoch": 4.813039309683605, "grad_norm": 0.31683099269866943, "learning_rate": 9.450983547013242e-05, "loss": 0.0396, "step": 5020 }, { "epoch": 4.822627037392138, "grad_norm": 0.5572241544723511, "learning_rate": 9.448082710018317e-05, "loss": 0.0464, "step": 5030 }, { "epoch": 4.832214765100671, "grad_norm": 0.4878758490085602, "learning_rate": 9.44517467723084e-05, "loss": 0.0462, "step": 5040 }, { "epoch": 4.8418024928092045, "grad_norm": 0.2646120488643646, "learning_rate": 9.442259453355222e-05, "loss": 0.0434, "step": 5050 }, { "epoch": 4.851390220517738, "grad_norm": 0.23440934717655182, "learning_rate": 9.439337043107506e-05, "loss": 0.0454, "step": 5060 }, { "epoch": 4.86097794822627, "grad_norm": 0.2339864820241928, "learning_rate": 9.436407451215356e-05, "loss": 0.0388, "step": 5070 }, { "epoch": 4.870565675934803, "grad_norm": 0.3039968013763428, "learning_rate": 9.433470682418061e-05, "loss": 0.0466, "step": 5080 }, { "epoch": 4.8801534036433365, "grad_norm": 0.29253584146499634, "learning_rate": 9.430526741466519e-05, "loss": 0.0421, "step": 5090 }, { "epoch": 4.88974113135187, "grad_norm": 0.1914910078048706, "learning_rate": 9.427575633123224e-05, "loss": 0.0476, "step": 5100 }, { "epoch": 4.899328859060403, "grad_norm": 0.2769542932510376, "learning_rate": 9.424617362162271e-05, "loss": 0.0498, "step": 5110 }, { "epoch": 4.908916586768935, "grad_norm": 0.4235975444316864, "learning_rate": 9.421651933369345e-05, "loss": 0.0479, "step": 5120 }, { "epoch": 4.9185043144774685, "grad_norm": 0.2977217733860016, "learning_rate": 9.4186793515417e-05, "loss": 0.0367, "step": 5130 }, { "epoch": 4.928092042186002, "grad_norm": 0.4000433087348938, "learning_rate": 9.415699621488172e-05, "loss": 0.0452, "step": 5140 }, { "epoch": 4.937679769894535, "grad_norm": 0.3901826739311218, "learning_rate": 9.412712748029157e-05, "loss": 0.0431, "step": 5150 }, { "epoch": 4.947267497603068, "grad_norm": 0.45422032475471497, "learning_rate": 9.409718735996605e-05, "loss": 0.0419, "step": 5160 }, { "epoch": 4.956855225311601, "grad_norm": 0.29559481143951416, "learning_rate": 9.406717590234016e-05, "loss": 0.0404, "step": 5170 }, { "epoch": 4.966442953020135, "grad_norm": 0.39736929535865784, "learning_rate": 9.403709315596431e-05, "loss": 0.0409, "step": 5180 }, { "epoch": 4.976030680728667, "grad_norm": 0.37043488025665283, "learning_rate": 9.400693916950427e-05, "loss": 0.0494, "step": 5190 }, { "epoch": 4.9856184084372, "grad_norm": 0.35436293482780457, "learning_rate": 9.397671399174096e-05, "loss": 0.0505, "step": 5200 }, { "epoch": 4.995206136145733, "grad_norm": 0.24993938207626343, "learning_rate": 9.394641767157056e-05, "loss": 0.0491, "step": 5210 }, { "epoch": 5.004793863854267, "grad_norm": 0.3652108609676361, "learning_rate": 9.391605025800431e-05, "loss": 0.0474, "step": 5220 }, { "epoch": 5.0143815915628, "grad_norm": 0.3362497389316559, "learning_rate": 9.388561180016844e-05, "loss": 0.0481, "step": 5230 }, { "epoch": 5.023969319271333, "grad_norm": 0.25596174597740173, "learning_rate": 9.385510234730415e-05, "loss": 0.0475, "step": 5240 }, { "epoch": 5.033557046979865, "grad_norm": 0.4541703760623932, "learning_rate": 9.382452194876743e-05, "loss": 0.0448, "step": 5250 }, { "epoch": 5.043144774688399, "grad_norm": 0.5844725966453552, "learning_rate": 9.379387065402911e-05, "loss": 0.0531, "step": 5260 }, { "epoch": 5.052732502396932, "grad_norm": 0.5136455297470093, "learning_rate": 9.376314851267468e-05, "loss": 0.0478, "step": 5270 }, { "epoch": 5.062320230105465, "grad_norm": 0.36073240637779236, "learning_rate": 9.373235557440423e-05, "loss": 0.0413, "step": 5280 }, { "epoch": 5.071907957813998, "grad_norm": 0.3564154803752899, "learning_rate": 9.370149188903238e-05, "loss": 0.0474, "step": 5290 }, { "epoch": 5.0814956855225315, "grad_norm": 0.269563764333725, "learning_rate": 9.367055750648823e-05, "loss": 0.05, "step": 5300 }, { "epoch": 5.091083413231064, "grad_norm": 0.32311663031578064, "learning_rate": 9.363955247681522e-05, "loss": 0.0443, "step": 5310 }, { "epoch": 5.100671140939597, "grad_norm": 0.2627108097076416, "learning_rate": 9.360847685017109e-05, "loss": 0.0442, "step": 5320 }, { "epoch": 5.11025886864813, "grad_norm": 0.34790635108947754, "learning_rate": 9.357733067682777e-05, "loss": 0.0441, "step": 5330 }, { "epoch": 5.1198465963566635, "grad_norm": 0.22408638894557953, "learning_rate": 9.354611400717135e-05, "loss": 0.0415, "step": 5340 }, { "epoch": 5.129434324065197, "grad_norm": 0.3347373306751251, "learning_rate": 9.351482689170193e-05, "loss": 0.0427, "step": 5350 }, { "epoch": 5.13902205177373, "grad_norm": 0.30321311950683594, "learning_rate": 9.348346938103359e-05, "loss": 0.0434, "step": 5360 }, { "epoch": 5.148609779482262, "grad_norm": 0.2402300387620926, "learning_rate": 9.345204152589428e-05, "loss": 0.0475, "step": 5370 }, { "epoch": 5.1581975071907955, "grad_norm": 0.5249261856079102, "learning_rate": 9.342054337712576e-05, "loss": 0.0486, "step": 5380 }, { "epoch": 5.167785234899329, "grad_norm": 0.3607705533504486, "learning_rate": 9.338897498568349e-05, "loss": 0.0417, "step": 5390 }, { "epoch": 5.177372962607862, "grad_norm": 0.38747304677963257, "learning_rate": 9.33573364026366e-05, "loss": 0.0477, "step": 5400 }, { "epoch": 5.186960690316395, "grad_norm": 0.36637309193611145, "learning_rate": 9.332562767916771e-05, "loss": 0.044, "step": 5410 }, { "epoch": 5.196548418024928, "grad_norm": 0.31087052822113037, "learning_rate": 9.329384886657296e-05, "loss": 0.0373, "step": 5420 }, { "epoch": 5.206136145733462, "grad_norm": 0.3998284935951233, "learning_rate": 9.326200001626184e-05, "loss": 0.036, "step": 5430 }, { "epoch": 5.215723873441994, "grad_norm": 0.3035097122192383, "learning_rate": 9.323008117975718e-05, "loss": 0.0429, "step": 5440 }, { "epoch": 5.225311601150527, "grad_norm": 0.3162848949432373, "learning_rate": 9.319809240869502e-05, "loss": 0.0524, "step": 5450 }, { "epoch": 5.23489932885906, "grad_norm": 0.3142375946044922, "learning_rate": 9.316603375482449e-05, "loss": 0.0479, "step": 5460 }, { "epoch": 5.244487056567594, "grad_norm": 0.3951794505119324, "learning_rate": 9.313390527000783e-05, "loss": 0.044, "step": 5470 }, { "epoch": 5.254074784276127, "grad_norm": 0.26764142513275146, "learning_rate": 9.310170700622021e-05, "loss": 0.0409, "step": 5480 }, { "epoch": 5.263662511984659, "grad_norm": 0.3293421268463135, "learning_rate": 9.306943901554972e-05, "loss": 0.0413, "step": 5490 }, { "epoch": 5.273250239693192, "grad_norm": 0.39588311314582825, "learning_rate": 9.30371013501972e-05, "loss": 0.0475, "step": 5500 }, { "epoch": 5.282837967401726, "grad_norm": 0.330180287361145, "learning_rate": 9.300469406247621e-05, "loss": 0.0409, "step": 5510 }, { "epoch": 5.292425695110259, "grad_norm": 0.37915417551994324, "learning_rate": 9.297221720481302e-05, "loss": 0.0413, "step": 5520 }, { "epoch": 5.302013422818792, "grad_norm": 0.3579411208629608, "learning_rate": 9.293967082974632e-05, "loss": 0.0476, "step": 5530 }, { "epoch": 5.311601150527325, "grad_norm": 0.30744969844818115, "learning_rate": 9.29070549899274e-05, "loss": 0.0454, "step": 5540 }, { "epoch": 5.3211888782358585, "grad_norm": 0.31515830755233765, "learning_rate": 9.287436973811978e-05, "loss": 0.0343, "step": 5550 }, { "epoch": 5.330776605944391, "grad_norm": 0.26603803038597107, "learning_rate": 9.284161512719938e-05, "loss": 0.0402, "step": 5560 }, { "epoch": 5.340364333652924, "grad_norm": 0.39798933267593384, "learning_rate": 9.280879121015428e-05, "loss": 0.0339, "step": 5570 }, { "epoch": 5.349952061361457, "grad_norm": 0.35744068026542664, "learning_rate": 9.277589804008467e-05, "loss": 0.0442, "step": 5580 }, { "epoch": 5.3595397890699905, "grad_norm": 0.426455557346344, "learning_rate": 9.27429356702028e-05, "loss": 0.0515, "step": 5590 }, { "epoch": 5.369127516778524, "grad_norm": 0.5735211372375488, "learning_rate": 9.270990415383285e-05, "loss": 0.0447, "step": 5600 }, { "epoch": 5.378715244487057, "grad_norm": 0.40888845920562744, "learning_rate": 9.267680354441087e-05, "loss": 0.0523, "step": 5610 }, { "epoch": 5.388302972195589, "grad_norm": 0.5509734153747559, "learning_rate": 9.264363389548465e-05, "loss": 0.0475, "step": 5620 }, { "epoch": 5.3978906999041225, "grad_norm": 0.4060477912425995, "learning_rate": 9.261039526071374e-05, "loss": 0.0456, "step": 5630 }, { "epoch": 5.407478427612656, "grad_norm": 0.3927951157093048, "learning_rate": 9.257708769386919e-05, "loss": 0.0588, "step": 5640 }, { "epoch": 5.417066155321189, "grad_norm": 0.2928200662136078, "learning_rate": 9.254371124883366e-05, "loss": 0.0412, "step": 5650 }, { "epoch": 5.426653883029722, "grad_norm": 0.37971609830856323, "learning_rate": 9.251026597960117e-05, "loss": 0.0473, "step": 5660 }, { "epoch": 5.436241610738255, "grad_norm": 0.3287939131259918, "learning_rate": 9.247675194027712e-05, "loss": 0.055, "step": 5670 }, { "epoch": 5.445829338446788, "grad_norm": 0.2745339870452881, "learning_rate": 9.244316918507813e-05, "loss": 0.044, "step": 5680 }, { "epoch": 5.455417066155321, "grad_norm": 0.2364960014820099, "learning_rate": 9.240951776833202e-05, "loss": 0.045, "step": 5690 }, { "epoch": 5.465004793863854, "grad_norm": 0.5563991069793701, "learning_rate": 9.237579774447765e-05, "loss": 0.042, "step": 5700 }, { "epoch": 5.474592521572387, "grad_norm": 0.3112446367740631, "learning_rate": 9.234200916806486e-05, "loss": 0.0488, "step": 5710 }, { "epoch": 5.484180249280921, "grad_norm": 0.32364800572395325, "learning_rate": 9.230815209375446e-05, "loss": 0.039, "step": 5720 }, { "epoch": 5.493767976989454, "grad_norm": 0.35172006487846375, "learning_rate": 9.227422657631796e-05, "loss": 0.0443, "step": 5730 }, { "epoch": 5.503355704697986, "grad_norm": 0.3294823169708252, "learning_rate": 9.22402326706377e-05, "loss": 0.0466, "step": 5740 }, { "epoch": 5.512943432406519, "grad_norm": 0.32464146614074707, "learning_rate": 9.220617043170661e-05, "loss": 0.0456, "step": 5750 }, { "epoch": 5.522531160115053, "grad_norm": 0.36492130160331726, "learning_rate": 9.217203991462815e-05, "loss": 0.041, "step": 5760 }, { "epoch": 5.532118887823586, "grad_norm": 0.35331547260284424, "learning_rate": 9.213784117461624e-05, "loss": 0.0377, "step": 5770 }, { "epoch": 5.541706615532119, "grad_norm": 0.2622346580028534, "learning_rate": 9.210357426699519e-05, "loss": 0.0422, "step": 5780 }, { "epoch": 5.551294343240652, "grad_norm": 0.4553088843822479, "learning_rate": 9.206923924719955e-05, "loss": 0.0467, "step": 5790 }, { "epoch": 5.5608820709491855, "grad_norm": 0.38138529658317566, "learning_rate": 9.203483617077411e-05, "loss": 0.0397, "step": 5800 }, { "epoch": 5.570469798657718, "grad_norm": 0.4665132761001587, "learning_rate": 9.200036509337369e-05, "loss": 0.0518, "step": 5810 }, { "epoch": 5.580057526366251, "grad_norm": 0.27688702940940857, "learning_rate": 9.196582607076319e-05, "loss": 0.0469, "step": 5820 }, { "epoch": 5.589645254074784, "grad_norm": 0.3505072295665741, "learning_rate": 9.193121915881737e-05, "loss": 0.0526, "step": 5830 }, { "epoch": 5.5992329817833175, "grad_norm": 0.23712855577468872, "learning_rate": 9.189654441352082e-05, "loss": 0.0431, "step": 5840 }, { "epoch": 5.608820709491851, "grad_norm": 0.41854333877563477, "learning_rate": 9.186180189096791e-05, "loss": 0.0505, "step": 5850 }, { "epoch": 5.618408437200383, "grad_norm": 0.2426682859659195, "learning_rate": 9.182699164736264e-05, "loss": 0.0444, "step": 5860 }, { "epoch": 5.627996164908916, "grad_norm": 0.6301522850990295, "learning_rate": 9.17921137390185e-05, "loss": 0.0495, "step": 5870 }, { "epoch": 5.6375838926174495, "grad_norm": 0.3933928608894348, "learning_rate": 9.175716822235854e-05, "loss": 0.0524, "step": 5880 }, { "epoch": 5.647171620325983, "grad_norm": 0.4735229015350342, "learning_rate": 9.17221551539151e-05, "loss": 0.0416, "step": 5890 }, { "epoch": 5.656759348034516, "grad_norm": 0.23624800145626068, "learning_rate": 9.168707459032988e-05, "loss": 0.0436, "step": 5900 }, { "epoch": 5.666347075743049, "grad_norm": 0.35046079754829407, "learning_rate": 9.165192658835369e-05, "loss": 0.043, "step": 5910 }, { "epoch": 5.675934803451582, "grad_norm": 0.43765562772750854, "learning_rate": 9.161671120484649e-05, "loss": 0.0449, "step": 5920 }, { "epoch": 5.685522531160115, "grad_norm": 0.2839658856391907, "learning_rate": 9.158142849677723e-05, "loss": 0.0429, "step": 5930 }, { "epoch": 5.695110258868648, "grad_norm": 0.34485873579978943, "learning_rate": 9.154607852122376e-05, "loss": 0.0444, "step": 5940 }, { "epoch": 5.704697986577181, "grad_norm": 1.9406903982162476, "learning_rate": 9.15106613353728e-05, "loss": 0.0491, "step": 5950 }, { "epoch": 5.714285714285714, "grad_norm": 0.25078949332237244, "learning_rate": 9.14751769965197e-05, "loss": 0.0455, "step": 5960 }, { "epoch": 5.723873441994248, "grad_norm": 0.342736154794693, "learning_rate": 9.143962556206853e-05, "loss": 0.0418, "step": 5970 }, { "epoch": 5.73346116970278, "grad_norm": 0.36492887139320374, "learning_rate": 9.140400708953189e-05, "loss": 0.0457, "step": 5980 }, { "epoch": 5.743048897411313, "grad_norm": 0.3328196108341217, "learning_rate": 9.136832163653083e-05, "loss": 0.0434, "step": 5990 }, { "epoch": 5.752636625119846, "grad_norm": 0.31458353996276855, "learning_rate": 9.13325692607947e-05, "loss": 0.0426, "step": 6000 }, { "epoch": 5.76222435282838, "grad_norm": 0.2388927936553955, "learning_rate": 9.129675002016119e-05, "loss": 0.0412, "step": 6010 }, { "epoch": 5.771812080536913, "grad_norm": 0.29163026809692383, "learning_rate": 9.126086397257612e-05, "loss": 0.0464, "step": 6020 }, { "epoch": 5.781399808245446, "grad_norm": 0.3460707664489746, "learning_rate": 9.122491117609336e-05, "loss": 0.0417, "step": 6030 }, { "epoch": 5.790987535953979, "grad_norm": 0.17329041659832, "learning_rate": 9.118889168887483e-05, "loss": 0.0394, "step": 6040 }, { "epoch": 5.800575263662512, "grad_norm": 0.2987213730812073, "learning_rate": 9.11528055691903e-05, "loss": 0.0399, "step": 6050 }, { "epoch": 5.810162991371045, "grad_norm": 0.4310978353023529, "learning_rate": 9.111665287541733e-05, "loss": 0.0387, "step": 6060 }, { "epoch": 5.819750719079578, "grad_norm": 0.3461402952671051, "learning_rate": 9.108043366604115e-05, "loss": 0.0388, "step": 6070 }, { "epoch": 5.829338446788111, "grad_norm": 0.3460417091846466, "learning_rate": 9.104414799965468e-05, "loss": 0.0458, "step": 6080 }, { "epoch": 5.8389261744966445, "grad_norm": 0.26389792561531067, "learning_rate": 9.100779593495825e-05, "loss": 0.0416, "step": 6090 }, { "epoch": 5.848513902205178, "grad_norm": 0.39147645235061646, "learning_rate": 9.097137753075966e-05, "loss": 0.0392, "step": 6100 }, { "epoch": 5.85810162991371, "grad_norm": 0.4331185221672058, "learning_rate": 9.093489284597404e-05, "loss": 0.0388, "step": 6110 }, { "epoch": 5.867689357622243, "grad_norm": 0.1933136284351349, "learning_rate": 9.089834193962372e-05, "loss": 0.0408, "step": 6120 }, { "epoch": 5.8772770853307765, "grad_norm": 0.29839614033699036, "learning_rate": 9.086172487083815e-05, "loss": 0.0374, "step": 6130 }, { "epoch": 5.88686481303931, "grad_norm": 0.23067611455917358, "learning_rate": 9.082504169885381e-05, "loss": 0.044, "step": 6140 }, { "epoch": 5.896452540747843, "grad_norm": 0.2817287743091583, "learning_rate": 9.078829248301417e-05, "loss": 0.036, "step": 6150 }, { "epoch": 5.906040268456376, "grad_norm": 0.2695087790489197, "learning_rate": 9.07514772827695e-05, "loss": 0.0371, "step": 6160 }, { "epoch": 5.9156279961649085, "grad_norm": 0.5207583904266357, "learning_rate": 9.071459615767679e-05, "loss": 0.0406, "step": 6170 }, { "epoch": 5.925215723873442, "grad_norm": 0.3283056914806366, "learning_rate": 9.067764916739971e-05, "loss": 0.0421, "step": 6180 }, { "epoch": 5.934803451581975, "grad_norm": 0.4326401352882385, "learning_rate": 9.06406363717085e-05, "loss": 0.0397, "step": 6190 }, { "epoch": 5.944391179290508, "grad_norm": 0.3044590651988983, "learning_rate": 9.060355783047982e-05, "loss": 0.045, "step": 6200 }, { "epoch": 5.953978906999041, "grad_norm": 0.2913448214530945, "learning_rate": 9.056641360369672e-05, "loss": 0.0364, "step": 6210 }, { "epoch": 5.963566634707575, "grad_norm": 0.4203062057495117, "learning_rate": 9.052920375144847e-05, "loss": 0.0544, "step": 6220 }, { "epoch": 5.973154362416107, "grad_norm": 0.34060561656951904, "learning_rate": 9.049192833393055e-05, "loss": 0.0428, "step": 6230 }, { "epoch": 5.98274209012464, "grad_norm": 0.28594672679901123, "learning_rate": 9.045458741144446e-05, "loss": 0.0497, "step": 6240 }, { "epoch": 5.992329817833173, "grad_norm": 0.4794290065765381, "learning_rate": 9.041718104439772e-05, "loss": 0.0462, "step": 6250 }, { "epoch": 6.001917545541707, "grad_norm": 0.47997909784317017, "learning_rate": 9.037970929330368e-05, "loss": 0.0489, "step": 6260 }, { "epoch": 6.01150527325024, "grad_norm": 0.37695473432540894, "learning_rate": 9.03421722187815e-05, "loss": 0.0409, "step": 6270 }, { "epoch": 6.021093000958773, "grad_norm": 0.4723213315010071, "learning_rate": 9.030456988155596e-05, "loss": 0.0458, "step": 6280 }, { "epoch": 6.030680728667305, "grad_norm": 0.4923066794872284, "learning_rate": 9.026690234245749e-05, "loss": 0.048, "step": 6290 }, { "epoch": 6.040268456375839, "grad_norm": 0.3513863980770111, "learning_rate": 9.022916966242192e-05, "loss": 0.0414, "step": 6300 }, { "epoch": 6.049856184084372, "grad_norm": 0.34284889698028564, "learning_rate": 9.019137190249055e-05, "loss": 0.0435, "step": 6310 }, { "epoch": 6.059443911792905, "grad_norm": 0.33619949221611023, "learning_rate": 9.015350912380989e-05, "loss": 0.0428, "step": 6320 }, { "epoch": 6.069031639501438, "grad_norm": 0.5763192176818848, "learning_rate": 9.011558138763165e-05, "loss": 0.0386, "step": 6330 }, { "epoch": 6.0786193672099715, "grad_norm": 0.5095228552818298, "learning_rate": 9.007758875531264e-05, "loss": 0.041, "step": 6340 }, { "epoch": 6.088207094918504, "grad_norm": 0.3965105414390564, "learning_rate": 9.003953128831464e-05, "loss": 0.0409, "step": 6350 }, { "epoch": 6.097794822627037, "grad_norm": 0.3434533178806305, "learning_rate": 9.000140904820432e-05, "loss": 0.0393, "step": 6360 }, { "epoch": 6.10738255033557, "grad_norm": 0.2840021252632141, "learning_rate": 8.996322209665313e-05, "loss": 0.0476, "step": 6370 }, { "epoch": 6.1169702780441035, "grad_norm": 0.3020944893360138, "learning_rate": 8.992497049543722e-05, "loss": 0.042, "step": 6380 }, { "epoch": 6.126558005752637, "grad_norm": 0.36320140957832336, "learning_rate": 8.988665430643732e-05, "loss": 0.0376, "step": 6390 }, { "epoch": 6.13614573346117, "grad_norm": 0.34080708026885986, "learning_rate": 8.984827359163863e-05, "loss": 0.0428, "step": 6400 }, { "epoch": 6.145733461169703, "grad_norm": 0.28345417976379395, "learning_rate": 8.980982841313074e-05, "loss": 0.041, "step": 6410 }, { "epoch": 6.1553211888782355, "grad_norm": 0.37377986311912537, "learning_rate": 8.977131883310757e-05, "loss": 0.0429, "step": 6420 }, { "epoch": 6.164908916586769, "grad_norm": 0.30983594059944153, "learning_rate": 8.973274491386712e-05, "loss": 0.0406, "step": 6430 }, { "epoch": 6.174496644295302, "grad_norm": 0.32864126563072205, "learning_rate": 8.96941067178116e-05, "loss": 0.0414, "step": 6440 }, { "epoch": 6.184084372003835, "grad_norm": 0.28770530223846436, "learning_rate": 8.965540430744712e-05, "loss": 0.0371, "step": 6450 }, { "epoch": 6.193672099712368, "grad_norm": 0.36449259519577026, "learning_rate": 8.961663774538367e-05, "loss": 0.0337, "step": 6460 }, { "epoch": 6.203259827420902, "grad_norm": 0.3418562412261963, "learning_rate": 8.957780709433509e-05, "loss": 0.0417, "step": 6470 }, { "epoch": 6.212847555129434, "grad_norm": 0.2991498112678528, "learning_rate": 8.95389124171188e-05, "loss": 0.0402, "step": 6480 }, { "epoch": 6.222435282837967, "grad_norm": 1.1912015676498413, "learning_rate": 8.94999537766559e-05, "loss": 0.045, "step": 6490 }, { "epoch": 6.2320230105465, "grad_norm": 0.6341769695281982, "learning_rate": 8.946093123597088e-05, "loss": 0.0531, "step": 6500 }, { "epoch": 6.241610738255034, "grad_norm": 0.34102702140808105, "learning_rate": 8.942184485819162e-05, "loss": 0.0432, "step": 6510 }, { "epoch": 6.251198465963567, "grad_norm": 0.4004610776901245, "learning_rate": 8.938269470654936e-05, "loss": 0.0476, "step": 6520 }, { "epoch": 6.2607861936721, "grad_norm": 0.4373878240585327, "learning_rate": 8.934348084437835e-05, "loss": 0.0428, "step": 6530 }, { "epoch": 6.270373921380632, "grad_norm": 0.36895817518234253, "learning_rate": 8.930420333511606e-05, "loss": 0.0503, "step": 6540 }, { "epoch": 6.279961649089166, "grad_norm": 0.4267611503601074, "learning_rate": 8.926486224230282e-05, "loss": 0.0429, "step": 6550 }, { "epoch": 6.289549376797699, "grad_norm": 0.4211304485797882, "learning_rate": 8.922545762958188e-05, "loss": 0.0428, "step": 6560 }, { "epoch": 6.299137104506232, "grad_norm": 0.41338953375816345, "learning_rate": 8.918598956069919e-05, "loss": 0.047, "step": 6570 }, { "epoch": 6.308724832214765, "grad_norm": 0.35418424010276794, "learning_rate": 8.914645809950344e-05, "loss": 0.0508, "step": 6580 }, { "epoch": 6.3183125599232985, "grad_norm": 0.5311810374259949, "learning_rate": 8.91068633099458e-05, "loss": 0.0469, "step": 6590 }, { "epoch": 6.327900287631831, "grad_norm": 0.2737090587615967, "learning_rate": 8.90672052560799e-05, "loss": 0.0438, "step": 6600 }, { "epoch": 6.337488015340364, "grad_norm": 0.2861912250518799, "learning_rate": 8.902748400206174e-05, "loss": 0.0498, "step": 6610 }, { "epoch": 6.347075743048897, "grad_norm": 0.37531477212905884, "learning_rate": 8.898769961214952e-05, "loss": 0.0456, "step": 6620 }, { "epoch": 6.3566634707574305, "grad_norm": 0.34681612253189087, "learning_rate": 8.894785215070365e-05, "loss": 0.0418, "step": 6630 }, { "epoch": 6.366251198465964, "grad_norm": 0.28546613454818726, "learning_rate": 8.890794168218649e-05, "loss": 0.0428, "step": 6640 }, { "epoch": 6.375838926174497, "grad_norm": 0.35588616132736206, "learning_rate": 8.886796827116237e-05, "loss": 0.0433, "step": 6650 }, { "epoch": 6.385426653883029, "grad_norm": 0.362427294254303, "learning_rate": 8.882793198229744e-05, "loss": 0.0421, "step": 6660 }, { "epoch": 6.3950143815915625, "grad_norm": 0.4580886960029602, "learning_rate": 8.878783288035957e-05, "loss": 0.034, "step": 6670 }, { "epoch": 6.404602109300096, "grad_norm": 0.37446141242980957, "learning_rate": 8.874767103021824e-05, "loss": 0.0485, "step": 6680 }, { "epoch": 6.414189837008629, "grad_norm": 0.2968175411224365, "learning_rate": 8.870744649684444e-05, "loss": 0.0354, "step": 6690 }, { "epoch": 6.423777564717162, "grad_norm": 0.3441408574581146, "learning_rate": 8.866715934531057e-05, "loss": 0.0427, "step": 6700 }, { "epoch": 6.433365292425695, "grad_norm": 0.3193801939487457, "learning_rate": 8.862680964079031e-05, "loss": 0.0377, "step": 6710 }, { "epoch": 6.442953020134228, "grad_norm": 0.3228664696216583, "learning_rate": 8.858639744855857e-05, "loss": 0.0451, "step": 6720 }, { "epoch": 6.452540747842761, "grad_norm": 0.4861704111099243, "learning_rate": 8.85459228339913e-05, "loss": 0.0426, "step": 6730 }, { "epoch": 6.462128475551294, "grad_norm": 0.281361848115921, "learning_rate": 8.85053858625655e-05, "loss": 0.0411, "step": 6740 }, { "epoch": 6.471716203259827, "grad_norm": 0.39643704891204834, "learning_rate": 8.846478659985895e-05, "loss": 0.0376, "step": 6750 }, { "epoch": 6.481303930968361, "grad_norm": 0.269710510969162, "learning_rate": 8.84241251115503e-05, "loss": 0.0393, "step": 6760 }, { "epoch": 6.490891658676894, "grad_norm": 0.31520572304725647, "learning_rate": 8.838340146341881e-05, "loss": 0.0486, "step": 6770 }, { "epoch": 6.500479386385427, "grad_norm": 0.3355605900287628, "learning_rate": 8.83426157213443e-05, "loss": 0.045, "step": 6780 }, { "epoch": 6.510067114093959, "grad_norm": 0.2806301712989807, "learning_rate": 8.830176795130707e-05, "loss": 0.0447, "step": 6790 }, { "epoch": 6.519654841802493, "grad_norm": 0.27659860253334045, "learning_rate": 8.82608582193877e-05, "loss": 0.0426, "step": 6800 }, { "epoch": 6.529242569511026, "grad_norm": 0.2935637831687927, "learning_rate": 8.82198865917671e-05, "loss": 0.0454, "step": 6810 }, { "epoch": 6.538830297219559, "grad_norm": 0.3571741580963135, "learning_rate": 8.817885313472623e-05, "loss": 0.0454, "step": 6820 }, { "epoch": 6.548418024928092, "grad_norm": 0.3467845022678375, "learning_rate": 8.813775791464611e-05, "loss": 0.043, "step": 6830 }, { "epoch": 6.558005752636625, "grad_norm": 0.4052905738353729, "learning_rate": 8.80966009980077e-05, "loss": 0.0449, "step": 6840 }, { "epoch": 6.567593480345158, "grad_norm": 0.3361055254936218, "learning_rate": 8.805538245139169e-05, "loss": 0.0464, "step": 6850 }, { "epoch": 6.577181208053691, "grad_norm": 0.29235902428627014, "learning_rate": 8.801410234147855e-05, "loss": 0.0432, "step": 6860 }, { "epoch": 6.586768935762224, "grad_norm": 0.4435720443725586, "learning_rate": 8.797276073504832e-05, "loss": 0.0483, "step": 6870 }, { "epoch": 6.5963566634707576, "grad_norm": 0.36006295680999756, "learning_rate": 8.793135769898048e-05, "loss": 0.0389, "step": 6880 }, { "epoch": 6.605944391179291, "grad_norm": 0.30433642864227295, "learning_rate": 8.788989330025397e-05, "loss": 0.0455, "step": 6890 }, { "epoch": 6.615532118887824, "grad_norm": 0.2952471673488617, "learning_rate": 8.784836760594692e-05, "loss": 0.0373, "step": 6900 }, { "epoch": 6.625119846596356, "grad_norm": 0.42555341124534607, "learning_rate": 8.780678068323666e-05, "loss": 0.041, "step": 6910 }, { "epoch": 6.6347075743048896, "grad_norm": 0.3166603744029999, "learning_rate": 8.776513259939957e-05, "loss": 0.0441, "step": 6920 }, { "epoch": 6.644295302013423, "grad_norm": 0.5082001686096191, "learning_rate": 8.772342342181095e-05, "loss": 0.0501, "step": 6930 }, { "epoch": 6.653883029721956, "grad_norm": 0.32811877131462097, "learning_rate": 8.768165321794496e-05, "loss": 0.0449, "step": 6940 }, { "epoch": 6.663470757430489, "grad_norm": 0.39213889837265015, "learning_rate": 8.763982205537446e-05, "loss": 0.0497, "step": 6950 }, { "epoch": 6.673058485139022, "grad_norm": 0.33301976323127747, "learning_rate": 8.759793000177094e-05, "loss": 0.0466, "step": 6960 }, { "epoch": 6.682646212847555, "grad_norm": 0.33493635058403015, "learning_rate": 8.755597712490442e-05, "loss": 0.0485, "step": 6970 }, { "epoch": 6.692233940556088, "grad_norm": 0.43134915828704834, "learning_rate": 8.751396349264324e-05, "loss": 0.051, "step": 6980 }, { "epoch": 6.701821668264621, "grad_norm": 0.3931342363357544, "learning_rate": 8.747188917295409e-05, "loss": 0.0436, "step": 6990 }, { "epoch": 6.7114093959731544, "grad_norm": 0.3660528063774109, "learning_rate": 8.742975423390183e-05, "loss": 0.0393, "step": 7000 }, { "epoch": 6.720997123681688, "grad_norm": 0.33165839314460754, "learning_rate": 8.738755874364937e-05, "loss": 0.0366, "step": 7010 }, { "epoch": 6.730584851390221, "grad_norm": 0.3469119966030121, "learning_rate": 8.734530277045759e-05, "loss": 0.0378, "step": 7020 }, { "epoch": 6.740172579098753, "grad_norm": 0.27698802947998047, "learning_rate": 8.730298638268516e-05, "loss": 0.0362, "step": 7030 }, { "epoch": 6.7497603068072864, "grad_norm": 0.4078359603881836, "learning_rate": 8.726060964878858e-05, "loss": 0.046, "step": 7040 }, { "epoch": 6.75934803451582, "grad_norm": 0.34536081552505493, "learning_rate": 8.721817263732191e-05, "loss": 0.0537, "step": 7050 }, { "epoch": 6.768935762224353, "grad_norm": 0.7122533917427063, "learning_rate": 8.717567541693673e-05, "loss": 0.0466, "step": 7060 }, { "epoch": 6.778523489932886, "grad_norm": 0.24024972319602966, "learning_rate": 8.7133118056382e-05, "loss": 0.0492, "step": 7070 }, { "epoch": 6.788111217641419, "grad_norm": 0.41367456316947937, "learning_rate": 8.709050062450403e-05, "loss": 0.0424, "step": 7080 }, { "epoch": 6.797698945349952, "grad_norm": 0.35695597529411316, "learning_rate": 8.70478231902463e-05, "loss": 0.0425, "step": 7090 }, { "epoch": 6.807286673058485, "grad_norm": 0.38064390420913696, "learning_rate": 8.700508582264928e-05, "loss": 0.0488, "step": 7100 }, { "epoch": 6.816874400767018, "grad_norm": 0.3264651894569397, "learning_rate": 8.696228859085049e-05, "loss": 0.0429, "step": 7110 }, { "epoch": 6.826462128475551, "grad_norm": 0.36960527300834656, "learning_rate": 8.691943156408425e-05, "loss": 0.0465, "step": 7120 }, { "epoch": 6.836049856184085, "grad_norm": 0.34985673427581787, "learning_rate": 8.687651481168158e-05, "loss": 0.0389, "step": 7130 }, { "epoch": 6.845637583892618, "grad_norm": 0.308672696352005, "learning_rate": 8.68335384030702e-05, "loss": 0.0426, "step": 7140 }, { "epoch": 6.855225311601151, "grad_norm": 0.3914170563220978, "learning_rate": 8.679050240777427e-05, "loss": 0.0421, "step": 7150 }, { "epoch": 6.864813039309683, "grad_norm": 0.2807207703590393, "learning_rate": 8.674740689541439e-05, "loss": 0.0484, "step": 7160 }, { "epoch": 6.874400767018217, "grad_norm": 0.31063312292099, "learning_rate": 8.670425193570739e-05, "loss": 0.0413, "step": 7170 }, { "epoch": 6.88398849472675, "grad_norm": 0.3080969452857971, "learning_rate": 8.666103759846634e-05, "loss": 0.0438, "step": 7180 }, { "epoch": 6.893576222435283, "grad_norm": 0.27219802141189575, "learning_rate": 8.661776395360029e-05, "loss": 0.045, "step": 7190 }, { "epoch": 6.903163950143816, "grad_norm": 0.44108715653419495, "learning_rate": 8.65744310711143e-05, "loss": 0.0431, "step": 7200 }, { "epoch": 6.912751677852349, "grad_norm": 0.34575361013412476, "learning_rate": 8.653103902110922e-05, "loss": 0.0419, "step": 7210 }, { "epoch": 6.922339405560882, "grad_norm": 0.29534199833869934, "learning_rate": 8.648758787378164e-05, "loss": 0.0392, "step": 7220 }, { "epoch": 6.931927133269415, "grad_norm": 0.3387232720851898, "learning_rate": 8.644407769942373e-05, "loss": 0.0354, "step": 7230 }, { "epoch": 6.941514860977948, "grad_norm": 0.27211427688598633, "learning_rate": 8.640050856842317e-05, "loss": 0.0401, "step": 7240 }, { "epoch": 6.9511025886864815, "grad_norm": 0.27033731341362, "learning_rate": 8.635688055126299e-05, "loss": 0.0389, "step": 7250 }, { "epoch": 6.960690316395015, "grad_norm": 0.3898187279701233, "learning_rate": 8.631319371852151e-05, "loss": 0.0393, "step": 7260 }, { "epoch": 6.970278044103548, "grad_norm": 0.2771322727203369, "learning_rate": 8.626944814087221e-05, "loss": 0.0463, "step": 7270 }, { "epoch": 6.97986577181208, "grad_norm": 0.28191322088241577, "learning_rate": 8.622564388908357e-05, "loss": 0.0443, "step": 7280 }, { "epoch": 6.9894534995206135, "grad_norm": 0.3647807240486145, "learning_rate": 8.618178103401897e-05, "loss": 0.044, "step": 7290 }, { "epoch": 6.999041227229147, "grad_norm": 0.2619480490684509, "learning_rate": 8.613785964663665e-05, "loss": 0.0422, "step": 7300 }, { "epoch": 7.00862895493768, "grad_norm": 0.2431744933128357, "learning_rate": 8.609387979798952e-05, "loss": 0.0458, "step": 7310 }, { "epoch": 7.018216682646213, "grad_norm": 0.31808608770370483, "learning_rate": 8.604984155922506e-05, "loss": 0.0391, "step": 7320 }, { "epoch": 7.027804410354746, "grad_norm": 0.41725489497184753, "learning_rate": 8.600574500158518e-05, "loss": 0.0395, "step": 7330 }, { "epoch": 7.037392138063279, "grad_norm": 0.23228147625923157, "learning_rate": 8.596159019640619e-05, "loss": 0.0415, "step": 7340 }, { "epoch": 7.046979865771812, "grad_norm": 0.25770825147628784, "learning_rate": 8.59173772151186e-05, "loss": 0.0428, "step": 7350 }, { "epoch": 7.056567593480345, "grad_norm": 0.2742254436016083, "learning_rate": 8.587310612924699e-05, "loss": 0.0456, "step": 7360 }, { "epoch": 7.066155321188878, "grad_norm": 0.34984004497528076, "learning_rate": 8.582877701041004e-05, "loss": 0.0304, "step": 7370 }, { "epoch": 7.075743048897412, "grad_norm": 0.34064123034477234, "learning_rate": 8.578438993032021e-05, "loss": 0.038, "step": 7380 }, { "epoch": 7.085330776605945, "grad_norm": 0.3359072506427765, "learning_rate": 8.57399449607838e-05, "loss": 0.0463, "step": 7390 }, { "epoch": 7.094918504314477, "grad_norm": 0.330243855714798, "learning_rate": 8.569544217370072e-05, "loss": 0.0469, "step": 7400 }, { "epoch": 7.10450623202301, "grad_norm": 0.23439550399780273, "learning_rate": 8.565088164106439e-05, "loss": 0.0388, "step": 7410 }, { "epoch": 7.114093959731544, "grad_norm": 0.45976459980010986, "learning_rate": 8.56062634349617e-05, "loss": 0.0454, "step": 7420 }, { "epoch": 7.123681687440077, "grad_norm": 0.310160368680954, "learning_rate": 8.556158762757282e-05, "loss": 0.0401, "step": 7430 }, { "epoch": 7.13326941514861, "grad_norm": 0.4018678665161133, "learning_rate": 8.551685429117111e-05, "loss": 0.0512, "step": 7440 }, { "epoch": 7.142857142857143, "grad_norm": 0.3131730556488037, "learning_rate": 8.547206349812298e-05, "loss": 0.0421, "step": 7450 }, { "epoch": 7.152444870565676, "grad_norm": 0.30326828360557556, "learning_rate": 8.542721532088778e-05, "loss": 0.0461, "step": 7460 }, { "epoch": 7.162032598274209, "grad_norm": 0.3814712166786194, "learning_rate": 8.538230983201771e-05, "loss": 0.0446, "step": 7470 }, { "epoch": 7.171620325982742, "grad_norm": 0.33048462867736816, "learning_rate": 8.533734710415771e-05, "loss": 0.048, "step": 7480 }, { "epoch": 7.181208053691275, "grad_norm": 0.2931906580924988, "learning_rate": 8.529232721004527e-05, "loss": 0.0405, "step": 7490 }, { "epoch": 7.1907957813998085, "grad_norm": 0.3595677614212036, "learning_rate": 8.524725022251039e-05, "loss": 0.0404, "step": 7500 }, { "epoch": 7.200383509108342, "grad_norm": 0.37149250507354736, "learning_rate": 8.520211621447541e-05, "loss": 0.0382, "step": 7510 }, { "epoch": 7.209971236816874, "grad_norm": 0.2645772099494934, "learning_rate": 8.515692525895494e-05, "loss": 0.0438, "step": 7520 }, { "epoch": 7.219558964525407, "grad_norm": 0.3602275848388672, "learning_rate": 8.511167742905569e-05, "loss": 0.0421, "step": 7530 }, { "epoch": 7.2291466922339405, "grad_norm": 0.27108579874038696, "learning_rate": 8.506637279797638e-05, "loss": 0.0406, "step": 7540 }, { "epoch": 7.238734419942474, "grad_norm": 0.329333633184433, "learning_rate": 8.502101143900764e-05, "loss": 0.0357, "step": 7550 }, { "epoch": 7.248322147651007, "grad_norm": 0.2549634873867035, "learning_rate": 8.497559342553185e-05, "loss": 0.0354, "step": 7560 }, { "epoch": 7.25790987535954, "grad_norm": 0.3205493092536926, "learning_rate": 8.493011883102307e-05, "loss": 0.0373, "step": 7570 }, { "epoch": 7.2674976030680725, "grad_norm": 0.2169693112373352, "learning_rate": 8.488458772904684e-05, "loss": 0.0394, "step": 7580 }, { "epoch": 7.277085330776606, "grad_norm": 0.37165510654449463, "learning_rate": 8.483900019326017e-05, "loss": 0.0381, "step": 7590 }, { "epoch": 7.286673058485139, "grad_norm": 0.26651856303215027, "learning_rate": 8.479335629741133e-05, "loss": 0.0422, "step": 7600 }, { "epoch": 7.296260786193672, "grad_norm": 0.32148563861846924, "learning_rate": 8.474765611533977e-05, "loss": 0.0357, "step": 7610 }, { "epoch": 7.305848513902205, "grad_norm": 0.26410454511642456, "learning_rate": 8.470189972097601e-05, "loss": 0.0362, "step": 7620 }, { "epoch": 7.315436241610739, "grad_norm": 0.43451759219169617, "learning_rate": 8.465608718834152e-05, "loss": 0.0394, "step": 7630 }, { "epoch": 7.325023969319272, "grad_norm": 0.39956948161125183, "learning_rate": 8.461021859154851e-05, "loss": 0.0467, "step": 7640 }, { "epoch": 7.334611697027804, "grad_norm": 0.36985108256340027, "learning_rate": 8.45642940048e-05, "loss": 0.0426, "step": 7650 }, { "epoch": 7.344199424736337, "grad_norm": 0.27028191089630127, "learning_rate": 8.451831350238947e-05, "loss": 0.0404, "step": 7660 }, { "epoch": 7.353787152444871, "grad_norm": 0.3216499388217926, "learning_rate": 8.447227715870097e-05, "loss": 0.0389, "step": 7670 }, { "epoch": 7.363374880153404, "grad_norm": 0.2922750413417816, "learning_rate": 8.442618504820878e-05, "loss": 0.0416, "step": 7680 }, { "epoch": 7.372962607861937, "grad_norm": 0.32347607612609863, "learning_rate": 8.438003724547747e-05, "loss": 0.0371, "step": 7690 }, { "epoch": 7.382550335570469, "grad_norm": 0.37498921155929565, "learning_rate": 8.433383382516169e-05, "loss": 0.0388, "step": 7700 }, { "epoch": 7.392138063279003, "grad_norm": 0.41235196590423584, "learning_rate": 8.428757486200603e-05, "loss": 0.0382, "step": 7710 }, { "epoch": 7.401725790987536, "grad_norm": 0.32482102513313293, "learning_rate": 8.424126043084499e-05, "loss": 0.0397, "step": 7720 }, { "epoch": 7.411313518696069, "grad_norm": 0.3329836130142212, "learning_rate": 8.419489060660272e-05, "loss": 0.0381, "step": 7730 }, { "epoch": 7.420901246404602, "grad_norm": 0.28950804471969604, "learning_rate": 8.41484654642931e-05, "loss": 0.037, "step": 7740 }, { "epoch": 7.4304889741131355, "grad_norm": 0.43603238463401794, "learning_rate": 8.410198507901936e-05, "loss": 0.0465, "step": 7750 }, { "epoch": 7.440076701821669, "grad_norm": 0.3902181386947632, "learning_rate": 8.405544952597422e-05, "loss": 0.0423, "step": 7760 }, { "epoch": 7.449664429530201, "grad_norm": 0.4409140348434448, "learning_rate": 8.400885888043956e-05, "loss": 0.0384, "step": 7770 }, { "epoch": 7.459252157238734, "grad_norm": 0.33337706327438354, "learning_rate": 8.396221321778645e-05, "loss": 0.0407, "step": 7780 }, { "epoch": 7.4688398849472675, "grad_norm": 0.29487982392311096, "learning_rate": 8.391551261347493e-05, "loss": 0.0407, "step": 7790 }, { "epoch": 7.478427612655801, "grad_norm": 0.2853257954120636, "learning_rate": 8.38687571430539e-05, "loss": 0.0412, "step": 7800 }, { "epoch": 7.488015340364334, "grad_norm": 0.24586626887321472, "learning_rate": 8.382194688216105e-05, "loss": 0.0453, "step": 7810 }, { "epoch": 7.497603068072867, "grad_norm": 0.24528749287128448, "learning_rate": 8.377508190652272e-05, "loss": 0.0435, "step": 7820 }, { "epoch": 7.5071907957813995, "grad_norm": 0.21899107098579407, "learning_rate": 8.37281622919537e-05, "loss": 0.0516, "step": 7830 }, { "epoch": 7.516778523489933, "grad_norm": 0.5243720412254333, "learning_rate": 8.368118811435726e-05, "loss": 0.0373, "step": 7840 }, { "epoch": 7.526366251198466, "grad_norm": 0.24362969398498535, "learning_rate": 8.363415944972487e-05, "loss": 0.0452, "step": 7850 }, { "epoch": 7.535953978906999, "grad_norm": 0.3614483177661896, "learning_rate": 8.358707637413615e-05, "loss": 0.0343, "step": 7860 }, { "epoch": 7.545541706615532, "grad_norm": 0.3958549201488495, "learning_rate": 8.353993896375878e-05, "loss": 0.0454, "step": 7870 }, { "epoch": 7.555129434324066, "grad_norm": 0.3544330596923828, "learning_rate": 8.349274729484832e-05, "loss": 0.0434, "step": 7880 }, { "epoch": 7.564717162032598, "grad_norm": 0.3171081244945526, "learning_rate": 8.344550144374808e-05, "loss": 0.0423, "step": 7890 }, { "epoch": 7.574304889741131, "grad_norm": 0.3729722797870636, "learning_rate": 8.339820148688907e-05, "loss": 0.0407, "step": 7900 }, { "epoch": 7.583892617449664, "grad_norm": 0.3339761197566986, "learning_rate": 8.335084750078978e-05, "loss": 0.0452, "step": 7910 }, { "epoch": 7.593480345158198, "grad_norm": 0.20363827049732208, "learning_rate": 8.330343956205615e-05, "loss": 0.0387, "step": 7920 }, { "epoch": 7.603068072866731, "grad_norm": 0.3942667245864868, "learning_rate": 8.325597774738137e-05, "loss": 0.0407, "step": 7930 }, { "epoch": 7.612655800575264, "grad_norm": 0.234974667429924, "learning_rate": 8.32084621335458e-05, "loss": 0.0417, "step": 7940 }, { "epoch": 7.622243528283796, "grad_norm": 0.4611276388168335, "learning_rate": 8.316089279741682e-05, "loss": 0.0455, "step": 7950 }, { "epoch": 7.63183125599233, "grad_norm": 0.31897857785224915, "learning_rate": 8.311326981594872e-05, "loss": 0.0489, "step": 7960 }, { "epoch": 7.641418983700863, "grad_norm": 0.34105560183525085, "learning_rate": 8.306559326618259e-05, "loss": 0.0441, "step": 7970 }, { "epoch": 7.651006711409396, "grad_norm": 0.35638663172721863, "learning_rate": 8.301786322524619e-05, "loss": 0.0443, "step": 7980 }, { "epoch": 7.660594439117929, "grad_norm": 0.4538173973560333, "learning_rate": 8.297007977035376e-05, "loss": 0.0414, "step": 7990 }, { "epoch": 7.6701821668264625, "grad_norm": 0.37664180994033813, "learning_rate": 8.292224297880598e-05, "loss": 0.0453, "step": 8000 }, { "epoch": 7.679769894534996, "grad_norm": 0.2357359379529953, "learning_rate": 8.287435292798984e-05, "loss": 0.0424, "step": 8010 }, { "epoch": 7.689357622243528, "grad_norm": 0.32804933190345764, "learning_rate": 8.282640969537848e-05, "loss": 0.0381, "step": 8020 }, { "epoch": 7.698945349952061, "grad_norm": 0.45805230736732483, "learning_rate": 8.277841335853101e-05, "loss": 0.0346, "step": 8030 }, { "epoch": 7.7085330776605945, "grad_norm": 0.2550659775733948, "learning_rate": 8.273036399509253e-05, "loss": 0.044, "step": 8040 }, { "epoch": 7.718120805369128, "grad_norm": 0.3587624728679657, "learning_rate": 8.268226168279389e-05, "loss": 0.0396, "step": 8050 }, { "epoch": 7.727708533077661, "grad_norm": 0.41126248240470886, "learning_rate": 8.263410649945159e-05, "loss": 0.0438, "step": 8060 }, { "epoch": 7.737296260786193, "grad_norm": 0.542373538017273, "learning_rate": 8.258589852296765e-05, "loss": 0.0473, "step": 8070 }, { "epoch": 7.7468839884947265, "grad_norm": 0.44072815775871277, "learning_rate": 8.253763783132955e-05, "loss": 0.0413, "step": 8080 }, { "epoch": 7.75647171620326, "grad_norm": 0.3905545771121979, "learning_rate": 8.248932450261e-05, "loss": 0.0394, "step": 8090 }, { "epoch": 7.766059443911793, "grad_norm": 0.3717019855976105, "learning_rate": 8.244095861496686e-05, "loss": 0.0391, "step": 8100 }, { "epoch": 7.775647171620326, "grad_norm": 0.28803032636642456, "learning_rate": 8.239254024664304e-05, "loss": 0.0398, "step": 8110 }, { "epoch": 7.785234899328859, "grad_norm": 0.26609280705451965, "learning_rate": 8.234406947596633e-05, "loss": 0.0399, "step": 8120 }, { "epoch": 7.794822627037393, "grad_norm": 0.28858163952827454, "learning_rate": 8.229554638134933e-05, "loss": 0.0327, "step": 8130 }, { "epoch": 7.804410354745925, "grad_norm": 0.3346012830734253, "learning_rate": 8.224697104128925e-05, "loss": 0.0372, "step": 8140 }, { "epoch": 7.813998082454458, "grad_norm": 0.3210478723049164, "learning_rate": 8.219834353436781e-05, "loss": 0.0424, "step": 8150 }, { "epoch": 7.823585810162991, "grad_norm": 0.2401236593723297, "learning_rate": 8.214966393925115e-05, "loss": 0.0366, "step": 8160 }, { "epoch": 7.833173537871525, "grad_norm": 0.29601314663887024, "learning_rate": 8.210093233468968e-05, "loss": 0.0416, "step": 8170 }, { "epoch": 7.842761265580058, "grad_norm": 0.29966652393341064, "learning_rate": 8.20521487995179e-05, "loss": 0.0349, "step": 8180 }, { "epoch": 7.85234899328859, "grad_norm": 0.3385706841945648, "learning_rate": 8.200331341265436e-05, "loss": 0.0421, "step": 8190 }, { "epoch": 7.861936720997123, "grad_norm": 0.4073570966720581, "learning_rate": 8.19544262531015e-05, "loss": 0.0416, "step": 8200 }, { "epoch": 7.871524448705657, "grad_norm": 0.30653032660484314, "learning_rate": 8.19054873999455e-05, "loss": 0.04, "step": 8210 }, { "epoch": 7.88111217641419, "grad_norm": 0.24951298534870148, "learning_rate": 8.185649693235614e-05, "loss": 0.0397, "step": 8220 }, { "epoch": 7.890699904122723, "grad_norm": 0.24890607595443726, "learning_rate": 8.180745492958674e-05, "loss": 0.0396, "step": 8230 }, { "epoch": 7.900287631831256, "grad_norm": 0.2634108066558838, "learning_rate": 8.175836147097396e-05, "loss": 0.0364, "step": 8240 }, { "epoch": 7.9098753595397895, "grad_norm": 0.29432108998298645, "learning_rate": 8.170921663593773e-05, "loss": 0.0353, "step": 8250 }, { "epoch": 7.919463087248322, "grad_norm": 0.3281777799129486, "learning_rate": 8.166002050398106e-05, "loss": 0.0429, "step": 8260 }, { "epoch": 7.929050814956855, "grad_norm": 0.24084685742855072, "learning_rate": 8.161077315468997e-05, "loss": 0.0454, "step": 8270 }, { "epoch": 7.938638542665388, "grad_norm": 0.290452241897583, "learning_rate": 8.156147466773332e-05, "loss": 0.0427, "step": 8280 }, { "epoch": 7.9482262703739215, "grad_norm": 0.3068200945854187, "learning_rate": 8.15121251228627e-05, "loss": 0.0416, "step": 8290 }, { "epoch": 7.957813998082455, "grad_norm": 0.5520877242088318, "learning_rate": 8.146272459991233e-05, "loss": 0.0369, "step": 8300 }, { "epoch": 7.967401725790987, "grad_norm": 0.268451064825058, "learning_rate": 8.141327317879884e-05, "loss": 0.0419, "step": 8310 }, { "epoch": 7.97698945349952, "grad_norm": 0.45414549112319946, "learning_rate": 8.136377093952123e-05, "loss": 0.0414, "step": 8320 }, { "epoch": 7.9865771812080535, "grad_norm": 0.2249930500984192, "learning_rate": 8.131421796216072e-05, "loss": 0.0389, "step": 8330 }, { "epoch": 7.996164908916587, "grad_norm": 0.28440603613853455, "learning_rate": 8.126461432688061e-05, "loss": 0.038, "step": 8340 }, { "epoch": 8.00575263662512, "grad_norm": 0.26801931858062744, "learning_rate": 8.121496011392613e-05, "loss": 0.0382, "step": 8350 }, { "epoch": 8.015340364333653, "grad_norm": 0.3116857409477234, "learning_rate": 8.116525540362434e-05, "loss": 0.0395, "step": 8360 }, { "epoch": 8.024928092042186, "grad_norm": 0.37847548723220825, "learning_rate": 8.1115500276384e-05, "loss": 0.0395, "step": 8370 }, { "epoch": 8.03451581975072, "grad_norm": 0.3358413279056549, "learning_rate": 8.10656948126954e-05, "loss": 0.0443, "step": 8380 }, { "epoch": 8.044103547459253, "grad_norm": 0.3593525290489197, "learning_rate": 8.101583909313033e-05, "loss": 0.0393, "step": 8390 }, { "epoch": 8.053691275167786, "grad_norm": 0.2807999551296234, "learning_rate": 8.09659331983418e-05, "loss": 0.0337, "step": 8400 }, { "epoch": 8.063279002876317, "grad_norm": 0.24256014823913574, "learning_rate": 8.091597720906403e-05, "loss": 0.0383, "step": 8410 }, { "epoch": 8.07286673058485, "grad_norm": 0.4359792172908783, "learning_rate": 8.086597120611228e-05, "loss": 0.0389, "step": 8420 }, { "epoch": 8.082454458293384, "grad_norm": 0.3423149883747101, "learning_rate": 8.081591527038271e-05, "loss": 0.0401, "step": 8430 }, { "epoch": 8.092042186001917, "grad_norm": 0.2202298790216446, "learning_rate": 8.076580948285227e-05, "loss": 0.0364, "step": 8440 }, { "epoch": 8.10162991371045, "grad_norm": 0.36670511960983276, "learning_rate": 8.071565392457852e-05, "loss": 0.0379, "step": 8450 }, { "epoch": 8.111217641418984, "grad_norm": 0.22374413907527924, "learning_rate": 8.066544867669961e-05, "loss": 0.0363, "step": 8460 }, { "epoch": 8.120805369127517, "grad_norm": 0.43999022245407104, "learning_rate": 8.061519382043399e-05, "loss": 0.0385, "step": 8470 }, { "epoch": 8.13039309683605, "grad_norm": 0.2890577018260956, "learning_rate": 8.056488943708041e-05, "loss": 0.0379, "step": 8480 }, { "epoch": 8.139980824544583, "grad_norm": 0.3366747200489044, "learning_rate": 8.051453560801772e-05, "loss": 0.0417, "step": 8490 }, { "epoch": 8.149568552253116, "grad_norm": 0.2634000778198242, "learning_rate": 8.046413241470478e-05, "loss": 0.0351, "step": 8500 }, { "epoch": 8.15915627996165, "grad_norm": 0.21788382530212402, "learning_rate": 8.041367993868031e-05, "loss": 0.0391, "step": 8510 }, { "epoch": 8.168744007670183, "grad_norm": 0.31453433632850647, "learning_rate": 8.036317826156275e-05, "loss": 0.0392, "step": 8520 }, { "epoch": 8.178331735378714, "grad_norm": 0.2942139506340027, "learning_rate": 8.031262746505012e-05, "loss": 0.0443, "step": 8530 }, { "epoch": 8.187919463087248, "grad_norm": 0.24110645055770874, "learning_rate": 8.02620276309199e-05, "loss": 0.038, "step": 8540 }, { "epoch": 8.19750719079578, "grad_norm": 0.26143452525138855, "learning_rate": 8.021137884102891e-05, "loss": 0.0349, "step": 8550 }, { "epoch": 8.207094918504314, "grad_norm": 0.23739804327487946, "learning_rate": 8.016068117731318e-05, "loss": 0.0367, "step": 8560 }, { "epoch": 8.216682646212847, "grad_norm": 0.31131234765052795, "learning_rate": 8.010993472178778e-05, "loss": 0.0383, "step": 8570 }, { "epoch": 8.22627037392138, "grad_norm": 0.301734060049057, "learning_rate": 8.005913955654675e-05, "loss": 0.0402, "step": 8580 }, { "epoch": 8.235858101629914, "grad_norm": 0.2536526620388031, "learning_rate": 8.000829576376288e-05, "loss": 0.0324, "step": 8590 }, { "epoch": 8.245445829338447, "grad_norm": 0.398578941822052, "learning_rate": 7.995740342568767e-05, "loss": 0.0382, "step": 8600 }, { "epoch": 8.25503355704698, "grad_norm": 0.2876124083995819, "learning_rate": 7.990646262465112e-05, "loss": 0.038, "step": 8610 }, { "epoch": 8.264621284755513, "grad_norm": 0.30959025025367737, "learning_rate": 7.985547344306161e-05, "loss": 0.0464, "step": 8620 }, { "epoch": 8.274209012464047, "grad_norm": 0.327210396528244, "learning_rate": 7.980443596340589e-05, "loss": 0.0426, "step": 8630 }, { "epoch": 8.28379674017258, "grad_norm": 0.23988771438598633, "learning_rate": 7.975335026824873e-05, "loss": 0.043, "step": 8640 }, { "epoch": 8.293384467881111, "grad_norm": 0.2276514321565628, "learning_rate": 7.970221644023293e-05, "loss": 0.0407, "step": 8650 }, { "epoch": 8.302972195589644, "grad_norm": 0.27630215883255005, "learning_rate": 7.965103456207919e-05, "loss": 0.0439, "step": 8660 }, { "epoch": 8.312559923298178, "grad_norm": 0.1922815442085266, "learning_rate": 7.959980471658592e-05, "loss": 0.0396, "step": 8670 }, { "epoch": 8.322147651006711, "grad_norm": 0.303406298160553, "learning_rate": 7.954852698662913e-05, "loss": 0.0363, "step": 8680 }, { "epoch": 8.331735378715244, "grad_norm": 0.3184201717376709, "learning_rate": 7.94972014551623e-05, "loss": 0.0414, "step": 8690 }, { "epoch": 8.341323106423777, "grad_norm": 0.31593239307403564, "learning_rate": 7.94458282052162e-05, "loss": 0.0431, "step": 8700 }, { "epoch": 8.35091083413231, "grad_norm": 0.2461700290441513, "learning_rate": 7.939440731989887e-05, "loss": 0.0447, "step": 8710 }, { "epoch": 8.360498561840844, "grad_norm": 0.5149932503700256, "learning_rate": 7.934293888239532e-05, "loss": 0.0377, "step": 8720 }, { "epoch": 8.370086289549377, "grad_norm": 0.273589164018631, "learning_rate": 7.929142297596756e-05, "loss": 0.0436, "step": 8730 }, { "epoch": 8.37967401725791, "grad_norm": 0.37680765986442566, "learning_rate": 7.92398596839544e-05, "loss": 0.0351, "step": 8740 }, { "epoch": 8.389261744966444, "grad_norm": 0.3258054256439209, "learning_rate": 7.918824908977123e-05, "loss": 0.0387, "step": 8750 }, { "epoch": 8.398849472674977, "grad_norm": 0.36646002531051636, "learning_rate": 7.913659127691002e-05, "loss": 0.0388, "step": 8760 }, { "epoch": 8.40843720038351, "grad_norm": 0.31907573342323303, "learning_rate": 7.908488632893913e-05, "loss": 0.043, "step": 8770 }, { "epoch": 8.418024928092041, "grad_norm": 0.3218369781970978, "learning_rate": 7.903313432950313e-05, "loss": 0.041, "step": 8780 }, { "epoch": 8.427612655800575, "grad_norm": 0.2750600576400757, "learning_rate": 7.898133536232275e-05, "loss": 0.0372, "step": 8790 }, { "epoch": 8.437200383509108, "grad_norm": 0.3370470106601715, "learning_rate": 7.892948951119467e-05, "loss": 0.0381, "step": 8800 }, { "epoch": 8.446788111217641, "grad_norm": 0.30544212460517883, "learning_rate": 7.887759685999143e-05, "loss": 0.0511, "step": 8810 }, { "epoch": 8.456375838926174, "grad_norm": 0.3022957742214203, "learning_rate": 7.88256574926613e-05, "loss": 0.0382, "step": 8820 }, { "epoch": 8.465963566634708, "grad_norm": 0.4892277121543884, "learning_rate": 7.877367149322807e-05, "loss": 0.0471, "step": 8830 }, { "epoch": 8.47555129434324, "grad_norm": 0.2292528748512268, "learning_rate": 7.872163894579103e-05, "loss": 0.0374, "step": 8840 }, { "epoch": 8.485139022051774, "grad_norm": 0.4441846013069153, "learning_rate": 7.866955993452473e-05, "loss": 0.0396, "step": 8850 }, { "epoch": 8.494726749760307, "grad_norm": 0.3326236605644226, "learning_rate": 7.86174345436789e-05, "loss": 0.0407, "step": 8860 }, { "epoch": 8.50431447746884, "grad_norm": 0.3634801506996155, "learning_rate": 7.856526285757829e-05, "loss": 0.0343, "step": 8870 }, { "epoch": 8.513902205177374, "grad_norm": 0.3255830705165863, "learning_rate": 7.851304496062254e-05, "loss": 0.0391, "step": 8880 }, { "epoch": 8.523489932885907, "grad_norm": 0.2465457022190094, "learning_rate": 7.846078093728611e-05, "loss": 0.0418, "step": 8890 }, { "epoch": 8.53307766059444, "grad_norm": 0.28741371631622314, "learning_rate": 7.840847087211799e-05, "loss": 0.0408, "step": 8900 }, { "epoch": 8.542665388302972, "grad_norm": 0.5026047825813293, "learning_rate": 7.835611484974169e-05, "loss": 0.0425, "step": 8910 }, { "epoch": 8.552253116011505, "grad_norm": 0.29450881481170654, "learning_rate": 7.830371295485506e-05, "loss": 0.0386, "step": 8920 }, { "epoch": 8.561840843720038, "grad_norm": 0.37559008598327637, "learning_rate": 7.82512652722302e-05, "loss": 0.0346, "step": 8930 }, { "epoch": 8.571428571428571, "grad_norm": 0.3274129033088684, "learning_rate": 7.819877188671322e-05, "loss": 0.0377, "step": 8940 }, { "epoch": 8.581016299137104, "grad_norm": 1.9449902772903442, "learning_rate": 7.81462328832242e-05, "loss": 0.0422, "step": 8950 }, { "epoch": 8.590604026845638, "grad_norm": 0.32859793305397034, "learning_rate": 7.809364834675703e-05, "loss": 0.0381, "step": 8960 }, { "epoch": 8.60019175455417, "grad_norm": 0.41501474380493164, "learning_rate": 7.804101836237921e-05, "loss": 0.0413, "step": 8970 }, { "epoch": 8.609779482262704, "grad_norm": 0.3548615574836731, "learning_rate": 7.798834301523182e-05, "loss": 0.0436, "step": 8980 }, { "epoch": 8.619367209971237, "grad_norm": 0.3612217903137207, "learning_rate": 7.793562239052928e-05, "loss": 0.0364, "step": 8990 }, { "epoch": 8.62895493767977, "grad_norm": 0.3534400761127472, "learning_rate": 7.78828565735593e-05, "loss": 0.0381, "step": 9000 }, { "epoch": 8.638542665388304, "grad_norm": 0.34939974546432495, "learning_rate": 7.783004564968263e-05, "loss": 0.0405, "step": 9010 }, { "epoch": 8.648130393096835, "grad_norm": 0.37234190106391907, "learning_rate": 7.777718970433309e-05, "loss": 0.0439, "step": 9020 }, { "epoch": 8.657718120805368, "grad_norm": 0.40179571509361267, "learning_rate": 7.772428882301724e-05, "loss": 0.0428, "step": 9030 }, { "epoch": 8.667305848513902, "grad_norm": 0.37865087389945984, "learning_rate": 7.767134309131437e-05, "loss": 0.0364, "step": 9040 }, { "epoch": 8.676893576222435, "grad_norm": 0.32325470447540283, "learning_rate": 7.761835259487635e-05, "loss": 0.0387, "step": 9050 }, { "epoch": 8.686481303930968, "grad_norm": 0.26749640703201294, "learning_rate": 7.756531741942743e-05, "loss": 0.048, "step": 9060 }, { "epoch": 8.696069031639501, "grad_norm": 0.381815105676651, "learning_rate": 7.751223765076418e-05, "loss": 0.0337, "step": 9070 }, { "epoch": 8.705656759348035, "grad_norm": 0.4329027533531189, "learning_rate": 7.745911337475524e-05, "loss": 0.0408, "step": 9080 }, { "epoch": 8.715244487056568, "grad_norm": 0.4740753173828125, "learning_rate": 7.740594467734131e-05, "loss": 0.0368, "step": 9090 }, { "epoch": 8.724832214765101, "grad_norm": 0.23423776030540466, "learning_rate": 7.735273164453494e-05, "loss": 0.0445, "step": 9100 }, { "epoch": 8.734419942473634, "grad_norm": 0.35593661665916443, "learning_rate": 7.72994743624204e-05, "loss": 0.0415, "step": 9110 }, { "epoch": 8.744007670182167, "grad_norm": 0.2637054920196533, "learning_rate": 7.724617291715355e-05, "loss": 0.0424, "step": 9120 }, { "epoch": 8.7535953978907, "grad_norm": 0.25044816732406616, "learning_rate": 7.719282739496167e-05, "loss": 0.0384, "step": 9130 }, { "epoch": 8.763183125599234, "grad_norm": 0.22907428443431854, "learning_rate": 7.713943788214337e-05, "loss": 0.0365, "step": 9140 }, { "epoch": 8.772770853307765, "grad_norm": 0.4074908494949341, "learning_rate": 7.70860044650684e-05, "loss": 0.0481, "step": 9150 }, { "epoch": 8.782358581016299, "grad_norm": 0.29292604327201843, "learning_rate": 7.703252723017757e-05, "loss": 0.0433, "step": 9160 }, { "epoch": 8.791946308724832, "grad_norm": 0.2879285514354706, "learning_rate": 7.697900626398255e-05, "loss": 0.0388, "step": 9170 }, { "epoch": 8.801534036433365, "grad_norm": 0.31987619400024414, "learning_rate": 7.692544165306574e-05, "loss": 0.0423, "step": 9180 }, { "epoch": 8.811121764141898, "grad_norm": 0.3260093331336975, "learning_rate": 7.687183348408018e-05, "loss": 0.0342, "step": 9190 }, { "epoch": 8.820709491850431, "grad_norm": 0.3373820185661316, "learning_rate": 7.681818184374938e-05, "loss": 0.0382, "step": 9200 }, { "epoch": 8.830297219558965, "grad_norm": 0.17047972977161407, "learning_rate": 7.676448681886715e-05, "loss": 0.0375, "step": 9210 }, { "epoch": 8.839884947267498, "grad_norm": 0.26559868454933167, "learning_rate": 7.671074849629746e-05, "loss": 0.0398, "step": 9220 }, { "epoch": 8.849472674976031, "grad_norm": 0.30938103795051575, "learning_rate": 7.665696696297439e-05, "loss": 0.0437, "step": 9230 }, { "epoch": 8.859060402684564, "grad_norm": 0.47756102681159973, "learning_rate": 7.660314230590187e-05, "loss": 0.0393, "step": 9240 }, { "epoch": 8.868648130393098, "grad_norm": 0.3115938901901245, "learning_rate": 7.654927461215362e-05, "loss": 0.0389, "step": 9250 }, { "epoch": 8.87823585810163, "grad_norm": 0.2378511130809784, "learning_rate": 7.649536396887296e-05, "loss": 0.0456, "step": 9260 }, { "epoch": 8.887823585810162, "grad_norm": 0.27728554606437683, "learning_rate": 7.644141046327271e-05, "loss": 0.0445, "step": 9270 }, { "epoch": 8.897411313518695, "grad_norm": 0.5434097051620483, "learning_rate": 7.638741418263505e-05, "loss": 0.0402, "step": 9280 }, { "epoch": 8.906999041227229, "grad_norm": 0.23838652670383453, "learning_rate": 7.633337521431127e-05, "loss": 0.038, "step": 9290 }, { "epoch": 8.916586768935762, "grad_norm": 0.2675243020057678, "learning_rate": 7.627929364572184e-05, "loss": 0.0409, "step": 9300 }, { "epoch": 8.926174496644295, "grad_norm": 0.36112427711486816, "learning_rate": 7.622516956435604e-05, "loss": 0.038, "step": 9310 }, { "epoch": 8.935762224352828, "grad_norm": 0.40189293026924133, "learning_rate": 7.617100305777199e-05, "loss": 0.0349, "step": 9320 }, { "epoch": 8.945349952061362, "grad_norm": 0.32217565178871155, "learning_rate": 7.611679421359639e-05, "loss": 0.0414, "step": 9330 }, { "epoch": 8.954937679769895, "grad_norm": 0.37468934059143066, "learning_rate": 7.60625431195245e-05, "loss": 0.0419, "step": 9340 }, { "epoch": 8.964525407478428, "grad_norm": 0.25082099437713623, "learning_rate": 7.600824986331989e-05, "loss": 0.0361, "step": 9350 }, { "epoch": 8.974113135186961, "grad_norm": 0.3598342835903168, "learning_rate": 7.595391453281431e-05, "loss": 0.034, "step": 9360 }, { "epoch": 8.983700862895494, "grad_norm": 0.3254631459712982, "learning_rate": 7.589953721590764e-05, "loss": 0.0482, "step": 9370 }, { "epoch": 8.993288590604028, "grad_norm": 0.3480936586856842, "learning_rate": 7.584511800056759e-05, "loss": 0.0359, "step": 9380 }, { "epoch": 9.002876318312559, "grad_norm": 0.321119099855423, "learning_rate": 7.579065697482974e-05, "loss": 0.0397, "step": 9390 }, { "epoch": 9.012464046021092, "grad_norm": 0.2790512144565582, "learning_rate": 7.573615422679726e-05, "loss": 0.0341, "step": 9400 }, { "epoch": 9.022051773729626, "grad_norm": 0.6163461208343506, "learning_rate": 7.568160984464083e-05, "loss": 0.0361, "step": 9410 }, { "epoch": 9.031639501438159, "grad_norm": 0.35653308033943176, "learning_rate": 7.56270239165985e-05, "loss": 0.0392, "step": 9420 }, { "epoch": 9.041227229146692, "grad_norm": 0.2938978970050812, "learning_rate": 7.55723965309755e-05, "loss": 0.0326, "step": 9430 }, { "epoch": 9.050814956855225, "grad_norm": 0.26529833674430847, "learning_rate": 7.551772777614412e-05, "loss": 0.0454, "step": 9440 }, { "epoch": 9.060402684563758, "grad_norm": 0.351085364818573, "learning_rate": 7.54630177405436e-05, "loss": 0.0467, "step": 9450 }, { "epoch": 9.069990412272292, "grad_norm": 0.23490998148918152, "learning_rate": 7.540826651267999e-05, "loss": 0.0405, "step": 9460 }, { "epoch": 9.079578139980825, "grad_norm": 0.3685658276081085, "learning_rate": 7.535347418112588e-05, "loss": 0.0372, "step": 9470 }, { "epoch": 9.089165867689358, "grad_norm": 0.36048129200935364, "learning_rate": 7.529864083452046e-05, "loss": 0.0378, "step": 9480 }, { "epoch": 9.098753595397891, "grad_norm": 0.3054652810096741, "learning_rate": 7.52437665615692e-05, "loss": 0.0447, "step": 9490 }, { "epoch": 9.108341323106425, "grad_norm": 0.2997536063194275, "learning_rate": 7.518885145104381e-05, "loss": 0.038, "step": 9500 }, { "epoch": 9.117929050814958, "grad_norm": 0.5517327189445496, "learning_rate": 7.513389559178209e-05, "loss": 0.0472, "step": 9510 }, { "epoch": 9.12751677852349, "grad_norm": 0.30378520488739014, "learning_rate": 7.507889907268769e-05, "loss": 0.0355, "step": 9520 }, { "epoch": 9.137104506232022, "grad_norm": 0.46029695868492126, "learning_rate": 7.50238619827301e-05, "loss": 0.0358, "step": 9530 }, { "epoch": 9.146692233940556, "grad_norm": 0.30406633019447327, "learning_rate": 7.496878441094439e-05, "loss": 0.0397, "step": 9540 }, { "epoch": 9.156279961649089, "grad_norm": 0.4107452929019928, "learning_rate": 7.491366644643118e-05, "loss": 0.043, "step": 9550 }, { "epoch": 9.165867689357622, "grad_norm": 0.6011910438537598, "learning_rate": 7.485850817835639e-05, "loss": 0.0459, "step": 9560 }, { "epoch": 9.175455417066155, "grad_norm": 0.42862173914909363, "learning_rate": 7.480330969595114e-05, "loss": 0.0392, "step": 9570 }, { "epoch": 9.185043144774689, "grad_norm": 0.3231380879878998, "learning_rate": 7.474807108851163e-05, "loss": 0.0379, "step": 9580 }, { "epoch": 9.194630872483222, "grad_norm": 0.31742027401924133, "learning_rate": 7.469279244539897e-05, "loss": 0.0398, "step": 9590 }, { "epoch": 9.204218600191755, "grad_norm": 0.34327855706214905, "learning_rate": 7.463747385603899e-05, "loss": 0.0365, "step": 9600 }, { "epoch": 9.213806327900288, "grad_norm": 0.40726932883262634, "learning_rate": 7.458211540992222e-05, "loss": 0.0421, "step": 9610 }, { "epoch": 9.223394055608821, "grad_norm": 0.28824758529663086, "learning_rate": 7.452671719660359e-05, "loss": 0.0392, "step": 9620 }, { "epoch": 9.232981783317355, "grad_norm": 0.4298984408378601, "learning_rate": 7.447127930570241e-05, "loss": 0.0396, "step": 9630 }, { "epoch": 9.242569511025886, "grad_norm": 0.3513946831226349, "learning_rate": 7.441580182690218e-05, "loss": 0.0344, "step": 9640 }, { "epoch": 9.25215723873442, "grad_norm": 0.31864580512046814, "learning_rate": 7.436028484995043e-05, "loss": 0.0352, "step": 9650 }, { "epoch": 9.261744966442953, "grad_norm": 0.42778101563453674, "learning_rate": 7.430472846465856e-05, "loss": 0.0345, "step": 9660 }, { "epoch": 9.271332694151486, "grad_norm": 0.25153082609176636, "learning_rate": 7.424913276090176e-05, "loss": 0.0376, "step": 9670 }, { "epoch": 9.280920421860019, "grad_norm": 0.30971595644950867, "learning_rate": 7.419349782861882e-05, "loss": 0.0402, "step": 9680 }, { "epoch": 9.290508149568552, "grad_norm": 0.5045586228370667, "learning_rate": 7.413782375781198e-05, "loss": 0.0321, "step": 9690 }, { "epoch": 9.300095877277085, "grad_norm": 0.26688501238822937, "learning_rate": 7.40821106385468e-05, "loss": 0.0405, "step": 9700 }, { "epoch": 9.309683604985619, "grad_norm": 0.3186158537864685, "learning_rate": 7.402635856095202e-05, "loss": 0.039, "step": 9710 }, { "epoch": 9.319271332694152, "grad_norm": 0.23956236243247986, "learning_rate": 7.397056761521936e-05, "loss": 0.0385, "step": 9720 }, { "epoch": 9.328859060402685, "grad_norm": 0.35403645038604736, "learning_rate": 7.391473789160352e-05, "loss": 0.037, "step": 9730 }, { "epoch": 9.338446788111218, "grad_norm": 0.30190348625183105, "learning_rate": 7.38588694804218e-05, "loss": 0.0402, "step": 9740 }, { "epoch": 9.348034515819752, "grad_norm": 0.45342007279396057, "learning_rate": 7.380296247205417e-05, "loss": 0.0385, "step": 9750 }, { "epoch": 9.357622243528283, "grad_norm": 0.355342835187912, "learning_rate": 7.374701695694304e-05, "loss": 0.0375, "step": 9760 }, { "epoch": 9.367209971236816, "grad_norm": 0.3231160640716553, "learning_rate": 7.369103302559308e-05, "loss": 0.0353, "step": 9770 }, { "epoch": 9.37679769894535, "grad_norm": 0.4163530766963959, "learning_rate": 7.363501076857112e-05, "loss": 0.0381, "step": 9780 }, { "epoch": 9.386385426653883, "grad_norm": 0.24439620971679688, "learning_rate": 7.357895027650598e-05, "loss": 0.0347, "step": 9790 }, { "epoch": 9.395973154362416, "grad_norm": 0.3847917318344116, "learning_rate": 7.352285164008838e-05, "loss": 0.0331, "step": 9800 }, { "epoch": 9.405560882070949, "grad_norm": 0.3097192049026489, "learning_rate": 7.346671495007068e-05, "loss": 0.0405, "step": 9810 }, { "epoch": 9.415148609779482, "grad_norm": 0.21436016261577606, "learning_rate": 7.341054029726685e-05, "loss": 0.0375, "step": 9820 }, { "epoch": 9.424736337488016, "grad_norm": 0.41024893522262573, "learning_rate": 7.335432777255225e-05, "loss": 0.0463, "step": 9830 }, { "epoch": 9.434324065196549, "grad_norm": 0.299177348613739, "learning_rate": 7.329807746686352e-05, "loss": 0.0418, "step": 9840 }, { "epoch": 9.443911792905082, "grad_norm": 0.3526586890220642, "learning_rate": 7.324178947119842e-05, "loss": 0.0383, "step": 9850 }, { "epoch": 9.453499520613615, "grad_norm": 0.277421772480011, "learning_rate": 7.318546387661564e-05, "loss": 0.0512, "step": 9860 }, { "epoch": 9.463087248322148, "grad_norm": 0.24628502130508423, "learning_rate": 7.312910077423477e-05, "loss": 0.0367, "step": 9870 }, { "epoch": 9.47267497603068, "grad_norm": 0.5568169951438904, "learning_rate": 7.307270025523601e-05, "loss": 0.0396, "step": 9880 }, { "epoch": 9.482262703739213, "grad_norm": 0.30765804648399353, "learning_rate": 7.301626241086012e-05, "loss": 0.043, "step": 9890 }, { "epoch": 9.491850431447746, "grad_norm": 0.32168257236480713, "learning_rate": 7.295978733240827e-05, "loss": 0.0385, "step": 9900 }, { "epoch": 9.50143815915628, "grad_norm": 0.46826574206352234, "learning_rate": 7.29032751112418e-05, "loss": 0.0375, "step": 9910 }, { "epoch": 9.511025886864813, "grad_norm": 0.19892945885658264, "learning_rate": 7.284672583878219e-05, "loss": 0.0432, "step": 9920 }, { "epoch": 9.520613614573346, "grad_norm": 0.21767093241214752, "learning_rate": 7.279013960651083e-05, "loss": 0.0331, "step": 9930 }, { "epoch": 9.53020134228188, "grad_norm": 0.32079631090164185, "learning_rate": 7.273351650596889e-05, "loss": 0.0355, "step": 9940 }, { "epoch": 9.539789069990412, "grad_norm": 0.40111902356147766, "learning_rate": 7.267685662875725e-05, "loss": 0.0412, "step": 9950 }, { "epoch": 9.549376797698946, "grad_norm": 0.58073490858078, "learning_rate": 7.26201600665362e-05, "loss": 0.0384, "step": 9960 }, { "epoch": 9.558964525407479, "grad_norm": 0.20928962528705597, "learning_rate": 7.256342691102545e-05, "loss": 0.0334, "step": 9970 }, { "epoch": 9.568552253116012, "grad_norm": 0.2809102535247803, "learning_rate": 7.250665725400385e-05, "loss": 0.0421, "step": 9980 }, { "epoch": 9.578139980824545, "grad_norm": 0.2836989164352417, "learning_rate": 7.244985118730933e-05, "loss": 0.0394, "step": 9990 }, { "epoch": 9.587727708533077, "grad_norm": 0.21493583917617798, "learning_rate": 7.239300880283869e-05, "loss": 0.0438, "step": 10000 }, { "epoch": 9.59731543624161, "grad_norm": 0.3654724955558777, "learning_rate": 7.233613019254755e-05, "loss": 0.0398, "step": 10010 }, { "epoch": 9.606903163950143, "grad_norm": 0.24901500344276428, "learning_rate": 7.227921544845003e-05, "loss": 0.0393, "step": 10020 }, { "epoch": 9.616490891658676, "grad_norm": 0.21980980038642883, "learning_rate": 7.222226466261883e-05, "loss": 0.0386, "step": 10030 }, { "epoch": 9.62607861936721, "grad_norm": 0.18104171752929688, "learning_rate": 7.216527792718484e-05, "loss": 0.0378, "step": 10040 }, { "epoch": 9.635666347075743, "grad_norm": 0.33641284704208374, "learning_rate": 7.210825533433719e-05, "loss": 0.0418, "step": 10050 }, { "epoch": 9.645254074784276, "grad_norm": 0.2590009570121765, "learning_rate": 7.205119697632297e-05, "loss": 0.0327, "step": 10060 }, { "epoch": 9.65484180249281, "grad_norm": 0.40689241886138916, "learning_rate": 7.199410294544713e-05, "loss": 0.0542, "step": 10070 }, { "epoch": 9.664429530201343, "grad_norm": 0.3199746310710907, "learning_rate": 7.193697333407234e-05, "loss": 0.0363, "step": 10080 }, { "epoch": 9.674017257909876, "grad_norm": 0.49059638381004333, "learning_rate": 7.187980823461887e-05, "loss": 0.0377, "step": 10090 }, { "epoch": 9.683604985618409, "grad_norm": 0.28129157423973083, "learning_rate": 7.182260773956433e-05, "loss": 0.0382, "step": 10100 }, { "epoch": 9.693192713326942, "grad_norm": 0.3830220401287079, "learning_rate": 7.176537194144362e-05, "loss": 0.0349, "step": 10110 }, { "epoch": 9.702780441035475, "grad_norm": 0.3658897578716278, "learning_rate": 7.170810093284876e-05, "loss": 0.0359, "step": 10120 }, { "epoch": 9.712368168744007, "grad_norm": 0.31416580080986023, "learning_rate": 7.165079480642873e-05, "loss": 0.0343, "step": 10130 }, { "epoch": 9.72195589645254, "grad_norm": 0.24944183230400085, "learning_rate": 7.159345365488929e-05, "loss": 0.0332, "step": 10140 }, { "epoch": 9.731543624161073, "grad_norm": 0.2953116297721863, "learning_rate": 7.153607757099292e-05, "loss": 0.0354, "step": 10150 }, { "epoch": 9.741131351869607, "grad_norm": 0.4103414714336395, "learning_rate": 7.147866664755856e-05, "loss": 0.036, "step": 10160 }, { "epoch": 9.75071907957814, "grad_norm": 0.28444069623947144, "learning_rate": 7.142122097746153e-05, "loss": 0.0389, "step": 10170 }, { "epoch": 9.760306807286673, "grad_norm": 0.2912525534629822, "learning_rate": 7.136374065363334e-05, "loss": 0.0345, "step": 10180 }, { "epoch": 9.769894534995206, "grad_norm": 0.25480780005455017, "learning_rate": 7.13062257690616e-05, "loss": 0.0355, "step": 10190 }, { "epoch": 9.77948226270374, "grad_norm": 0.305532306432724, "learning_rate": 7.124867641678981e-05, "loss": 0.0376, "step": 10200 }, { "epoch": 9.789069990412273, "grad_norm": 0.32806769013404846, "learning_rate": 7.119109268991723e-05, "loss": 0.0357, "step": 10210 }, { "epoch": 9.798657718120806, "grad_norm": 0.23281969130039215, "learning_rate": 7.113347468159871e-05, "loss": 0.0332, "step": 10220 }, { "epoch": 9.808245445829339, "grad_norm": 0.3487169146537781, "learning_rate": 7.107582248504458e-05, "loss": 0.0397, "step": 10230 }, { "epoch": 9.817833173537872, "grad_norm": 0.3124096989631653, "learning_rate": 7.101813619352048e-05, "loss": 0.0391, "step": 10240 }, { "epoch": 9.827420901246404, "grad_norm": 0.39542460441589355, "learning_rate": 7.09604159003472e-05, "loss": 0.0361, "step": 10250 }, { "epoch": 9.837008628954937, "grad_norm": 0.3044220209121704, "learning_rate": 7.090266169890051e-05, "loss": 0.0382, "step": 10260 }, { "epoch": 9.84659635666347, "grad_norm": 0.3320329189300537, "learning_rate": 7.08448736826111e-05, "loss": 0.043, "step": 10270 }, { "epoch": 9.856184084372003, "grad_norm": 0.25773710012435913, "learning_rate": 7.078705194496429e-05, "loss": 0.0363, "step": 10280 }, { "epoch": 9.865771812080537, "grad_norm": 0.4256868064403534, "learning_rate": 7.07291965795e-05, "loss": 0.0388, "step": 10290 }, { "epoch": 9.87535953978907, "grad_norm": 0.48361513018608093, "learning_rate": 7.067130767981252e-05, "loss": 0.0387, "step": 10300 }, { "epoch": 9.884947267497603, "grad_norm": 0.3017280697822571, "learning_rate": 7.061338533955043e-05, "loss": 0.0334, "step": 10310 }, { "epoch": 9.894534995206136, "grad_norm": 0.3394894599914551, "learning_rate": 7.055542965241634e-05, "loss": 0.0402, "step": 10320 }, { "epoch": 9.90412272291467, "grad_norm": 0.3364240527153015, "learning_rate": 7.049744071216687e-05, "loss": 0.0332, "step": 10330 }, { "epoch": 9.913710450623203, "grad_norm": 0.2847566604614258, "learning_rate": 7.043941861261242e-05, "loss": 0.0372, "step": 10340 }, { "epoch": 9.923298178331736, "grad_norm": 0.6304646730422974, "learning_rate": 7.038136344761703e-05, "loss": 0.0338, "step": 10350 }, { "epoch": 9.93288590604027, "grad_norm": 0.37469327449798584, "learning_rate": 7.03232753110982e-05, "loss": 0.0377, "step": 10360 }, { "epoch": 9.9424736337488, "grad_norm": 0.3126644790172577, "learning_rate": 7.026515429702682e-05, "loss": 0.0313, "step": 10370 }, { "epoch": 9.952061361457334, "grad_norm": 0.22097988426685333, "learning_rate": 7.020700049942694e-05, "loss": 0.037, "step": 10380 }, { "epoch": 9.961649089165867, "grad_norm": 0.2554224729537964, "learning_rate": 7.014881401237563e-05, "loss": 0.0338, "step": 10390 }, { "epoch": 9.9712368168744, "grad_norm": 0.41450753808021545, "learning_rate": 7.009059493000285e-05, "loss": 0.0373, "step": 10400 }, { "epoch": 9.980824544582934, "grad_norm": 0.2980963885784149, "learning_rate": 7.003234334649133e-05, "loss": 0.0357, "step": 10410 }, { "epoch": 9.990412272291467, "grad_norm": 0.34623420238494873, "learning_rate": 6.997405935607635e-05, "loss": 0.0393, "step": 10420 }, { "epoch": 10.0, "grad_norm": 0.31464067101478577, "learning_rate": 6.991574305304558e-05, "loss": 0.0373, "step": 10430 }, { "epoch": 10.009587727708533, "grad_norm": 0.3440396785736084, "learning_rate": 6.985739453173903e-05, "loss": 0.0352, "step": 10440 }, { "epoch": 10.019175455417066, "grad_norm": 0.3453032374382019, "learning_rate": 6.979901388654879e-05, "loss": 0.0384, "step": 10450 }, { "epoch": 10.0287631831256, "grad_norm": 0.2174844592809677, "learning_rate": 6.97406012119189e-05, "loss": 0.033, "step": 10460 }, { "epoch": 10.038350910834133, "grad_norm": 0.34027159214019775, "learning_rate": 6.968215660234527e-05, "loss": 0.0439, "step": 10470 }, { "epoch": 10.047938638542666, "grad_norm": 0.29484447836875916, "learning_rate": 6.962368015237543e-05, "loss": 0.0406, "step": 10480 }, { "epoch": 10.0575263662512, "grad_norm": 0.2926745116710663, "learning_rate": 6.956517195660842e-05, "loss": 0.0366, "step": 10490 }, { "epoch": 10.06711409395973, "grad_norm": 0.25546324253082275, "learning_rate": 6.950663210969466e-05, "loss": 0.0387, "step": 10500 }, { "epoch": 10.076701821668264, "grad_norm": 0.19871650636196136, "learning_rate": 6.944806070633578e-05, "loss": 0.0408, "step": 10510 }, { "epoch": 10.086289549376797, "grad_norm": 0.432463139295578, "learning_rate": 6.93894578412844e-05, "loss": 0.0415, "step": 10520 }, { "epoch": 10.09587727708533, "grad_norm": 0.3453048765659332, "learning_rate": 6.933082360934408e-05, "loss": 0.0359, "step": 10530 }, { "epoch": 10.105465004793864, "grad_norm": 0.28228339552879333, "learning_rate": 6.927215810536915e-05, "loss": 0.0363, "step": 10540 }, { "epoch": 10.115052732502397, "grad_norm": 0.2979227304458618, "learning_rate": 6.921346142426448e-05, "loss": 0.0349, "step": 10550 }, { "epoch": 10.12464046021093, "grad_norm": 0.23034702241420746, "learning_rate": 6.915473366098541e-05, "loss": 0.0337, "step": 10560 }, { "epoch": 10.134228187919463, "grad_norm": 0.30385303497314453, "learning_rate": 6.909597491053751e-05, "loss": 0.0358, "step": 10570 }, { "epoch": 10.143815915627997, "grad_norm": 0.34254565834999084, "learning_rate": 6.903718526797658e-05, "loss": 0.0383, "step": 10580 }, { "epoch": 10.15340364333653, "grad_norm": 0.3243492841720581, "learning_rate": 6.897836482840828e-05, "loss": 0.0388, "step": 10590 }, { "epoch": 10.162991371045063, "grad_norm": 0.24607200920581818, "learning_rate": 6.891951368698815e-05, "loss": 0.0359, "step": 10600 }, { "epoch": 10.172579098753596, "grad_norm": 0.2082456648349762, "learning_rate": 6.88606319389214e-05, "loss": 0.0347, "step": 10610 }, { "epoch": 10.182166826462128, "grad_norm": 0.23741546273231506, "learning_rate": 6.880171967946273e-05, "loss": 0.0335, "step": 10620 }, { "epoch": 10.191754554170661, "grad_norm": 0.7699126601219177, "learning_rate": 6.874277700391623e-05, "loss": 0.0402, "step": 10630 }, { "epoch": 10.201342281879194, "grad_norm": 0.23752135038375854, "learning_rate": 6.868380400763516e-05, "loss": 0.0378, "step": 10640 }, { "epoch": 10.210930009587727, "grad_norm": 0.2777273952960968, "learning_rate": 6.86248007860219e-05, "loss": 0.0341, "step": 10650 }, { "epoch": 10.22051773729626, "grad_norm": 0.33273088932037354, "learning_rate": 6.856576743452761e-05, "loss": 0.0379, "step": 10660 }, { "epoch": 10.230105465004794, "grad_norm": 0.22550059854984283, "learning_rate": 6.850670404865227e-05, "loss": 0.0323, "step": 10670 }, { "epoch": 10.239693192713327, "grad_norm": 0.22732175886631012, "learning_rate": 6.844761072394446e-05, "loss": 0.0335, "step": 10680 }, { "epoch": 10.24928092042186, "grad_norm": 0.1689731478691101, "learning_rate": 6.838848755600114e-05, "loss": 0.0368, "step": 10690 }, { "epoch": 10.258868648130393, "grad_norm": 0.20502756536006927, "learning_rate": 6.83293346404676e-05, "loss": 0.041, "step": 10700 }, { "epoch": 10.268456375838927, "grad_norm": 0.2094731330871582, "learning_rate": 6.827015207303722e-05, "loss": 0.0383, "step": 10710 }, { "epoch": 10.27804410354746, "grad_norm": 0.3424762487411499, "learning_rate": 6.821093994945135e-05, "loss": 0.0435, "step": 10720 }, { "epoch": 10.287631831255993, "grad_norm": 0.3471381366252899, "learning_rate": 6.815169836549916e-05, "loss": 0.04, "step": 10730 }, { "epoch": 10.297219558964525, "grad_norm": 0.2713249623775482, "learning_rate": 6.80924274170175e-05, "loss": 0.0313, "step": 10740 }, { "epoch": 10.306807286673058, "grad_norm": 0.24895431101322174, "learning_rate": 6.803312719989068e-05, "loss": 0.0371, "step": 10750 }, { "epoch": 10.316395014381591, "grad_norm": 0.3460264205932617, "learning_rate": 6.797379781005039e-05, "loss": 0.0312, "step": 10760 }, { "epoch": 10.325982742090124, "grad_norm": 0.36002618074417114, "learning_rate": 6.791443934347553e-05, "loss": 0.0443, "step": 10770 }, { "epoch": 10.335570469798657, "grad_norm": 0.46812546253204346, "learning_rate": 6.785505189619197e-05, "loss": 0.0417, "step": 10780 }, { "epoch": 10.34515819750719, "grad_norm": 0.3170137107372284, "learning_rate": 6.779563556427255e-05, "loss": 0.0413, "step": 10790 }, { "epoch": 10.354745925215724, "grad_norm": 0.27735644578933716, "learning_rate": 6.773619044383677e-05, "loss": 0.0411, "step": 10800 }, { "epoch": 10.364333652924257, "grad_norm": 0.2342735081911087, "learning_rate": 6.767671663105075e-05, "loss": 0.0327, "step": 10810 }, { "epoch": 10.37392138063279, "grad_norm": 0.31249138712882996, "learning_rate": 6.761721422212696e-05, "loss": 0.042, "step": 10820 }, { "epoch": 10.383509108341324, "grad_norm": 0.26663604378700256, "learning_rate": 6.755768331332424e-05, "loss": 0.0359, "step": 10830 }, { "epoch": 10.393096836049857, "grad_norm": 0.30388474464416504, "learning_rate": 6.749812400094742e-05, "loss": 0.0443, "step": 10840 }, { "epoch": 10.40268456375839, "grad_norm": 0.3067167401313782, "learning_rate": 6.743853638134734e-05, "loss": 0.0424, "step": 10850 }, { "epoch": 10.412272291466923, "grad_norm": 0.3138778805732727, "learning_rate": 6.737892055092064e-05, "loss": 0.0313, "step": 10860 }, { "epoch": 10.421860019175455, "grad_norm": 0.28191816806793213, "learning_rate": 6.731927660610954e-05, "loss": 0.0358, "step": 10870 }, { "epoch": 10.431447746883988, "grad_norm": 0.37692686915397644, "learning_rate": 6.725960464340182e-05, "loss": 0.0317, "step": 10880 }, { "epoch": 10.441035474592521, "grad_norm": 0.26821082830429077, "learning_rate": 6.719990475933053e-05, "loss": 0.0319, "step": 10890 }, { "epoch": 10.450623202301054, "grad_norm": 0.46883681416511536, "learning_rate": 6.71401770504739e-05, "loss": 0.0376, "step": 10900 }, { "epoch": 10.460210930009588, "grad_norm": 0.8076095581054688, "learning_rate": 6.708042161345521e-05, "loss": 0.0355, "step": 10910 }, { "epoch": 10.46979865771812, "grad_norm": 0.29810166358947754, "learning_rate": 6.702063854494254e-05, "loss": 0.0269, "step": 10920 }, { "epoch": 10.479386385426654, "grad_norm": 0.3273125886917114, "learning_rate": 6.696082794164868e-05, "loss": 0.0386, "step": 10930 }, { "epoch": 10.488974113135187, "grad_norm": 0.4401116371154785, "learning_rate": 6.690098990033102e-05, "loss": 0.0298, "step": 10940 }, { "epoch": 10.49856184084372, "grad_norm": 0.2832469642162323, "learning_rate": 6.684112451779127e-05, "loss": 0.0397, "step": 10950 }, { "epoch": 10.508149568552254, "grad_norm": 0.3664191961288452, "learning_rate": 6.67812318908754e-05, "loss": 0.0382, "step": 10960 }, { "epoch": 10.517737296260787, "grad_norm": 0.32039886713027954, "learning_rate": 6.672131211647344e-05, "loss": 0.0332, "step": 10970 }, { "epoch": 10.527325023969318, "grad_norm": 0.31571629643440247, "learning_rate": 6.666136529151938e-05, "loss": 0.0358, "step": 10980 }, { "epoch": 10.536912751677852, "grad_norm": 0.30983471870422363, "learning_rate": 6.660139151299093e-05, "loss": 0.0402, "step": 10990 }, { "epoch": 10.546500479386385, "grad_norm": 0.35966020822525024, "learning_rate": 6.65413908779094e-05, "loss": 0.0418, "step": 11000 }, { "epoch": 10.556088207094918, "grad_norm": 0.3868638277053833, "learning_rate": 6.648136348333954e-05, "loss": 0.0428, "step": 11010 }, { "epoch": 10.565675934803451, "grad_norm": 0.20595276355743408, "learning_rate": 6.642130942638945e-05, "loss": 0.0359, "step": 11020 }, { "epoch": 10.575263662511984, "grad_norm": 0.6492677927017212, "learning_rate": 6.636122880421032e-05, "loss": 0.0345, "step": 11030 }, { "epoch": 10.584851390220518, "grad_norm": 0.22226084768772125, "learning_rate": 6.630112171399628e-05, "loss": 0.0322, "step": 11040 }, { "epoch": 10.594439117929051, "grad_norm": 0.27300918102264404, "learning_rate": 6.624098825298436e-05, "loss": 0.0345, "step": 11050 }, { "epoch": 10.604026845637584, "grad_norm": 0.2507658898830414, "learning_rate": 6.618082851845417e-05, "loss": 0.0397, "step": 11060 }, { "epoch": 10.613614573346117, "grad_norm": 0.22898472845554352, "learning_rate": 6.612064260772788e-05, "loss": 0.0312, "step": 11070 }, { "epoch": 10.62320230105465, "grad_norm": 0.2579527199268341, "learning_rate": 6.606043061816998e-05, "loss": 0.0319, "step": 11080 }, { "epoch": 10.632790028763184, "grad_norm": 0.3027057945728302, "learning_rate": 6.600019264718713e-05, "loss": 0.0425, "step": 11090 }, { "epoch": 10.642377756471717, "grad_norm": 0.4396612048149109, "learning_rate": 6.593992879222808e-05, "loss": 0.0347, "step": 11100 }, { "epoch": 10.651965484180248, "grad_norm": 0.3383849561214447, "learning_rate": 6.587963915078342e-05, "loss": 0.0427, "step": 11110 }, { "epoch": 10.661553211888782, "grad_norm": 0.39786002039909363, "learning_rate": 6.581932382038542e-05, "loss": 0.0325, "step": 11120 }, { "epoch": 10.671140939597315, "grad_norm": 0.29470136761665344, "learning_rate": 6.575898289860798e-05, "loss": 0.0327, "step": 11130 }, { "epoch": 10.680728667305848, "grad_norm": 0.33293044567108154, "learning_rate": 6.569861648306632e-05, "loss": 0.0372, "step": 11140 }, { "epoch": 10.690316395014381, "grad_norm": 0.2922416627407074, "learning_rate": 6.563822467141697e-05, "loss": 0.0371, "step": 11150 }, { "epoch": 10.699904122722915, "grad_norm": 0.37106814980506897, "learning_rate": 6.557780756135749e-05, "loss": 0.0358, "step": 11160 }, { "epoch": 10.709491850431448, "grad_norm": 0.2364514172077179, "learning_rate": 6.551736525062645e-05, "loss": 0.038, "step": 11170 }, { "epoch": 10.719079578139981, "grad_norm": 0.327987939119339, "learning_rate": 6.545689783700307e-05, "loss": 0.0399, "step": 11180 }, { "epoch": 10.728667305848514, "grad_norm": 0.25306403636932373, "learning_rate": 6.539640541830728e-05, "loss": 0.0319, "step": 11190 }, { "epoch": 10.738255033557047, "grad_norm": 0.301178902387619, "learning_rate": 6.533588809239941e-05, "loss": 0.0408, "step": 11200 }, { "epoch": 10.74784276126558, "grad_norm": 0.2662244439125061, "learning_rate": 6.527534595718007e-05, "loss": 0.0381, "step": 11210 }, { "epoch": 10.757430488974114, "grad_norm": 0.3115426301956177, "learning_rate": 6.521477911059008e-05, "loss": 0.0368, "step": 11220 }, { "epoch": 10.767018216682647, "grad_norm": 0.4020492136478424, "learning_rate": 6.515418765061015e-05, "loss": 0.0346, "step": 11230 }, { "epoch": 10.776605944391179, "grad_norm": 0.49596187472343445, "learning_rate": 6.509357167526084e-05, "loss": 0.0376, "step": 11240 }, { "epoch": 10.786193672099712, "grad_norm": 0.33604878187179565, "learning_rate": 6.50329312826024e-05, "loss": 0.0395, "step": 11250 }, { "epoch": 10.795781399808245, "grad_norm": 0.2914005219936371, "learning_rate": 6.497226657073454e-05, "loss": 0.0371, "step": 11260 }, { "epoch": 10.805369127516778, "grad_norm": 0.34624671936035156, "learning_rate": 6.491157763779632e-05, "loss": 0.0281, "step": 11270 }, { "epoch": 10.814956855225311, "grad_norm": 0.30700233578681946, "learning_rate": 6.485086458196602e-05, "loss": 0.0331, "step": 11280 }, { "epoch": 10.824544582933845, "grad_norm": 0.3025294244289398, "learning_rate": 6.479012750146087e-05, "loss": 0.0341, "step": 11290 }, { "epoch": 10.834132310642378, "grad_norm": 0.23997899889945984, "learning_rate": 6.472936649453701e-05, "loss": 0.0383, "step": 11300 }, { "epoch": 10.843720038350911, "grad_norm": 0.24672740697860718, "learning_rate": 6.466858165948933e-05, "loss": 0.0313, "step": 11310 }, { "epoch": 10.853307766059444, "grad_norm": 0.2887534201145172, "learning_rate": 6.460777309465118e-05, "loss": 0.039, "step": 11320 }, { "epoch": 10.862895493767978, "grad_norm": 0.24179044365882874, "learning_rate": 6.454694089839436e-05, "loss": 0.032, "step": 11330 }, { "epoch": 10.87248322147651, "grad_norm": 0.47962746024131775, "learning_rate": 6.448608516912888e-05, "loss": 0.0368, "step": 11340 }, { "epoch": 10.882070949185042, "grad_norm": 0.26336967945098877, "learning_rate": 6.44252060053028e-05, "loss": 0.045, "step": 11350 }, { "epoch": 10.891658676893575, "grad_norm": 0.2424604296684265, "learning_rate": 6.436430350540215e-05, "loss": 0.0321, "step": 11360 }, { "epoch": 10.901246404602109, "grad_norm": 0.25244084000587463, "learning_rate": 6.430337776795064e-05, "loss": 0.0346, "step": 11370 }, { "epoch": 10.910834132310642, "grad_norm": 0.30204179883003235, "learning_rate": 6.42424288915096e-05, "loss": 0.0362, "step": 11380 }, { "epoch": 10.920421860019175, "grad_norm": 0.3095405697822571, "learning_rate": 6.418145697467784e-05, "loss": 0.036, "step": 11390 }, { "epoch": 10.930009587727708, "grad_norm": 0.22773784399032593, "learning_rate": 6.412046211609134e-05, "loss": 0.0399, "step": 11400 }, { "epoch": 10.939597315436242, "grad_norm": 0.3239744007587433, "learning_rate": 6.40594444144233e-05, "loss": 0.0374, "step": 11410 }, { "epoch": 10.949185043144775, "grad_norm": 0.28157058358192444, "learning_rate": 6.399840396838382e-05, "loss": 0.0352, "step": 11420 }, { "epoch": 10.958772770853308, "grad_norm": 0.31856581568717957, "learning_rate": 6.393734087671979e-05, "loss": 0.0379, "step": 11430 }, { "epoch": 10.968360498561841, "grad_norm": 0.2937244772911072, "learning_rate": 6.387625523821474e-05, "loss": 0.0322, "step": 11440 }, { "epoch": 10.977948226270374, "grad_norm": 0.2260034680366516, "learning_rate": 6.38151471516887e-05, "loss": 0.0319, "step": 11450 }, { "epoch": 10.987535953978908, "grad_norm": 0.42635470628738403, "learning_rate": 6.375401671599798e-05, "loss": 0.0383, "step": 11460 }, { "epoch": 10.997123681687441, "grad_norm": 0.288327693939209, "learning_rate": 6.369286403003509e-05, "loss": 0.0406, "step": 11470 }, { "epoch": 11.006711409395972, "grad_norm": 0.2826128900051117, "learning_rate": 6.363168919272846e-05, "loss": 0.0356, "step": 11480 }, { "epoch": 11.016299137104506, "grad_norm": 0.2275691032409668, "learning_rate": 6.357049230304244e-05, "loss": 0.0336, "step": 11490 }, { "epoch": 11.025886864813039, "grad_norm": 0.24633708596229553, "learning_rate": 6.3509273459977e-05, "loss": 0.0353, "step": 11500 }, { "epoch": 11.035474592521572, "grad_norm": 0.3283119201660156, "learning_rate": 6.344803276256764e-05, "loss": 0.0324, "step": 11510 }, { "epoch": 11.045062320230105, "grad_norm": 0.5711014270782471, "learning_rate": 6.338677030988521e-05, "loss": 0.033, "step": 11520 }, { "epoch": 11.054650047938638, "grad_norm": 0.3481939435005188, "learning_rate": 6.332548620103575e-05, "loss": 0.0398, "step": 11530 }, { "epoch": 11.064237775647172, "grad_norm": 0.24051983654499054, "learning_rate": 6.326418053516037e-05, "loss": 0.04, "step": 11540 }, { "epoch": 11.073825503355705, "grad_norm": 0.4249405264854431, "learning_rate": 6.320285341143501e-05, "loss": 0.0389, "step": 11550 }, { "epoch": 11.083413231064238, "grad_norm": 0.24299634993076324, "learning_rate": 6.314150492907034e-05, "loss": 0.0323, "step": 11560 }, { "epoch": 11.093000958772771, "grad_norm": 0.2705395817756653, "learning_rate": 6.308013518731157e-05, "loss": 0.0358, "step": 11570 }, { "epoch": 11.102588686481305, "grad_norm": 0.3055950105190277, "learning_rate": 6.301874428543833e-05, "loss": 0.0299, "step": 11580 }, { "epoch": 11.112176414189838, "grad_norm": 0.35363319516181946, "learning_rate": 6.295733232276447e-05, "loss": 0.0361, "step": 11590 }, { "epoch": 11.12176414189837, "grad_norm": 0.4558916985988617, "learning_rate": 6.28958993986379e-05, "loss": 0.0391, "step": 11600 }, { "epoch": 11.131351869606902, "grad_norm": 0.26662135124206543, "learning_rate": 6.283444561244042e-05, "loss": 0.0372, "step": 11610 }, { "epoch": 11.140939597315436, "grad_norm": 0.24726532399654388, "learning_rate": 6.27729710635876e-05, "loss": 0.0346, "step": 11620 }, { "epoch": 11.150527325023969, "grad_norm": 0.2278524488210678, "learning_rate": 6.271147585152866e-05, "loss": 0.0338, "step": 11630 }, { "epoch": 11.160115052732502, "grad_norm": 0.3538067042827606, "learning_rate": 6.264996007574615e-05, "loss": 0.0388, "step": 11640 }, { "epoch": 11.169702780441035, "grad_norm": 0.3667300045490265, "learning_rate": 6.258842383575591e-05, "loss": 0.0367, "step": 11650 }, { "epoch": 11.179290508149569, "grad_norm": 0.29877883195877075, "learning_rate": 6.252686723110696e-05, "loss": 0.0348, "step": 11660 }, { "epoch": 11.188878235858102, "grad_norm": 0.2846558392047882, "learning_rate": 6.246529036138116e-05, "loss": 0.0341, "step": 11670 }, { "epoch": 11.198465963566635, "grad_norm": 0.2631428837776184, "learning_rate": 6.24036933261932e-05, "loss": 0.0356, "step": 11680 }, { "epoch": 11.208053691275168, "grad_norm": 0.34309467673301697, "learning_rate": 6.23420762251904e-05, "loss": 0.0365, "step": 11690 }, { "epoch": 11.217641418983701, "grad_norm": 0.2427697777748108, "learning_rate": 6.228043915805254e-05, "loss": 0.0378, "step": 11700 }, { "epoch": 11.227229146692235, "grad_norm": 0.31478065252304077, "learning_rate": 6.221878222449169e-05, "loss": 0.0404, "step": 11710 }, { "epoch": 11.236816874400766, "grad_norm": 0.27574971318244934, "learning_rate": 6.215710552425206e-05, "loss": 0.0311, "step": 11720 }, { "epoch": 11.2464046021093, "grad_norm": 0.7589734792709351, "learning_rate": 6.209540915710985e-05, "loss": 0.0331, "step": 11730 }, { "epoch": 11.255992329817833, "grad_norm": 0.2826196551322937, "learning_rate": 6.203369322287306e-05, "loss": 0.04, "step": 11740 }, { "epoch": 11.265580057526366, "grad_norm": 0.6920874714851379, "learning_rate": 6.197195782138132e-05, "loss": 0.0367, "step": 11750 }, { "epoch": 11.275167785234899, "grad_norm": 0.29903581738471985, "learning_rate": 6.191020305250582e-05, "loss": 0.0385, "step": 11760 }, { "epoch": 11.284755512943432, "grad_norm": 0.2374860942363739, "learning_rate": 6.184842901614902e-05, "loss": 0.0349, "step": 11770 }, { "epoch": 11.294343240651965, "grad_norm": 0.44580623507499695, "learning_rate": 6.178663581224458e-05, "loss": 0.0333, "step": 11780 }, { "epoch": 11.303930968360499, "grad_norm": 0.2667308747768402, "learning_rate": 6.172482354075716e-05, "loss": 0.0359, "step": 11790 }, { "epoch": 11.313518696069032, "grad_norm": 0.21850627660751343, "learning_rate": 6.166299230168228e-05, "loss": 0.0381, "step": 11800 }, { "epoch": 11.323106423777565, "grad_norm": 0.27936065196990967, "learning_rate": 6.16011421950461e-05, "loss": 0.0371, "step": 11810 }, { "epoch": 11.332694151486098, "grad_norm": 0.3284420371055603, "learning_rate": 6.153927332090537e-05, "loss": 0.0373, "step": 11820 }, { "epoch": 11.342281879194632, "grad_norm": 0.2999724745750427, "learning_rate": 6.147738577934711e-05, "loss": 0.0376, "step": 11830 }, { "epoch": 11.351869606903165, "grad_norm": 0.27732089161872864, "learning_rate": 6.141547967048867e-05, "loss": 0.0281, "step": 11840 }, { "epoch": 11.361457334611696, "grad_norm": 0.22769756615161896, "learning_rate": 6.135355509447727e-05, "loss": 0.0407, "step": 11850 }, { "epoch": 11.37104506232023, "grad_norm": 0.2970350682735443, "learning_rate": 6.129161215149016e-05, "loss": 0.0355, "step": 11860 }, { "epoch": 11.380632790028763, "grad_norm": 0.319409042596817, "learning_rate": 6.122965094173424e-05, "loss": 0.0387, "step": 11870 }, { "epoch": 11.390220517737296, "grad_norm": 0.31056809425354004, "learning_rate": 6.116767156544592e-05, "loss": 0.0353, "step": 11880 }, { "epoch": 11.39980824544583, "grad_norm": 0.2925516366958618, "learning_rate": 6.110567412289106e-05, "loss": 0.0313, "step": 11890 }, { "epoch": 11.409395973154362, "grad_norm": 0.2066742330789566, "learning_rate": 6.10436587143647e-05, "loss": 0.031, "step": 11900 }, { "epoch": 11.418983700862896, "grad_norm": 0.2351049929857254, "learning_rate": 6.0981625440191e-05, "loss": 0.0384, "step": 11910 }, { "epoch": 11.428571428571429, "grad_norm": 0.2299109846353531, "learning_rate": 6.091957440072297e-05, "loss": 0.029, "step": 11920 }, { "epoch": 11.438159156279962, "grad_norm": 0.27398043870925903, "learning_rate": 6.0857505696342376e-05, "loss": 0.0334, "step": 11930 }, { "epoch": 11.447746883988495, "grad_norm": 0.2886539697647095, "learning_rate": 6.0795419427459564e-05, "loss": 0.0403, "step": 11940 }, { "epoch": 11.457334611697028, "grad_norm": 0.1952909678220749, "learning_rate": 6.0733315694513306e-05, "loss": 0.0342, "step": 11950 }, { "epoch": 11.466922339405562, "grad_norm": 0.3800734281539917, "learning_rate": 6.067119459797061e-05, "loss": 0.0345, "step": 11960 }, { "epoch": 11.476510067114093, "grad_norm": 0.2989748418331146, "learning_rate": 6.060905623832656e-05, "loss": 0.0397, "step": 11970 }, { "epoch": 11.486097794822626, "grad_norm": 0.410169392824173, "learning_rate": 6.0546900716104206e-05, "loss": 0.0343, "step": 11980 }, { "epoch": 11.49568552253116, "grad_norm": 0.3879852890968323, "learning_rate": 6.048472813185433e-05, "loss": 0.0328, "step": 11990 }, { "epoch": 11.505273250239693, "grad_norm": 0.27426809072494507, "learning_rate": 6.042253858615532e-05, "loss": 0.0334, "step": 12000 }, { "epoch": 11.514860977948226, "grad_norm": 0.3014174699783325, "learning_rate": 6.036033217961303e-05, "loss": 0.0305, "step": 12010 }, { "epoch": 11.52444870565676, "grad_norm": 0.17615869641304016, "learning_rate": 6.029810901286056e-05, "loss": 0.04, "step": 12020 }, { "epoch": 11.534036433365292, "grad_norm": 0.2742109000682831, "learning_rate": 6.0235869186558125e-05, "loss": 0.0349, "step": 12030 }, { "epoch": 11.543624161073826, "grad_norm": 0.22772598266601562, "learning_rate": 6.017361280139292e-05, "loss": 0.0322, "step": 12040 }, { "epoch": 11.553211888782359, "grad_norm": 0.2431521862745285, "learning_rate": 6.011133995807888e-05, "loss": 0.0334, "step": 12050 }, { "epoch": 11.562799616490892, "grad_norm": 0.2893143594264984, "learning_rate": 6.004905075735662e-05, "loss": 0.0354, "step": 12060 }, { "epoch": 11.572387344199425, "grad_norm": 0.26321181654930115, "learning_rate": 5.998674529999316e-05, "loss": 0.0364, "step": 12070 }, { "epoch": 11.581975071907959, "grad_norm": 0.5845431685447693, "learning_rate": 5.992442368678187e-05, "loss": 0.0341, "step": 12080 }, { "epoch": 11.59156279961649, "grad_norm": 0.23230616748332977, "learning_rate": 5.986208601854222e-05, "loss": 0.0316, "step": 12090 }, { "epoch": 11.601150527325023, "grad_norm": 0.2684799134731293, "learning_rate": 5.979973239611967e-05, "loss": 0.0399, "step": 12100 }, { "epoch": 11.610738255033556, "grad_norm": 0.19658780097961426, "learning_rate": 5.973736292038549e-05, "loss": 0.0396, "step": 12110 }, { "epoch": 11.62032598274209, "grad_norm": 0.3254534602165222, "learning_rate": 5.967497769223659e-05, "loss": 0.0366, "step": 12120 }, { "epoch": 11.629913710450623, "grad_norm": 0.573215663433075, "learning_rate": 5.961257681259535e-05, "loss": 0.0371, "step": 12130 }, { "epoch": 11.639501438159156, "grad_norm": 0.24387991428375244, "learning_rate": 5.955016038240951e-05, "loss": 0.0314, "step": 12140 }, { "epoch": 11.64908916586769, "grad_norm": 0.3126358091831207, "learning_rate": 5.948772850265193e-05, "loss": 0.0388, "step": 12150 }, { "epoch": 11.658676893576223, "grad_norm": 0.2461678385734558, "learning_rate": 5.9425281274320466e-05, "loss": 0.0389, "step": 12160 }, { "epoch": 11.668264621284756, "grad_norm": 0.2887043058872223, "learning_rate": 5.936281879843782e-05, "loss": 0.0316, "step": 12170 }, { "epoch": 11.677852348993289, "grad_norm": 0.4977504014968872, "learning_rate": 5.9300341176051364e-05, "loss": 0.0369, "step": 12180 }, { "epoch": 11.687440076701822, "grad_norm": 0.1966911256313324, "learning_rate": 5.923784850823294e-05, "loss": 0.0354, "step": 12190 }, { "epoch": 11.697027804410356, "grad_norm": 0.28435948491096497, "learning_rate": 5.917534089607877e-05, "loss": 0.0347, "step": 12200 }, { "epoch": 11.706615532118889, "grad_norm": 0.26728013157844543, "learning_rate": 5.911281844070923e-05, "loss": 0.0292, "step": 12210 }, { "epoch": 11.71620325982742, "grad_norm": 0.24896536767482758, "learning_rate": 5.905028124326869e-05, "loss": 0.04, "step": 12220 }, { "epoch": 11.725790987535953, "grad_norm": 0.394512414932251, "learning_rate": 5.8987729404925405e-05, "loss": 0.0394, "step": 12230 }, { "epoch": 11.735378715244487, "grad_norm": 0.27139657735824585, "learning_rate": 5.892516302687131e-05, "loss": 0.0367, "step": 12240 }, { "epoch": 11.74496644295302, "grad_norm": 0.30433669686317444, "learning_rate": 5.886258221032184e-05, "loss": 0.0315, "step": 12250 }, { "epoch": 11.754554170661553, "grad_norm": 0.387657105922699, "learning_rate": 5.8799987056515804e-05, "loss": 0.0367, "step": 12260 }, { "epoch": 11.764141898370086, "grad_norm": 0.7159243226051331, "learning_rate": 5.87373776667152e-05, "loss": 0.037, "step": 12270 }, { "epoch": 11.77372962607862, "grad_norm": 0.4516725540161133, "learning_rate": 5.867475414220506e-05, "loss": 0.0389, "step": 12280 }, { "epoch": 11.783317353787153, "grad_norm": 0.4054473638534546, "learning_rate": 5.8612116584293266e-05, "loss": 0.0349, "step": 12290 }, { "epoch": 11.792905081495686, "grad_norm": 0.7706658244132996, "learning_rate": 5.854946509431042e-05, "loss": 0.0304, "step": 12300 }, { "epoch": 11.80249280920422, "grad_norm": 0.2988179624080658, "learning_rate": 5.848679977360963e-05, "loss": 0.0308, "step": 12310 }, { "epoch": 11.812080536912752, "grad_norm": 0.3133019804954529, "learning_rate": 5.8424120723566453e-05, "loss": 0.0341, "step": 12320 }, { "epoch": 11.821668264621284, "grad_norm": 0.28148677945137024, "learning_rate": 5.8361428045578595e-05, "loss": 0.0365, "step": 12330 }, { "epoch": 11.831255992329817, "grad_norm": 0.2674432098865509, "learning_rate": 5.829872184106579e-05, "loss": 0.0335, "step": 12340 }, { "epoch": 11.84084372003835, "grad_norm": 0.2875913679599762, "learning_rate": 5.823600221146974e-05, "loss": 0.0324, "step": 12350 }, { "epoch": 11.850431447746884, "grad_norm": 0.39202550053596497, "learning_rate": 5.817326925825376e-05, "loss": 0.0309, "step": 12360 }, { "epoch": 11.860019175455417, "grad_norm": 0.2087734192609787, "learning_rate": 5.811052308290279e-05, "loss": 0.033, "step": 12370 }, { "epoch": 11.86960690316395, "grad_norm": 0.2347189038991928, "learning_rate": 5.804776378692313e-05, "loss": 0.0357, "step": 12380 }, { "epoch": 11.879194630872483, "grad_norm": 0.18789781630039215, "learning_rate": 5.798499147184233e-05, "loss": 0.0362, "step": 12390 }, { "epoch": 11.888782358581016, "grad_norm": 0.44185203313827515, "learning_rate": 5.792220623920898e-05, "loss": 0.0353, "step": 12400 }, { "epoch": 11.89837008628955, "grad_norm": 0.34168651700019836, "learning_rate": 5.785940819059259e-05, "loss": 0.0399, "step": 12410 }, { "epoch": 11.907957813998083, "grad_norm": 0.3143576979637146, "learning_rate": 5.779659742758336e-05, "loss": 0.042, "step": 12420 }, { "epoch": 11.917545541706616, "grad_norm": 0.2344186156988144, "learning_rate": 5.773377405179209e-05, "loss": 0.0377, "step": 12430 }, { "epoch": 11.92713326941515, "grad_norm": 0.19894208014011383, "learning_rate": 5.767093816484999e-05, "loss": 0.0335, "step": 12440 }, { "epoch": 11.936720997123683, "grad_norm": 0.332093745470047, "learning_rate": 5.7608089868408486e-05, "loss": 0.0324, "step": 12450 }, { "epoch": 11.946308724832214, "grad_norm": 0.2045692652463913, "learning_rate": 5.75452292641391e-05, "loss": 0.0348, "step": 12460 }, { "epoch": 11.955896452540747, "grad_norm": 0.2825275659561157, "learning_rate": 5.748235645373325e-05, "loss": 0.0385, "step": 12470 }, { "epoch": 11.96548418024928, "grad_norm": 0.3274647891521454, "learning_rate": 5.741947153890215e-05, "loss": 0.0338, "step": 12480 }, { "epoch": 11.975071907957814, "grad_norm": 0.29837775230407715, "learning_rate": 5.7356574621376493e-05, "loss": 0.0406, "step": 12490 }, { "epoch": 11.984659635666347, "grad_norm": 0.3342297673225403, "learning_rate": 5.729366580290646e-05, "loss": 0.0325, "step": 12500 }, { "epoch": 11.99424736337488, "grad_norm": 0.2670736014842987, "learning_rate": 5.7230745185261505e-05, "loss": 0.0361, "step": 12510 }, { "epoch": 12.003835091083413, "grad_norm": 0.24446439743041992, "learning_rate": 5.7167812870230094e-05, "loss": 0.0298, "step": 12520 }, { "epoch": 12.013422818791947, "grad_norm": 0.24025262892246246, "learning_rate": 5.710486895961971e-05, "loss": 0.0285, "step": 12530 }, { "epoch": 12.02301054650048, "grad_norm": 0.20725701749324799, "learning_rate": 5.7041913555256506e-05, "loss": 0.0319, "step": 12540 }, { "epoch": 12.032598274209013, "grad_norm": 0.24926726520061493, "learning_rate": 5.6978946758985285e-05, "loss": 0.0358, "step": 12550 }, { "epoch": 12.042186001917546, "grad_norm": 0.22566866874694824, "learning_rate": 5.691596867266925e-05, "loss": 0.0353, "step": 12560 }, { "epoch": 12.05177372962608, "grad_norm": 0.2323976457118988, "learning_rate": 5.68529793981899e-05, "loss": 0.0347, "step": 12570 }, { "epoch": 12.06136145733461, "grad_norm": 0.2751142680644989, "learning_rate": 5.6789979037446784e-05, "loss": 0.0343, "step": 12580 }, { "epoch": 12.070949185043144, "grad_norm": 0.20366577804088593, "learning_rate": 5.672696769235744e-05, "loss": 0.0346, "step": 12590 }, { "epoch": 12.080536912751677, "grad_norm": 0.30414018034935, "learning_rate": 5.666394546485714e-05, "loss": 0.0335, "step": 12600 }, { "epoch": 12.09012464046021, "grad_norm": 0.24006792902946472, "learning_rate": 5.660091245689878e-05, "loss": 0.0332, "step": 12610 }, { "epoch": 12.099712368168744, "grad_norm": 0.25928163528442383, "learning_rate": 5.653786877045266e-05, "loss": 0.0345, "step": 12620 }, { "epoch": 12.109300095877277, "grad_norm": 0.3005020320415497, "learning_rate": 5.6474814507506426e-05, "loss": 0.0279, "step": 12630 }, { "epoch": 12.11888782358581, "grad_norm": 0.2962352931499481, "learning_rate": 5.641174977006476e-05, "loss": 0.0349, "step": 12640 }, { "epoch": 12.128475551294343, "grad_norm": 0.3519500195980072, "learning_rate": 5.634867466014932e-05, "loss": 0.0322, "step": 12650 }, { "epoch": 12.138063279002877, "grad_norm": 0.3588416576385498, "learning_rate": 5.628558927979854e-05, "loss": 0.0324, "step": 12660 }, { "epoch": 12.14765100671141, "grad_norm": 0.29862353205680847, "learning_rate": 5.622249373106748e-05, "loss": 0.037, "step": 12670 }, { "epoch": 12.157238734419943, "grad_norm": 0.3698887526988983, "learning_rate": 5.6159388116027654e-05, "loss": 0.0336, "step": 12680 }, { "epoch": 12.166826462128476, "grad_norm": 0.268628865480423, "learning_rate": 5.609627253676682e-05, "loss": 0.0373, "step": 12690 }, { "epoch": 12.176414189837008, "grad_norm": 0.23115096986293793, "learning_rate": 5.603314709538891e-05, "loss": 0.0393, "step": 12700 }, { "epoch": 12.186001917545541, "grad_norm": 0.26541295647621155, "learning_rate": 5.597001189401376e-05, "loss": 0.0367, "step": 12710 }, { "epoch": 12.195589645254074, "grad_norm": 0.28933706879615784, "learning_rate": 5.5906867034777046e-05, "loss": 0.0332, "step": 12720 }, { "epoch": 12.205177372962607, "grad_norm": 0.320468008518219, "learning_rate": 5.584371261983e-05, "loss": 0.0351, "step": 12730 }, { "epoch": 12.21476510067114, "grad_norm": 0.24627713859081268, "learning_rate": 5.578054875133939e-05, "loss": 0.032, "step": 12740 }, { "epoch": 12.224352828379674, "grad_norm": 0.19859549403190613, "learning_rate": 5.571737553148723e-05, "loss": 0.0338, "step": 12750 }, { "epoch": 12.233940556088207, "grad_norm": 0.2559930086135864, "learning_rate": 5.565419306247065e-05, "loss": 0.0372, "step": 12760 }, { "epoch": 12.24352828379674, "grad_norm": 0.1816064417362213, "learning_rate": 5.559100144650179e-05, "loss": 0.0325, "step": 12770 }, { "epoch": 12.253116011505274, "grad_norm": 0.5027087330818176, "learning_rate": 5.552780078580756e-05, "loss": 0.0357, "step": 12780 }, { "epoch": 12.262703739213807, "grad_norm": 0.4723157584667206, "learning_rate": 5.54645911826295e-05, "loss": 0.0301, "step": 12790 }, { "epoch": 12.27229146692234, "grad_norm": 0.18510127067565918, "learning_rate": 5.5401372739223615e-05, "loss": 0.0393, "step": 12800 }, { "epoch": 12.281879194630873, "grad_norm": 0.19757391512393951, "learning_rate": 5.533814555786021e-05, "loss": 0.0319, "step": 12810 }, { "epoch": 12.291466922339406, "grad_norm": 0.25884294509887695, "learning_rate": 5.527490974082376e-05, "loss": 0.0319, "step": 12820 }, { "epoch": 12.301054650047938, "grad_norm": 0.29503270983695984, "learning_rate": 5.521166539041266e-05, "loss": 0.0405, "step": 12830 }, { "epoch": 12.310642377756471, "grad_norm": 0.3443543016910553, "learning_rate": 5.514841260893913e-05, "loss": 0.0345, "step": 12840 }, { "epoch": 12.320230105465004, "grad_norm": 0.3162010610103607, "learning_rate": 5.508515149872903e-05, "loss": 0.0374, "step": 12850 }, { "epoch": 12.329817833173538, "grad_norm": 0.37343630194664, "learning_rate": 5.502188216212172e-05, "loss": 0.0339, "step": 12860 }, { "epoch": 12.33940556088207, "grad_norm": 0.4099912643432617, "learning_rate": 5.4958604701469804e-05, "loss": 0.0348, "step": 12870 }, { "epoch": 12.348993288590604, "grad_norm": 0.3237497806549072, "learning_rate": 5.489531921913911e-05, "loss": 0.0277, "step": 12880 }, { "epoch": 12.358581016299137, "grad_norm": 0.2685404121875763, "learning_rate": 5.483202581750838e-05, "loss": 0.0326, "step": 12890 }, { "epoch": 12.36816874400767, "grad_norm": 0.28428301215171814, "learning_rate": 5.476872459896918e-05, "loss": 0.0372, "step": 12900 }, { "epoch": 12.377756471716204, "grad_norm": 0.34229859709739685, "learning_rate": 5.470541566592573e-05, "loss": 0.0324, "step": 12910 }, { "epoch": 12.387344199424737, "grad_norm": 0.3393026292324066, "learning_rate": 5.464209912079472e-05, "loss": 0.034, "step": 12920 }, { "epoch": 12.39693192713327, "grad_norm": 0.3479039967060089, "learning_rate": 5.4578775066005196e-05, "loss": 0.0384, "step": 12930 }, { "epoch": 12.406519654841803, "grad_norm": 0.22416572272777557, "learning_rate": 5.4515443603998304e-05, "loss": 0.0339, "step": 12940 }, { "epoch": 12.416107382550335, "grad_norm": 0.3005695343017578, "learning_rate": 5.445210483722719e-05, "loss": 0.0374, "step": 12950 }, { "epoch": 12.425695110258868, "grad_norm": 0.2770855724811554, "learning_rate": 5.438875886815682e-05, "loss": 0.0407, "step": 12960 }, { "epoch": 12.435282837967401, "grad_norm": 0.3203631043434143, "learning_rate": 5.4325405799263786e-05, "loss": 0.0381, "step": 12970 }, { "epoch": 12.444870565675934, "grad_norm": 0.32981497049331665, "learning_rate": 5.4262045733036204e-05, "loss": 0.0389, "step": 12980 }, { "epoch": 12.454458293384468, "grad_norm": 0.24350851774215698, "learning_rate": 5.4198678771973475e-05, "loss": 0.0377, "step": 12990 }, { "epoch": 12.464046021093, "grad_norm": 0.25702494382858276, "learning_rate": 5.413530501858621e-05, "loss": 0.0308, "step": 13000 }, { "epoch": 12.473633748801534, "grad_norm": 0.25904905796051025, "learning_rate": 5.407192457539594e-05, "loss": 0.0327, "step": 13010 }, { "epoch": 12.483221476510067, "grad_norm": 0.29727786779403687, "learning_rate": 5.4008537544935066e-05, "loss": 0.0376, "step": 13020 }, { "epoch": 12.4928092042186, "grad_norm": 0.21568791568279266, "learning_rate": 5.394514402974661e-05, "loss": 0.029, "step": 13030 }, { "epoch": 12.502396931927134, "grad_norm": 0.31120288372039795, "learning_rate": 5.3881744132384104e-05, "loss": 0.0289, "step": 13040 }, { "epoch": 12.511984659635667, "grad_norm": 0.3262520134449005, "learning_rate": 5.381833795541141e-05, "loss": 0.0458, "step": 13050 }, { "epoch": 12.5215723873442, "grad_norm": 0.27970728278160095, "learning_rate": 5.375492560140254e-05, "loss": 0.0411, "step": 13060 }, { "epoch": 12.531160115052732, "grad_norm": 0.25999292731285095, "learning_rate": 5.3691507172941493e-05, "loss": 0.0367, "step": 13070 }, { "epoch": 12.540747842761265, "grad_norm": 0.32972532510757446, "learning_rate": 5.362808277262211e-05, "loss": 0.0336, "step": 13080 }, { "epoch": 12.550335570469798, "grad_norm": 0.21841417253017426, "learning_rate": 5.3564652503047895e-05, "loss": 0.0383, "step": 13090 }, { "epoch": 12.559923298178331, "grad_norm": 0.5416061878204346, "learning_rate": 5.350121646683183e-05, "loss": 0.0385, "step": 13100 }, { "epoch": 12.569511025886865, "grad_norm": 0.28985804319381714, "learning_rate": 5.343777476659621e-05, "loss": 0.0334, "step": 13110 }, { "epoch": 12.579098753595398, "grad_norm": 0.7717734575271606, "learning_rate": 5.3374327504972544e-05, "loss": 0.035, "step": 13120 }, { "epoch": 12.588686481303931, "grad_norm": 0.38980838656425476, "learning_rate": 5.331087478460129e-05, "loss": 0.04, "step": 13130 }, { "epoch": 12.598274209012464, "grad_norm": 0.43601536750793457, "learning_rate": 5.324741670813178e-05, "loss": 0.034, "step": 13140 }, { "epoch": 12.607861936720997, "grad_norm": 0.27574917674064636, "learning_rate": 5.318395337822195e-05, "loss": 0.0328, "step": 13150 }, { "epoch": 12.61744966442953, "grad_norm": 0.19968970119953156, "learning_rate": 5.312048489753833e-05, "loss": 0.0311, "step": 13160 }, { "epoch": 12.627037392138064, "grad_norm": 0.4505964517593384, "learning_rate": 5.305701136875566e-05, "loss": 0.0326, "step": 13170 }, { "epoch": 12.636625119846597, "grad_norm": 0.24829363822937012, "learning_rate": 5.299353289455694e-05, "loss": 0.0344, "step": 13180 }, { "epoch": 12.64621284755513, "grad_norm": 0.2600340247154236, "learning_rate": 5.2930049577633146e-05, "loss": 0.0309, "step": 13190 }, { "epoch": 12.655800575263662, "grad_norm": 0.2981709837913513, "learning_rate": 5.2866561520683065e-05, "loss": 0.0314, "step": 13200 }, { "epoch": 12.665388302972195, "grad_norm": 0.22709107398986816, "learning_rate": 5.280306882641319e-05, "loss": 0.0323, "step": 13210 }, { "epoch": 12.674976030680728, "grad_norm": 0.31488150358200073, "learning_rate": 5.273957159753749e-05, "loss": 0.0357, "step": 13220 }, { "epoch": 12.684563758389261, "grad_norm": 0.5378819704055786, "learning_rate": 5.2676069936777264e-05, "loss": 0.0341, "step": 13230 }, { "epoch": 12.694151486097795, "grad_norm": 0.3149401843547821, "learning_rate": 5.2612563946861e-05, "loss": 0.0369, "step": 13240 }, { "epoch": 12.703739213806328, "grad_norm": 0.2183138132095337, "learning_rate": 5.254905373052419e-05, "loss": 0.0349, "step": 13250 }, { "epoch": 12.713326941514861, "grad_norm": 1.1205395460128784, "learning_rate": 5.2485539390509156e-05, "loss": 0.0368, "step": 13260 }, { "epoch": 12.722914669223394, "grad_norm": 0.21172510087490082, "learning_rate": 5.242202102956486e-05, "loss": 0.0402, "step": 13270 }, { "epoch": 12.732502396931928, "grad_norm": 0.25088265538215637, "learning_rate": 5.2358498750446835e-05, "loss": 0.0356, "step": 13280 }, { "epoch": 12.74209012464046, "grad_norm": 0.36349666118621826, "learning_rate": 5.229497265591689e-05, "loss": 0.0292, "step": 13290 }, { "epoch": 12.751677852348994, "grad_norm": 0.3626287877559662, "learning_rate": 5.2231442848743064e-05, "loss": 0.0402, "step": 13300 }, { "epoch": 12.761265580057525, "grad_norm": 0.18637891113758087, "learning_rate": 5.2167909431699344e-05, "loss": 0.0324, "step": 13310 }, { "epoch": 12.770853307766059, "grad_norm": 0.28557726740837097, "learning_rate": 5.2104372507565593e-05, "loss": 0.0289, "step": 13320 }, { "epoch": 12.780441035474592, "grad_norm": 0.3556912839412689, "learning_rate": 5.204083217912732e-05, "loss": 0.0343, "step": 13330 }, { "epoch": 12.790028763183125, "grad_norm": 0.16443754732608795, "learning_rate": 5.197728854917558e-05, "loss": 0.0337, "step": 13340 }, { "epoch": 12.799616490891658, "grad_norm": 0.19597065448760986, "learning_rate": 5.191374172050676e-05, "loss": 0.0293, "step": 13350 }, { "epoch": 12.809204218600192, "grad_norm": 0.38750675320625305, "learning_rate": 5.185019179592238e-05, "loss": 0.0404, "step": 13360 }, { "epoch": 12.818791946308725, "grad_norm": 0.25635913014411926, "learning_rate": 5.178663887822901e-05, "loss": 0.0259, "step": 13370 }, { "epoch": 12.828379674017258, "grad_norm": 0.21815137565135956, "learning_rate": 5.172308307023805e-05, "loss": 0.0296, "step": 13380 }, { "epoch": 12.837967401725791, "grad_norm": 0.3391851782798767, "learning_rate": 5.165952447476559e-05, "loss": 0.0312, "step": 13390 }, { "epoch": 12.847555129434324, "grad_norm": 0.38378575444221497, "learning_rate": 5.159596319463219e-05, "loss": 0.0301, "step": 13400 }, { "epoch": 12.857142857142858, "grad_norm": 0.29647505283355713, "learning_rate": 5.15323993326628e-05, "loss": 0.0355, "step": 13410 }, { "epoch": 12.86673058485139, "grad_norm": 0.3213365972042084, "learning_rate": 5.146883299168651e-05, "loss": 0.0309, "step": 13420 }, { "epoch": 12.876318312559924, "grad_norm": 0.21259522438049316, "learning_rate": 5.1405264274536445e-05, "loss": 0.0361, "step": 13430 }, { "epoch": 12.885906040268456, "grad_norm": 0.41032230854034424, "learning_rate": 5.134169328404956e-05, "loss": 0.0347, "step": 13440 }, { "epoch": 12.895493767976989, "grad_norm": 0.3352082371711731, "learning_rate": 5.127812012306649e-05, "loss": 0.0329, "step": 13450 }, { "epoch": 12.905081495685522, "grad_norm": 2.1955349445343018, "learning_rate": 5.1214544894431396e-05, "loss": 0.036, "step": 13460 }, { "epoch": 12.914669223394055, "grad_norm": 0.19683793187141418, "learning_rate": 5.115096770099175e-05, "loss": 0.0344, "step": 13470 }, { "epoch": 12.924256951102588, "grad_norm": 0.2288978546857834, "learning_rate": 5.1087388645598235e-05, "loss": 0.0289, "step": 13480 }, { "epoch": 12.933844678811122, "grad_norm": 0.3008512556552887, "learning_rate": 5.1023807831104544e-05, "loss": 0.0421, "step": 13490 }, { "epoch": 12.943432406519655, "grad_norm": 0.29300564527511597, "learning_rate": 5.096022536036721e-05, "loss": 0.0374, "step": 13500 }, { "epoch": 12.953020134228188, "grad_norm": 0.2803822457790375, "learning_rate": 5.089664133624541e-05, "loss": 0.0349, "step": 13510 }, { "epoch": 12.962607861936721, "grad_norm": 0.35536760091781616, "learning_rate": 5.083305586160089e-05, "loss": 0.0311, "step": 13520 }, { "epoch": 12.972195589645255, "grad_norm": 0.290683776140213, "learning_rate": 5.07694690392977e-05, "loss": 0.0315, "step": 13530 }, { "epoch": 12.981783317353788, "grad_norm": 0.19355502724647522, "learning_rate": 5.070588097220213e-05, "loss": 0.0297, "step": 13540 }, { "epoch": 12.991371045062321, "grad_norm": 0.2547348439693451, "learning_rate": 5.06422917631824e-05, "loss": 0.0325, "step": 13550 }, { "epoch": 13.000958772770852, "grad_norm": 0.1929698884487152, "learning_rate": 5.057870151510864e-05, "loss": 0.0329, "step": 13560 }, { "epoch": 13.010546500479386, "grad_norm": 0.29264265298843384, "learning_rate": 5.051511033085264e-05, "loss": 0.0319, "step": 13570 }, { "epoch": 13.020134228187919, "grad_norm": 0.28177183866500854, "learning_rate": 5.0451518313287704e-05, "loss": 0.038, "step": 13580 }, { "epoch": 13.029721955896452, "grad_norm": 0.3331814110279083, "learning_rate": 5.0387925565288485e-05, "loss": 0.0307, "step": 13590 }, { "epoch": 13.039309683604985, "grad_norm": 0.297892689704895, "learning_rate": 5.0324332189730796e-05, "loss": 0.0339, "step": 13600 }, { "epoch": 13.048897411313519, "grad_norm": 0.2248513251543045, "learning_rate": 5.0260738289491516e-05, "loss": 0.0227, "step": 13610 }, { "epoch": 13.058485139022052, "grad_norm": 0.24514958262443542, "learning_rate": 5.0197143967448335e-05, "loss": 0.0335, "step": 13620 }, { "epoch": 13.068072866730585, "grad_norm": 0.29958298802375793, "learning_rate": 5.0133549326479645e-05, "loss": 0.0305, "step": 13630 }, { "epoch": 13.077660594439118, "grad_norm": 3.086843252182007, "learning_rate": 5.006995446946433e-05, "loss": 0.0377, "step": 13640 }, { "epoch": 13.087248322147651, "grad_norm": 0.31443238258361816, "learning_rate": 5.000635949928163e-05, "loss": 0.0344, "step": 13650 }, { "epoch": 13.096836049856185, "grad_norm": 0.27507051825523376, "learning_rate": 4.994276451881098e-05, "loss": 0.034, "step": 13660 }, { "epoch": 13.106423777564718, "grad_norm": 0.2578774094581604, "learning_rate": 4.987916963093184e-05, "loss": 0.0328, "step": 13670 }, { "epoch": 13.116011505273251, "grad_norm": 0.28767842054367065, "learning_rate": 4.981557493852349e-05, "loss": 0.0332, "step": 13680 }, { "epoch": 13.125599232981783, "grad_norm": 0.17203165590763092, "learning_rate": 4.975198054446492e-05, "loss": 0.0327, "step": 13690 }, { "epoch": 13.135186960690316, "grad_norm": 0.2606458067893982, "learning_rate": 4.968838655163462e-05, "loss": 0.0321, "step": 13700 }, { "epoch": 13.144774688398849, "grad_norm": 0.3137904703617096, "learning_rate": 4.9624793062910445e-05, "loss": 0.0376, "step": 13710 }, { "epoch": 13.154362416107382, "grad_norm": 0.255403608083725, "learning_rate": 4.956120018116941e-05, "loss": 0.0304, "step": 13720 }, { "epoch": 13.163950143815915, "grad_norm": 0.32765787839889526, "learning_rate": 4.94976080092876e-05, "loss": 0.0352, "step": 13730 }, { "epoch": 13.173537871524449, "grad_norm": 0.36302298307418823, "learning_rate": 4.94340166501399e-05, "loss": 0.0287, "step": 13740 }, { "epoch": 13.183125599232982, "grad_norm": 0.1956561803817749, "learning_rate": 4.93704262065999e-05, "loss": 0.0299, "step": 13750 }, { "epoch": 13.192713326941515, "grad_norm": 0.28090646862983704, "learning_rate": 4.930683678153971e-05, "loss": 0.0294, "step": 13760 }, { "epoch": 13.202301054650048, "grad_norm": 0.3016568422317505, "learning_rate": 4.9243248477829786e-05, "loss": 0.0339, "step": 13770 }, { "epoch": 13.211888782358582, "grad_norm": 0.34404152631759644, "learning_rate": 4.9179661398338764e-05, "loss": 0.0268, "step": 13780 }, { "epoch": 13.221476510067115, "grad_norm": 0.35919350385665894, "learning_rate": 4.911607564593331e-05, "loss": 0.0308, "step": 13790 }, { "epoch": 13.231064237775648, "grad_norm": 0.23123154044151306, "learning_rate": 4.905249132347796e-05, "loss": 0.0293, "step": 13800 }, { "epoch": 13.24065196548418, "grad_norm": 0.2878974378108978, "learning_rate": 4.89889085338349e-05, "loss": 0.0366, "step": 13810 }, { "epoch": 13.250239693192713, "grad_norm": 0.1915551722049713, "learning_rate": 4.892532737986387e-05, "loss": 0.0326, "step": 13820 }, { "epoch": 13.259827420901246, "grad_norm": 0.29005202651023865, "learning_rate": 4.886174796442193e-05, "loss": 0.0332, "step": 13830 }, { "epoch": 13.269415148609779, "grad_norm": 0.335665225982666, "learning_rate": 4.879817039036336e-05, "loss": 0.0254, "step": 13840 }, { "epoch": 13.279002876318312, "grad_norm": 0.1871231645345688, "learning_rate": 4.873459476053946e-05, "loss": 0.0288, "step": 13850 }, { "epoch": 13.288590604026846, "grad_norm": 0.26077544689178467, "learning_rate": 4.867102117779834e-05, "loss": 0.031, "step": 13860 }, { "epoch": 13.298178331735379, "grad_norm": 0.46799513697624207, "learning_rate": 4.8607449744984836e-05, "loss": 0.0292, "step": 13870 }, { "epoch": 13.307766059443912, "grad_norm": 0.24073362350463867, "learning_rate": 4.8543880564940327e-05, "loss": 0.0268, "step": 13880 }, { "epoch": 13.317353787152445, "grad_norm": 0.22020606696605682, "learning_rate": 4.848031374050251e-05, "loss": 0.0339, "step": 13890 }, { "epoch": 13.326941514860978, "grad_norm": 0.20859257876873016, "learning_rate": 4.8416749374505285e-05, "loss": 0.0319, "step": 13900 }, { "epoch": 13.336529242569512, "grad_norm": 4.301571846008301, "learning_rate": 4.835318756977856e-05, "loss": 0.0382, "step": 13910 }, { "epoch": 13.346116970278045, "grad_norm": 0.33860668540000916, "learning_rate": 4.828962842914812e-05, "loss": 0.0334, "step": 13920 }, { "epoch": 13.355704697986576, "grad_norm": 0.24827070534229279, "learning_rate": 4.8226072055435425e-05, "loss": 0.0267, "step": 13930 }, { "epoch": 13.36529242569511, "grad_norm": 0.22739817202091217, "learning_rate": 4.816251855145748e-05, "loss": 0.0308, "step": 13940 }, { "epoch": 13.374880153403643, "grad_norm": 0.33846351504325867, "learning_rate": 4.809896802002662e-05, "loss": 0.0337, "step": 13950 }, { "epoch": 13.384467881112176, "grad_norm": 0.2737593352794647, "learning_rate": 4.8035420563950395e-05, "loss": 0.0358, "step": 13960 }, { "epoch": 13.39405560882071, "grad_norm": 0.3176287114620209, "learning_rate": 4.797187628603136e-05, "loss": 0.0273, "step": 13970 }, { "epoch": 13.403643336529242, "grad_norm": 0.2898380756378174, "learning_rate": 4.790833528906696e-05, "loss": 0.0324, "step": 13980 }, { "epoch": 13.413231064237776, "grad_norm": 0.48169559240341187, "learning_rate": 4.784479767584929e-05, "loss": 0.0269, "step": 13990 }, { "epoch": 13.422818791946309, "grad_norm": 0.23410825431346893, "learning_rate": 4.778126354916498e-05, "loss": 0.0307, "step": 14000 }, { "epoch": 13.432406519654842, "grad_norm": 0.39884692430496216, "learning_rate": 4.771773301179506e-05, "loss": 0.0324, "step": 14010 }, { "epoch": 13.441994247363375, "grad_norm": 0.26422742009162903, "learning_rate": 4.765420616651468e-05, "loss": 0.0318, "step": 14020 }, { "epoch": 13.451581975071909, "grad_norm": 0.261283278465271, "learning_rate": 4.7590683116093135e-05, "loss": 0.0312, "step": 14030 }, { "epoch": 13.461169702780442, "grad_norm": 0.28744202852249146, "learning_rate": 4.752716396329346e-05, "loss": 0.0349, "step": 14040 }, { "epoch": 13.470757430488973, "grad_norm": 0.2296159714460373, "learning_rate": 4.746364881087244e-05, "loss": 0.0329, "step": 14050 }, { "epoch": 13.480345158197506, "grad_norm": 0.2238318920135498, "learning_rate": 4.7400137761580376e-05, "loss": 0.0287, "step": 14060 }, { "epoch": 13.48993288590604, "grad_norm": 0.3209201395511627, "learning_rate": 4.733663091816095e-05, "loss": 0.0312, "step": 14070 }, { "epoch": 13.499520613614573, "grad_norm": 4.273186206817627, "learning_rate": 4.7273128383351015e-05, "loss": 0.0292, "step": 14080 }, { "epoch": 13.509108341323106, "grad_norm": 0.2698652446269989, "learning_rate": 4.720963025988047e-05, "loss": 0.0319, "step": 14090 }, { "epoch": 13.51869606903164, "grad_norm": 0.28722748160362244, "learning_rate": 4.714613665047207e-05, "loss": 0.0285, "step": 14100 }, { "epoch": 13.528283796740173, "grad_norm": 0.2316875010728836, "learning_rate": 4.708264765784129e-05, "loss": 0.0368, "step": 14110 }, { "epoch": 13.537871524448706, "grad_norm": 0.5195225477218628, "learning_rate": 4.701916338469608e-05, "loss": 0.031, "step": 14120 }, { "epoch": 13.547459252157239, "grad_norm": 0.40332475304603577, "learning_rate": 4.6955683933736814e-05, "loss": 0.032, "step": 14130 }, { "epoch": 13.557046979865772, "grad_norm": 0.2699570059776306, "learning_rate": 4.689220940765605e-05, "loss": 0.0334, "step": 14140 }, { "epoch": 13.566634707574305, "grad_norm": 0.26050880551338196, "learning_rate": 4.682873990913835e-05, "loss": 0.0333, "step": 14150 }, { "epoch": 13.576222435282839, "grad_norm": 0.2826980650424957, "learning_rate": 4.676527554086018e-05, "loss": 0.0282, "step": 14160 }, { "epoch": 13.585810162991372, "grad_norm": 0.17002440989017487, "learning_rate": 4.6701816405489686e-05, "loss": 0.0325, "step": 14170 }, { "epoch": 13.595397890699903, "grad_norm": 0.33742156624794006, "learning_rate": 4.6638362605686555e-05, "loss": 0.0283, "step": 14180 }, { "epoch": 13.604985618408437, "grad_norm": 0.29989632964134216, "learning_rate": 4.657491424410185e-05, "loss": 0.0327, "step": 14190 }, { "epoch": 13.61457334611697, "grad_norm": 0.2583453357219696, "learning_rate": 4.6511471423377815e-05, "loss": 0.0285, "step": 14200 }, { "epoch": 13.624161073825503, "grad_norm": 0.2405027151107788, "learning_rate": 4.6448034246147754e-05, "loss": 0.0262, "step": 14210 }, { "epoch": 13.633748801534036, "grad_norm": 0.3429577052593231, "learning_rate": 4.638460281503582e-05, "loss": 0.0429, "step": 14220 }, { "epoch": 13.64333652924257, "grad_norm": 0.30057376623153687, "learning_rate": 4.6321177232656894e-05, "loss": 0.0255, "step": 14230 }, { "epoch": 13.652924256951103, "grad_norm": 0.25279220938682556, "learning_rate": 4.6257757601616364e-05, "loss": 0.0333, "step": 14240 }, { "epoch": 13.662511984659636, "grad_norm": 1.2111369371414185, "learning_rate": 4.6194344024510036e-05, "loss": 0.0325, "step": 14250 }, { "epoch": 13.67209971236817, "grad_norm": 1.4284824132919312, "learning_rate": 4.613093660392386e-05, "loss": 0.0368, "step": 14260 }, { "epoch": 13.681687440076702, "grad_norm": 1.6276288032531738, "learning_rate": 4.6067535442433885e-05, "loss": 0.0343, "step": 14270 }, { "epoch": 13.691275167785236, "grad_norm": 1.3329591751098633, "learning_rate": 4.6004140642606e-05, "loss": 0.0322, "step": 14280 }, { "epoch": 13.700862895493769, "grad_norm": 0.2651832103729248, "learning_rate": 4.5940752306995824e-05, "loss": 0.0337, "step": 14290 }, { "epoch": 13.7104506232023, "grad_norm": 0.1592620313167572, "learning_rate": 4.58773705381485e-05, "loss": 0.0309, "step": 14300 }, { "epoch": 13.720038350910833, "grad_norm": 0.7592516541481018, "learning_rate": 4.581399543859855e-05, "loss": 0.0355, "step": 14310 }, { "epoch": 13.729626078619367, "grad_norm": 0.27996954321861267, "learning_rate": 4.5750627110869724e-05, "loss": 0.0299, "step": 14320 }, { "epoch": 13.7392138063279, "grad_norm": 0.19375735521316528, "learning_rate": 4.5687265657474797e-05, "loss": 0.0354, "step": 14330 }, { "epoch": 13.748801534036433, "grad_norm": 0.263683944940567, "learning_rate": 4.562391118091544e-05, "loss": 0.0342, "step": 14340 }, { "epoch": 13.758389261744966, "grad_norm": 0.4312153160572052, "learning_rate": 4.556056378368203e-05, "loss": 0.0349, "step": 14350 }, { "epoch": 13.7679769894535, "grad_norm": 0.3268071413040161, "learning_rate": 4.549722356825349e-05, "loss": 0.0297, "step": 14360 }, { "epoch": 13.777564717162033, "grad_norm": 0.43241703510284424, "learning_rate": 4.543389063709712e-05, "loss": 0.0333, "step": 14370 }, { "epoch": 13.787152444870566, "grad_norm": 0.2650851905345917, "learning_rate": 4.537056509266845e-05, "loss": 0.0338, "step": 14380 }, { "epoch": 13.7967401725791, "grad_norm": 0.34464454650878906, "learning_rate": 4.530724703741104e-05, "loss": 0.0334, "step": 14390 }, { "epoch": 13.806327900287632, "grad_norm": 0.2718554735183716, "learning_rate": 4.524393657375635e-05, "loss": 0.0295, "step": 14400 }, { "epoch": 13.815915627996166, "grad_norm": 0.27128416299819946, "learning_rate": 4.5180633804123555e-05, "loss": 0.0367, "step": 14410 }, { "epoch": 13.825503355704697, "grad_norm": 0.190488800406456, "learning_rate": 4.511733883091939e-05, "loss": 0.0273, "step": 14420 }, { "epoch": 13.83509108341323, "grad_norm": 0.45956146717071533, "learning_rate": 4.5054051756537965e-05, "loss": 0.0333, "step": 14430 }, { "epoch": 13.844678811121764, "grad_norm": 0.2585156559944153, "learning_rate": 4.499077268336063e-05, "loss": 0.0277, "step": 14440 }, { "epoch": 13.854266538830297, "grad_norm": 0.209930419921875, "learning_rate": 4.492750171375576e-05, "loss": 0.0317, "step": 14450 }, { "epoch": 13.86385426653883, "grad_norm": 0.25458142161369324, "learning_rate": 4.486423895007866e-05, "loss": 0.0402, "step": 14460 }, { "epoch": 13.873441994247363, "grad_norm": 0.2012961506843567, "learning_rate": 4.480098449467132e-05, "loss": 0.031, "step": 14470 }, { "epoch": 13.883029721955896, "grad_norm": 0.2313721477985382, "learning_rate": 4.473773844986229e-05, "loss": 0.0278, "step": 14480 }, { "epoch": 13.89261744966443, "grad_norm": 0.3655869960784912, "learning_rate": 4.467450091796658e-05, "loss": 0.0356, "step": 14490 }, { "epoch": 13.902205177372963, "grad_norm": 0.2222936451435089, "learning_rate": 4.461127200128536e-05, "loss": 0.0335, "step": 14500 }, { "epoch": 13.911792905081496, "grad_norm": 0.2714097797870636, "learning_rate": 4.4548051802105914e-05, "loss": 0.0289, "step": 14510 }, { "epoch": 13.92138063279003, "grad_norm": 0.28923454880714417, "learning_rate": 4.448484042270134e-05, "loss": 0.0321, "step": 14520 }, { "epoch": 13.930968360498563, "grad_norm": 0.3318518400192261, "learning_rate": 4.4421637965330554e-05, "loss": 0.0302, "step": 14530 }, { "epoch": 13.940556088207096, "grad_norm": 0.21569694578647614, "learning_rate": 4.4358444532237996e-05, "loss": 0.0347, "step": 14540 }, { "epoch": 13.950143815915627, "grad_norm": 0.24663789570331573, "learning_rate": 4.429526022565352e-05, "loss": 0.0293, "step": 14550 }, { "epoch": 13.95973154362416, "grad_norm": 0.17170065641403198, "learning_rate": 4.423208514779222e-05, "loss": 0.0383, "step": 14560 }, { "epoch": 13.969319271332694, "grad_norm": 0.2217435985803604, "learning_rate": 4.4168919400854245e-05, "loss": 0.0357, "step": 14570 }, { "epoch": 13.978906999041227, "grad_norm": 0.18699301779270172, "learning_rate": 4.4105763087024666e-05, "loss": 0.0261, "step": 14580 }, { "epoch": 13.98849472674976, "grad_norm": 0.35671454668045044, "learning_rate": 4.404261630847329e-05, "loss": 0.0356, "step": 14590 }, { "epoch": 13.998082454458293, "grad_norm": 0.33537557721138, "learning_rate": 4.3979479167354477e-05, "loss": 0.0317, "step": 14600 }, { "epoch": 14.007670182166827, "grad_norm": 0.25765296816825867, "learning_rate": 4.391635176580702e-05, "loss": 0.0314, "step": 14610 }, { "epoch": 14.01725790987536, "grad_norm": 0.18932734429836273, "learning_rate": 4.385323420595395e-05, "loss": 0.036, "step": 14620 }, { "epoch": 14.026845637583893, "grad_norm": 0.2255479097366333, "learning_rate": 4.3790126589902344e-05, "loss": 0.0329, "step": 14630 }, { "epoch": 14.036433365292426, "grad_norm": 0.19790147244930267, "learning_rate": 4.372702901974331e-05, "loss": 0.032, "step": 14640 }, { "epoch": 14.04602109300096, "grad_norm": 0.16959276795387268, "learning_rate": 4.366394159755155e-05, "loss": 0.0328, "step": 14650 }, { "epoch": 14.055608820709493, "grad_norm": 0.36921027302742004, "learning_rate": 4.3600864425385434e-05, "loss": 0.0313, "step": 14660 }, { "epoch": 14.065196548418024, "grad_norm": 0.1770399957895279, "learning_rate": 4.3537797605286736e-05, "loss": 0.0265, "step": 14670 }, { "epoch": 14.074784276126557, "grad_norm": 0.28713101148605347, "learning_rate": 4.347474123928048e-05, "loss": 0.0282, "step": 14680 }, { "epoch": 14.08437200383509, "grad_norm": 0.1728815734386444, "learning_rate": 4.3411695429374793e-05, "loss": 0.03, "step": 14690 }, { "epoch": 14.093959731543624, "grad_norm": 0.2004602998495102, "learning_rate": 4.3348660277560694e-05, "loss": 0.0301, "step": 14700 }, { "epoch": 14.103547459252157, "grad_norm": 0.24591505527496338, "learning_rate": 4.328563588581199e-05, "loss": 0.0384, "step": 14710 }, { "epoch": 14.11313518696069, "grad_norm": 0.3375163674354553, "learning_rate": 4.322262235608508e-05, "loss": 0.0339, "step": 14720 }, { "epoch": 14.122722914669223, "grad_norm": 0.22719378769397736, "learning_rate": 4.315961979031875e-05, "loss": 0.0323, "step": 14730 }, { "epoch": 14.132310642377757, "grad_norm": 0.34426233172416687, "learning_rate": 4.30966282904341e-05, "loss": 0.0335, "step": 14740 }, { "epoch": 14.14189837008629, "grad_norm": 0.30899283289909363, "learning_rate": 4.3033647958334306e-05, "loss": 0.0334, "step": 14750 }, { "epoch": 14.151486097794823, "grad_norm": 0.3567700684070587, "learning_rate": 4.2970678895904476e-05, "loss": 0.0356, "step": 14760 }, { "epoch": 14.161073825503356, "grad_norm": 0.22836564481258392, "learning_rate": 4.29077212050115e-05, "loss": 0.0321, "step": 14770 }, { "epoch": 14.17066155321189, "grad_norm": 0.17751692235469818, "learning_rate": 4.284477498750383e-05, "loss": 0.0302, "step": 14780 }, { "epoch": 14.180249280920421, "grad_norm": 0.3431791067123413, "learning_rate": 4.278184034521144e-05, "loss": 0.0332, "step": 14790 }, { "epoch": 14.189837008628954, "grad_norm": 0.26100659370422363, "learning_rate": 4.27189173799455e-05, "loss": 0.0315, "step": 14800 }, { "epoch": 14.199424736337487, "grad_norm": 0.2879122197628021, "learning_rate": 4.265600619349832e-05, "loss": 0.0277, "step": 14810 }, { "epoch": 14.20901246404602, "grad_norm": 0.26366403698921204, "learning_rate": 4.2593106887643156e-05, "loss": 0.0332, "step": 14820 }, { "epoch": 14.218600191754554, "grad_norm": 0.25366711616516113, "learning_rate": 4.2530219564134046e-05, "loss": 0.0309, "step": 14830 }, { "epoch": 14.228187919463087, "grad_norm": 0.259772926568985, "learning_rate": 4.246734432470563e-05, "loss": 0.0302, "step": 14840 }, { "epoch": 14.23777564717162, "grad_norm": 0.32079434394836426, "learning_rate": 4.240448127107301e-05, "loss": 0.0293, "step": 14850 }, { "epoch": 14.247363374880154, "grad_norm": 0.25380274653434753, "learning_rate": 4.234163050493158e-05, "loss": 0.0287, "step": 14860 }, { "epoch": 14.256951102588687, "grad_norm": 0.26985570788383484, "learning_rate": 4.2278792127956846e-05, "loss": 0.0307, "step": 14870 }, { "epoch": 14.26653883029722, "grad_norm": 0.2960470914840698, "learning_rate": 4.221596624180426e-05, "loss": 0.0313, "step": 14880 }, { "epoch": 14.276126558005753, "grad_norm": 0.41474372148513794, "learning_rate": 4.21531529481091e-05, "loss": 0.0287, "step": 14890 }, { "epoch": 14.285714285714286, "grad_norm": 0.2426476627588272, "learning_rate": 4.2090352348486256e-05, "loss": 0.0272, "step": 14900 }, { "epoch": 14.29530201342282, "grad_norm": 0.2811989486217499, "learning_rate": 4.202756454453007e-05, "loss": 0.0328, "step": 14910 }, { "epoch": 14.304889741131351, "grad_norm": 0.20871858298778534, "learning_rate": 4.196478963781421e-05, "loss": 0.028, "step": 14920 }, { "epoch": 14.314477468839884, "grad_norm": 0.1654272824525833, "learning_rate": 4.190202772989144e-05, "loss": 0.0301, "step": 14930 }, { "epoch": 14.324065196548418, "grad_norm": 0.6324641108512878, "learning_rate": 4.183927892229354e-05, "loss": 0.0284, "step": 14940 }, { "epoch": 14.33365292425695, "grad_norm": 1.3468248844146729, "learning_rate": 4.177654331653108e-05, "loss": 0.0331, "step": 14950 }, { "epoch": 14.343240651965484, "grad_norm": 0.16660985350608826, "learning_rate": 4.171382101409327e-05, "loss": 0.0262, "step": 14960 }, { "epoch": 14.352828379674017, "grad_norm": 0.32994958758354187, "learning_rate": 4.165111211644779e-05, "loss": 0.0259, "step": 14970 }, { "epoch": 14.36241610738255, "grad_norm": 0.20298174023628235, "learning_rate": 4.158841672504066e-05, "loss": 0.0298, "step": 14980 }, { "epoch": 14.372003835091084, "grad_norm": 0.23911802470684052, "learning_rate": 4.1525734941296026e-05, "loss": 0.0315, "step": 14990 }, { "epoch": 14.381591562799617, "grad_norm": 0.22921425104141235, "learning_rate": 4.146306686661602e-05, "loss": 0.0336, "step": 15000 }, { "epoch": 14.39117929050815, "grad_norm": 0.24981558322906494, "learning_rate": 4.140041260238062e-05, "loss": 0.0326, "step": 15010 }, { "epoch": 14.400767018216683, "grad_norm": 0.24186521768569946, "learning_rate": 4.1337772249947435e-05, "loss": 0.0264, "step": 15020 }, { "epoch": 14.410354745925215, "grad_norm": 0.26285290718078613, "learning_rate": 4.1275145910651603e-05, "loss": 0.0304, "step": 15030 }, { "epoch": 14.419942473633748, "grad_norm": 0.2739505469799042, "learning_rate": 4.121253368580555e-05, "loss": 0.0263, "step": 15040 }, { "epoch": 14.429530201342281, "grad_norm": 0.6612746119499207, "learning_rate": 4.1149935676698904e-05, "loss": 0.0395, "step": 15050 }, { "epoch": 14.439117929050814, "grad_norm": 0.2866060435771942, "learning_rate": 4.108735198459827e-05, "loss": 0.025, "step": 15060 }, { "epoch": 14.448705656759348, "grad_norm": 0.27634814381599426, "learning_rate": 4.102478271074712e-05, "loss": 0.0278, "step": 15070 }, { "epoch": 14.458293384467881, "grad_norm": 0.25131815671920776, "learning_rate": 4.0962227956365574e-05, "loss": 0.0271, "step": 15080 }, { "epoch": 14.467881112176414, "grad_norm": 0.3638950288295746, "learning_rate": 4.089968782265025e-05, "loss": 0.0297, "step": 15090 }, { "epoch": 14.477468839884947, "grad_norm": 0.2399180382490158, "learning_rate": 4.083716241077419e-05, "loss": 0.0284, "step": 15100 }, { "epoch": 14.48705656759348, "grad_norm": 0.27603140473365784, "learning_rate": 4.077465182188654e-05, "loss": 0.0302, "step": 15110 }, { "epoch": 14.496644295302014, "grad_norm": 0.17177820205688477, "learning_rate": 4.07121561571125e-05, "loss": 0.0382, "step": 15120 }, { "epoch": 14.506232023010547, "grad_norm": 0.26461273431777954, "learning_rate": 4.064967551755312e-05, "loss": 0.0328, "step": 15130 }, { "epoch": 14.51581975071908, "grad_norm": 0.31283822655677795, "learning_rate": 4.058721000428514e-05, "loss": 0.025, "step": 15140 }, { "epoch": 14.525407478427613, "grad_norm": 0.18203134834766388, "learning_rate": 4.052475971836083e-05, "loss": 0.0286, "step": 15150 }, { "epoch": 14.534995206136145, "grad_norm": 0.295449435710907, "learning_rate": 4.0462324760807846e-05, "loss": 0.033, "step": 15160 }, { "epoch": 14.544582933844678, "grad_norm": 0.16782015562057495, "learning_rate": 4.039990523262902e-05, "loss": 0.0278, "step": 15170 }, { "epoch": 14.554170661553211, "grad_norm": 0.23671366274356842, "learning_rate": 4.033750123480224e-05, "loss": 0.0319, "step": 15180 }, { "epoch": 14.563758389261745, "grad_norm": 0.18487486243247986, "learning_rate": 4.027511286828028e-05, "loss": 0.0297, "step": 15190 }, { "epoch": 14.573346116970278, "grad_norm": 0.19782863557338715, "learning_rate": 4.0212740233990587e-05, "loss": 0.0316, "step": 15200 }, { "epoch": 14.582933844678811, "grad_norm": 0.30595293641090393, "learning_rate": 4.0150383432835186e-05, "loss": 0.0282, "step": 15210 }, { "epoch": 14.592521572387344, "grad_norm": 0.2661206126213074, "learning_rate": 4.00880425656905e-05, "loss": 0.0286, "step": 15220 }, { "epoch": 14.602109300095877, "grad_norm": 0.2635152339935303, "learning_rate": 4.002571773340714e-05, "loss": 0.0334, "step": 15230 }, { "epoch": 14.61169702780441, "grad_norm": 0.27702832221984863, "learning_rate": 3.996340903680979e-05, "loss": 0.0304, "step": 15240 }, { "epoch": 14.621284755512944, "grad_norm": 0.21685196459293365, "learning_rate": 3.9901116576697083e-05, "loss": 0.0394, "step": 15250 }, { "epoch": 14.630872483221477, "grad_norm": 0.2177799493074417, "learning_rate": 3.983884045384131e-05, "loss": 0.0321, "step": 15260 }, { "epoch": 14.64046021093001, "grad_norm": 0.21278002858161926, "learning_rate": 3.977658076898836e-05, "loss": 0.0329, "step": 15270 }, { "epoch": 14.650047938638544, "grad_norm": 0.4188462495803833, "learning_rate": 3.971433762285754e-05, "loss": 0.0324, "step": 15280 }, { "epoch": 14.659635666347075, "grad_norm": 0.4150042235851288, "learning_rate": 3.965211111614139e-05, "loss": 0.0311, "step": 15290 }, { "epoch": 14.669223394055608, "grad_norm": 0.5566287040710449, "learning_rate": 3.958990134950555e-05, "loss": 0.028, "step": 15300 }, { "epoch": 14.678811121764141, "grad_norm": 0.2592385411262512, "learning_rate": 3.9527708423588546e-05, "loss": 0.0354, "step": 15310 }, { "epoch": 14.688398849472675, "grad_norm": 0.20564644038677216, "learning_rate": 3.946553243900169e-05, "loss": 0.0359, "step": 15320 }, { "epoch": 14.697986577181208, "grad_norm": 0.27093440294265747, "learning_rate": 3.9403373496328885e-05, "loss": 0.0377, "step": 15330 }, { "epoch": 14.707574304889741, "grad_norm": 0.35600170493125916, "learning_rate": 3.934123169612645e-05, "loss": 0.0323, "step": 15340 }, { "epoch": 14.717162032598274, "grad_norm": 0.3020756244659424, "learning_rate": 3.927910713892298e-05, "loss": 0.0313, "step": 15350 }, { "epoch": 14.726749760306808, "grad_norm": 0.26487666368484497, "learning_rate": 3.921699992521917e-05, "loss": 0.0322, "step": 15360 }, { "epoch": 14.73633748801534, "grad_norm": 0.2509137988090515, "learning_rate": 3.915491015548766e-05, "loss": 0.0249, "step": 15370 }, { "epoch": 14.745925215723874, "grad_norm": 0.2903117537498474, "learning_rate": 3.9092837930172884e-05, "loss": 0.0325, "step": 15380 }, { "epoch": 14.755512943432407, "grad_norm": 2.1292974948883057, "learning_rate": 3.903078334969087e-05, "loss": 0.0352, "step": 15390 }, { "epoch": 14.765100671140939, "grad_norm": 0.18879927694797516, "learning_rate": 3.8968746514429134e-05, "loss": 0.0348, "step": 15400 }, { "epoch": 14.774688398849472, "grad_norm": 0.27570220828056335, "learning_rate": 3.890672752474646e-05, "loss": 0.0267, "step": 15410 }, { "epoch": 14.784276126558005, "grad_norm": 0.28451746702194214, "learning_rate": 3.884472648097276e-05, "loss": 0.029, "step": 15420 }, { "epoch": 14.793863854266538, "grad_norm": 0.2464732676744461, "learning_rate": 3.878274348340892e-05, "loss": 0.027, "step": 15430 }, { "epoch": 14.803451581975072, "grad_norm": 0.1651841551065445, "learning_rate": 3.872077863232665e-05, "loss": 0.0275, "step": 15440 }, { "epoch": 14.813039309683605, "grad_norm": 0.1864641159772873, "learning_rate": 3.865883202796829e-05, "loss": 0.028, "step": 15450 }, { "epoch": 14.822627037392138, "grad_norm": 0.40212348103523254, "learning_rate": 3.8596903770546636e-05, "loss": 0.0296, "step": 15460 }, { "epoch": 14.832214765100671, "grad_norm": 0.34442323446273804, "learning_rate": 3.853499396024486e-05, "loss": 0.0279, "step": 15470 }, { "epoch": 14.841802492809204, "grad_norm": 0.21626895666122437, "learning_rate": 3.8473102697216226e-05, "loss": 0.0298, "step": 15480 }, { "epoch": 14.851390220517738, "grad_norm": 0.22285476326942444, "learning_rate": 3.841123008158405e-05, "loss": 0.0265, "step": 15490 }, { "epoch": 14.860977948226271, "grad_norm": 0.330901563167572, "learning_rate": 3.8349376213441444e-05, "loss": 0.032, "step": 15500 }, { "epoch": 14.870565675934804, "grad_norm": 0.3265020251274109, "learning_rate": 3.828754119285123e-05, "loss": 0.0291, "step": 15510 }, { "epoch": 14.880153403643337, "grad_norm": 0.2532041668891907, "learning_rate": 3.822572511984569e-05, "loss": 0.0267, "step": 15520 }, { "epoch": 14.889741131351869, "grad_norm": 0.3086365759372711, "learning_rate": 3.816392809442649e-05, "loss": 0.036, "step": 15530 }, { "epoch": 14.899328859060402, "grad_norm": 0.22954832017421722, "learning_rate": 3.8102150216564484e-05, "loss": 0.0302, "step": 15540 }, { "epoch": 14.908916586768935, "grad_norm": 0.2649918496608734, "learning_rate": 3.804039158619951e-05, "loss": 0.037, "step": 15550 }, { "epoch": 14.918504314477468, "grad_norm": 0.22433148324489594, "learning_rate": 3.797865230324033e-05, "loss": 0.0258, "step": 15560 }, { "epoch": 14.928092042186002, "grad_norm": 0.2442513406276703, "learning_rate": 3.791693246756436e-05, "loss": 0.0289, "step": 15570 }, { "epoch": 14.937679769894535, "grad_norm": 0.22684846818447113, "learning_rate": 3.785523217901757e-05, "loss": 0.032, "step": 15580 }, { "epoch": 14.947267497603068, "grad_norm": 0.27900537848472595, "learning_rate": 3.7793551537414313e-05, "loss": 0.0284, "step": 15590 }, { "epoch": 14.956855225311601, "grad_norm": 0.29420506954193115, "learning_rate": 3.7731890642537154e-05, "loss": 0.0278, "step": 15600 }, { "epoch": 14.966442953020135, "grad_norm": 0.23950040340423584, "learning_rate": 3.76702495941367e-05, "loss": 0.03, "step": 15610 }, { "epoch": 14.976030680728668, "grad_norm": 0.3971647322177887, "learning_rate": 3.760862849193148e-05, "loss": 0.0324, "step": 15620 }, { "epoch": 14.985618408437201, "grad_norm": 0.18756671249866486, "learning_rate": 3.754702743560773e-05, "loss": 0.026, "step": 15630 }, { "epoch": 14.995206136145734, "grad_norm": 0.24370504915714264, "learning_rate": 3.748544652481927e-05, "loss": 0.0353, "step": 15640 }, { "epoch": 15.004793863854266, "grad_norm": 0.26173216104507446, "learning_rate": 3.742388585918733e-05, "loss": 0.0356, "step": 15650 }, { "epoch": 15.014381591562799, "grad_norm": 0.22543974220752716, "learning_rate": 3.736234553830038e-05, "loss": 0.0314, "step": 15660 }, { "epoch": 15.023969319271332, "grad_norm": 0.1632285714149475, "learning_rate": 3.7300825661714e-05, "loss": 0.0267, "step": 15670 }, { "epoch": 15.033557046979865, "grad_norm": 0.2474079430103302, "learning_rate": 3.723932632895067e-05, "loss": 0.0289, "step": 15680 }, { "epoch": 15.043144774688399, "grad_norm": 0.21004092693328857, "learning_rate": 3.717784763949964e-05, "loss": 0.0272, "step": 15690 }, { "epoch": 15.052732502396932, "grad_norm": 0.20469725131988525, "learning_rate": 3.7116389692816754e-05, "loss": 0.0282, "step": 15700 }, { "epoch": 15.062320230105465, "grad_norm": 0.4098300337791443, "learning_rate": 3.7054952588324364e-05, "loss": 0.0318, "step": 15710 }, { "epoch": 15.071907957813998, "grad_norm": 0.1645730435848236, "learning_rate": 3.699353642541103e-05, "loss": 0.0307, "step": 15720 }, { "epoch": 15.081495685522532, "grad_norm": 0.16053102910518646, "learning_rate": 3.693214130343148e-05, "loss": 0.0263, "step": 15730 }, { "epoch": 15.091083413231065, "grad_norm": 0.2607749104499817, "learning_rate": 3.687076732170635e-05, "loss": 0.0279, "step": 15740 }, { "epoch": 15.100671140939598, "grad_norm": 0.20249375700950623, "learning_rate": 3.680941457952214e-05, "loss": 0.031, "step": 15750 }, { "epoch": 15.110258868648131, "grad_norm": 0.17298898100852966, "learning_rate": 3.6748083176130955e-05, "loss": 0.0304, "step": 15760 }, { "epoch": 15.119846596356663, "grad_norm": 0.3816901743412018, "learning_rate": 3.6686773210750385e-05, "loss": 0.0267, "step": 15770 }, { "epoch": 15.129434324065196, "grad_norm": 0.26607292890548706, "learning_rate": 3.6625484782563345e-05, "loss": 0.0285, "step": 15780 }, { "epoch": 15.139022051773729, "grad_norm": 0.24211320281028748, "learning_rate": 3.656421799071791e-05, "loss": 0.0325, "step": 15790 }, { "epoch": 15.148609779482262, "grad_norm": 0.3071950376033783, "learning_rate": 3.650297293432713e-05, "loss": 0.0344, "step": 15800 }, { "epoch": 15.158197507190796, "grad_norm": 0.3314298689365387, "learning_rate": 3.6441749712468944e-05, "loss": 0.0297, "step": 15810 }, { "epoch": 15.167785234899329, "grad_norm": 0.2220297008752823, "learning_rate": 3.6380548424185894e-05, "loss": 0.0328, "step": 15820 }, { "epoch": 15.177372962607862, "grad_norm": 0.15199415385723114, "learning_rate": 3.6319369168485104e-05, "loss": 0.025, "step": 15830 }, { "epoch": 15.186960690316395, "grad_norm": 0.2900523841381073, "learning_rate": 3.625821204433803e-05, "loss": 0.0261, "step": 15840 }, { "epoch": 15.196548418024928, "grad_norm": 0.17855972051620483, "learning_rate": 3.61970771506803e-05, "loss": 0.034, "step": 15850 }, { "epoch": 15.206136145733462, "grad_norm": 0.35416078567504883, "learning_rate": 3.613596458641167e-05, "loss": 0.0362, "step": 15860 }, { "epoch": 15.215723873441995, "grad_norm": 0.21492497622966766, "learning_rate": 3.6074874450395666e-05, "loss": 0.0259, "step": 15870 }, { "epoch": 15.225311601150528, "grad_norm": 0.2749202847480774, "learning_rate": 3.6013806841459586e-05, "loss": 0.0257, "step": 15880 }, { "epoch": 15.234899328859061, "grad_norm": 0.16736426949501038, "learning_rate": 3.595276185839426e-05, "loss": 0.0328, "step": 15890 }, { "epoch": 15.244487056567593, "grad_norm": 0.2754712998867035, "learning_rate": 3.5891739599953945e-05, "loss": 0.0276, "step": 15900 }, { "epoch": 15.254074784276126, "grad_norm": 0.29541146755218506, "learning_rate": 3.583074016485611e-05, "loss": 0.031, "step": 15910 }, { "epoch": 15.26366251198466, "grad_norm": 0.26210564374923706, "learning_rate": 3.576976365178132e-05, "loss": 0.0325, "step": 15920 }, { "epoch": 15.273250239693192, "grad_norm": 0.2595176100730896, "learning_rate": 3.5708810159373044e-05, "loss": 0.0375, "step": 15930 }, { "epoch": 15.282837967401726, "grad_norm": 0.21411257982254028, "learning_rate": 3.564787978623753e-05, "loss": 0.0277, "step": 15940 }, { "epoch": 15.292425695110259, "grad_norm": 0.2823658585548401, "learning_rate": 3.5586972630943594e-05, "loss": 0.0259, "step": 15950 }, { "epoch": 15.302013422818792, "grad_norm": 0.2719429135322571, "learning_rate": 3.552608879202252e-05, "loss": 0.0295, "step": 15960 }, { "epoch": 15.311601150527325, "grad_norm": 0.2882955074310303, "learning_rate": 3.5465228367967854e-05, "loss": 0.0297, "step": 15970 }, { "epoch": 15.321188878235859, "grad_norm": 0.22343681752681732, "learning_rate": 3.540439145723529e-05, "loss": 0.0241, "step": 15980 }, { "epoch": 15.330776605944392, "grad_norm": 0.18314386904239655, "learning_rate": 3.534357815824243e-05, "loss": 0.0345, "step": 15990 }, { "epoch": 15.340364333652925, "grad_norm": 0.22451230883598328, "learning_rate": 3.528278856936874e-05, "loss": 0.0259, "step": 16000 }, { "epoch": 15.349952061361458, "grad_norm": 0.23394083976745605, "learning_rate": 3.52220227889553e-05, "loss": 0.0235, "step": 16010 }, { "epoch": 15.35953978906999, "grad_norm": 0.21090802550315857, "learning_rate": 3.516128091530469e-05, "loss": 0.0259, "step": 16020 }, { "epoch": 15.369127516778523, "grad_norm": 0.42782530188560486, "learning_rate": 3.5100563046680764e-05, "loss": 0.0297, "step": 16030 }, { "epoch": 15.378715244487056, "grad_norm": 0.2408047765493393, "learning_rate": 3.503986928130862e-05, "loss": 0.0287, "step": 16040 }, { "epoch": 15.38830297219559, "grad_norm": 0.24126370251178741, "learning_rate": 3.49791997173743e-05, "loss": 0.0295, "step": 16050 }, { "epoch": 15.397890699904123, "grad_norm": 0.28855326771736145, "learning_rate": 3.4918554453024746e-05, "loss": 0.0272, "step": 16060 }, { "epoch": 15.407478427612656, "grad_norm": 0.2622244358062744, "learning_rate": 3.485793358636753e-05, "loss": 0.0264, "step": 16070 }, { "epoch": 15.417066155321189, "grad_norm": 0.433159202337265, "learning_rate": 3.479733721547082e-05, "loss": 0.0331, "step": 16080 }, { "epoch": 15.426653883029722, "grad_norm": 0.35671567916870117, "learning_rate": 3.47367654383631e-05, "loss": 0.0309, "step": 16090 }, { "epoch": 15.436241610738255, "grad_norm": 0.2572173476219177, "learning_rate": 3.467621835303309e-05, "loss": 0.0299, "step": 16100 }, { "epoch": 15.445829338446789, "grad_norm": 0.3275107145309448, "learning_rate": 3.461569605742958e-05, "loss": 0.0258, "step": 16110 }, { "epoch": 15.455417066155322, "grad_norm": 0.38686898350715637, "learning_rate": 3.455519864946125e-05, "loss": 0.0281, "step": 16120 }, { "epoch": 15.465004793863855, "grad_norm": 0.5980708003044128, "learning_rate": 3.449472622699651e-05, "loss": 0.0266, "step": 16130 }, { "epoch": 15.474592521572387, "grad_norm": 0.1607155054807663, "learning_rate": 3.443427888786335e-05, "loss": 0.0286, "step": 16140 }, { "epoch": 15.48418024928092, "grad_norm": 0.19821766018867493, "learning_rate": 3.437385672984918e-05, "loss": 0.0299, "step": 16150 }, { "epoch": 15.493767976989453, "grad_norm": 0.27373266220092773, "learning_rate": 3.431345985070067e-05, "loss": 0.0387, "step": 16160 }, { "epoch": 15.503355704697986, "grad_norm": 0.24755899608135223, "learning_rate": 3.425308834812364e-05, "loss": 0.0268, "step": 16170 }, { "epoch": 15.51294343240652, "grad_norm": 0.34930139780044556, "learning_rate": 3.4192742319782805e-05, "loss": 0.0358, "step": 16180 }, { "epoch": 15.522531160115053, "grad_norm": 0.21849294006824493, "learning_rate": 3.413242186330168e-05, "loss": 0.0327, "step": 16190 }, { "epoch": 15.532118887823586, "grad_norm": 0.2413625419139862, "learning_rate": 3.407212707626243e-05, "loss": 0.0283, "step": 16200 }, { "epoch": 15.541706615532119, "grad_norm": 0.27283817529678345, "learning_rate": 3.401185805620568e-05, "loss": 0.0295, "step": 16210 }, { "epoch": 15.551294343240652, "grad_norm": 0.3242924213409424, "learning_rate": 3.395161490063037e-05, "loss": 0.0328, "step": 16220 }, { "epoch": 15.560882070949186, "grad_norm": 0.2872219383716583, "learning_rate": 3.38913977069936e-05, "loss": 0.0273, "step": 16230 }, { "epoch": 15.570469798657719, "grad_norm": 0.14021213352680206, "learning_rate": 3.3831206572710464e-05, "loss": 0.0271, "step": 16240 }, { "epoch": 15.580057526366252, "grad_norm": 0.19898459315299988, "learning_rate": 3.377104159515393e-05, "loss": 0.0299, "step": 16250 }, { "epoch": 15.589645254074785, "grad_norm": 0.2079470306634903, "learning_rate": 3.371090287165462e-05, "loss": 0.031, "step": 16260 }, { "epoch": 15.599232981783317, "grad_norm": 0.2817933261394501, "learning_rate": 3.3650790499500675e-05, "loss": 0.0273, "step": 16270 }, { "epoch": 15.60882070949185, "grad_norm": 0.20972701907157898, "learning_rate": 3.3590704575937655e-05, "loss": 0.0279, "step": 16280 }, { "epoch": 15.618408437200383, "grad_norm": 0.21050924062728882, "learning_rate": 3.3530645198168295e-05, "loss": 0.0327, "step": 16290 }, { "epoch": 15.627996164908916, "grad_norm": 0.33600106835365295, "learning_rate": 3.3470612463352376e-05, "loss": 0.0314, "step": 16300 }, { "epoch": 15.63758389261745, "grad_norm": 0.33707502484321594, "learning_rate": 3.341060646860659e-05, "loss": 0.029, "step": 16310 }, { "epoch": 15.647171620325983, "grad_norm": 0.2761129140853882, "learning_rate": 3.335062731100441e-05, "loss": 0.0271, "step": 16320 }, { "epoch": 15.656759348034516, "grad_norm": 0.2787131369113922, "learning_rate": 3.3290675087575856e-05, "loss": 0.0252, "step": 16330 }, { "epoch": 15.66634707574305, "grad_norm": 0.23235364258289337, "learning_rate": 3.3230749895307375e-05, "loss": 0.0357, "step": 16340 }, { "epoch": 15.675934803451582, "grad_norm": 0.22941578924655914, "learning_rate": 3.317085183114168e-05, "loss": 0.0223, "step": 16350 }, { "epoch": 15.685522531160116, "grad_norm": 0.2411498874425888, "learning_rate": 3.311098099197761e-05, "loss": 0.0271, "step": 16360 }, { "epoch": 15.695110258868649, "grad_norm": 0.35220983624458313, "learning_rate": 3.3051137474669966e-05, "loss": 0.0262, "step": 16370 }, { "epoch": 15.70469798657718, "grad_norm": 0.28711986541748047, "learning_rate": 3.299132137602934e-05, "loss": 0.0342, "step": 16380 }, { "epoch": 15.714285714285714, "grad_norm": 0.1615312546491623, "learning_rate": 3.293153279282199e-05, "loss": 0.0334, "step": 16390 }, { "epoch": 15.723873441994247, "grad_norm": 0.17363496124744415, "learning_rate": 3.287177182176961e-05, "loss": 0.0279, "step": 16400 }, { "epoch": 15.73346116970278, "grad_norm": 0.3049766421318054, "learning_rate": 3.2812038559549275e-05, "loss": 0.032, "step": 16410 }, { "epoch": 15.743048897411313, "grad_norm": 0.3206036686897278, "learning_rate": 3.275233310279321e-05, "loss": 0.0281, "step": 16420 }, { "epoch": 15.752636625119846, "grad_norm": 0.20691925287246704, "learning_rate": 3.2692655548088704e-05, "loss": 0.026, "step": 16430 }, { "epoch": 15.76222435282838, "grad_norm": 0.2701127827167511, "learning_rate": 3.263300599197781e-05, "loss": 0.0247, "step": 16440 }, { "epoch": 15.771812080536913, "grad_norm": 0.18183131515979767, "learning_rate": 3.2573384530957384e-05, "loss": 0.0249, "step": 16450 }, { "epoch": 15.781399808245446, "grad_norm": 0.260061115026474, "learning_rate": 3.251379126147877e-05, "loss": 0.0249, "step": 16460 }, { "epoch": 15.79098753595398, "grad_norm": 0.2887513041496277, "learning_rate": 3.245422627994777e-05, "loss": 0.0333, "step": 16470 }, { "epoch": 15.800575263662513, "grad_norm": 0.3020176887512207, "learning_rate": 3.239468968272436e-05, "loss": 0.0289, "step": 16480 }, { "epoch": 15.810162991371046, "grad_norm": 0.23766952753067017, "learning_rate": 3.233518156612262e-05, "loss": 0.0302, "step": 16490 }, { "epoch": 15.819750719079579, "grad_norm": 0.31386175751686096, "learning_rate": 3.227570202641056e-05, "loss": 0.0287, "step": 16500 }, { "epoch": 15.82933844678811, "grad_norm": 0.2746824026107788, "learning_rate": 3.2216251159809955e-05, "loss": 0.0293, "step": 16510 }, { "epoch": 15.838926174496644, "grad_norm": 0.21857379376888275, "learning_rate": 3.215682906249621e-05, "loss": 0.0294, "step": 16520 }, { "epoch": 15.848513902205177, "grad_norm": 0.16576367616653442, "learning_rate": 3.209743583059817e-05, "loss": 0.0271, "step": 16530 }, { "epoch": 15.85810162991371, "grad_norm": 0.31498968601226807, "learning_rate": 3.203807156019798e-05, "loss": 0.0312, "step": 16540 }, { "epoch": 15.867689357622243, "grad_norm": 0.2268988937139511, "learning_rate": 3.197873634733096e-05, "loss": 0.0309, "step": 16550 }, { "epoch": 15.877277085330777, "grad_norm": 0.2843955159187317, "learning_rate": 3.1919430287985415e-05, "loss": 0.0271, "step": 16560 }, { "epoch": 15.88686481303931, "grad_norm": 0.270082026720047, "learning_rate": 3.186015347810245e-05, "loss": 0.0267, "step": 16570 }, { "epoch": 15.896452540747843, "grad_norm": 0.13555888831615448, "learning_rate": 3.18009060135759e-05, "loss": 0.0303, "step": 16580 }, { "epoch": 15.906040268456376, "grad_norm": 0.5174959301948547, "learning_rate": 3.17416879902521e-05, "loss": 0.0298, "step": 16590 }, { "epoch": 15.91562799616491, "grad_norm": 0.23616893589496613, "learning_rate": 3.168249950392978e-05, "loss": 0.026, "step": 16600 }, { "epoch": 15.925215723873443, "grad_norm": 0.2044319212436676, "learning_rate": 3.162334065035985e-05, "loss": 0.0294, "step": 16610 }, { "epoch": 15.934803451581976, "grad_norm": 0.2839745879173279, "learning_rate": 3.156421152524532e-05, "loss": 0.0311, "step": 16620 }, { "epoch": 15.944391179290509, "grad_norm": 0.28521618247032166, "learning_rate": 3.150511222424111e-05, "loss": 0.029, "step": 16630 }, { "epoch": 15.95397890699904, "grad_norm": 0.4045862555503845, "learning_rate": 3.1446042842953845e-05, "loss": 0.0347, "step": 16640 }, { "epoch": 15.963566634707574, "grad_norm": 0.2557837963104248, "learning_rate": 3.138700347694179e-05, "loss": 0.0211, "step": 16650 }, { "epoch": 15.973154362416107, "grad_norm": 0.23164719343185425, "learning_rate": 3.132799422171464e-05, "loss": 0.0273, "step": 16660 }, { "epoch": 15.98274209012464, "grad_norm": 0.17888516187667847, "learning_rate": 3.126901517273339e-05, "loss": 0.0252, "step": 16670 }, { "epoch": 15.992329817833173, "grad_norm": 0.2732132077217102, "learning_rate": 3.121006642541014e-05, "loss": 0.0259, "step": 16680 }, { "epoch": 16.001917545541705, "grad_norm": 0.3599238693714142, "learning_rate": 3.115114807510803e-05, "loss": 0.0292, "step": 16690 }, { "epoch": 16.01150527325024, "grad_norm": 0.18428216874599457, "learning_rate": 3.109226021714093e-05, "loss": 0.0238, "step": 16700 }, { "epoch": 16.02109300095877, "grad_norm": 0.1668870896100998, "learning_rate": 3.1033402946773474e-05, "loss": 0.0276, "step": 16710 }, { "epoch": 16.030680728667306, "grad_norm": 0.2498198300600052, "learning_rate": 3.097457635922077e-05, "loss": 0.0326, "step": 16720 }, { "epoch": 16.040268456375838, "grad_norm": 0.27348780632019043, "learning_rate": 3.09157805496483e-05, "loss": 0.0337, "step": 16730 }, { "epoch": 16.049856184084373, "grad_norm": 0.3426136076450348, "learning_rate": 3.085701561317174e-05, "loss": 0.027, "step": 16740 }, { "epoch": 16.059443911792904, "grad_norm": 0.1942438781261444, "learning_rate": 3.079828164485684e-05, "loss": 0.0231, "step": 16750 }, { "epoch": 16.06903163950144, "grad_norm": 0.3608817160129547, "learning_rate": 3.073957873971925e-05, "loss": 0.0246, "step": 16760 }, { "epoch": 16.07861936720997, "grad_norm": 0.2943773567676544, "learning_rate": 3.068090699272436e-05, "loss": 0.033, "step": 16770 }, { "epoch": 16.088207094918506, "grad_norm": 0.3121021091938019, "learning_rate": 3.062226649878717e-05, "loss": 0.0228, "step": 16780 }, { "epoch": 16.097794822627037, "grad_norm": 0.2769118547439575, "learning_rate": 3.056365735277209e-05, "loss": 0.0228, "step": 16790 }, { "epoch": 16.107382550335572, "grad_norm": 0.2802489995956421, "learning_rate": 3.0505079649492853e-05, "loss": 0.0281, "step": 16800 }, { "epoch": 16.116970278044104, "grad_norm": 0.27936017513275146, "learning_rate": 3.0446533483712304e-05, "loss": 0.0285, "step": 16810 }, { "epoch": 16.126558005752635, "grad_norm": 0.2072148621082306, "learning_rate": 3.038801895014229e-05, "loss": 0.0295, "step": 16820 }, { "epoch": 16.13614573346117, "grad_norm": 0.2498210370540619, "learning_rate": 3.0329536143443444e-05, "loss": 0.0292, "step": 16830 }, { "epoch": 16.1457334611697, "grad_norm": 0.274496853351593, "learning_rate": 3.027108515822511e-05, "loss": 0.0292, "step": 16840 }, { "epoch": 16.155321188878236, "grad_norm": 0.40636447072029114, "learning_rate": 3.0212666089045155e-05, "loss": 0.0281, "step": 16850 }, { "epoch": 16.164908916586768, "grad_norm": 0.22214102745056152, "learning_rate": 3.0154279030409794e-05, "loss": 0.0218, "step": 16860 }, { "epoch": 16.174496644295303, "grad_norm": 0.26967325806617737, "learning_rate": 3.0095924076773467e-05, "loss": 0.0255, "step": 16870 }, { "epoch": 16.184084372003834, "grad_norm": 0.23795704543590546, "learning_rate": 3.003760132253868e-05, "loss": 0.0327, "step": 16880 }, { "epoch": 16.19367209971237, "grad_norm": 0.1818399578332901, "learning_rate": 2.9979310862055842e-05, "loss": 0.0312, "step": 16890 }, { "epoch": 16.2032598274209, "grad_norm": 0.23240040242671967, "learning_rate": 2.9921052789623137e-05, "loss": 0.0294, "step": 16900 }, { "epoch": 16.212847555129436, "grad_norm": 0.20948819816112518, "learning_rate": 2.9862827199486327e-05, "loss": 0.0271, "step": 16910 }, { "epoch": 16.222435282837967, "grad_norm": 0.20456750690937042, "learning_rate": 2.9804634185838614e-05, "loss": 0.0258, "step": 16920 }, { "epoch": 16.232023010546502, "grad_norm": 0.2674747705459595, "learning_rate": 2.9746473842820578e-05, "loss": 0.0287, "step": 16930 }, { "epoch": 16.241610738255034, "grad_norm": 0.19764818251132965, "learning_rate": 2.9688346264519866e-05, "loss": 0.0284, "step": 16940 }, { "epoch": 16.251198465963565, "grad_norm": 0.3688560426235199, "learning_rate": 2.9630251544971165e-05, "loss": 0.0289, "step": 16950 }, { "epoch": 16.2607861936721, "grad_norm": 0.34308168292045593, "learning_rate": 2.957218977815598e-05, "loss": 0.0289, "step": 16960 }, { "epoch": 16.27037392138063, "grad_norm": 0.3008866608142853, "learning_rate": 2.9514161058002498e-05, "loss": 0.0307, "step": 16970 }, { "epoch": 16.279961649089167, "grad_norm": 0.12983451783657074, "learning_rate": 2.9456165478385494e-05, "loss": 0.0232, "step": 16980 }, { "epoch": 16.289549376797698, "grad_norm": 0.14965233206748962, "learning_rate": 2.9398203133126085e-05, "loss": 0.0248, "step": 16990 }, { "epoch": 16.299137104506233, "grad_norm": 0.256956547498703, "learning_rate": 2.9340274115991638e-05, "loss": 0.0348, "step": 17000 }, { "epoch": 16.308724832214764, "grad_norm": 0.18191608786582947, "learning_rate": 2.9282378520695618e-05, "loss": 0.0292, "step": 17010 }, { "epoch": 16.3183125599233, "grad_norm": 0.20375274121761322, "learning_rate": 2.922451644089741e-05, "loss": 0.0282, "step": 17020 }, { "epoch": 16.32790028763183, "grad_norm": 0.24703994393348694, "learning_rate": 2.9166687970202177e-05, "loss": 0.0335, "step": 17030 }, { "epoch": 16.337488015340366, "grad_norm": 0.266993910074234, "learning_rate": 2.9108893202160702e-05, "loss": 0.021, "step": 17040 }, { "epoch": 16.347075743048897, "grad_norm": 0.42793118953704834, "learning_rate": 2.9051132230269272e-05, "loss": 0.0257, "step": 17050 }, { "epoch": 16.35666347075743, "grad_norm": 0.36531713604927063, "learning_rate": 2.8993405147969493e-05, "loss": 0.0322, "step": 17060 }, { "epoch": 16.366251198465964, "grad_norm": 0.21013452112674713, "learning_rate": 2.8935712048648112e-05, "loss": 0.0278, "step": 17070 }, { "epoch": 16.375838926174495, "grad_norm": 0.1972169280052185, "learning_rate": 2.8878053025636975e-05, "loss": 0.025, "step": 17080 }, { "epoch": 16.38542665388303, "grad_norm": 0.2844037115573883, "learning_rate": 2.882042817221273e-05, "loss": 0.0265, "step": 17090 }, { "epoch": 16.39501438159156, "grad_norm": 0.18470896780490875, "learning_rate": 2.8762837581596792e-05, "loss": 0.0234, "step": 17100 }, { "epoch": 16.404602109300097, "grad_norm": 0.27581846714019775, "learning_rate": 2.8705281346955116e-05, "loss": 0.0303, "step": 17110 }, { "epoch": 16.414189837008628, "grad_norm": 0.27025681734085083, "learning_rate": 2.86477595613981e-05, "loss": 0.0309, "step": 17120 }, { "epoch": 16.423777564717163, "grad_norm": 0.35465800762176514, "learning_rate": 2.8590272317980437e-05, "loss": 0.0318, "step": 17130 }, { "epoch": 16.433365292425695, "grad_norm": 0.2873314917087555, "learning_rate": 2.8532819709700854e-05, "loss": 0.0335, "step": 17140 }, { "epoch": 16.44295302013423, "grad_norm": 0.3287470042705536, "learning_rate": 2.8475401829502124e-05, "loss": 0.0308, "step": 17150 }, { "epoch": 16.45254074784276, "grad_norm": 0.18719346821308136, "learning_rate": 2.841801877027083e-05, "loss": 0.0297, "step": 17160 }, { "epoch": 16.462128475551296, "grad_norm": 0.16801686584949493, "learning_rate": 2.836067062483721e-05, "loss": 0.026, "step": 17170 }, { "epoch": 16.471716203259827, "grad_norm": 0.3017866909503937, "learning_rate": 2.830335748597502e-05, "loss": 0.0298, "step": 17180 }, { "epoch": 16.48130393096836, "grad_norm": 0.16507741808891296, "learning_rate": 2.8246079446401386e-05, "loss": 0.028, "step": 17190 }, { "epoch": 16.490891658676894, "grad_norm": 0.25729814171791077, "learning_rate": 2.8188836598776662e-05, "loss": 0.0291, "step": 17200 }, { "epoch": 16.500479386385425, "grad_norm": 0.36721915006637573, "learning_rate": 2.8131629035704264e-05, "loss": 0.0324, "step": 17210 }, { "epoch": 16.51006711409396, "grad_norm": 5.430606365203857, "learning_rate": 2.8074456849730507e-05, "loss": 0.026, "step": 17220 }, { "epoch": 16.51965484180249, "grad_norm": 0.18490955233573914, "learning_rate": 2.8017320133344533e-05, "loss": 0.0265, "step": 17230 }, { "epoch": 16.529242569511027, "grad_norm": 0.17146821320056915, "learning_rate": 2.7960218978978047e-05, "loss": 0.0293, "step": 17240 }, { "epoch": 16.538830297219558, "grad_norm": 0.21457697451114655, "learning_rate": 2.7903153479005255e-05, "loss": 0.0294, "step": 17250 }, { "epoch": 16.548418024928093, "grad_norm": 0.2303658127784729, "learning_rate": 2.7846123725742678e-05, "loss": 0.0278, "step": 17260 }, { "epoch": 16.558005752636625, "grad_norm": 0.20711682736873627, "learning_rate": 2.778912981144898e-05, "loss": 0.0245, "step": 17270 }, { "epoch": 16.56759348034516, "grad_norm": 0.2282470464706421, "learning_rate": 2.7732171828324872e-05, "loss": 0.029, "step": 17280 }, { "epoch": 16.57718120805369, "grad_norm": 0.27450570464134216, "learning_rate": 2.7675249868512954e-05, "loss": 0.036, "step": 17290 }, { "epoch": 16.586768935762223, "grad_norm": 0.18990963697433472, "learning_rate": 2.761836402409752e-05, "loss": 0.0362, "step": 17300 }, { "epoch": 16.596356663470758, "grad_norm": 0.19880448281764984, "learning_rate": 2.7561514387104464e-05, "loss": 0.0283, "step": 17310 }, { "epoch": 16.60594439117929, "grad_norm": 0.2031632959842682, "learning_rate": 2.750470104950109e-05, "loss": 0.0253, "step": 17320 }, { "epoch": 16.615532118887824, "grad_norm": 0.5270239114761353, "learning_rate": 2.7447924103195976e-05, "loss": 0.0278, "step": 17330 }, { "epoch": 16.625119846596355, "grad_norm": 0.29472750425338745, "learning_rate": 2.7391183640038847e-05, "loss": 0.0284, "step": 17340 }, { "epoch": 16.63470757430489, "grad_norm": 0.21734996140003204, "learning_rate": 2.7334479751820396e-05, "loss": 0.0294, "step": 17350 }, { "epoch": 16.644295302013422, "grad_norm": 0.29278430342674255, "learning_rate": 2.7277812530272147e-05, "loss": 0.0297, "step": 17360 }, { "epoch": 16.653883029721957, "grad_norm": 0.2573314309120178, "learning_rate": 2.7221182067066307e-05, "loss": 0.0241, "step": 17370 }, { "epoch": 16.66347075743049, "grad_norm": 0.23133955895900726, "learning_rate": 2.7164588453815602e-05, "loss": 0.0258, "step": 17380 }, { "epoch": 16.673058485139023, "grad_norm": 0.20745334029197693, "learning_rate": 2.710803178207323e-05, "loss": 0.0242, "step": 17390 }, { "epoch": 16.682646212847555, "grad_norm": 0.22852954268455505, "learning_rate": 2.7051512143332492e-05, "loss": 0.027, "step": 17400 }, { "epoch": 16.69223394055609, "grad_norm": 0.25844722986221313, "learning_rate": 2.6995029629026874e-05, "loss": 0.0244, "step": 17410 }, { "epoch": 16.70182166826462, "grad_norm": 0.23631109297275543, "learning_rate": 2.6938584330529782e-05, "loss": 0.0215, "step": 17420 }, { "epoch": 16.711409395973153, "grad_norm": 0.27872714400291443, "learning_rate": 2.6882176339154404e-05, "loss": 0.0308, "step": 17430 }, { "epoch": 16.720997123681688, "grad_norm": 0.23717211186885834, "learning_rate": 2.6825805746153594e-05, "loss": 0.0266, "step": 17440 }, { "epoch": 16.73058485139022, "grad_norm": 0.281259685754776, "learning_rate": 2.6769472642719695e-05, "loss": 0.0329, "step": 17450 }, { "epoch": 16.740172579098754, "grad_norm": 0.257068932056427, "learning_rate": 2.67131771199844e-05, "loss": 0.0245, "step": 17460 }, { "epoch": 16.749760306807286, "grad_norm": 0.18098169565200806, "learning_rate": 2.665691926901862e-05, "loss": 0.0284, "step": 17470 }, { "epoch": 16.75934803451582, "grad_norm": 0.23477615416049957, "learning_rate": 2.6600699180832307e-05, "loss": 0.026, "step": 17480 }, { "epoch": 16.768935762224352, "grad_norm": 0.24687384068965912, "learning_rate": 2.654451694637433e-05, "loss": 0.0255, "step": 17490 }, { "epoch": 16.778523489932887, "grad_norm": 0.2607274651527405, "learning_rate": 2.6488372656532322e-05, "loss": 0.0294, "step": 17500 }, { "epoch": 16.78811121764142, "grad_norm": 0.4215647578239441, "learning_rate": 2.6432266402132532e-05, "loss": 0.0283, "step": 17510 }, { "epoch": 16.797698945349953, "grad_norm": 0.20454095304012299, "learning_rate": 2.637619827393968e-05, "loss": 0.0306, "step": 17520 }, { "epoch": 16.807286673058485, "grad_norm": 0.19789418578147888, "learning_rate": 2.6320168362656796e-05, "loss": 0.025, "step": 17530 }, { "epoch": 16.81687440076702, "grad_norm": 0.34662866592407227, "learning_rate": 2.6264176758925098e-05, "loss": 0.0317, "step": 17540 }, { "epoch": 16.82646212847555, "grad_norm": 0.20395246148109436, "learning_rate": 2.620822355332383e-05, "loss": 0.0306, "step": 17550 }, { "epoch": 16.836049856184083, "grad_norm": 0.39246705174446106, "learning_rate": 2.615230883637012e-05, "loss": 0.0259, "step": 17560 }, { "epoch": 16.845637583892618, "grad_norm": 0.22869329154491425, "learning_rate": 2.609643269851883e-05, "loss": 0.0285, "step": 17570 }, { "epoch": 16.85522531160115, "grad_norm": 0.3232511281967163, "learning_rate": 2.60405952301624e-05, "loss": 0.0288, "step": 17580 }, { "epoch": 16.864813039309684, "grad_norm": 0.2171912044286728, "learning_rate": 2.5984796521630737e-05, "loss": 0.0249, "step": 17590 }, { "epoch": 16.874400767018216, "grad_norm": 0.28310737013816833, "learning_rate": 2.592903666319103e-05, "loss": 0.0295, "step": 17600 }, { "epoch": 16.88398849472675, "grad_norm": 0.19829969108104706, "learning_rate": 2.587331574504761e-05, "loss": 0.025, "step": 17610 }, { "epoch": 16.893576222435282, "grad_norm": 0.1657049059867859, "learning_rate": 2.581763385734183e-05, "loss": 0.0244, "step": 17620 }, { "epoch": 16.903163950143817, "grad_norm": 0.256913959980011, "learning_rate": 2.5761991090151906e-05, "loss": 0.0306, "step": 17630 }, { "epoch": 16.91275167785235, "grad_norm": 0.2738933861255646, "learning_rate": 2.5706387533492737e-05, "loss": 0.0326, "step": 17640 }, { "epoch": 16.922339405560884, "grad_norm": 0.2700929343700409, "learning_rate": 2.5650823277315837e-05, "loss": 0.0313, "step": 17650 }, { "epoch": 16.931927133269415, "grad_norm": 0.2965131103992462, "learning_rate": 2.5595298411509094e-05, "loss": 0.0275, "step": 17660 }, { "epoch": 16.941514860977946, "grad_norm": 0.3247256278991699, "learning_rate": 2.553981302589671e-05, "loss": 0.0326, "step": 17670 }, { "epoch": 16.95110258868648, "grad_norm": 0.30926892161369324, "learning_rate": 2.5484367210239e-05, "loss": 0.0297, "step": 17680 }, { "epoch": 16.960690316395013, "grad_norm": 0.15213845670223236, "learning_rate": 2.5428961054232264e-05, "loss": 0.0271, "step": 17690 }, { "epoch": 16.970278044103548, "grad_norm": 0.20840811729431152, "learning_rate": 2.537359464750866e-05, "loss": 0.0273, "step": 17700 }, { "epoch": 16.97986577181208, "grad_norm": 0.21467389166355133, "learning_rate": 2.5318268079636022e-05, "loss": 0.0314, "step": 17710 }, { "epoch": 16.989453499520614, "grad_norm": 0.2677682638168335, "learning_rate": 2.526298144011775e-05, "loss": 0.0238, "step": 17720 }, { "epoch": 16.999041227229146, "grad_norm": 0.16417664289474487, "learning_rate": 2.5207734818392648e-05, "loss": 0.0258, "step": 17730 }, { "epoch": 17.00862895493768, "grad_norm": 0.20820669829845428, "learning_rate": 2.5152528303834777e-05, "loss": 0.0329, "step": 17740 }, { "epoch": 17.018216682646212, "grad_norm": 0.19568829238414764, "learning_rate": 2.5097361985753316e-05, "loss": 0.0269, "step": 17750 }, { "epoch": 17.027804410354747, "grad_norm": 0.1650926023721695, "learning_rate": 2.5042235953392423e-05, "loss": 0.026, "step": 17760 }, { "epoch": 17.03739213806328, "grad_norm": 0.16357482969760895, "learning_rate": 2.4987150295931082e-05, "loss": 0.0305, "step": 17770 }, { "epoch": 17.046979865771814, "grad_norm": 0.22878289222717285, "learning_rate": 2.4932105102482955e-05, "loss": 0.0276, "step": 17780 }, { "epoch": 17.056567593480345, "grad_norm": 0.2666637599468231, "learning_rate": 2.487710046209626e-05, "loss": 0.0278, "step": 17790 }, { "epoch": 17.066155321188877, "grad_norm": 0.2581173777580261, "learning_rate": 2.4822136463753594e-05, "loss": 0.0285, "step": 17800 }, { "epoch": 17.07574304889741, "grad_norm": 0.19729219377040863, "learning_rate": 2.4767213196371813e-05, "loss": 0.0251, "step": 17810 }, { "epoch": 17.085330776605943, "grad_norm": 0.21068008244037628, "learning_rate": 2.47123307488019e-05, "loss": 0.0218, "step": 17820 }, { "epoch": 17.094918504314478, "grad_norm": 0.21502196788787842, "learning_rate": 2.465748920982873e-05, "loss": 0.0244, "step": 17830 }, { "epoch": 17.10450623202301, "grad_norm": 0.20099669694900513, "learning_rate": 2.4602688668171103e-05, "loss": 0.0299, "step": 17840 }, { "epoch": 17.114093959731544, "grad_norm": 0.6751896739006042, "learning_rate": 2.4547929212481435e-05, "loss": 0.0386, "step": 17850 }, { "epoch": 17.123681687440076, "grad_norm": 0.32390302419662476, "learning_rate": 2.4493210931345684e-05, "loss": 0.029, "step": 17860 }, { "epoch": 17.13326941514861, "grad_norm": 0.31073060631752014, "learning_rate": 2.4438533913283206e-05, "loss": 0.0232, "step": 17870 }, { "epoch": 17.142857142857142, "grad_norm": 0.17248332500457764, "learning_rate": 2.4383898246746596e-05, "loss": 0.0214, "step": 17880 }, { "epoch": 17.152444870565677, "grad_norm": 0.33149340748786926, "learning_rate": 2.4329304020121558e-05, "loss": 0.0298, "step": 17890 }, { "epoch": 17.16203259827421, "grad_norm": 0.2364264875650406, "learning_rate": 2.4274751321726762e-05, "loss": 0.0333, "step": 17900 }, { "epoch": 17.171620325982744, "grad_norm": 0.15520252287387848, "learning_rate": 2.4220240239813684e-05, "loss": 0.0196, "step": 17910 }, { "epoch": 17.181208053691275, "grad_norm": 0.23256506025791168, "learning_rate": 2.4165770862566494e-05, "loss": 0.029, "step": 17920 }, { "epoch": 17.190795781399807, "grad_norm": 0.17074307799339294, "learning_rate": 2.4111343278101884e-05, "loss": 0.0302, "step": 17930 }, { "epoch": 17.20038350910834, "grad_norm": 0.24341343343257904, "learning_rate": 2.4056957574468932e-05, "loss": 0.0296, "step": 17940 }, { "epoch": 17.209971236816873, "grad_norm": 0.1940905898809433, "learning_rate": 2.4002613839648987e-05, "loss": 0.029, "step": 17950 }, { "epoch": 17.219558964525408, "grad_norm": 0.194035604596138, "learning_rate": 2.3948312161555453e-05, "loss": 0.0297, "step": 17960 }, { "epoch": 17.22914669223394, "grad_norm": 0.14753536880016327, "learning_rate": 2.389405262803375e-05, "loss": 0.0259, "step": 17970 }, { "epoch": 17.238734419942475, "grad_norm": 0.18068645894527435, "learning_rate": 2.3839835326861104e-05, "loss": 0.0284, "step": 17980 }, { "epoch": 17.248322147651006, "grad_norm": 0.33698755502700806, "learning_rate": 2.378566034574639e-05, "loss": 0.0289, "step": 17990 }, { "epoch": 17.25790987535954, "grad_norm": 0.2708437144756317, "learning_rate": 2.3731527772330098e-05, "loss": 0.0252, "step": 18000 }, { "epoch": 17.267497603068072, "grad_norm": 0.37091711163520813, "learning_rate": 2.367743769418403e-05, "loss": 0.031, "step": 18010 }, { "epoch": 17.277085330776607, "grad_norm": 0.22311721742153168, "learning_rate": 2.362339019881129e-05, "loss": 0.0356, "step": 18020 }, { "epoch": 17.28667305848514, "grad_norm": 0.3006376624107361, "learning_rate": 2.3569385373646068e-05, "loss": 0.0283, "step": 18030 }, { "epoch": 17.29626078619367, "grad_norm": 0.2278210073709488, "learning_rate": 2.351542330605355e-05, "loss": 0.0292, "step": 18040 }, { "epoch": 17.305848513902205, "grad_norm": 0.1900917887687683, "learning_rate": 2.3461504083329732e-05, "loss": 0.0293, "step": 18050 }, { "epoch": 17.315436241610737, "grad_norm": 0.36089229583740234, "learning_rate": 2.340762779270131e-05, "loss": 0.0335, "step": 18060 }, { "epoch": 17.325023969319272, "grad_norm": 0.20157793164253235, "learning_rate": 2.3353794521325516e-05, "loss": 0.0224, "step": 18070 }, { "epoch": 17.334611697027803, "grad_norm": 0.25802189111709595, "learning_rate": 2.330000435629002e-05, "loss": 0.0241, "step": 18080 }, { "epoch": 17.34419942473634, "grad_norm": 0.19763995707035065, "learning_rate": 2.32462573846127e-05, "loss": 0.0324, "step": 18090 }, { "epoch": 17.35378715244487, "grad_norm": 0.24877896904945374, "learning_rate": 2.319255369324161e-05, "loss": 0.0297, "step": 18100 }, { "epoch": 17.363374880153405, "grad_norm": 0.23094792664051056, "learning_rate": 2.3138893369054766e-05, "loss": 0.0279, "step": 18110 }, { "epoch": 17.372962607861936, "grad_norm": 0.1878676414489746, "learning_rate": 2.3085276498860032e-05, "loss": 0.0278, "step": 18120 }, { "epoch": 17.38255033557047, "grad_norm": 0.20479904115200043, "learning_rate": 2.3031703169394985e-05, "loss": 0.0263, "step": 18130 }, { "epoch": 17.392138063279003, "grad_norm": 0.3048153519630432, "learning_rate": 2.2978173467326724e-05, "loss": 0.0282, "step": 18140 }, { "epoch": 17.401725790987538, "grad_norm": 0.2260926365852356, "learning_rate": 2.292468747925185e-05, "loss": 0.0282, "step": 18150 }, { "epoch": 17.41131351869607, "grad_norm": 0.23683381080627441, "learning_rate": 2.287124529169618e-05, "loss": 0.0255, "step": 18160 }, { "epoch": 17.4209012464046, "grad_norm": 0.21933788061141968, "learning_rate": 2.2817846991114684e-05, "loss": 0.0259, "step": 18170 }, { "epoch": 17.430488974113135, "grad_norm": 0.2983873784542084, "learning_rate": 2.2764492663891353e-05, "loss": 0.0294, "step": 18180 }, { "epoch": 17.440076701821667, "grad_norm": 0.2740059792995453, "learning_rate": 2.271118239633902e-05, "loss": 0.0292, "step": 18190 }, { "epoch": 17.449664429530202, "grad_norm": 0.18633967638015747, "learning_rate": 2.2657916274699265e-05, "loss": 0.024, "step": 18200 }, { "epoch": 17.459252157238733, "grad_norm": 0.21379147469997406, "learning_rate": 2.2604694385142233e-05, "loss": 0.0245, "step": 18210 }, { "epoch": 17.46883988494727, "grad_norm": 0.2814527153968811, "learning_rate": 2.2551516813766538e-05, "loss": 0.0264, "step": 18220 }, { "epoch": 17.4784276126558, "grad_norm": 0.18947578966617584, "learning_rate": 2.2498383646599048e-05, "loss": 0.0222, "step": 18230 }, { "epoch": 17.488015340364335, "grad_norm": 0.41355225443840027, "learning_rate": 2.2445294969594844e-05, "loss": 0.0285, "step": 18240 }, { "epoch": 17.497603068072866, "grad_norm": 0.4395101070404053, "learning_rate": 2.2392250868637026e-05, "loss": 0.0301, "step": 18250 }, { "epoch": 17.5071907957814, "grad_norm": 0.1704569309949875, "learning_rate": 2.233925142953657e-05, "loss": 0.0236, "step": 18260 }, { "epoch": 17.516778523489933, "grad_norm": 6.209451198577881, "learning_rate": 2.2286296738032214e-05, "loss": 0.03, "step": 18270 }, { "epoch": 17.526366251198468, "grad_norm": 0.5336940884590149, "learning_rate": 2.223338687979029e-05, "loss": 0.024, "step": 18280 }, { "epoch": 17.535953978907, "grad_norm": 0.2711230516433716, "learning_rate": 2.2180521940404607e-05, "loss": 0.025, "step": 18290 }, { "epoch": 17.54554170661553, "grad_norm": 0.35838785767555237, "learning_rate": 2.212770200539634e-05, "loss": 0.0328, "step": 18300 }, { "epoch": 17.555129434324066, "grad_norm": 0.2138790637254715, "learning_rate": 2.207492716021381e-05, "loss": 0.0272, "step": 18310 }, { "epoch": 17.564717162032597, "grad_norm": 0.18834197521209717, "learning_rate": 2.2022197490232427e-05, "loss": 0.0266, "step": 18320 }, { "epoch": 17.574304889741132, "grad_norm": 0.28788337111473083, "learning_rate": 2.1969513080754504e-05, "loss": 0.0247, "step": 18330 }, { "epoch": 17.583892617449663, "grad_norm": 0.1590379774570465, "learning_rate": 2.1916874017009136e-05, "loss": 0.0233, "step": 18340 }, { "epoch": 17.5934803451582, "grad_norm": 0.2774651050567627, "learning_rate": 2.186428038415209e-05, "loss": 0.022, "step": 18350 }, { "epoch": 17.60306807286673, "grad_norm": 0.18108907341957092, "learning_rate": 2.1811732267265577e-05, "loss": 0.0228, "step": 18360 }, { "epoch": 17.612655800575265, "grad_norm": 0.2790849208831787, "learning_rate": 2.1759229751358217e-05, "loss": 0.0295, "step": 18370 }, { "epoch": 17.622243528283796, "grad_norm": 0.1974640190601349, "learning_rate": 2.170677292136487e-05, "loss": 0.0265, "step": 18380 }, { "epoch": 17.63183125599233, "grad_norm": 0.2952618896961212, "learning_rate": 2.1654361862146465e-05, "loss": 0.0257, "step": 18390 }, { "epoch": 17.641418983700863, "grad_norm": 0.21564097702503204, "learning_rate": 2.160199665848989e-05, "loss": 0.0286, "step": 18400 }, { "epoch": 17.651006711409394, "grad_norm": 0.2616369426250458, "learning_rate": 2.154967739510787e-05, "loss": 0.0265, "step": 18410 }, { "epoch": 17.66059443911793, "grad_norm": 0.22359015047550201, "learning_rate": 2.1497404156638784e-05, "loss": 0.0217, "step": 18420 }, { "epoch": 17.67018216682646, "grad_norm": 0.26012542843818665, "learning_rate": 2.144517702764657e-05, "loss": 0.0265, "step": 18430 }, { "epoch": 17.679769894534996, "grad_norm": 0.13236083090305328, "learning_rate": 2.1392996092620555e-05, "loss": 0.0203, "step": 18440 }, { "epoch": 17.689357622243527, "grad_norm": 0.23233279585838318, "learning_rate": 2.1340861435975384e-05, "loss": 0.0239, "step": 18450 }, { "epoch": 17.698945349952062, "grad_norm": 0.22985659539699554, "learning_rate": 2.1288773142050794e-05, "loss": 0.026, "step": 18460 }, { "epoch": 17.708533077660594, "grad_norm": 0.2680293321609497, "learning_rate": 2.123673129511152e-05, "loss": 0.0307, "step": 18470 }, { "epoch": 17.71812080536913, "grad_norm": 0.23979081213474274, "learning_rate": 2.1184735979347205e-05, "loss": 0.0251, "step": 18480 }, { "epoch": 17.72770853307766, "grad_norm": 0.2722991704940796, "learning_rate": 2.113278727887213e-05, "loss": 0.0301, "step": 18490 }, { "epoch": 17.737296260786195, "grad_norm": 0.22843940556049347, "learning_rate": 2.1080885277725236e-05, "loss": 0.0228, "step": 18500 }, { "epoch": 17.746883988494726, "grad_norm": 0.34953558444976807, "learning_rate": 2.1029030059869898e-05, "loss": 0.0296, "step": 18510 }, { "epoch": 17.75647171620326, "grad_norm": 0.12219765037298203, "learning_rate": 2.0977221709193813e-05, "loss": 0.0271, "step": 18520 }, { "epoch": 17.766059443911793, "grad_norm": 0.33025461435317993, "learning_rate": 2.0925460309508843e-05, "loss": 0.0305, "step": 18530 }, { "epoch": 17.775647171620324, "grad_norm": 0.3049762547016144, "learning_rate": 2.087374594455092e-05, "loss": 0.0316, "step": 18540 }, { "epoch": 17.78523489932886, "grad_norm": 0.3146844506263733, "learning_rate": 2.082207869797987e-05, "loss": 0.0272, "step": 18550 }, { "epoch": 17.79482262703739, "grad_norm": 0.18491698801517487, "learning_rate": 2.0770458653379286e-05, "loss": 0.0281, "step": 18560 }, { "epoch": 17.804410354745926, "grad_norm": 0.21474412083625793, "learning_rate": 2.0718885894256428e-05, "loss": 0.0238, "step": 18570 }, { "epoch": 17.813998082454457, "grad_norm": 0.1813114583492279, "learning_rate": 2.0667360504042045e-05, "loss": 0.027, "step": 18580 }, { "epoch": 17.823585810162992, "grad_norm": 0.36077314615249634, "learning_rate": 2.0615882566090243e-05, "loss": 0.0311, "step": 18590 }, { "epoch": 17.833173537871524, "grad_norm": 0.1905115246772766, "learning_rate": 2.0564452163678378e-05, "loss": 0.0254, "step": 18600 }, { "epoch": 17.84276126558006, "grad_norm": 0.22948439419269562, "learning_rate": 2.0513069380006943e-05, "loss": 0.0296, "step": 18610 }, { "epoch": 17.85234899328859, "grad_norm": 0.27490001916885376, "learning_rate": 2.046173429819931e-05, "loss": 0.0239, "step": 18620 }, { "epoch": 17.861936720997125, "grad_norm": 0.21853777766227722, "learning_rate": 2.0410447001301753e-05, "loss": 0.028, "step": 18630 }, { "epoch": 17.871524448705657, "grad_norm": 0.20548582077026367, "learning_rate": 2.0359207572283224e-05, "loss": 0.0225, "step": 18640 }, { "epoch": 17.88111217641419, "grad_norm": 0.14802424609661102, "learning_rate": 2.0308016094035226e-05, "loss": 0.0295, "step": 18650 }, { "epoch": 17.890699904122723, "grad_norm": 0.32737597823143005, "learning_rate": 2.02568726493717e-05, "loss": 0.0242, "step": 18660 }, { "epoch": 17.900287631831254, "grad_norm": 0.21833331882953644, "learning_rate": 2.020577732102889e-05, "loss": 0.0273, "step": 18670 }, { "epoch": 17.90987535953979, "grad_norm": 0.24916410446166992, "learning_rate": 2.015473019166519e-05, "loss": 0.0305, "step": 18680 }, { "epoch": 17.91946308724832, "grad_norm": 0.18901677429676056, "learning_rate": 2.0103731343861014e-05, "loss": 0.0256, "step": 18690 }, { "epoch": 17.929050814956856, "grad_norm": 0.20720627903938293, "learning_rate": 2.0052780860118692e-05, "loss": 0.0262, "step": 18700 }, { "epoch": 17.938638542665387, "grad_norm": 0.20290115475654602, "learning_rate": 2.0001878822862292e-05, "loss": 0.0302, "step": 18710 }, { "epoch": 17.948226270373922, "grad_norm": 0.28782570362091064, "learning_rate": 1.995102531443752e-05, "loss": 0.0272, "step": 18720 }, { "epoch": 17.957813998082454, "grad_norm": 0.19285361468791962, "learning_rate": 1.9900220417111577e-05, "loss": 0.0226, "step": 18730 }, { "epoch": 17.96740172579099, "grad_norm": 0.2487422674894333, "learning_rate": 1.984946421307301e-05, "loss": 0.0259, "step": 18740 }, { "epoch": 17.97698945349952, "grad_norm": 0.20847800374031067, "learning_rate": 1.9798756784431616e-05, "loss": 0.0248, "step": 18750 }, { "epoch": 17.986577181208055, "grad_norm": 0.29753822088241577, "learning_rate": 1.974809821321827e-05, "loss": 0.0307, "step": 18760 }, { "epoch": 17.996164908916587, "grad_norm": 0.2475176304578781, "learning_rate": 1.969748858138481e-05, "loss": 0.0192, "step": 18770 }, { "epoch": 18.005752636625118, "grad_norm": 0.24821995198726654, "learning_rate": 1.9646927970803913e-05, "loss": 0.0217, "step": 18780 }, { "epoch": 18.015340364333653, "grad_norm": 0.24269837141036987, "learning_rate": 1.959641646326894e-05, "loss": 0.0267, "step": 18790 }, { "epoch": 18.024928092042185, "grad_norm": 0.4261660575866699, "learning_rate": 1.9545954140493828e-05, "loss": 0.028, "step": 18800 }, { "epoch": 18.03451581975072, "grad_norm": 0.27009981870651245, "learning_rate": 1.9495541084112945e-05, "loss": 0.0261, "step": 18810 }, { "epoch": 18.04410354745925, "grad_norm": 0.4468768537044525, "learning_rate": 1.9445177375680944e-05, "loss": 0.0237, "step": 18820 }, { "epoch": 18.053691275167786, "grad_norm": 0.34373733401298523, "learning_rate": 1.939486309667267e-05, "loss": 0.0283, "step": 18830 }, { "epoch": 18.063279002876317, "grad_norm": 0.3583851456642151, "learning_rate": 1.9344598328482994e-05, "loss": 0.0239, "step": 18840 }, { "epoch": 18.072866730584852, "grad_norm": 0.2819909453392029, "learning_rate": 1.9294383152426682e-05, "loss": 0.0228, "step": 18850 }, { "epoch": 18.082454458293384, "grad_norm": 0.21321451663970947, "learning_rate": 1.924421764973829e-05, "loss": 0.0257, "step": 18860 }, { "epoch": 18.09204218600192, "grad_norm": 0.20414310693740845, "learning_rate": 1.9194101901572e-05, "loss": 0.027, "step": 18870 }, { "epoch": 18.10162991371045, "grad_norm": 0.1880536824464798, "learning_rate": 1.9144035989001518e-05, "loss": 0.0236, "step": 18880 }, { "epoch": 18.111217641418985, "grad_norm": 0.15333381295204163, "learning_rate": 1.909401999301993e-05, "loss": 0.0285, "step": 18890 }, { "epoch": 18.120805369127517, "grad_norm": 0.25423663854599, "learning_rate": 1.904405399453955e-05, "loss": 0.0253, "step": 18900 }, { "epoch": 18.13039309683605, "grad_norm": 0.16123837232589722, "learning_rate": 1.8994138074391843e-05, "loss": 0.0269, "step": 18910 }, { "epoch": 18.139980824544583, "grad_norm": 0.28160786628723145, "learning_rate": 1.8944272313327226e-05, "loss": 0.0289, "step": 18920 }, { "epoch": 18.149568552253115, "grad_norm": 0.17112663388252258, "learning_rate": 1.8894456792014996e-05, "loss": 0.0273, "step": 18930 }, { "epoch": 18.15915627996165, "grad_norm": 0.19048067927360535, "learning_rate": 1.8844691591043173e-05, "loss": 0.0225, "step": 18940 }, { "epoch": 18.16874400767018, "grad_norm": 0.22992561757564545, "learning_rate": 1.8794976790918363e-05, "loss": 0.0229, "step": 18950 }, { "epoch": 18.178331735378716, "grad_norm": 0.30747804045677185, "learning_rate": 1.8745312472065635e-05, "loss": 0.0259, "step": 18960 }, { "epoch": 18.187919463087248, "grad_norm": 0.2523973882198334, "learning_rate": 1.8695698714828406e-05, "loss": 0.0249, "step": 18970 }, { "epoch": 18.197507190795783, "grad_norm": 0.2866404056549072, "learning_rate": 1.8646135599468297e-05, "loss": 0.0252, "step": 18980 }, { "epoch": 18.207094918504314, "grad_norm": 0.1944408118724823, "learning_rate": 1.8596623206164987e-05, "loss": 0.0265, "step": 18990 }, { "epoch": 18.21668264621285, "grad_norm": 0.22918511927127838, "learning_rate": 1.8547161615016116e-05, "loss": 0.0272, "step": 19000 }, { "epoch": 18.22627037392138, "grad_norm": 0.2857123911380768, "learning_rate": 1.8497750906037148e-05, "loss": 0.0307, "step": 19010 }, { "epoch": 18.235858101629915, "grad_norm": 0.17393842339515686, "learning_rate": 1.8448391159161204e-05, "loss": 0.029, "step": 19020 }, { "epoch": 18.245445829338447, "grad_norm": 0.2042463719844818, "learning_rate": 1.839908245423899e-05, "loss": 0.0251, "step": 19030 }, { "epoch": 18.25503355704698, "grad_norm": 0.277891606092453, "learning_rate": 1.8349824871038644e-05, "loss": 0.0237, "step": 19040 }, { "epoch": 18.264621284755513, "grad_norm": 0.16384513676166534, "learning_rate": 1.8300618489245537e-05, "loss": 0.0239, "step": 19050 }, { "epoch": 18.274209012464045, "grad_norm": 0.27934807538986206, "learning_rate": 1.8251463388462315e-05, "loss": 0.0258, "step": 19060 }, { "epoch": 18.28379674017258, "grad_norm": 0.28241196274757385, "learning_rate": 1.8202359648208593e-05, "loss": 0.0244, "step": 19070 }, { "epoch": 18.29338446788111, "grad_norm": 0.22691746056079865, "learning_rate": 1.8153307347920918e-05, "loss": 0.0279, "step": 19080 }, { "epoch": 18.302972195589646, "grad_norm": 0.31362423300743103, "learning_rate": 1.8104306566952618e-05, "loss": 0.0235, "step": 19090 }, { "epoch": 18.312559923298178, "grad_norm": 0.5029933452606201, "learning_rate": 1.805535738457368e-05, "loss": 0.027, "step": 19100 }, { "epoch": 18.322147651006713, "grad_norm": 0.23722821474075317, "learning_rate": 1.8006459879970622e-05, "loss": 0.0309, "step": 19110 }, { "epoch": 18.331735378715244, "grad_norm": 0.2513883113861084, "learning_rate": 1.7957614132246347e-05, "loss": 0.0263, "step": 19120 }, { "epoch": 18.34132310642378, "grad_norm": 0.24489589035511017, "learning_rate": 1.7908820220420052e-05, "loss": 0.0251, "step": 19130 }, { "epoch": 18.35091083413231, "grad_norm": 0.2208951860666275, "learning_rate": 1.7860078223427056e-05, "loss": 0.0267, "step": 19140 }, { "epoch": 18.360498561840842, "grad_norm": 0.2466048002243042, "learning_rate": 1.7811388220118707e-05, "loss": 0.0246, "step": 19150 }, { "epoch": 18.370086289549377, "grad_norm": 0.1647568643093109, "learning_rate": 1.7762750289262238e-05, "loss": 0.0221, "step": 19160 }, { "epoch": 18.37967401725791, "grad_norm": 0.20359550416469574, "learning_rate": 1.7714164509540654e-05, "loss": 0.024, "step": 19170 }, { "epoch": 18.389261744966444, "grad_norm": 0.15871766209602356, "learning_rate": 1.7665630959552548e-05, "loss": 0.0252, "step": 19180 }, { "epoch": 18.398849472674975, "grad_norm": 0.2411220222711563, "learning_rate": 1.7617149717812076e-05, "loss": 0.0225, "step": 19190 }, { "epoch": 18.40843720038351, "grad_norm": 0.3407461643218994, "learning_rate": 1.7568720862748744e-05, "loss": 0.0312, "step": 19200 }, { "epoch": 18.41802492809204, "grad_norm": 0.21590691804885864, "learning_rate": 1.75203444727073e-05, "loss": 0.0248, "step": 19210 }, { "epoch": 18.427612655800576, "grad_norm": 0.17383931577205658, "learning_rate": 1.7472020625947678e-05, "loss": 0.0231, "step": 19220 }, { "epoch": 18.437200383509108, "grad_norm": 0.31559276580810547, "learning_rate": 1.742374940064474e-05, "loss": 0.0263, "step": 19230 }, { "epoch": 18.446788111217643, "grad_norm": 0.23316271603107452, "learning_rate": 1.737553087488825e-05, "loss": 0.0316, "step": 19240 }, { "epoch": 18.456375838926174, "grad_norm": 0.21858806908130646, "learning_rate": 1.7327365126682726e-05, "loss": 0.0272, "step": 19250 }, { "epoch": 18.46596356663471, "grad_norm": 0.2449788898229599, "learning_rate": 1.7279252233947286e-05, "loss": 0.0286, "step": 19260 }, { "epoch": 18.47555129434324, "grad_norm": 0.21250544488430023, "learning_rate": 1.7231192274515562e-05, "loss": 0.0247, "step": 19270 }, { "epoch": 18.485139022051772, "grad_norm": 0.2528996169567108, "learning_rate": 1.7183185326135543e-05, "loss": 0.0253, "step": 19280 }, { "epoch": 18.494726749760307, "grad_norm": 0.2549261748790741, "learning_rate": 1.7135231466469463e-05, "loss": 0.0294, "step": 19290 }, { "epoch": 18.50431447746884, "grad_norm": 0.352224200963974, "learning_rate": 1.7087330773093673e-05, "loss": 0.0228, "step": 19300 }, { "epoch": 18.513902205177374, "grad_norm": 0.18530428409576416, "learning_rate": 1.7039483323498534e-05, "loss": 0.0258, "step": 19310 }, { "epoch": 18.523489932885905, "grad_norm": 0.14298230409622192, "learning_rate": 1.6991689195088217e-05, "loss": 0.0236, "step": 19320 }, { "epoch": 18.53307766059444, "grad_norm": 0.2754952311515808, "learning_rate": 1.6943948465180693e-05, "loss": 0.0235, "step": 19330 }, { "epoch": 18.54266538830297, "grad_norm": 0.2274174690246582, "learning_rate": 1.6896261211007518e-05, "loss": 0.0305, "step": 19340 }, { "epoch": 18.552253116011507, "grad_norm": 0.3091070055961609, "learning_rate": 1.684862750971376e-05, "loss": 0.0227, "step": 19350 }, { "epoch": 18.561840843720038, "grad_norm": 0.15530341863632202, "learning_rate": 1.6801047438357818e-05, "loss": 0.0224, "step": 19360 }, { "epoch": 18.571428571428573, "grad_norm": 0.25515303015708923, "learning_rate": 1.675352107391139e-05, "loss": 0.0291, "step": 19370 }, { "epoch": 18.581016299137104, "grad_norm": 0.27960437536239624, "learning_rate": 1.670604849325923e-05, "loss": 0.0258, "step": 19380 }, { "epoch": 18.59060402684564, "grad_norm": 0.2250082641839981, "learning_rate": 1.6658629773199124e-05, "loss": 0.0232, "step": 19390 }, { "epoch": 18.60019175455417, "grad_norm": 0.27883338928222656, "learning_rate": 1.6611264990441706e-05, "loss": 0.0343, "step": 19400 }, { "epoch": 18.609779482262702, "grad_norm": 0.17993752658367157, "learning_rate": 1.6563954221610355e-05, "loss": 0.0278, "step": 19410 }, { "epoch": 18.619367209971237, "grad_norm": 0.26345837116241455, "learning_rate": 1.6516697543241083e-05, "loss": 0.026, "step": 19420 }, { "epoch": 18.62895493767977, "grad_norm": 0.22277230024337769, "learning_rate": 1.646949503178239e-05, "loss": 0.0213, "step": 19430 }, { "epoch": 18.638542665388304, "grad_norm": 0.2661077082157135, "learning_rate": 1.642234676359516e-05, "loss": 0.0243, "step": 19440 }, { "epoch": 18.648130393096835, "grad_norm": 0.28437718749046326, "learning_rate": 1.6375252814952487e-05, "loss": 0.0263, "step": 19450 }, { "epoch": 18.65771812080537, "grad_norm": 0.20834000408649445, "learning_rate": 1.6328213262039637e-05, "loss": 0.0225, "step": 19460 }, { "epoch": 18.6673058485139, "grad_norm": 0.24616943299770355, "learning_rate": 1.6281228180953857e-05, "loss": 0.0233, "step": 19470 }, { "epoch": 18.676893576222437, "grad_norm": 0.21522550284862518, "learning_rate": 1.623429764770428e-05, "loss": 0.0233, "step": 19480 }, { "epoch": 18.686481303930968, "grad_norm": 0.2068173587322235, "learning_rate": 1.618742173821179e-05, "loss": 0.0308, "step": 19490 }, { "epoch": 18.696069031639503, "grad_norm": 0.5226014256477356, "learning_rate": 1.614060052830891e-05, "loss": 0.031, "step": 19500 }, { "epoch": 18.705656759348035, "grad_norm": 0.18240250647068024, "learning_rate": 1.6093834093739647e-05, "loss": 0.0245, "step": 19510 }, { "epoch": 18.715244487056566, "grad_norm": 0.2039356231689453, "learning_rate": 1.6047122510159458e-05, "loss": 0.02, "step": 19520 }, { "epoch": 18.7248322147651, "grad_norm": 0.2688858211040497, "learning_rate": 1.600046585313501e-05, "loss": 0.0232, "step": 19530 }, { "epoch": 18.734419942473632, "grad_norm": 0.3605387806892395, "learning_rate": 1.5953864198144135e-05, "loss": 0.0285, "step": 19540 }, { "epoch": 18.744007670182167, "grad_norm": 0.19552724063396454, "learning_rate": 1.5907317620575686e-05, "loss": 0.0249, "step": 19550 }, { "epoch": 18.7535953978907, "grad_norm": 0.2785275876522064, "learning_rate": 1.58608261957294e-05, "loss": 0.0301, "step": 19560 }, { "epoch": 18.763183125599234, "grad_norm": 0.265622079372406, "learning_rate": 1.5814389998815836e-05, "loss": 0.0244, "step": 19570 }, { "epoch": 18.772770853307765, "grad_norm": 0.22419176995754242, "learning_rate": 1.5768009104956137e-05, "loss": 0.0197, "step": 19580 }, { "epoch": 18.7823585810163, "grad_norm": 0.22098082304000854, "learning_rate": 1.572168358918204e-05, "loss": 0.0219, "step": 19590 }, { "epoch": 18.79194630872483, "grad_norm": 0.26601535081863403, "learning_rate": 1.5675413526435677e-05, "loss": 0.0234, "step": 19600 }, { "epoch": 18.801534036433367, "grad_norm": 0.2946853041648865, "learning_rate": 1.562919899156947e-05, "loss": 0.0246, "step": 19610 }, { "epoch": 18.811121764141898, "grad_norm": 0.3101515471935272, "learning_rate": 1.558304005934602e-05, "loss": 0.0244, "step": 19620 }, { "epoch": 18.82070949185043, "grad_norm": 0.24001409113407135, "learning_rate": 1.5536936804437963e-05, "loss": 0.0254, "step": 19630 }, { "epoch": 18.830297219558965, "grad_norm": 0.1419634222984314, "learning_rate": 1.549088930142788e-05, "loss": 0.0231, "step": 19640 }, { "epoch": 18.839884947267496, "grad_norm": 0.24882347881793976, "learning_rate": 1.544489762480815e-05, "loss": 0.0201, "step": 19650 }, { "epoch": 18.84947267497603, "grad_norm": 0.22982530295848846, "learning_rate": 1.5398961848980838e-05, "loss": 0.0218, "step": 19660 }, { "epoch": 18.859060402684563, "grad_norm": 0.1851414293050766, "learning_rate": 1.5353082048257596e-05, "loss": 0.0267, "step": 19670 }, { "epoch": 18.868648130393098, "grad_norm": 0.23806796967983246, "learning_rate": 1.53072582968595e-05, "loss": 0.0273, "step": 19680 }, { "epoch": 18.87823585810163, "grad_norm": 0.2619253396987915, "learning_rate": 1.526149066891697e-05, "loss": 0.0263, "step": 19690 }, { "epoch": 18.887823585810164, "grad_norm": 0.24336743354797363, "learning_rate": 1.5215779238469641e-05, "loss": 0.0344, "step": 19700 }, { "epoch": 18.897411313518695, "grad_norm": 0.21095559000968933, "learning_rate": 1.5170124079466186e-05, "loss": 0.0239, "step": 19710 }, { "epoch": 18.90699904122723, "grad_norm": 0.3702682554721832, "learning_rate": 1.51245252657643e-05, "loss": 0.0265, "step": 19720 }, { "epoch": 18.916586768935762, "grad_norm": 0.45442819595336914, "learning_rate": 1.5078982871130504e-05, "loss": 0.0284, "step": 19730 }, { "epoch": 18.926174496644297, "grad_norm": 0.1986912339925766, "learning_rate": 1.5033496969240057e-05, "loss": 0.025, "step": 19740 }, { "epoch": 18.93576222435283, "grad_norm": 0.18418286740779877, "learning_rate": 1.4988067633676816e-05, "loss": 0.0255, "step": 19750 }, { "epoch": 18.94534995206136, "grad_norm": 0.16016803681850433, "learning_rate": 1.4942694937933144e-05, "loss": 0.0224, "step": 19760 }, { "epoch": 18.954937679769895, "grad_norm": 0.2799144387245178, "learning_rate": 1.4897378955409763e-05, "loss": 0.0279, "step": 19770 }, { "epoch": 18.964525407478426, "grad_norm": 0.17058733105659485, "learning_rate": 1.4852119759415661e-05, "loss": 0.0257, "step": 19780 }, { "epoch": 18.97411313518696, "grad_norm": 0.24392423033714294, "learning_rate": 1.4806917423167944e-05, "loss": 0.0237, "step": 19790 }, { "epoch": 18.983700862895493, "grad_norm": 0.19233231246471405, "learning_rate": 1.4761772019791748e-05, "loss": 0.0296, "step": 19800 }, { "epoch": 18.993288590604028, "grad_norm": 0.2076229453086853, "learning_rate": 1.4716683622320105e-05, "loss": 0.026, "step": 19810 }, { "epoch": 19.00287631831256, "grad_norm": 0.22467122972011566, "learning_rate": 1.4671652303693806e-05, "loss": 0.028, "step": 19820 }, { "epoch": 19.012464046021094, "grad_norm": 0.16231553256511688, "learning_rate": 1.4626678136761369e-05, "loss": 0.0248, "step": 19830 }, { "epoch": 19.022051773729626, "grad_norm": 0.24173732101917267, "learning_rate": 1.4581761194278765e-05, "loss": 0.0318, "step": 19840 }, { "epoch": 19.03163950143816, "grad_norm": 0.21880550682544708, "learning_rate": 1.4536901548909448e-05, "loss": 0.0299, "step": 19850 }, { "epoch": 19.041227229146692, "grad_norm": 0.3532547950744629, "learning_rate": 1.4492099273224174e-05, "loss": 0.0282, "step": 19860 }, { "epoch": 19.050814956855227, "grad_norm": 0.20322856307029724, "learning_rate": 1.4447354439700889e-05, "loss": 0.0235, "step": 19870 }, { "epoch": 19.06040268456376, "grad_norm": 0.2487279176712036, "learning_rate": 1.4402667120724594e-05, "loss": 0.0285, "step": 19880 }, { "epoch": 19.06999041227229, "grad_norm": 0.19251792132854462, "learning_rate": 1.4358037388587281e-05, "loss": 0.0269, "step": 19890 }, { "epoch": 19.079578139980825, "grad_norm": 0.2209775298833847, "learning_rate": 1.4313465315487745e-05, "loss": 0.0209, "step": 19900 }, { "epoch": 19.089165867689356, "grad_norm": 0.15831854939460754, "learning_rate": 1.4268950973531536e-05, "loss": 0.023, "step": 19910 }, { "epoch": 19.09875359539789, "grad_norm": 0.21416033804416656, "learning_rate": 1.4224494434730794e-05, "loss": 0.0217, "step": 19920 }, { "epoch": 19.108341323106423, "grad_norm": 0.15104466676712036, "learning_rate": 1.4180095771004154e-05, "loss": 0.0212, "step": 19930 }, { "epoch": 19.117929050814958, "grad_norm": 0.19750936329364777, "learning_rate": 1.413575505417662e-05, "loss": 0.0172, "step": 19940 }, { "epoch": 19.12751677852349, "grad_norm": 0.26296430826187134, "learning_rate": 1.4091472355979463e-05, "loss": 0.0248, "step": 19950 }, { "epoch": 19.137104506232024, "grad_norm": 0.20639511942863464, "learning_rate": 1.404724774805008e-05, "loss": 0.0255, "step": 19960 }, { "epoch": 19.146692233940556, "grad_norm": 0.14402848482131958, "learning_rate": 1.4003081301931909e-05, "loss": 0.0273, "step": 19970 }, { "epoch": 19.15627996164909, "grad_norm": 0.14169853925704956, "learning_rate": 1.395897308907429e-05, "loss": 0.0256, "step": 19980 }, { "epoch": 19.165867689357622, "grad_norm": 0.17262916266918182, "learning_rate": 1.3914923180832368e-05, "loss": 0.0264, "step": 19990 }, { "epoch": 19.175455417066157, "grad_norm": 0.13429339230060577, "learning_rate": 1.3870931648466945e-05, "loss": 0.0246, "step": 20000 }, { "epoch": 19.18504314477469, "grad_norm": 0.2229502946138382, "learning_rate": 1.3826998563144411e-05, "loss": 0.0238, "step": 20010 }, { "epoch": 19.19463087248322, "grad_norm": 0.1920672059059143, "learning_rate": 1.3783123995936587e-05, "loss": 0.0239, "step": 20020 }, { "epoch": 19.204218600191755, "grad_norm": 0.22073961794376373, "learning_rate": 1.373930801782064e-05, "loss": 0.021, "step": 20030 }, { "epoch": 19.213806327900286, "grad_norm": 0.3254948556423187, "learning_rate": 1.369555069967895e-05, "loss": 0.0253, "step": 20040 }, { "epoch": 19.22339405560882, "grad_norm": 0.14167852699756622, "learning_rate": 1.3651852112298995e-05, "loss": 0.0207, "step": 20050 }, { "epoch": 19.232981783317353, "grad_norm": 0.2205292135477066, "learning_rate": 1.3608212326373249e-05, "loss": 0.0266, "step": 20060 }, { "epoch": 19.242569511025888, "grad_norm": 0.268951416015625, "learning_rate": 1.3564631412499067e-05, "loss": 0.0187, "step": 20070 }, { "epoch": 19.25215723873442, "grad_norm": 0.18108440935611725, "learning_rate": 1.3521109441178559e-05, "loss": 0.0258, "step": 20080 }, { "epoch": 19.261744966442954, "grad_norm": 0.2803739905357361, "learning_rate": 1.3477646482818474e-05, "loss": 0.0263, "step": 20090 }, { "epoch": 19.271332694151486, "grad_norm": 0.2689793109893799, "learning_rate": 1.3434242607730108e-05, "loss": 0.0254, "step": 20100 }, { "epoch": 19.28092042186002, "grad_norm": 0.3495311439037323, "learning_rate": 1.3390897886129162e-05, "loss": 0.0277, "step": 20110 }, { "epoch": 19.290508149568552, "grad_norm": 0.17431464791297913, "learning_rate": 1.334761238813566e-05, "loss": 0.0211, "step": 20120 }, { "epoch": 19.300095877277084, "grad_norm": 0.3109664022922516, "learning_rate": 1.3304386183773809e-05, "loss": 0.0239, "step": 20130 }, { "epoch": 19.30968360498562, "grad_norm": 0.15485496819019318, "learning_rate": 1.3261219342971887e-05, "loss": 0.0283, "step": 20140 }, { "epoch": 19.31927133269415, "grad_norm": 0.18140093982219696, "learning_rate": 1.3218111935562149e-05, "loss": 0.0275, "step": 20150 }, { "epoch": 19.328859060402685, "grad_norm": 0.22020739316940308, "learning_rate": 1.3175064031280703e-05, "loss": 0.0318, "step": 20160 }, { "epoch": 19.338446788111217, "grad_norm": 0.27302905917167664, "learning_rate": 1.3132075699767393e-05, "loss": 0.0204, "step": 20170 }, { "epoch": 19.34803451581975, "grad_norm": 0.20312833786010742, "learning_rate": 1.3089147010565689e-05, "loss": 0.0243, "step": 20180 }, { "epoch": 19.357622243528283, "grad_norm": 0.2138754427433014, "learning_rate": 1.3046278033122577e-05, "loss": 0.0246, "step": 20190 }, { "epoch": 19.367209971236818, "grad_norm": 0.15521451830863953, "learning_rate": 1.3003468836788446e-05, "loss": 0.0202, "step": 20200 }, { "epoch": 19.37679769894535, "grad_norm": 0.14165331423282623, "learning_rate": 1.296071949081698e-05, "loss": 0.029, "step": 20210 }, { "epoch": 19.386385426653884, "grad_norm": 0.17273680865764618, "learning_rate": 1.2918030064365034e-05, "loss": 0.0248, "step": 20220 }, { "epoch": 19.395973154362416, "grad_norm": 0.1982639878988266, "learning_rate": 1.2875400626492534e-05, "loss": 0.0218, "step": 20230 }, { "epoch": 19.40556088207095, "grad_norm": 0.20939846336841583, "learning_rate": 1.2832831246162359e-05, "loss": 0.0221, "step": 20240 }, { "epoch": 19.415148609779482, "grad_norm": 0.2230292111635208, "learning_rate": 1.2790321992240228e-05, "loss": 0.0262, "step": 20250 }, { "epoch": 19.424736337488014, "grad_norm": 0.17387695610523224, "learning_rate": 1.2747872933494615e-05, "loss": 0.0231, "step": 20260 }, { "epoch": 19.43432406519655, "grad_norm": 0.2639104723930359, "learning_rate": 1.2705484138596552e-05, "loss": 0.0227, "step": 20270 }, { "epoch": 19.44391179290508, "grad_norm": 0.2716933488845825, "learning_rate": 1.2663155676119665e-05, "loss": 0.025, "step": 20280 }, { "epoch": 19.453499520613615, "grad_norm": 0.2254800945520401, "learning_rate": 1.2620887614539917e-05, "loss": 0.0236, "step": 20290 }, { "epoch": 19.463087248322147, "grad_norm": 0.1728450208902359, "learning_rate": 1.2578680022235585e-05, "loss": 0.0237, "step": 20300 }, { "epoch": 19.47267497603068, "grad_norm": 0.2077593207359314, "learning_rate": 1.253653296748712e-05, "loss": 0.026, "step": 20310 }, { "epoch": 19.482262703739213, "grad_norm": 0.20576708018779755, "learning_rate": 1.2494446518477022e-05, "loss": 0.021, "step": 20320 }, { "epoch": 19.491850431447748, "grad_norm": 0.2826680839061737, "learning_rate": 1.2452420743289778e-05, "loss": 0.0241, "step": 20330 }, { "epoch": 19.50143815915628, "grad_norm": 0.3109418451786041, "learning_rate": 1.2410455709911694e-05, "loss": 0.0262, "step": 20340 }, { "epoch": 19.511025886864815, "grad_norm": 0.6010233759880066, "learning_rate": 1.2368551486230828e-05, "loss": 0.0261, "step": 20350 }, { "epoch": 19.520613614573346, "grad_norm": 0.33683139085769653, "learning_rate": 1.2326708140036852e-05, "loss": 0.0268, "step": 20360 }, { "epoch": 19.530201342281877, "grad_norm": 0.1394880712032318, "learning_rate": 1.2284925739020974e-05, "loss": 0.0287, "step": 20370 }, { "epoch": 19.539789069990412, "grad_norm": 0.2836284935474396, "learning_rate": 1.2243204350775789e-05, "loss": 0.0215, "step": 20380 }, { "epoch": 19.549376797698944, "grad_norm": 0.9439190626144409, "learning_rate": 1.2201544042795198e-05, "loss": 0.0254, "step": 20390 }, { "epoch": 19.55896452540748, "grad_norm": 0.18774332106113434, "learning_rate": 1.215994488247431e-05, "loss": 0.0273, "step": 20400 }, { "epoch": 19.56855225311601, "grad_norm": 0.4038194715976715, "learning_rate": 1.211840693710926e-05, "loss": 0.0186, "step": 20410 }, { "epoch": 19.578139980824545, "grad_norm": 0.2532286047935486, "learning_rate": 1.2076930273897214e-05, "loss": 0.0303, "step": 20420 }, { "epoch": 19.587727708533077, "grad_norm": 0.23393119871616364, "learning_rate": 1.2035514959936144e-05, "loss": 0.0223, "step": 20430 }, { "epoch": 19.59731543624161, "grad_norm": 0.17693249881267548, "learning_rate": 1.199416106222484e-05, "loss": 0.0222, "step": 20440 }, { "epoch": 19.606903163950143, "grad_norm": 0.4991660714149475, "learning_rate": 1.1952868647662696e-05, "loss": 0.0255, "step": 20450 }, { "epoch": 19.616490891658678, "grad_norm": 0.24061451852321625, "learning_rate": 1.1911637783049645e-05, "loss": 0.0271, "step": 20460 }, { "epoch": 19.62607861936721, "grad_norm": 0.20236246287822723, "learning_rate": 1.1870468535086054e-05, "loss": 0.0274, "step": 20470 }, { "epoch": 19.635666347075745, "grad_norm": 0.16982276737689972, "learning_rate": 1.1829360970372604e-05, "loss": 0.0275, "step": 20480 }, { "epoch": 19.645254074784276, "grad_norm": 0.17934417724609375, "learning_rate": 1.1788315155410212e-05, "loss": 0.02, "step": 20490 }, { "epoch": 19.654841802492808, "grad_norm": 0.2388330101966858, "learning_rate": 1.1747331156599873e-05, "loss": 0.0192, "step": 20500 }, { "epoch": 19.664429530201343, "grad_norm": 0.19787994027137756, "learning_rate": 1.1706409040242588e-05, "loss": 0.0261, "step": 20510 }, { "epoch": 19.674017257909874, "grad_norm": 0.2273687720298767, "learning_rate": 1.166554887253926e-05, "loss": 0.0246, "step": 20520 }, { "epoch": 19.68360498561841, "grad_norm": 0.33494409918785095, "learning_rate": 1.1624750719590588e-05, "loss": 0.0249, "step": 20530 }, { "epoch": 19.69319271332694, "grad_norm": 0.2192111760377884, "learning_rate": 1.158401464739689e-05, "loss": 0.0239, "step": 20540 }, { "epoch": 19.702780441035475, "grad_norm": 0.2234772890806198, "learning_rate": 1.154334072185811e-05, "loss": 0.0227, "step": 20550 }, { "epoch": 19.712368168744007, "grad_norm": 0.3074262738227844, "learning_rate": 1.1502729008773639e-05, "loss": 0.027, "step": 20560 }, { "epoch": 19.721955896452542, "grad_norm": 0.22344590723514557, "learning_rate": 1.146217957384223e-05, "loss": 0.0251, "step": 20570 }, { "epoch": 19.731543624161073, "grad_norm": 0.12177485972642899, "learning_rate": 1.1421692482661856e-05, "loss": 0.0246, "step": 20580 }, { "epoch": 19.74113135186961, "grad_norm": 0.12450756132602692, "learning_rate": 1.1381267800729695e-05, "loss": 0.0247, "step": 20590 }, { "epoch": 19.75071907957814, "grad_norm": 0.26811161637306213, "learning_rate": 1.1340905593441914e-05, "loss": 0.0263, "step": 20600 }, { "epoch": 19.760306807286675, "grad_norm": 0.18584440648555756, "learning_rate": 1.1300605926093627e-05, "loss": 0.0259, "step": 20610 }, { "epoch": 19.769894534995206, "grad_norm": 0.15904641151428223, "learning_rate": 1.1260368863878778e-05, "loss": 0.0239, "step": 20620 }, { "epoch": 19.779482262703738, "grad_norm": 0.22534583508968353, "learning_rate": 1.1220194471890027e-05, "loss": 0.0234, "step": 20630 }, { "epoch": 19.789069990412273, "grad_norm": 0.22182218730449677, "learning_rate": 1.1180082815118659e-05, "loss": 0.0255, "step": 20640 }, { "epoch": 19.798657718120804, "grad_norm": 0.13675539195537567, "learning_rate": 1.114003395845446e-05, "loss": 0.0203, "step": 20650 }, { "epoch": 19.80824544582934, "grad_norm": 0.153213232755661, "learning_rate": 1.1100047966685645e-05, "loss": 0.0235, "step": 20660 }, { "epoch": 19.81783317353787, "grad_norm": 0.23550502955913544, "learning_rate": 1.1060124904498686e-05, "loss": 0.0262, "step": 20670 }, { "epoch": 19.827420901246406, "grad_norm": 0.16561271250247955, "learning_rate": 1.10202648364783e-05, "loss": 0.0252, "step": 20680 }, { "epoch": 19.837008628954937, "grad_norm": 0.21752074360847473, "learning_rate": 1.0980467827107265e-05, "loss": 0.025, "step": 20690 }, { "epoch": 19.846596356663472, "grad_norm": 0.3683970868587494, "learning_rate": 1.0940733940766367e-05, "loss": 0.0275, "step": 20700 }, { "epoch": 19.856184084372003, "grad_norm": 0.19650644063949585, "learning_rate": 1.090106324173426e-05, "loss": 0.0227, "step": 20710 }, { "epoch": 19.86577181208054, "grad_norm": 0.3195613622665405, "learning_rate": 1.0861455794187398e-05, "loss": 0.0246, "step": 20720 }, { "epoch": 19.87535953978907, "grad_norm": 0.30373549461364746, "learning_rate": 1.0821911662199874e-05, "loss": 0.0289, "step": 20730 }, { "epoch": 19.8849472674976, "grad_norm": 0.23305653035640717, "learning_rate": 1.0782430909743407e-05, "loss": 0.0317, "step": 20740 }, { "epoch": 19.894534995206136, "grad_norm": 0.19694805145263672, "learning_rate": 1.0743013600687146e-05, "loss": 0.021, "step": 20750 }, { "epoch": 19.904122722914668, "grad_norm": 0.18307265639305115, "learning_rate": 1.0703659798797616e-05, "loss": 0.0215, "step": 20760 }, { "epoch": 19.913710450623203, "grad_norm": 0.15986226499080658, "learning_rate": 1.0664369567738608e-05, "loss": 0.0254, "step": 20770 }, { "epoch": 19.923298178331734, "grad_norm": 0.22868862748146057, "learning_rate": 1.0625142971071067e-05, "loss": 0.0209, "step": 20780 }, { "epoch": 19.93288590604027, "grad_norm": 0.23605976998806, "learning_rate": 1.0585980072253005e-05, "loss": 0.0251, "step": 20790 }, { "epoch": 19.9424736337488, "grad_norm": 0.40362289547920227, "learning_rate": 1.0546880934639364e-05, "loss": 0.0291, "step": 20800 }, { "epoch": 19.952061361457336, "grad_norm": 0.23181037604808807, "learning_rate": 1.0507845621481954e-05, "loss": 0.0239, "step": 20810 }, { "epoch": 19.961649089165867, "grad_norm": 0.17109474539756775, "learning_rate": 1.046887419592935e-05, "loss": 0.0234, "step": 20820 }, { "epoch": 19.971236816874402, "grad_norm": 0.19465407729148865, "learning_rate": 1.0429966721026751e-05, "loss": 0.0215, "step": 20830 }, { "epoch": 19.980824544582934, "grad_norm": 0.22324107587337494, "learning_rate": 1.0391123259715906e-05, "loss": 0.0196, "step": 20840 }, { "epoch": 19.99041227229147, "grad_norm": 0.3217203915119171, "learning_rate": 1.0352343874835018e-05, "loss": 0.0234, "step": 20850 }, { "epoch": 20.0, "grad_norm": 0.33047834038734436, "learning_rate": 1.0313628629118616e-05, "loss": 0.0276, "step": 20860 }, { "epoch": 20.00958772770853, "grad_norm": 0.23915094137191772, "learning_rate": 1.0274977585197482e-05, "loss": 0.0225, "step": 20870 }, { "epoch": 20.019175455417066, "grad_norm": 0.19355012476444244, "learning_rate": 1.0236390805598516e-05, "loss": 0.0232, "step": 20880 }, { "epoch": 20.028763183125598, "grad_norm": 0.4942316710948944, "learning_rate": 1.01978683527447e-05, "loss": 0.0262, "step": 20890 }, { "epoch": 20.038350910834133, "grad_norm": 0.17617733776569366, "learning_rate": 1.0159410288954912e-05, "loss": 0.023, "step": 20900 }, { "epoch": 20.047938638542664, "grad_norm": 0.21215124428272247, "learning_rate": 1.0121016676443878e-05, "loss": 0.0187, "step": 20910 }, { "epoch": 20.0575263662512, "grad_norm": 0.23251253366470337, "learning_rate": 1.008268757732207e-05, "loss": 0.027, "step": 20920 }, { "epoch": 20.06711409395973, "grad_norm": 0.16768194735050201, "learning_rate": 1.0044423053595559e-05, "loss": 0.0182, "step": 20930 }, { "epoch": 20.076701821668266, "grad_norm": 0.19847920536994934, "learning_rate": 1.000622316716599e-05, "loss": 0.0284, "step": 20940 }, { "epoch": 20.086289549376797, "grad_norm": 0.13995741307735443, "learning_rate": 9.968087979830432e-06, "loss": 0.0192, "step": 20950 }, { "epoch": 20.095877277085332, "grad_norm": 0.1870267242193222, "learning_rate": 9.930017553281279e-06, "loss": 0.0259, "step": 20960 }, { "epoch": 20.105465004793864, "grad_norm": 0.22275184094905853, "learning_rate": 9.892011949106172e-06, "loss": 0.0248, "step": 20970 }, { "epoch": 20.1150527325024, "grad_norm": 0.14587286114692688, "learning_rate": 9.854071228787875e-06, "loss": 0.0234, "step": 20980 }, { "epoch": 20.12464046021093, "grad_norm": 0.32902705669403076, "learning_rate": 9.816195453704191e-06, "loss": 0.0233, "step": 20990 }, { "epoch": 20.13422818791946, "grad_norm": 0.17466923594474792, "learning_rate": 9.778384685127867e-06, "loss": 0.023, "step": 21000 }, { "epoch": 20.143815915627997, "grad_norm": 0.17678095400333405, "learning_rate": 9.740638984226481e-06, "loss": 0.0265, "step": 21010 }, { "epoch": 20.153403643336528, "grad_norm": 0.16016939282417297, "learning_rate": 9.70295841206234e-06, "loss": 0.0248, "step": 21020 }, { "epoch": 20.162991371045063, "grad_norm": 0.2382485419511795, "learning_rate": 9.665343029592417e-06, "loss": 0.0233, "step": 21030 }, { "epoch": 20.172579098753594, "grad_norm": 0.24307946860790253, "learning_rate": 9.627792897668175e-06, "loss": 0.025, "step": 21040 }, { "epoch": 20.18216682646213, "grad_norm": 0.4551367461681366, "learning_rate": 9.590308077035592e-06, "loss": 0.0211, "step": 21050 }, { "epoch": 20.19175455417066, "grad_norm": 0.2893312871456146, "learning_rate": 9.55288862833495e-06, "loss": 0.0206, "step": 21060 }, { "epoch": 20.201342281879196, "grad_norm": 0.16855131089687347, "learning_rate": 9.515534612100746e-06, "loss": 0.027, "step": 21070 }, { "epoch": 20.210930009587727, "grad_norm": 0.3295097053050995, "learning_rate": 9.478246088761671e-06, "loss": 0.0282, "step": 21080 }, { "epoch": 20.220517737296262, "grad_norm": 0.1354684680700302, "learning_rate": 9.441023118640457e-06, "loss": 0.0278, "step": 21090 }, { "epoch": 20.230105465004794, "grad_norm": 0.16148221492767334, "learning_rate": 9.403865761953779e-06, "loss": 0.0287, "step": 21100 }, { "epoch": 20.239693192713325, "grad_norm": 0.3596087098121643, "learning_rate": 9.366774078812174e-06, "loss": 0.0273, "step": 21110 }, { "epoch": 20.24928092042186, "grad_norm": 0.24658294022083282, "learning_rate": 9.329748129219934e-06, "loss": 0.0224, "step": 21120 }, { "epoch": 20.25886864813039, "grad_norm": 0.2896967828273773, "learning_rate": 9.292787973075007e-06, "loss": 0.0203, "step": 21130 }, { "epoch": 20.268456375838927, "grad_norm": 0.2359636425971985, "learning_rate": 9.255893670168919e-06, "loss": 0.0241, "step": 21140 }, { "epoch": 20.278044103547458, "grad_norm": 0.24353083968162537, "learning_rate": 9.219065280186656e-06, "loss": 0.0247, "step": 21150 }, { "epoch": 20.287631831255993, "grad_norm": 0.14789700508117676, "learning_rate": 9.182302862706566e-06, "loss": 0.0191, "step": 21160 }, { "epoch": 20.297219558964525, "grad_norm": 0.27849042415618896, "learning_rate": 9.145606477200286e-06, "loss": 0.0217, "step": 21170 }, { "epoch": 20.30680728667306, "grad_norm": 0.4151756763458252, "learning_rate": 9.108976183032613e-06, "loss": 0.0233, "step": 21180 }, { "epoch": 20.31639501438159, "grad_norm": 0.2625637948513031, "learning_rate": 9.072412039461453e-06, "loss": 0.0223, "step": 21190 }, { "epoch": 20.325982742090126, "grad_norm": 0.22075968980789185, "learning_rate": 9.035914105637678e-06, "loss": 0.0239, "step": 21200 }, { "epoch": 20.335570469798657, "grad_norm": 0.22036759555339813, "learning_rate": 8.99948244060505e-06, "loss": 0.0243, "step": 21210 }, { "epoch": 20.345158197507192, "grad_norm": 0.4981054663658142, "learning_rate": 8.963117103300134e-06, "loss": 0.0207, "step": 21220 }, { "epoch": 20.354745925215724, "grad_norm": 0.20227645337581635, "learning_rate": 8.92681815255219e-06, "loss": 0.0198, "step": 21230 }, { "epoch": 20.364333652924255, "grad_norm": 0.24407237768173218, "learning_rate": 8.890585647083088e-06, "loss": 0.0292, "step": 21240 }, { "epoch": 20.37392138063279, "grad_norm": 0.4346962869167328, "learning_rate": 8.8544196455072e-06, "loss": 0.0237, "step": 21250 }, { "epoch": 20.383509108341322, "grad_norm": 0.32540345191955566, "learning_rate": 8.818320206331327e-06, "loss": 0.0237, "step": 21260 }, { "epoch": 20.393096836049857, "grad_norm": 0.2086063176393509, "learning_rate": 8.782287387954563e-06, "loss": 0.0215, "step": 21270 }, { "epoch": 20.40268456375839, "grad_norm": 0.2799685001373291, "learning_rate": 8.74632124866826e-06, "loss": 0.0283, "step": 21280 }, { "epoch": 20.412272291466923, "grad_norm": 0.40446561574935913, "learning_rate": 8.71042184665588e-06, "loss": 0.0244, "step": 21290 }, { "epoch": 20.421860019175455, "grad_norm": 0.20995816588401794, "learning_rate": 8.674589239992931e-06, "loss": 0.0301, "step": 21300 }, { "epoch": 20.43144774688399, "grad_norm": 0.25973740220069885, "learning_rate": 8.638823486646853e-06, "loss": 0.0241, "step": 21310 }, { "epoch": 20.44103547459252, "grad_norm": 0.31719037890434265, "learning_rate": 8.603124644476945e-06, "loss": 0.0207, "step": 21320 }, { "epoch": 20.450623202301056, "grad_norm": 0.1637444943189621, "learning_rate": 8.56749277123427e-06, "loss": 0.0264, "step": 21330 }, { "epoch": 20.460210930009588, "grad_norm": 0.3017114996910095, "learning_rate": 8.531927924561538e-06, "loss": 0.0271, "step": 21340 }, { "epoch": 20.469798657718123, "grad_norm": 0.20100443065166473, "learning_rate": 8.496430161993036e-06, "loss": 0.0247, "step": 21350 }, { "epoch": 20.479386385426654, "grad_norm": 0.2818273603916168, "learning_rate": 8.460999540954517e-06, "loss": 0.0278, "step": 21360 }, { "epoch": 20.488974113135185, "grad_norm": 0.22835665941238403, "learning_rate": 8.425636118763136e-06, "loss": 0.0228, "step": 21370 }, { "epoch": 20.49856184084372, "grad_norm": 0.24139605462551117, "learning_rate": 8.390339952627324e-06, "loss": 0.0279, "step": 21380 }, { "epoch": 20.508149568552252, "grad_norm": 0.17489181458950043, "learning_rate": 8.355111099646712e-06, "loss": 0.0255, "step": 21390 }, { "epoch": 20.517737296260787, "grad_norm": 0.14566893875598907, "learning_rate": 8.319949616812039e-06, "loss": 0.0222, "step": 21400 }, { "epoch": 20.52732502396932, "grad_norm": 0.2523178160190582, "learning_rate": 8.284855561005062e-06, "loss": 0.0194, "step": 21410 }, { "epoch": 20.536912751677853, "grad_norm": 0.20255376398563385, "learning_rate": 8.249828988998448e-06, "loss": 0.0233, "step": 21420 }, { "epoch": 20.546500479386385, "grad_norm": 0.2267649918794632, "learning_rate": 8.214869957455694e-06, "loss": 0.0247, "step": 21430 }, { "epoch": 20.55608820709492, "grad_norm": 0.20469827950000763, "learning_rate": 8.179978522931058e-06, "loss": 0.0196, "step": 21440 }, { "epoch": 20.56567593480345, "grad_norm": 0.2033228874206543, "learning_rate": 8.14515474186941e-06, "loss": 0.0284, "step": 21450 }, { "epoch": 20.575263662511986, "grad_norm": 0.20115645229816437, "learning_rate": 8.1103986706062e-06, "loss": 0.0263, "step": 21460 }, { "epoch": 20.584851390220518, "grad_norm": 0.12906615436077118, "learning_rate": 8.075710365367328e-06, "loss": 0.0207, "step": 21470 }, { "epoch": 20.59443911792905, "grad_norm": 0.2021467238664627, "learning_rate": 8.041089882269082e-06, "loss": 0.0286, "step": 21480 }, { "epoch": 20.604026845637584, "grad_norm": 0.21031218767166138, "learning_rate": 8.00653727731801e-06, "loss": 0.0211, "step": 21490 }, { "epoch": 20.613614573346116, "grad_norm": 0.19011792540550232, "learning_rate": 7.972052606410873e-06, "loss": 0.024, "step": 21500 }, { "epoch": 20.62320230105465, "grad_norm": 0.18954189121723175, "learning_rate": 7.937635925334525e-06, "loss": 0.0273, "step": 21510 }, { "epoch": 20.632790028763182, "grad_norm": 0.14838428795337677, "learning_rate": 7.903287289765826e-06, "loss": 0.0218, "step": 21520 }, { "epoch": 20.642377756471717, "grad_norm": 0.5171023607254028, "learning_rate": 7.869006755271568e-06, "loss": 0.0221, "step": 21530 }, { "epoch": 20.65196548418025, "grad_norm": 0.21473269164562225, "learning_rate": 7.834794377308358e-06, "loss": 0.0247, "step": 21540 }, { "epoch": 20.661553211888783, "grad_norm": 0.37152165174484253, "learning_rate": 7.800650211222554e-06, "loss": 0.0258, "step": 21550 }, { "epoch": 20.671140939597315, "grad_norm": 0.23063114285469055, "learning_rate": 7.766574312250168e-06, "loss": 0.0208, "step": 21560 }, { "epoch": 20.68072866730585, "grad_norm": 0.12213137745857239, "learning_rate": 7.732566735516777e-06, "loss": 0.0229, "step": 21570 }, { "epoch": 20.69031639501438, "grad_norm": 0.22326141595840454, "learning_rate": 7.698627536037411e-06, "loss": 0.0248, "step": 21580 }, { "epoch": 20.699904122722916, "grad_norm": 0.21889561414718628, "learning_rate": 7.664756768716513e-06, "loss": 0.0218, "step": 21590 }, { "epoch": 20.709491850431448, "grad_norm": 0.16505682468414307, "learning_rate": 7.630954488347797e-06, "loss": 0.0247, "step": 21600 }, { "epoch": 20.71907957813998, "grad_norm": 0.18476873636245728, "learning_rate": 7.5972207496142036e-06, "loss": 0.0279, "step": 21610 }, { "epoch": 20.728667305848514, "grad_norm": 0.18805384635925293, "learning_rate": 7.56355560708778e-06, "loss": 0.0269, "step": 21620 }, { "epoch": 20.738255033557046, "grad_norm": 0.19865307211875916, "learning_rate": 7.52995911522959e-06, "loss": 0.0231, "step": 21630 }, { "epoch": 20.74784276126558, "grad_norm": 0.25113141536712646, "learning_rate": 7.496431328389658e-06, "loss": 0.02, "step": 21640 }, { "epoch": 20.757430488974112, "grad_norm": 0.2101268321275711, "learning_rate": 7.4629723008068584e-06, "loss": 0.0275, "step": 21650 }, { "epoch": 20.767018216682647, "grad_norm": 0.2648563086986542, "learning_rate": 7.429582086608849e-06, "loss": 0.0239, "step": 21660 }, { "epoch": 20.77660594439118, "grad_norm": 0.19610466063022614, "learning_rate": 7.396260739811933e-06, "loss": 0.0248, "step": 21670 }, { "epoch": 20.786193672099714, "grad_norm": 0.22526168823242188, "learning_rate": 7.363008314321024e-06, "loss": 0.0185, "step": 21680 }, { "epoch": 20.795781399808245, "grad_norm": 0.20826375484466553, "learning_rate": 7.3298248639295405e-06, "loss": 0.0215, "step": 21690 }, { "epoch": 20.80536912751678, "grad_norm": 0.2082778960466385, "learning_rate": 7.296710442319305e-06, "loss": 0.0246, "step": 21700 }, { "epoch": 20.81495685522531, "grad_norm": 0.19884945452213287, "learning_rate": 7.2636651030604855e-06, "loss": 0.0197, "step": 21710 }, { "epoch": 20.824544582933846, "grad_norm": 0.30188602209091187, "learning_rate": 7.230688899611487e-06, "loss": 0.0188, "step": 21720 }, { "epoch": 20.834132310642378, "grad_norm": 0.2548470199108124, "learning_rate": 7.197781885318866e-06, "loss": 0.0223, "step": 21730 }, { "epoch": 20.84372003835091, "grad_norm": 0.42960646748542786, "learning_rate": 7.16494411341726e-06, "loss": 0.0265, "step": 21740 }, { "epoch": 20.853307766059444, "grad_norm": 0.1879805475473404, "learning_rate": 7.132175637029293e-06, "loss": 0.0225, "step": 21750 }, { "epoch": 20.862895493767976, "grad_norm": 0.29146450757980347, "learning_rate": 7.099476509165459e-06, "loss": 0.0254, "step": 21760 }, { "epoch": 20.87248322147651, "grad_norm": 0.27178776264190674, "learning_rate": 7.066846782724107e-06, "loss": 0.0253, "step": 21770 }, { "epoch": 20.882070949185042, "grad_norm": 0.1681770384311676, "learning_rate": 7.034286510491278e-06, "loss": 0.02, "step": 21780 }, { "epoch": 20.891658676893577, "grad_norm": 0.17788025736808777, "learning_rate": 7.001795745140683e-06, "loss": 0.0265, "step": 21790 }, { "epoch": 20.90124640460211, "grad_norm": 0.29857704043388367, "learning_rate": 6.969374539233553e-06, "loss": 0.0193, "step": 21800 }, { "epoch": 20.910834132310644, "grad_norm": 0.3122943937778473, "learning_rate": 6.937022945218647e-06, "loss": 0.0252, "step": 21810 }, { "epoch": 20.920421860019175, "grad_norm": 0.22873246669769287, "learning_rate": 6.904741015432059e-06, "loss": 0.0292, "step": 21820 }, { "epoch": 20.93000958772771, "grad_norm": 0.23916493356227875, "learning_rate": 6.872528802097211e-06, "loss": 0.0224, "step": 21830 }, { "epoch": 20.93959731543624, "grad_norm": 0.16214150190353394, "learning_rate": 6.84038635732473e-06, "loss": 0.0225, "step": 21840 }, { "epoch": 20.949185043144773, "grad_norm": 0.2523308992385864, "learning_rate": 6.808313733112387e-06, "loss": 0.0237, "step": 21850 }, { "epoch": 20.958772770853308, "grad_norm": 0.1933407187461853, "learning_rate": 6.776310981344996e-06, "loss": 0.021, "step": 21860 }, { "epoch": 20.96836049856184, "grad_norm": 0.1911810338497162, "learning_rate": 6.744378153794334e-06, "loss": 0.0242, "step": 21870 }, { "epoch": 20.977948226270374, "grad_norm": 0.2139926254749298, "learning_rate": 6.712515302119077e-06, "loss": 0.021, "step": 21880 }, { "epoch": 20.987535953978906, "grad_norm": 0.3361279368400574, "learning_rate": 6.680722477864665e-06, "loss": 0.0263, "step": 21890 }, { "epoch": 20.99712368168744, "grad_norm": 0.14909407496452332, "learning_rate": 6.648999732463284e-06, "loss": 0.0214, "step": 21900 }, { "epoch": 21.006711409395972, "grad_norm": 0.318256139755249, "learning_rate": 6.617347117233735e-06, "loss": 0.0296, "step": 21910 }, { "epoch": 21.016299137104507, "grad_norm": 0.15366911888122559, "learning_rate": 6.585764683381379e-06, "loss": 0.0262, "step": 21920 }, { "epoch": 21.02588686481304, "grad_norm": 0.3859999179840088, "learning_rate": 6.554252481998035e-06, "loss": 0.0229, "step": 21930 }, { "epoch": 21.035474592521574, "grad_norm": 0.22637765109539032, "learning_rate": 6.522810564061899e-06, "loss": 0.0284, "step": 21940 }, { "epoch": 21.045062320230105, "grad_norm": 0.2992878556251526, "learning_rate": 6.491438980437475e-06, "loss": 0.0254, "step": 21950 }, { "epoch": 21.05465004793864, "grad_norm": 0.2881068289279938, "learning_rate": 6.460137781875497e-06, "loss": 0.029, "step": 21960 }, { "epoch": 21.06423777564717, "grad_norm": 0.19176606833934784, "learning_rate": 6.4289070190128196e-06, "loss": 0.0232, "step": 21970 }, { "epoch": 21.073825503355703, "grad_norm": 0.1914961189031601, "learning_rate": 6.3977467423723516e-06, "loss": 0.0245, "step": 21980 }, { "epoch": 21.083413231064238, "grad_norm": 0.31313207745552063, "learning_rate": 6.366657002362975e-06, "loss": 0.0296, "step": 21990 }, { "epoch": 21.09300095877277, "grad_norm": 0.2076486051082611, "learning_rate": 6.335637849279464e-06, "loss": 0.0236, "step": 22000 }, { "epoch": 21.102588686481305, "grad_norm": 0.27381840348243713, "learning_rate": 6.304689333302416e-06, "loss": 0.027, "step": 22010 }, { "epoch": 21.112176414189836, "grad_norm": 0.24494417011737823, "learning_rate": 6.2738115044981225e-06, "loss": 0.0248, "step": 22020 }, { "epoch": 21.12176414189837, "grad_norm": 0.18255186080932617, "learning_rate": 6.24300441281856e-06, "loss": 0.0209, "step": 22030 }, { "epoch": 21.131351869606902, "grad_norm": 0.2896704077720642, "learning_rate": 6.212268108101249e-06, "loss": 0.0254, "step": 22040 }, { "epoch": 21.140939597315437, "grad_norm": 0.19372302293777466, "learning_rate": 6.1816026400692006e-06, "loss": 0.0275, "step": 22050 }, { "epoch": 21.15052732502397, "grad_norm": 0.19685117900371552, "learning_rate": 6.151008058330832e-06, "loss": 0.0244, "step": 22060 }, { "epoch": 21.160115052732504, "grad_norm": 0.18076103925704956, "learning_rate": 6.120484412379896e-06, "loss": 0.0191, "step": 22070 }, { "epoch": 21.169702780441035, "grad_norm": 0.37527182698249817, "learning_rate": 6.090031751595371e-06, "loss": 0.029, "step": 22080 }, { "epoch": 21.179290508149567, "grad_norm": 0.1315630078315735, "learning_rate": 6.059650125241412e-06, "loss": 0.0222, "step": 22090 }, { "epoch": 21.188878235858102, "grad_norm": 0.15893638134002686, "learning_rate": 6.029339582467253e-06, "loss": 0.0204, "step": 22100 }, { "epoch": 21.198465963566633, "grad_norm": 0.27391794323921204, "learning_rate": 5.999100172307154e-06, "loss": 0.0251, "step": 22110 }, { "epoch": 21.20805369127517, "grad_norm": 0.1774057298898697, "learning_rate": 5.968931943680284e-06, "loss": 0.0214, "step": 22120 }, { "epoch": 21.2176414189837, "grad_norm": 0.18632689118385315, "learning_rate": 5.938834945390653e-06, "loss": 0.0222, "step": 22130 }, { "epoch": 21.227229146692235, "grad_norm": 0.19212083518505096, "learning_rate": 5.908809226127054e-06, "loss": 0.0233, "step": 22140 }, { "epoch": 21.236816874400766, "grad_norm": 0.1936277598142624, "learning_rate": 5.878854834462977e-06, "loss": 0.0188, "step": 22150 }, { "epoch": 21.2464046021093, "grad_norm": 0.23681025207042694, "learning_rate": 5.848971818856486e-06, "loss": 0.0231, "step": 22160 }, { "epoch": 21.255992329817833, "grad_norm": 0.13978900015354156, "learning_rate": 5.819160227650216e-06, "loss": 0.0176, "step": 22170 }, { "epoch": 21.265580057526368, "grad_norm": 0.20834662020206451, "learning_rate": 5.789420109071242e-06, "loss": 0.0256, "step": 22180 }, { "epoch": 21.2751677852349, "grad_norm": 0.15531818568706512, "learning_rate": 5.759751511231021e-06, "loss": 0.0237, "step": 22190 }, { "epoch": 21.284755512943434, "grad_norm": 0.38519012928009033, "learning_rate": 5.7301544821253054e-06, "loss": 0.0213, "step": 22200 }, { "epoch": 21.294343240651965, "grad_norm": 0.17564308643341064, "learning_rate": 5.700629069634061e-06, "loss": 0.0224, "step": 22210 }, { "epoch": 21.303930968360497, "grad_norm": 0.21635904908180237, "learning_rate": 5.67117532152141e-06, "loss": 0.0231, "step": 22220 }, { "epoch": 21.313518696069032, "grad_norm": 0.2563495934009552, "learning_rate": 5.641793285435537e-06, "loss": 0.03, "step": 22230 }, { "epoch": 21.323106423777563, "grad_norm": 0.1372513771057129, "learning_rate": 5.612483008908609e-06, "loss": 0.0205, "step": 22240 }, { "epoch": 21.3326941514861, "grad_norm": 0.4297633767127991, "learning_rate": 5.583244539356719e-06, "loss": 0.0283, "step": 22250 }, { "epoch": 21.34228187919463, "grad_norm": 0.18425339460372925, "learning_rate": 5.554077924079776e-06, "loss": 0.0254, "step": 22260 }, { "epoch": 21.351869606903165, "grad_norm": 0.24806487560272217, "learning_rate": 5.524983210261481e-06, "loss": 0.0186, "step": 22270 }, { "epoch": 21.361457334611696, "grad_norm": 0.12550103664398193, "learning_rate": 5.495960444969189e-06, "loss": 0.0221, "step": 22280 }, { "epoch": 21.37104506232023, "grad_norm": 0.40927961468696594, "learning_rate": 5.467009675153861e-06, "loss": 0.0215, "step": 22290 }, { "epoch": 21.380632790028763, "grad_norm": 0.3342265188694, "learning_rate": 5.438130947650006e-06, "loss": 0.0274, "step": 22300 }, { "epoch": 21.390220517737298, "grad_norm": 0.23332703113555908, "learning_rate": 5.409324309175573e-06, "loss": 0.0213, "step": 22310 }, { "epoch": 21.39980824544583, "grad_norm": 0.6654828786849976, "learning_rate": 5.380589806331904e-06, "loss": 0.0305, "step": 22320 }, { "epoch": 21.409395973154364, "grad_norm": 0.18974971771240234, "learning_rate": 5.3519274856036414e-06, "loss": 0.0255, "step": 22330 }, { "epoch": 21.418983700862896, "grad_norm": 0.2501462697982788, "learning_rate": 5.3233373933586405e-06, "loss": 0.0273, "step": 22340 }, { "epoch": 21.428571428571427, "grad_norm": 0.205572247505188, "learning_rate": 5.294819575847937e-06, "loss": 0.0234, "step": 22350 }, { "epoch": 21.438159156279962, "grad_norm": 0.2879716157913208, "learning_rate": 5.266374079205627e-06, "loss": 0.0218, "step": 22360 }, { "epoch": 21.447746883988493, "grad_norm": 0.2402382642030716, "learning_rate": 5.238000949448818e-06, "loss": 0.0261, "step": 22370 }, { "epoch": 21.45733461169703, "grad_norm": 0.23849613964557648, "learning_rate": 5.209700232477543e-06, "loss": 0.0244, "step": 22380 }, { "epoch": 21.46692233940556, "grad_norm": 0.21024039387702942, "learning_rate": 5.181471974074692e-06, "loss": 0.0229, "step": 22390 }, { "epoch": 21.476510067114095, "grad_norm": 0.3411503732204437, "learning_rate": 5.153316219905946e-06, "loss": 0.0259, "step": 22400 }, { "epoch": 21.486097794822626, "grad_norm": 0.22467151284217834, "learning_rate": 5.1252330155196756e-06, "loss": 0.0234, "step": 22410 }, { "epoch": 21.49568552253116, "grad_norm": 0.29987016320228577, "learning_rate": 5.097222406346908e-06, "loss": 0.0273, "step": 22420 }, { "epoch": 21.505273250239693, "grad_norm": 0.26795509457588196, "learning_rate": 5.06928443770121e-06, "loss": 0.0217, "step": 22430 }, { "epoch": 21.514860977948228, "grad_norm": 0.1902526170015335, "learning_rate": 5.041419154778648e-06, "loss": 0.0237, "step": 22440 }, { "epoch": 21.52444870565676, "grad_norm": 0.27109450101852417, "learning_rate": 5.0136266026577e-06, "loss": 0.0239, "step": 22450 }, { "epoch": 21.53403643336529, "grad_norm": 0.3726276457309723, "learning_rate": 4.9859068262991805e-06, "loss": 0.0297, "step": 22460 }, { "epoch": 21.543624161073826, "grad_norm": 0.19683849811553955, "learning_rate": 4.958259870546178e-06, "loss": 0.0246, "step": 22470 }, { "epoch": 21.553211888782357, "grad_norm": 0.2269980013370514, "learning_rate": 4.930685780123978e-06, "loss": 0.0209, "step": 22480 }, { "epoch": 21.562799616490892, "grad_norm": 0.19175530970096588, "learning_rate": 4.903184599639987e-06, "loss": 0.0231, "step": 22490 }, { "epoch": 21.572387344199424, "grad_norm": 0.14310085773468018, "learning_rate": 4.875756373583662e-06, "loss": 0.0313, "step": 22500 }, { "epoch": 21.58197507190796, "grad_norm": 0.18566519021987915, "learning_rate": 4.848401146326442e-06, "loss": 0.0204, "step": 22510 }, { "epoch": 21.59156279961649, "grad_norm": 0.1244194358587265, "learning_rate": 4.821118962121668e-06, "loss": 0.022, "step": 22520 }, { "epoch": 21.601150527325025, "grad_norm": 0.3905356228351593, "learning_rate": 4.7939098651045235e-06, "loss": 0.0243, "step": 22530 }, { "epoch": 21.610738255033556, "grad_norm": 0.19283372163772583, "learning_rate": 4.76677389929196e-06, "loss": 0.0224, "step": 22540 }, { "epoch": 21.62032598274209, "grad_norm": 0.27373161911964417, "learning_rate": 4.739711108582612e-06, "loss": 0.0234, "step": 22550 }, { "epoch": 21.629913710450623, "grad_norm": 0.23337620496749878, "learning_rate": 4.712721536756743e-06, "loss": 0.0185, "step": 22560 }, { "epoch": 21.639501438159158, "grad_norm": 0.22057722508907318, "learning_rate": 4.685805227476164e-06, "loss": 0.019, "step": 22570 }, { "epoch": 21.64908916586769, "grad_norm": 0.18951620161533356, "learning_rate": 4.65896222428418e-06, "loss": 0.0239, "step": 22580 }, { "epoch": 21.65867689357622, "grad_norm": 0.19423332810401917, "learning_rate": 4.632192570605481e-06, "loss": 0.024, "step": 22590 }, { "epoch": 21.668264621284756, "grad_norm": 0.21294209361076355, "learning_rate": 4.605496309746127e-06, "loss": 0.0248, "step": 22600 }, { "epoch": 21.677852348993287, "grad_norm": 0.18906791508197784, "learning_rate": 4.578873484893431e-06, "loss": 0.0173, "step": 22610 }, { "epoch": 21.687440076701822, "grad_norm": 0.24385139346122742, "learning_rate": 4.552324139115905e-06, "loss": 0.026, "step": 22620 }, { "epoch": 21.697027804410354, "grad_norm": 0.14667080342769623, "learning_rate": 4.525848315363196e-06, "loss": 0.0178, "step": 22630 }, { "epoch": 21.70661553211889, "grad_norm": 0.15309500694274902, "learning_rate": 4.499446056466022e-06, "loss": 0.0225, "step": 22640 }, { "epoch": 21.71620325982742, "grad_norm": 0.7658588886260986, "learning_rate": 4.473117405136073e-06, "loss": 0.025, "step": 22650 }, { "epoch": 21.725790987535955, "grad_norm": 0.27880871295928955, "learning_rate": 4.446862403965984e-06, "loss": 0.0179, "step": 22660 }, { "epoch": 21.735378715244487, "grad_norm": 0.24886669218540192, "learning_rate": 4.420681095429219e-06, "loss": 0.0261, "step": 22670 }, { "epoch": 21.74496644295302, "grad_norm": 0.21482285857200623, "learning_rate": 4.394573521880052e-06, "loss": 0.019, "step": 22680 }, { "epoch": 21.754554170661553, "grad_norm": 0.2901485860347748, "learning_rate": 4.368539725553461e-06, "loss": 0.0284, "step": 22690 }, { "epoch": 21.764141898370085, "grad_norm": 0.2559397518634796, "learning_rate": 4.342579748565068e-06, "loss": 0.0241, "step": 22700 }, { "epoch": 21.77372962607862, "grad_norm": 0.24078018963336945, "learning_rate": 4.316693632911089e-06, "loss": 0.0234, "step": 22710 }, { "epoch": 21.78331735378715, "grad_norm": 0.23349763453006744, "learning_rate": 4.2908814204682405e-06, "loss": 0.0204, "step": 22720 }, { "epoch": 21.792905081495686, "grad_norm": 0.2100505232810974, "learning_rate": 4.265143152993695e-06, "loss": 0.0271, "step": 22730 }, { "epoch": 21.802492809204217, "grad_norm": 0.41341936588287354, "learning_rate": 4.23947887212498e-06, "loss": 0.0319, "step": 22740 }, { "epoch": 21.812080536912752, "grad_norm": 0.17779189348220825, "learning_rate": 4.213888619379963e-06, "loss": 0.0163, "step": 22750 }, { "epoch": 21.821668264621284, "grad_norm": 0.15619170665740967, "learning_rate": 4.188372436156734e-06, "loss": 0.0233, "step": 22760 }, { "epoch": 21.83125599232982, "grad_norm": 0.22723935544490814, "learning_rate": 4.162930363733558e-06, "loss": 0.0236, "step": 22770 }, { "epoch": 21.84084372003835, "grad_norm": 0.16326063871383667, "learning_rate": 4.137562443268822e-06, "loss": 0.0195, "step": 22780 }, { "epoch": 21.850431447746885, "grad_norm": 0.2659025490283966, "learning_rate": 4.112268715800943e-06, "loss": 0.0242, "step": 22790 }, { "epoch": 21.860019175455417, "grad_norm": 0.22254447638988495, "learning_rate": 4.087049222248324e-06, "loss": 0.0217, "step": 22800 }, { "epoch": 21.86960690316395, "grad_norm": 0.19777144491672516, "learning_rate": 4.061904003409261e-06, "loss": 0.0195, "step": 22810 }, { "epoch": 21.879194630872483, "grad_norm": 0.251908540725708, "learning_rate": 4.036833099961912e-06, "loss": 0.0232, "step": 22820 }, { "epoch": 21.888782358581015, "grad_norm": 0.1795634925365448, "learning_rate": 4.0118365524642095e-06, "loss": 0.0274, "step": 22830 }, { "epoch": 21.89837008628955, "grad_norm": 0.2238956242799759, "learning_rate": 3.986914401353797e-06, "loss": 0.0198, "step": 22840 }, { "epoch": 21.90795781399808, "grad_norm": 0.13608454167842865, "learning_rate": 3.96206668694794e-06, "loss": 0.0206, "step": 22850 }, { "epoch": 21.917545541706616, "grad_norm": 0.32671546936035156, "learning_rate": 3.93729344944353e-06, "loss": 0.0222, "step": 22860 }, { "epoch": 21.927133269415148, "grad_norm": 0.15036197006702423, "learning_rate": 3.912594728916929e-06, "loss": 0.0259, "step": 22870 }, { "epoch": 21.936720997123683, "grad_norm": 0.2388329952955246, "learning_rate": 3.887970565324006e-06, "loss": 0.0273, "step": 22880 }, { "epoch": 21.946308724832214, "grad_norm": 0.2434564232826233, "learning_rate": 3.8634209984999615e-06, "loss": 0.018, "step": 22890 }, { "epoch": 21.95589645254075, "grad_norm": 0.16361406445503235, "learning_rate": 3.8389460681593545e-06, "loss": 0.0264, "step": 22900 }, { "epoch": 21.96548418024928, "grad_norm": 0.4144115149974823, "learning_rate": 3.8145458138959865e-06, "loss": 0.0238, "step": 22910 }, { "epoch": 21.975071907957815, "grad_norm": 0.1729598045349121, "learning_rate": 3.790220275182854e-06, "loss": 0.0276, "step": 22920 }, { "epoch": 21.984659635666347, "grad_norm": 0.2305474728345871, "learning_rate": 3.7659694913720956e-06, "loss": 0.0209, "step": 22930 }, { "epoch": 21.994247363374882, "grad_norm": 0.11657913029193878, "learning_rate": 3.741793501694901e-06, "loss": 0.0214, "step": 22940 }, { "epoch": 22.003835091083413, "grad_norm": 0.15219245851039886, "learning_rate": 3.71769234526147e-06, "loss": 0.0187, "step": 22950 }, { "epoch": 22.013422818791945, "grad_norm": 0.2325374186038971, "learning_rate": 3.6936660610609465e-06, "loss": 0.0256, "step": 22960 }, { "epoch": 22.02301054650048, "grad_norm": 0.15524373948574066, "learning_rate": 3.6697146879613564e-06, "loss": 0.0209, "step": 22970 }, { "epoch": 22.03259827420901, "grad_norm": 0.2640591263771057, "learning_rate": 3.645838264709517e-06, "loss": 0.0192, "step": 22980 }, { "epoch": 22.042186001917546, "grad_norm": 0.15865999460220337, "learning_rate": 3.6220368299310136e-06, "loss": 0.0207, "step": 22990 }, { "epoch": 22.051773729626078, "grad_norm": 0.20118165016174316, "learning_rate": 3.5983104221301244e-06, "loss": 0.0217, "step": 23000 }, { "epoch": 22.061361457334613, "grad_norm": 0.40763455629348755, "learning_rate": 3.5746590796897404e-06, "loss": 0.0239, "step": 23010 }, { "epoch": 22.070949185043144, "grad_norm": 0.18958386778831482, "learning_rate": 3.551082840871328e-06, "loss": 0.025, "step": 23020 }, { "epoch": 22.08053691275168, "grad_norm": 0.2477806657552719, "learning_rate": 3.5275817438148616e-06, "loss": 0.0189, "step": 23030 }, { "epoch": 22.09012464046021, "grad_norm": 0.1568249762058258, "learning_rate": 3.504155826538741e-06, "loss": 0.0222, "step": 23040 }, { "epoch": 22.099712368168746, "grad_norm": 0.2600797116756439, "learning_rate": 3.4808051269397512e-06, "loss": 0.0238, "step": 23050 }, { "epoch": 22.109300095877277, "grad_norm": 0.18155524134635925, "learning_rate": 3.457529682793004e-06, "loss": 0.022, "step": 23060 }, { "epoch": 22.11888782358581, "grad_norm": 0.1558566689491272, "learning_rate": 3.4343295317518565e-06, "loss": 0.0225, "step": 23070 }, { "epoch": 22.128475551294343, "grad_norm": 0.23820623755455017, "learning_rate": 3.4112047113478653e-06, "loss": 0.0242, "step": 23080 }, { "epoch": 22.138063279002875, "grad_norm": 0.1538003832101822, "learning_rate": 3.3881552589907216e-06, "loss": 0.0214, "step": 23090 }, { "epoch": 22.14765100671141, "grad_norm": 0.21073570847511292, "learning_rate": 3.36518121196821e-06, "loss": 0.0243, "step": 23100 }, { "epoch": 22.15723873441994, "grad_norm": 0.1772642582654953, "learning_rate": 3.34228260744609e-06, "loss": 0.0228, "step": 23110 }, { "epoch": 22.166826462128476, "grad_norm": 0.18608751893043518, "learning_rate": 3.3194594824681123e-06, "loss": 0.0229, "step": 23120 }, { "epoch": 22.176414189837008, "grad_norm": 0.14585159718990326, "learning_rate": 3.2967118739559045e-06, "loss": 0.0216, "step": 23130 }, { "epoch": 22.186001917545543, "grad_norm": 0.18371617794036865, "learning_rate": 3.2740398187089405e-06, "loss": 0.021, "step": 23140 }, { "epoch": 22.195589645254074, "grad_norm": 0.11511317640542984, "learning_rate": 3.2514433534044544e-06, "loss": 0.023, "step": 23150 }, { "epoch": 22.20517737296261, "grad_norm": 0.22962027788162231, "learning_rate": 3.2289225145974046e-06, "loss": 0.0187, "step": 23160 }, { "epoch": 22.21476510067114, "grad_norm": 0.1875505894422531, "learning_rate": 3.2064773387203984e-06, "loss": 0.0298, "step": 23170 }, { "epoch": 22.224352828379676, "grad_norm": 0.1442171037197113, "learning_rate": 3.1841078620836683e-06, "loss": 0.0231, "step": 23180 }, { "epoch": 22.233940556088207, "grad_norm": 0.22087924182415009, "learning_rate": 3.1618141208749617e-06, "loss": 0.0229, "step": 23190 }, { "epoch": 22.24352828379674, "grad_norm": 0.18746836483478546, "learning_rate": 3.139596151159502e-06, "loss": 0.0197, "step": 23200 }, { "epoch": 22.253116011505274, "grad_norm": 0.23875057697296143, "learning_rate": 3.1174539888799425e-06, "loss": 0.0202, "step": 23210 }, { "epoch": 22.262703739213805, "grad_norm": 0.3558003902435303, "learning_rate": 3.0953876698563144e-06, "loss": 0.0209, "step": 23220 }, { "epoch": 22.27229146692234, "grad_norm": 0.32513800263404846, "learning_rate": 3.0733972297859294e-06, "loss": 0.0306, "step": 23230 }, { "epoch": 22.28187919463087, "grad_norm": 0.14356492459774017, "learning_rate": 3.0514827042433804e-06, "loss": 0.0263, "step": 23240 }, { "epoch": 22.291466922339406, "grad_norm": 0.28215450048446655, "learning_rate": 3.029644128680409e-06, "loss": 0.0266, "step": 23250 }, { "epoch": 22.301054650047938, "grad_norm": 0.27818936109542847, "learning_rate": 3.0078815384259163e-06, "loss": 0.0242, "step": 23260 }, { "epoch": 22.310642377756473, "grad_norm": 0.21585923433303833, "learning_rate": 2.9861949686858903e-06, "loss": 0.0225, "step": 23270 }, { "epoch": 22.320230105465004, "grad_norm": 0.2902468144893646, "learning_rate": 2.964584454543312e-06, "loss": 0.0302, "step": 23280 }, { "epoch": 22.32981783317354, "grad_norm": 0.15972809493541718, "learning_rate": 2.9430500309581387e-06, "loss": 0.0265, "step": 23290 }, { "epoch": 22.33940556088207, "grad_norm": 0.29737722873687744, "learning_rate": 2.9215917327672426e-06, "loss": 0.0187, "step": 23300 }, { "epoch": 22.348993288590606, "grad_norm": 0.24347999691963196, "learning_rate": 2.9002095946843277e-06, "loss": 0.0233, "step": 23310 }, { "epoch": 22.358581016299137, "grad_norm": 0.38935643434524536, "learning_rate": 2.878903651299891e-06, "loss": 0.0225, "step": 23320 }, { "epoch": 22.36816874400767, "grad_norm": 0.32759687304496765, "learning_rate": 2.8576739370811957e-06, "loss": 0.0218, "step": 23330 }, { "epoch": 22.377756471716204, "grad_norm": 0.11117486655712128, "learning_rate": 2.8365204863721573e-06, "loss": 0.023, "step": 23340 }, { "epoch": 22.387344199424735, "grad_norm": 0.2842784523963928, "learning_rate": 2.815443333393325e-06, "loss": 0.0238, "step": 23350 }, { "epoch": 22.39693192713327, "grad_norm": 0.23901750147342682, "learning_rate": 2.794442512241824e-06, "loss": 0.0237, "step": 23360 }, { "epoch": 22.4065196548418, "grad_norm": 0.23337188363075256, "learning_rate": 2.7735180568912943e-06, "loss": 0.0188, "step": 23370 }, { "epoch": 22.416107382550337, "grad_norm": 0.2736450433731079, "learning_rate": 2.7526700011918316e-06, "loss": 0.0247, "step": 23380 }, { "epoch": 22.425695110258868, "grad_norm": 0.13441747426986694, "learning_rate": 2.731898378869935e-06, "loss": 0.0229, "step": 23390 }, { "epoch": 22.435282837967403, "grad_norm": 0.1925612986087799, "learning_rate": 2.7112032235284744e-06, "loss": 0.0268, "step": 23400 }, { "epoch": 22.444870565675934, "grad_norm": 0.2339990884065628, "learning_rate": 2.6905845686465924e-06, "loss": 0.0239, "step": 23410 }, { "epoch": 22.45445829338447, "grad_norm": 0.17596887052059174, "learning_rate": 2.6700424475796905e-06, "loss": 0.0222, "step": 23420 }, { "epoch": 22.464046021093, "grad_norm": 0.3486236035823822, "learning_rate": 2.6495768935593525e-06, "loss": 0.0222, "step": 23430 }, { "epoch": 22.473633748801532, "grad_norm": 0.290425181388855, "learning_rate": 2.6291879396933004e-06, "loss": 0.0259, "step": 23440 }, { "epoch": 22.483221476510067, "grad_norm": 0.18840055167675018, "learning_rate": 2.6088756189653397e-06, "loss": 0.0291, "step": 23450 }, { "epoch": 22.4928092042186, "grad_norm": 0.2448890209197998, "learning_rate": 2.588639964235301e-06, "loss": 0.0244, "step": 23460 }, { "epoch": 22.502396931927134, "grad_norm": 0.49676454067230225, "learning_rate": 2.568481008238982e-06, "loss": 0.0248, "step": 23470 }, { "epoch": 22.511984659635665, "grad_norm": 0.15404298901557922, "learning_rate": 2.5483987835881127e-06, "loss": 0.0215, "step": 23480 }, { "epoch": 22.5215723873442, "grad_norm": 0.13425439596176147, "learning_rate": 2.528393322770306e-06, "loss": 0.0236, "step": 23490 }, { "epoch": 22.53116011505273, "grad_norm": 0.23426268994808197, "learning_rate": 2.508464658148968e-06, "loss": 0.0218, "step": 23500 }, { "epoch": 22.540747842761267, "grad_norm": 0.21721801161766052, "learning_rate": 2.488612821963271e-06, "loss": 0.0229, "step": 23510 }, { "epoch": 22.550335570469798, "grad_norm": 0.2656913697719574, "learning_rate": 2.4688378463281146e-06, "loss": 0.0248, "step": 23520 }, { "epoch": 22.559923298178333, "grad_norm": 0.15874969959259033, "learning_rate": 2.4491397632340487e-06, "loss": 0.0244, "step": 23530 }, { "epoch": 22.569511025886865, "grad_norm": 0.26198479533195496, "learning_rate": 2.429518604547232e-06, "loss": 0.0248, "step": 23540 }, { "epoch": 22.5790987535954, "grad_norm": 0.2117781937122345, "learning_rate": 2.409974402009385e-06, "loss": 0.0177, "step": 23550 }, { "epoch": 22.58868648130393, "grad_norm": 0.3583064675331116, "learning_rate": 2.390507187237734e-06, "loss": 0.0242, "step": 23560 }, { "epoch": 22.598274209012462, "grad_norm": 0.26825496554374695, "learning_rate": 2.371116991724953e-06, "loss": 0.0242, "step": 23570 }, { "epoch": 22.607861936720997, "grad_norm": 0.2770189344882965, "learning_rate": 2.3518038468391236e-06, "loss": 0.0198, "step": 23580 }, { "epoch": 22.61744966442953, "grad_norm": 0.13024599850177765, "learning_rate": 2.332567783823686e-06, "loss": 0.0221, "step": 23590 }, { "epoch": 22.627037392138064, "grad_norm": 0.16579675674438477, "learning_rate": 2.313408833797376e-06, "loss": 0.0198, "step": 23600 }, { "epoch": 22.636625119846595, "grad_norm": 0.23456168174743652, "learning_rate": 2.294327027754184e-06, "loss": 0.0242, "step": 23610 }, { "epoch": 22.64621284755513, "grad_norm": 0.2899184226989746, "learning_rate": 2.275322396563301e-06, "loss": 0.0268, "step": 23620 }, { "epoch": 22.65580057526366, "grad_norm": 0.19349917769432068, "learning_rate": 2.2563949709690725e-06, "loss": 0.0224, "step": 23630 }, { "epoch": 22.665388302972197, "grad_norm": 0.32430675625801086, "learning_rate": 2.2375447815909388e-06, "loss": 0.0241, "step": 23640 }, { "epoch": 22.674976030680728, "grad_norm": 0.18584056198596954, "learning_rate": 2.218771858923402e-06, "loss": 0.0231, "step": 23650 }, { "epoch": 22.684563758389263, "grad_norm": 0.21064673364162445, "learning_rate": 2.2000762333359625e-06, "loss": 0.0294, "step": 23660 }, { "epoch": 22.694151486097795, "grad_norm": 0.2811007797718048, "learning_rate": 2.1814579350730835e-06, "loss": 0.023, "step": 23670 }, { "epoch": 22.70373921380633, "grad_norm": 0.17581719160079956, "learning_rate": 2.162916994254116e-06, "loss": 0.0212, "step": 23680 }, { "epoch": 22.71332694151486, "grad_norm": 0.22076162695884705, "learning_rate": 2.1444534408732898e-06, "loss": 0.026, "step": 23690 }, { "epoch": 22.722914669223393, "grad_norm": 0.17099499702453613, "learning_rate": 2.1260673047996227e-06, "loss": 0.0231, "step": 23700 }, { "epoch": 22.732502396931928, "grad_norm": 0.2862556576728821, "learning_rate": 2.1077586157769e-06, "loss": 0.0212, "step": 23710 }, { "epoch": 22.74209012464046, "grad_norm": 0.257538378238678, "learning_rate": 2.0895274034236245e-06, "loss": 0.022, "step": 23720 }, { "epoch": 22.751677852348994, "grad_norm": 0.12845216691493988, "learning_rate": 2.071373697232959e-06, "loss": 0.0332, "step": 23730 }, { "epoch": 22.761265580057525, "grad_norm": 0.2115718573331833, "learning_rate": 2.0532975265726786e-06, "loss": 0.0295, "step": 23740 }, { "epoch": 22.77085330776606, "grad_norm": 0.24508948624134064, "learning_rate": 2.0352989206851303e-06, "loss": 0.0219, "step": 23750 }, { "epoch": 22.780441035474592, "grad_norm": 0.16549085080623627, "learning_rate": 2.0173779086871735e-06, "loss": 0.0228, "step": 23760 }, { "epoch": 22.790028763183127, "grad_norm": 0.2713741362094879, "learning_rate": 1.999534519570162e-06, "loss": 0.0253, "step": 23770 }, { "epoch": 22.79961649089166, "grad_norm": 0.30385440587997437, "learning_rate": 1.981768782199861e-06, "loss": 0.0219, "step": 23780 }, { "epoch": 22.809204218600193, "grad_norm": 0.13380350172519684, "learning_rate": 1.964080725316414e-06, "loss": 0.0223, "step": 23790 }, { "epoch": 22.818791946308725, "grad_norm": 0.2257150113582611, "learning_rate": 1.9464703775343096e-06, "loss": 0.0246, "step": 23800 }, { "epoch": 22.828379674017256, "grad_norm": 0.18876703083515167, "learning_rate": 1.928937767342315e-06, "loss": 0.0216, "step": 23810 }, { "epoch": 22.83796740172579, "grad_norm": 0.21190357208251953, "learning_rate": 1.911482923103447e-06, "loss": 0.0236, "step": 23820 }, { "epoch": 22.847555129434323, "grad_norm": 0.20915569365024567, "learning_rate": 1.8941058730549132e-06, "loss": 0.0202, "step": 23830 }, { "epoch": 22.857142857142858, "grad_norm": 0.11660904437303543, "learning_rate": 1.8768066453080657e-06, "loss": 0.0227, "step": 23840 }, { "epoch": 22.86673058485139, "grad_norm": 0.3356838524341583, "learning_rate": 1.8595852678483738e-06, "loss": 0.0264, "step": 23850 }, { "epoch": 22.876318312559924, "grad_norm": 0.26690155267715454, "learning_rate": 1.8424417685353634e-06, "loss": 0.0249, "step": 23860 }, { "epoch": 22.885906040268456, "grad_norm": 0.23408538103103638, "learning_rate": 1.825376175102561e-06, "loss": 0.0249, "step": 23870 }, { "epoch": 22.89549376797699, "grad_norm": 0.40512949228286743, "learning_rate": 1.8083885151574775e-06, "loss": 0.0259, "step": 23880 }, { "epoch": 22.905081495685522, "grad_norm": 0.20047682523727417, "learning_rate": 1.7914788161815466e-06, "loss": 0.025, "step": 23890 }, { "epoch": 22.914669223394057, "grad_norm": 0.2298455536365509, "learning_rate": 1.7746471055300751e-06, "loss": 0.0208, "step": 23900 }, { "epoch": 22.92425695110259, "grad_norm": 0.13947898149490356, "learning_rate": 1.7578934104322097e-06, "loss": 0.0201, "step": 23910 }, { "epoch": 22.933844678811123, "grad_norm": 0.22104570269584656, "learning_rate": 1.741217757990893e-06, "loss": 0.0233, "step": 23920 }, { "epoch": 22.943432406519655, "grad_norm": 0.2084132432937622, "learning_rate": 1.7246201751828117e-06, "loss": 0.0269, "step": 23930 }, { "epoch": 22.953020134228186, "grad_norm": 0.2208138108253479, "learning_rate": 1.7081006888583495e-06, "loss": 0.023, "step": 23940 }, { "epoch": 22.96260786193672, "grad_norm": 0.20159326493740082, "learning_rate": 1.6916593257415735e-06, "loss": 0.0181, "step": 23950 }, { "epoch": 22.972195589645253, "grad_norm": 0.16700109839439392, "learning_rate": 1.6752961124301415e-06, "loss": 0.0225, "step": 23960 }, { "epoch": 22.981783317353788, "grad_norm": 0.16461221873760223, "learning_rate": 1.6590110753953058e-06, "loss": 0.0267, "step": 23970 }, { "epoch": 22.99137104506232, "grad_norm": 0.30045902729034424, "learning_rate": 1.6428042409818434e-06, "loss": 0.0252, "step": 23980 }, { "epoch": 23.000958772770854, "grad_norm": 0.2971097528934479, "learning_rate": 1.6266756354080148e-06, "loss": 0.021, "step": 23990 }, { "epoch": 23.010546500479386, "grad_norm": 0.18751384317874908, "learning_rate": 1.610625284765538e-06, "loss": 0.0225, "step": 24000 }, { "epoch": 23.02013422818792, "grad_norm": 0.2587001621723175, "learning_rate": 1.5946532150195315e-06, "loss": 0.024, "step": 24010 }, { "epoch": 23.029721955896452, "grad_norm": 0.2185692936182022, "learning_rate": 1.578759452008477e-06, "loss": 0.0269, "step": 24020 }, { "epoch": 23.039309683604987, "grad_norm": 0.17715659737586975, "learning_rate": 1.5629440214441737e-06, "loss": 0.0227, "step": 24030 }, { "epoch": 23.04889741131352, "grad_norm": 0.27682605385780334, "learning_rate": 1.5472069489117058e-06, "loss": 0.0261, "step": 24040 }, { "epoch": 23.058485139022054, "grad_norm": 0.20817138254642487, "learning_rate": 1.531548259869392e-06, "loss": 0.0165, "step": 24050 }, { "epoch": 23.068072866730585, "grad_norm": 0.2723507583141327, "learning_rate": 1.515967979648747e-06, "loss": 0.0228, "step": 24060 }, { "epoch": 23.077660594439116, "grad_norm": 0.1344461739063263, "learning_rate": 1.5004661334544422e-06, "loss": 0.0207, "step": 24070 }, { "epoch": 23.08724832214765, "grad_norm": 0.16644145548343658, "learning_rate": 1.4850427463642568e-06, "loss": 0.0282, "step": 24080 }, { "epoch": 23.096836049856183, "grad_norm": 0.22762452065944672, "learning_rate": 1.4696978433290653e-06, "loss": 0.0273, "step": 24090 }, { "epoch": 23.106423777564718, "grad_norm": 0.1904400885105133, "learning_rate": 1.4544314491727607e-06, "loss": 0.0216, "step": 24100 }, { "epoch": 23.11601150527325, "grad_norm": 0.20357421040534973, "learning_rate": 1.4392435885922262e-06, "loss": 0.0176, "step": 24110 }, { "epoch": 23.125599232981784, "grad_norm": 0.14489389955997467, "learning_rate": 1.4241342861573081e-06, "loss": 0.0241, "step": 24120 }, { "epoch": 23.135186960690316, "grad_norm": 0.42258408665657043, "learning_rate": 1.4091035663107599e-06, "loss": 0.0261, "step": 24130 }, { "epoch": 23.14477468839885, "grad_norm": 0.33509764075279236, "learning_rate": 1.39415145336822e-06, "loss": 0.0225, "step": 24140 }, { "epoch": 23.154362416107382, "grad_norm": 0.1689392775297165, "learning_rate": 1.3792779715181503e-06, "loss": 0.0239, "step": 24150 }, { "epoch": 23.163950143815917, "grad_norm": 0.16699060797691345, "learning_rate": 1.3644831448218154e-06, "loss": 0.018, "step": 24160 }, { "epoch": 23.17353787152445, "grad_norm": 0.18517597019672394, "learning_rate": 1.349766997213242e-06, "loss": 0.0241, "step": 24170 }, { "epoch": 23.18312559923298, "grad_norm": 0.25795888900756836, "learning_rate": 1.3351295524991592e-06, "loss": 0.019, "step": 24180 }, { "epoch": 23.192713326941515, "grad_norm": 0.14099453389644623, "learning_rate": 1.3205708343589973e-06, "loss": 0.0202, "step": 24190 }, { "epoch": 23.202301054650047, "grad_norm": 0.1665448546409607, "learning_rate": 1.3060908663448057e-06, "loss": 0.0227, "step": 24200 }, { "epoch": 23.21188878235858, "grad_norm": 0.23710502684116364, "learning_rate": 1.2916896718812577e-06, "loss": 0.0206, "step": 24210 }, { "epoch": 23.221476510067113, "grad_norm": 0.20079617202281952, "learning_rate": 1.2773672742655784e-06, "loss": 0.0254, "step": 24220 }, { "epoch": 23.231064237775648, "grad_norm": 0.19830353558063507, "learning_rate": 1.2631236966675287e-06, "loss": 0.0192, "step": 24230 }, { "epoch": 23.24065196548418, "grad_norm": 0.14918625354766846, "learning_rate": 1.2489589621293485e-06, "loss": 0.019, "step": 24240 }, { "epoch": 23.250239693192714, "grad_norm": 0.2350005954504013, "learning_rate": 1.2348730935657582e-06, "loss": 0.0234, "step": 24250 }, { "epoch": 23.259827420901246, "grad_norm": 0.25462934374809265, "learning_rate": 1.2208661137638687e-06, "loss": 0.0205, "step": 24260 }, { "epoch": 23.26941514860978, "grad_norm": 0.18844899535179138, "learning_rate": 1.2069380453831768e-06, "loss": 0.0264, "step": 24270 }, { "epoch": 23.279002876318312, "grad_norm": 0.247798353433609, "learning_rate": 1.19308891095552e-06, "loss": 0.0228, "step": 24280 }, { "epoch": 23.288590604026847, "grad_norm": 0.19767391681671143, "learning_rate": 1.1793187328850485e-06, "loss": 0.0214, "step": 24290 }, { "epoch": 23.29817833173538, "grad_norm": 0.30730581283569336, "learning_rate": 1.165627533448177e-06, "loss": 0.0214, "step": 24300 }, { "epoch": 23.30776605944391, "grad_norm": 0.3338695466518402, "learning_rate": 1.1520153347935658e-06, "loss": 0.0286, "step": 24310 }, { "epoch": 23.317353787152445, "grad_norm": 0.17891177535057068, "learning_rate": 1.1384821589420502e-06, "loss": 0.0213, "step": 24320 }, { "epoch": 23.326941514860977, "grad_norm": 0.2726079225540161, "learning_rate": 1.1250280277866509e-06, "loss": 0.0247, "step": 24330 }, { "epoch": 23.33652924256951, "grad_norm": 0.15224401652812958, "learning_rate": 1.1116529630925022e-06, "loss": 0.0221, "step": 24340 }, { "epoch": 23.346116970278043, "grad_norm": 0.3975865840911865, "learning_rate": 1.0983569864968346e-06, "loss": 0.024, "step": 24350 }, { "epoch": 23.355704697986578, "grad_norm": 0.1999419927597046, "learning_rate": 1.0851401195089316e-06, "loss": 0.0247, "step": 24360 }, { "epoch": 23.36529242569511, "grad_norm": 0.22919629514217377, "learning_rate": 1.072002383510118e-06, "loss": 0.0218, "step": 24370 }, { "epoch": 23.374880153403645, "grad_norm": 0.11979561299085617, "learning_rate": 1.05894379975367e-06, "loss": 0.0154, "step": 24380 }, { "epoch": 23.384467881112176, "grad_norm": 0.16747049987316132, "learning_rate": 1.0459643893648507e-06, "loss": 0.0228, "step": 24390 }, { "epoch": 23.39405560882071, "grad_norm": 0.2855249345302582, "learning_rate": 1.0330641733408309e-06, "loss": 0.0268, "step": 24400 }, { "epoch": 23.403643336529242, "grad_norm": 0.39327678084373474, "learning_rate": 1.0202431725506556e-06, "loss": 0.0241, "step": 24410 }, { "epoch": 23.413231064237777, "grad_norm": 0.21982307732105255, "learning_rate": 1.0075014077352396e-06, "loss": 0.0157, "step": 24420 }, { "epoch": 23.42281879194631, "grad_norm": 0.18971124291419983, "learning_rate": 9.948388995072943e-07, "loss": 0.0256, "step": 24430 }, { "epoch": 23.43240651965484, "grad_norm": 0.2546897530555725, "learning_rate": 9.822556683513395e-07, "loss": 0.02, "step": 24440 }, { "epoch": 23.441994247363375, "grad_norm": 0.23896943032741547, "learning_rate": 9.69751734623625e-07, "loss": 0.02, "step": 24450 }, { "epoch": 23.451581975071907, "grad_norm": 0.2220362275838852, "learning_rate": 9.57327118552137e-07, "loss": 0.0228, "step": 24460 }, { "epoch": 23.461169702780442, "grad_norm": 0.20060044527053833, "learning_rate": 9.449818402365251e-07, "loss": 0.0198, "step": 24470 }, { "epoch": 23.470757430488973, "grad_norm": 0.22660043835639954, "learning_rate": 9.327159196481138e-07, "loss": 0.0225, "step": 24480 }, { "epoch": 23.48034515819751, "grad_norm": 0.20978592336177826, "learning_rate": 9.205293766298307e-07, "loss": 0.0201, "step": 24490 }, { "epoch": 23.48993288590604, "grad_norm": 0.205510213971138, "learning_rate": 9.084222308962053e-07, "loss": 0.0257, "step": 24500 }, { "epoch": 23.499520613614575, "grad_norm": 0.15999889373779297, "learning_rate": 8.963945020333209e-07, "loss": 0.0242, "step": 24510 }, { "epoch": 23.509108341323106, "grad_norm": 0.16011640429496765, "learning_rate": 8.844462094987793e-07, "loss": 0.0243, "step": 24520 }, { "epoch": 23.51869606903164, "grad_norm": 0.1832507699728012, "learning_rate": 8.725773726216801e-07, "loss": 0.0199, "step": 24530 }, { "epoch": 23.528283796740173, "grad_norm": 0.1802021861076355, "learning_rate": 8.607880106025868e-07, "loss": 0.0228, "step": 24540 }, { "epoch": 23.537871524448704, "grad_norm": 0.3221112787723541, "learning_rate": 8.49078142513493e-07, "loss": 0.0253, "step": 24550 }, { "epoch": 23.54745925215724, "grad_norm": 0.16884173452854156, "learning_rate": 8.37447787297796e-07, "loss": 0.0232, "step": 24560 }, { "epoch": 23.55704697986577, "grad_norm": 0.26041117310523987, "learning_rate": 8.258969637702563e-07, "loss": 0.023, "step": 24570 }, { "epoch": 23.566634707574305, "grad_norm": 0.3258151710033417, "learning_rate": 8.144256906169767e-07, "loss": 0.0211, "step": 24580 }, { "epoch": 23.576222435282837, "grad_norm": 0.2306542694568634, "learning_rate": 8.030339863953684e-07, "loss": 0.0213, "step": 24590 }, { "epoch": 23.585810162991372, "grad_norm": 0.2112276405096054, "learning_rate": 7.917218695341178e-07, "loss": 0.0251, "step": 24600 }, { "epoch": 23.595397890699903, "grad_norm": 0.20719598233699799, "learning_rate": 7.804893583331696e-07, "loss": 0.0226, "step": 24610 }, { "epoch": 23.60498561840844, "grad_norm": 0.3165718913078308, "learning_rate": 7.693364709636886e-07, "loss": 0.0287, "step": 24620 }, { "epoch": 23.61457334611697, "grad_norm": 0.24788926541805267, "learning_rate": 7.582632254680089e-07, "loss": 0.0228, "step": 24630 }, { "epoch": 23.624161073825505, "grad_norm": 0.2913201153278351, "learning_rate": 7.472696397596568e-07, "loss": 0.0216, "step": 24640 }, { "epoch": 23.633748801534036, "grad_norm": 0.16556118428707123, "learning_rate": 7.363557316232673e-07, "loss": 0.0192, "step": 24650 }, { "epoch": 23.64333652924257, "grad_norm": 0.1945585161447525, "learning_rate": 7.255215187145892e-07, "loss": 0.023, "step": 24660 }, { "epoch": 23.652924256951103, "grad_norm": 0.26050955057144165, "learning_rate": 7.147670185604361e-07, "loss": 0.019, "step": 24670 }, { "epoch": 23.662511984659634, "grad_norm": 0.20233625173568726, "learning_rate": 7.04092248558691e-07, "loss": 0.0239, "step": 24680 }, { "epoch": 23.67209971236817, "grad_norm": 0.19280561804771423, "learning_rate": 6.93497225978218e-07, "loss": 0.0211, "step": 24690 }, { "epoch": 23.6816874400767, "grad_norm": 0.15425735712051392, "learning_rate": 6.829819679589122e-07, "loss": 0.0265, "step": 24700 }, { "epoch": 23.691275167785236, "grad_norm": 0.20832641422748566, "learning_rate": 6.725464915115997e-07, "loss": 0.0204, "step": 24710 }, { "epoch": 23.700862895493767, "grad_norm": 0.1475028246641159, "learning_rate": 6.621908135180655e-07, "loss": 0.022, "step": 24720 }, { "epoch": 23.710450623202302, "grad_norm": 0.18368731439113617, "learning_rate": 6.519149507309807e-07, "loss": 0.0222, "step": 24730 }, { "epoch": 23.720038350910833, "grad_norm": 0.2757015824317932, "learning_rate": 6.417189197739093e-07, "loss": 0.0198, "step": 24740 }, { "epoch": 23.72962607861937, "grad_norm": 0.25313273072242737, "learning_rate": 6.316027371412625e-07, "loss": 0.0287, "step": 24750 }, { "epoch": 23.7392138063279, "grad_norm": 0.30066144466400146, "learning_rate": 6.215664191982884e-07, "loss": 0.0214, "step": 24760 }, { "epoch": 23.748801534036435, "grad_norm": 0.4100160002708435, "learning_rate": 6.116099821810272e-07, "loss": 0.0223, "step": 24770 }, { "epoch": 23.758389261744966, "grad_norm": 0.231341153383255, "learning_rate": 6.017334421963006e-07, "loss": 0.0241, "step": 24780 }, { "epoch": 23.7679769894535, "grad_norm": 0.19291090965270996, "learning_rate": 5.919368152216664e-07, "loss": 0.0233, "step": 24790 }, { "epoch": 23.777564717162033, "grad_norm": 0.20064283907413483, "learning_rate": 5.822201171054197e-07, "loss": 0.0186, "step": 24800 }, { "epoch": 23.787152444870564, "grad_norm": 0.22854574024677277, "learning_rate": 5.725833635665423e-07, "loss": 0.0148, "step": 24810 }, { "epoch": 23.7967401725791, "grad_norm": 0.2627497613430023, "learning_rate": 5.630265701946912e-07, "loss": 0.0229, "step": 24820 }, { "epoch": 23.80632790028763, "grad_norm": 0.19262003898620605, "learning_rate": 5.535497524501665e-07, "loss": 0.0195, "step": 24830 }, { "epoch": 23.815915627996166, "grad_norm": 0.2796723246574402, "learning_rate": 5.441529256638933e-07, "loss": 0.0249, "step": 24840 }, { "epoch": 23.825503355704697, "grad_norm": 0.2186604142189026, "learning_rate": 5.348361050373896e-07, "loss": 0.0213, "step": 24850 }, { "epoch": 23.835091083413232, "grad_norm": 0.14878273010253906, "learning_rate": 5.255993056427433e-07, "loss": 0.0204, "step": 24860 }, { "epoch": 23.844678811121764, "grad_norm": 0.2015840858221054, "learning_rate": 5.164425424226016e-07, "loss": 0.0183, "step": 24870 }, { "epoch": 23.8542665388303, "grad_norm": 0.3008297383785248, "learning_rate": 5.073658301901207e-07, "loss": 0.0228, "step": 24880 }, { "epoch": 23.86385426653883, "grad_norm": 0.1776721477508545, "learning_rate": 4.983691836289606e-07, "loss": 0.025, "step": 24890 }, { "epoch": 23.873441994247365, "grad_norm": 0.27587175369262695, "learning_rate": 4.894526172932623e-07, "loss": 0.021, "step": 24900 }, { "epoch": 23.883029721955896, "grad_norm": 0.13460497558116913, "learning_rate": 4.806161456076097e-07, "loss": 0.0215, "step": 24910 }, { "epoch": 23.892617449664428, "grad_norm": 0.23786711692810059, "learning_rate": 4.718597828670235e-07, "loss": 0.0228, "step": 24920 }, { "epoch": 23.902205177372963, "grad_norm": 0.18216513097286224, "learning_rate": 4.6318354323692246e-07, "loss": 0.0194, "step": 24930 }, { "epoch": 23.911792905081494, "grad_norm": 0.10642199218273163, "learning_rate": 4.5458744075311253e-07, "loss": 0.0225, "step": 24940 }, { "epoch": 23.92138063279003, "grad_norm": 0.2601510286331177, "learning_rate": 4.460714893217588e-07, "loss": 0.0237, "step": 24950 }, { "epoch": 23.93096836049856, "grad_norm": 0.2652058005332947, "learning_rate": 4.376357027193634e-07, "loss": 0.0209, "step": 24960 }, { "epoch": 23.940556088207096, "grad_norm": 0.1884078085422516, "learning_rate": 4.292800945927378e-07, "loss": 0.0241, "step": 24970 }, { "epoch": 23.950143815915627, "grad_norm": 0.3231159448623657, "learning_rate": 4.210046784590027e-07, "loss": 0.0274, "step": 24980 }, { "epoch": 23.959731543624162, "grad_norm": 0.2526688575744629, "learning_rate": 4.128094677055272e-07, "loss": 0.0229, "step": 24990 }, { "epoch": 23.969319271332694, "grad_norm": 0.13271057605743408, "learning_rate": 4.0469447558995065e-07, "loss": 0.021, "step": 25000 }, { "epoch": 23.97890699904123, "grad_norm": 0.24924740195274353, "learning_rate": 3.9665971524012747e-07, "loss": 0.0283, "step": 25010 }, { "epoch": 23.98849472674976, "grad_norm": 0.29960912466049194, "learning_rate": 3.8870519965412135e-07, "loss": 0.0239, "step": 25020 }, { "epoch": 23.99808245445829, "grad_norm": 0.17576931416988373, "learning_rate": 3.8083094170018875e-07, "loss": 0.0207, "step": 25030 }, { "epoch": 24.007670182166827, "grad_norm": 0.21935302019119263, "learning_rate": 3.7303695411674e-07, "loss": 0.0222, "step": 25040 }, { "epoch": 24.017257909875358, "grad_norm": 0.2815200686454773, "learning_rate": 3.6532324951233934e-07, "loss": 0.0238, "step": 25050 }, { "epoch": 24.026845637583893, "grad_norm": 0.245747908949852, "learning_rate": 3.576898403656659e-07, "loss": 0.018, "step": 25060 }, { "epoch": 24.036433365292424, "grad_norm": 0.20894894003868103, "learning_rate": 3.501367390255139e-07, "loss": 0.0214, "step": 25070 }, { "epoch": 24.04602109300096, "grad_norm": 0.16371525824069977, "learning_rate": 3.426639577107427e-07, "loss": 0.019, "step": 25080 }, { "epoch": 24.05560882070949, "grad_norm": 0.15699537098407745, "learning_rate": 3.352715085103042e-07, "loss": 0.0221, "step": 25090 }, { "epoch": 24.065196548418026, "grad_norm": 0.29383793473243713, "learning_rate": 3.279594033831601e-07, "loss": 0.0197, "step": 25100 }, { "epoch": 24.074784276126557, "grad_norm": 0.25920745730400085, "learning_rate": 3.2072765415833153e-07, "loss": 0.0217, "step": 25110 }, { "epoch": 24.084372003835092, "grad_norm": 0.1622145175933838, "learning_rate": 3.1357627253482127e-07, "loss": 0.0242, "step": 25120 }, { "epoch": 24.093959731543624, "grad_norm": 0.26235565543174744, "learning_rate": 3.0650527008162513e-07, "loss": 0.0224, "step": 25130 }, { "epoch": 24.10354745925216, "grad_norm": 0.24994677305221558, "learning_rate": 2.9951465823771505e-07, "loss": 0.0243, "step": 25140 }, { "epoch": 24.11313518696069, "grad_norm": 0.35253608226776123, "learning_rate": 2.926044483120005e-07, "loss": 0.0271, "step": 25150 }, { "epoch": 24.12272291466922, "grad_norm": 0.12745951116085052, "learning_rate": 2.857746514833337e-07, "loss": 0.0182, "step": 25160 }, { "epoch": 24.132310642377757, "grad_norm": 0.16648143529891968, "learning_rate": 2.79025278800471e-07, "loss": 0.0211, "step": 25170 }, { "epoch": 24.141898370086288, "grad_norm": 0.15013748407363892, "learning_rate": 2.7235634118207286e-07, "loss": 0.0203, "step": 25180 }, { "epoch": 24.151486097794823, "grad_norm": 0.2093784213066101, "learning_rate": 2.6576784941667045e-07, "loss": 0.0232, "step": 25190 }, { "epoch": 24.161073825503355, "grad_norm": 0.23546428978443146, "learning_rate": 2.592598141626601e-07, "loss": 0.0243, "step": 25200 }, { "epoch": 24.17066155321189, "grad_norm": 0.20667380094528198, "learning_rate": 2.528322459482757e-07, "loss": 0.0225, "step": 25210 }, { "epoch": 24.18024928092042, "grad_norm": 0.26651787757873535, "learning_rate": 2.4648515517158297e-07, "loss": 0.0196, "step": 25220 }, { "epoch": 24.189837008628956, "grad_norm": 0.2723236083984375, "learning_rate": 2.402185521004574e-07, "loss": 0.0199, "step": 25230 }, { "epoch": 24.199424736337487, "grad_norm": 0.2087775468826294, "learning_rate": 2.3403244687256743e-07, "loss": 0.0251, "step": 25240 }, { "epoch": 24.209012464046022, "grad_norm": 0.1503581702709198, "learning_rate": 2.279268494953468e-07, "loss": 0.0172, "step": 25250 }, { "epoch": 24.218600191754554, "grad_norm": 0.1757836937904358, "learning_rate": 2.219017698460002e-07, "loss": 0.0227, "step": 25260 }, { "epoch": 24.22818791946309, "grad_norm": 0.22135482728481293, "learning_rate": 2.1595721767147526e-07, "loss": 0.0182, "step": 25270 }, { "epoch": 24.23777564717162, "grad_norm": 0.19286498427391052, "learning_rate": 2.1009320258845167e-07, "loss": 0.0265, "step": 25280 }, { "epoch": 24.247363374880152, "grad_norm": 0.12808747589588165, "learning_rate": 2.0430973408330778e-07, "loss": 0.0201, "step": 25290 }, { "epoch": 24.256951102588687, "grad_norm": 0.15946893393993378, "learning_rate": 1.9860682151212616e-07, "loss": 0.026, "step": 25300 }, { "epoch": 24.26653883029722, "grad_norm": 0.2374187558889389, "learning_rate": 1.929844741006881e-07, "loss": 0.0193, "step": 25310 }, { "epoch": 24.276126558005753, "grad_norm": 0.2157372087240219, "learning_rate": 1.8744270094441796e-07, "loss": 0.0266, "step": 25320 }, { "epoch": 24.285714285714285, "grad_norm": 0.27296164631843567, "learning_rate": 1.819815110084111e-07, "loss": 0.0211, "step": 25330 }, { "epoch": 24.29530201342282, "grad_norm": 0.16994787752628326, "learning_rate": 1.766009131273838e-07, "loss": 0.0188, "step": 25340 }, { "epoch": 24.30488974113135, "grad_norm": 0.2888137102127075, "learning_rate": 1.7130091600568443e-07, "loss": 0.0247, "step": 25350 }, { "epoch": 24.314477468839886, "grad_norm": 0.26905524730682373, "learning_rate": 1.660815282172823e-07, "loss": 0.026, "step": 25360 }, { "epoch": 24.324065196548418, "grad_norm": 0.28536051511764526, "learning_rate": 1.609427582057288e-07, "loss": 0.0221, "step": 25370 }, { "epoch": 24.333652924256953, "grad_norm": 0.26181870698928833, "learning_rate": 1.5588461428415745e-07, "loss": 0.0248, "step": 25380 }, { "epoch": 24.343240651965484, "grad_norm": 0.20964038372039795, "learning_rate": 1.5090710463527836e-07, "loss": 0.0222, "step": 25390 }, { "epoch": 24.352828379674015, "grad_norm": 0.22509586811065674, "learning_rate": 1.4601023731135034e-07, "loss": 0.0196, "step": 25400 }, { "epoch": 24.36241610738255, "grad_norm": 0.13734106719493866, "learning_rate": 1.4119402023418106e-07, "loss": 0.0249, "step": 25410 }, { "epoch": 24.372003835091082, "grad_norm": 0.2952769100666046, "learning_rate": 1.3645846119510474e-07, "loss": 0.0204, "step": 25420 }, { "epoch": 24.381591562799617, "grad_norm": 0.33259129524230957, "learning_rate": 1.3180356785496562e-07, "loss": 0.0267, "step": 25430 }, { "epoch": 24.39117929050815, "grad_norm": 0.1688985675573349, "learning_rate": 1.2722934774412887e-07, "loss": 0.0208, "step": 25440 }, { "epoch": 24.400767018216683, "grad_norm": 0.13669002056121826, "learning_rate": 1.2273580826244192e-07, "loss": 0.0238, "step": 25450 }, { "epoch": 24.410354745925215, "grad_norm": 0.14696350693702698, "learning_rate": 1.1832295667922876e-07, "loss": 0.0219, "step": 25460 }, { "epoch": 24.41994247363375, "grad_norm": 0.20755600929260254, "learning_rate": 1.139908001332901e-07, "loss": 0.0186, "step": 25470 }, { "epoch": 24.42953020134228, "grad_norm": 0.19683778285980225, "learning_rate": 1.0973934563288658e-07, "loss": 0.0211, "step": 25480 }, { "epoch": 24.439117929050816, "grad_norm": 0.2026386559009552, "learning_rate": 1.0556860005571101e-07, "loss": 0.0238, "step": 25490 }, { "epoch": 24.448705656759348, "grad_norm": 0.4480651617050171, "learning_rate": 1.0147857014890516e-07, "loss": 0.021, "step": 25500 }, { "epoch": 24.458293384467883, "grad_norm": 0.31666049361228943, "learning_rate": 9.746926252902633e-08, "loss": 0.032, "step": 25510 }, { "epoch": 24.467881112176414, "grad_norm": 0.5467284321784973, "learning_rate": 9.354068368204739e-08, "loss": 0.0209, "step": 25520 }, { "epoch": 24.477468839884946, "grad_norm": 0.15496346354484558, "learning_rate": 8.969283996335121e-08, "loss": 0.0224, "step": 25530 }, { "epoch": 24.48705656759348, "grad_norm": 0.210786372423172, "learning_rate": 8.59257375976974e-08, "loss": 0.025, "step": 25540 }, { "epoch": 24.496644295302012, "grad_norm": 0.12938974797725677, "learning_rate": 8.223938267924446e-08, "loss": 0.0176, "step": 25550 }, { "epoch": 24.506232023010547, "grad_norm": 0.22987248003482819, "learning_rate": 7.863378117151099e-08, "loss": 0.0231, "step": 25560 }, { "epoch": 24.51581975071908, "grad_norm": 0.3242381811141968, "learning_rate": 7.510893890738113e-08, "loss": 0.023, "step": 25570 }, { "epoch": 24.525407478427613, "grad_norm": 0.2817991375923157, "learning_rate": 7.166486158909913e-08, "loss": 0.0231, "step": 25580 }, { "epoch": 24.534995206136145, "grad_norm": 0.20501790940761566, "learning_rate": 6.830155478824707e-08, "loss": 0.0191, "step": 25590 }, { "epoch": 24.54458293384468, "grad_norm": 0.1096939668059349, "learning_rate": 6.501902394574488e-08, "loss": 0.0273, "step": 25600 }, { "epoch": 24.55417066155321, "grad_norm": 0.1630508154630661, "learning_rate": 6.181727437183372e-08, "loss": 0.0209, "step": 25610 }, { "epoch": 24.563758389261746, "grad_norm": 0.28238698840141296, "learning_rate": 5.8696311246081436e-08, "loss": 0.0251, "step": 25620 }, { "epoch": 24.573346116970278, "grad_norm": 0.11937420070171356, "learning_rate": 5.5656139617366045e-08, "loss": 0.0185, "step": 25630 }, { "epoch": 24.582933844678813, "grad_norm": 0.17204758524894714, "learning_rate": 5.2696764403847855e-08, "loss": 0.0229, "step": 25640 }, { "epoch": 24.592521572387344, "grad_norm": 0.17664316296577454, "learning_rate": 4.981819039300284e-08, "loss": 0.019, "step": 25650 }, { "epoch": 24.602109300095876, "grad_norm": 0.14691434800624847, "learning_rate": 4.702042224158931e-08, "loss": 0.0272, "step": 25660 }, { "epoch": 24.61169702780441, "grad_norm": 0.21293459832668304, "learning_rate": 4.430346447562572e-08, "loss": 0.0174, "step": 25670 }, { "epoch": 24.621284755512942, "grad_norm": 0.17576336860656738, "learning_rate": 4.166732149041841e-08, "loss": 0.0257, "step": 25680 }, { "epoch": 24.630872483221477, "grad_norm": 0.19463558495044708, "learning_rate": 3.911199755053385e-08, "loss": 0.0212, "step": 25690 }, { "epoch": 24.64046021093001, "grad_norm": 0.17403477430343628, "learning_rate": 3.663749678979311e-08, "loss": 0.0202, "step": 25700 }, { "epoch": 24.650047938638544, "grad_norm": 0.3777727782726288, "learning_rate": 3.424382321126629e-08, "loss": 0.024, "step": 25710 }, { "epoch": 24.659635666347075, "grad_norm": 0.14289294183254242, "learning_rate": 3.193098068727252e-08, "loss": 0.0244, "step": 25720 }, { "epoch": 24.66922339405561, "grad_norm": 0.17767243087291718, "learning_rate": 2.9698972959357753e-08, "loss": 0.0241, "step": 25730 }, { "epoch": 24.67881112176414, "grad_norm": 0.2469603717327118, "learning_rate": 2.7547803638311442e-08, "loss": 0.0244, "step": 25740 }, { "epoch": 24.688398849472676, "grad_norm": 0.1393066793680191, "learning_rate": 2.5477476204144314e-08, "loss": 0.0237, "step": 25750 }, { "epoch": 24.697986577181208, "grad_norm": 0.2745441794395447, "learning_rate": 2.3487994006077263e-08, "loss": 0.0192, "step": 25760 }, { "epoch": 24.70757430488974, "grad_norm": 0.19631850719451904, "learning_rate": 2.1579360262558025e-08, "loss": 0.0228, "step": 25770 }, { "epoch": 24.717162032598274, "grad_norm": 0.4640311300754547, "learning_rate": 1.9751578061244504e-08, "loss": 0.0216, "step": 25780 }, { "epoch": 24.726749760306806, "grad_norm": 0.262236088514328, "learning_rate": 1.8004650358982578e-08, "loss": 0.0243, "step": 25790 }, { "epoch": 24.73633748801534, "grad_norm": 0.1786222904920578, "learning_rate": 1.6338579981833856e-08, "loss": 0.0165, "step": 25800 }, { "epoch": 24.745925215723872, "grad_norm": 0.2555926442146301, "learning_rate": 1.475336962504792e-08, "loss": 0.0201, "step": 25810 }, { "epoch": 24.755512943432407, "grad_norm": 0.16927938163280487, "learning_rate": 1.3249021853062315e-08, "loss": 0.0225, "step": 25820 }, { "epoch": 24.76510067114094, "grad_norm": 0.17081952095031738, "learning_rate": 1.182553909950812e-08, "loss": 0.0206, "step": 25830 }, { "epoch": 24.774688398849474, "grad_norm": 0.2548944056034088, "learning_rate": 1.048292366719883e-08, "loss": 0.0238, "step": 25840 }, { "epoch": 24.784276126558005, "grad_norm": 0.14747904241085052, "learning_rate": 9.221177728108154e-09, "loss": 0.0218, "step": 25850 }, { "epoch": 24.79386385426654, "grad_norm": 0.2064131796360016, "learning_rate": 8.040303323414433e-09, "loss": 0.0275, "step": 25860 }, { "epoch": 24.80345158197507, "grad_norm": 0.1762009561061859, "learning_rate": 6.940302363445117e-09, "loss": 0.0183, "step": 25870 }, { "epoch": 24.813039309683607, "grad_norm": 0.36469346284866333, "learning_rate": 5.9211766277045276e-09, "loss": 0.0226, "step": 25880 }, { "epoch": 24.822627037392138, "grad_norm": 0.23766785860061646, "learning_rate": 4.982927764862755e-09, "loss": 0.0225, "step": 25890 }, { "epoch": 24.83221476510067, "grad_norm": 0.277342826128006, "learning_rate": 4.125557292750104e-09, "loss": 0.0245, "step": 25900 }, { "epoch": 24.841802492809204, "grad_norm": 0.2073160707950592, "learning_rate": 3.349066598362649e-09, "loss": 0.0213, "step": 25910 }, { "epoch": 24.851390220517736, "grad_norm": 0.24508048593997955, "learning_rate": 2.6534569378455776e-09, "loss": 0.0217, "step": 25920 }, { "epoch": 24.86097794822627, "grad_norm": 0.11171819269657135, "learning_rate": 2.0387294365209475e-09, "loss": 0.0213, "step": 25930 }, { "epoch": 24.870565675934802, "grad_norm": 0.24452242255210876, "learning_rate": 1.5048850888377265e-09, "loss": 0.0172, "step": 25940 }, { "epoch": 24.880153403643337, "grad_norm": 0.2535116374492645, "learning_rate": 1.0519247584106495e-09, "loss": 0.0252, "step": 25950 }, { "epoch": 24.88974113135187, "grad_norm": 0.2804677486419678, "learning_rate": 6.798491780202199e-10, "loss": 0.0258, "step": 25960 }, { "epoch": 24.899328859060404, "grad_norm": 0.21948733925819397, "learning_rate": 3.8865894956829905e-10, "loss": 0.0209, "step": 25970 }, { "epoch": 24.908916586768935, "grad_norm": 0.21812966465950012, "learning_rate": 1.7835454413361875e-10, "loss": 0.025, "step": 25980 }, { "epoch": 24.91850431447747, "grad_norm": 0.14540976285934448, "learning_rate": 4.893630192737142e-11, "loss": 0.0231, "step": 25990 }, { "epoch": 24.928092042186, "grad_norm": 0.1897832155227661, "learning_rate": 4.0443231541509307e-13, "loss": 0.025, "step": 26000 }, { "epoch": 24.928092042186, "step": 26000, "total_flos": 0.0, "train_loss": 0.037590215687568374, "train_runtime": 11164.0416, "train_samples_per_second": 74.525, "train_steps_per_second": 2.329 } ], "logging_steps": 10, "max_steps": 26000, "num_input_tokens_seen": 0, "num_train_epochs": 25, "save_steps": 20000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }