{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 515, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009708737864077669, "grad_norm": 6.002007543902212, "learning_rate": 1.5384615384615387e-06, "loss": 1.0598, "step": 1 }, { "epoch": 0.019417475728155338, "grad_norm": 5.991740133700607, "learning_rate": 3.0769230769230774e-06, "loss": 1.0579, "step": 2 }, { "epoch": 0.02912621359223301, "grad_norm": 5.825979646199088, "learning_rate": 4.615384615384616e-06, "loss": 1.0544, "step": 3 }, { "epoch": 0.038834951456310676, "grad_norm": 4.364941447544986, "learning_rate": 6.153846153846155e-06, "loss": 1.0106, "step": 4 }, { "epoch": 0.04854368932038835, "grad_norm": 2.7678687055700784, "learning_rate": 7.692307692307694e-06, "loss": 0.9739, "step": 5 }, { "epoch": 0.05825242718446602, "grad_norm": 2.5422026912598827, "learning_rate": 9.230769230769232e-06, "loss": 0.948, "step": 6 }, { "epoch": 0.06796116504854369, "grad_norm": 3.961951944224408, "learning_rate": 1.076923076923077e-05, "loss": 0.9572, "step": 7 }, { "epoch": 0.07766990291262135, "grad_norm": 3.5883110693606577, "learning_rate": 1.230769230769231e-05, "loss": 0.9112, "step": 8 }, { "epoch": 0.08737864077669903, "grad_norm": 3.860767785745949, "learning_rate": 1.3846153846153847e-05, "loss": 0.9007, "step": 9 }, { "epoch": 0.0970873786407767, "grad_norm": 2.7156118144668633, "learning_rate": 1.5384615384615387e-05, "loss": 0.8734, "step": 10 }, { "epoch": 0.10679611650485436, "grad_norm": 1.9289111077886563, "learning_rate": 1.6923076923076924e-05, "loss": 0.8391, "step": 11 }, { "epoch": 0.11650485436893204, "grad_norm": 1.8123247995331513, "learning_rate": 1.8461538461538465e-05, "loss": 0.8171, "step": 12 }, { "epoch": 0.1262135922330097, "grad_norm": 1.428428989967321, "learning_rate": 2e-05, "loss": 0.7956, "step": 13 }, { "epoch": 0.13592233009708737, "grad_norm": 1.3680143343907647, "learning_rate": 2.153846153846154e-05, "loss": 0.7813, "step": 14 }, { "epoch": 0.14563106796116504, "grad_norm": 1.3879441764342295, "learning_rate": 2.3076923076923076e-05, "loss": 0.7681, "step": 15 }, { "epoch": 0.1553398058252427, "grad_norm": 1.0796522703025953, "learning_rate": 2.461538461538462e-05, "loss": 0.7574, "step": 16 }, { "epoch": 0.1650485436893204, "grad_norm": 1.371942931930126, "learning_rate": 2.6153846153846157e-05, "loss": 0.748, "step": 17 }, { "epoch": 0.17475728155339806, "grad_norm": 1.410433870930946, "learning_rate": 2.7692307692307694e-05, "loss": 0.7432, "step": 18 }, { "epoch": 0.18446601941747573, "grad_norm": 1.0096190041268163, "learning_rate": 2.923076923076923e-05, "loss": 0.7329, "step": 19 }, { "epoch": 0.1941747572815534, "grad_norm": 1.7265065580167445, "learning_rate": 3.0769230769230774e-05, "loss": 0.7295, "step": 20 }, { "epoch": 0.20388349514563106, "grad_norm": 1.1032179992161404, "learning_rate": 3.230769230769231e-05, "loss": 0.7245, "step": 21 }, { "epoch": 0.21359223300970873, "grad_norm": 1.0901184558146035, "learning_rate": 3.384615384615385e-05, "loss": 0.7183, "step": 22 }, { "epoch": 0.22330097087378642, "grad_norm": 1.7732487418217808, "learning_rate": 3.538461538461539e-05, "loss": 0.7136, "step": 23 }, { "epoch": 0.23300970873786409, "grad_norm": 1.430366935608713, "learning_rate": 3.692307692307693e-05, "loss": 0.7105, "step": 24 }, { "epoch": 0.24271844660194175, "grad_norm": 1.682581513255261, "learning_rate": 3.846153846153846e-05, "loss": 0.6988, "step": 25 }, { "epoch": 0.2524271844660194, "grad_norm": 1.706823703289166, "learning_rate": 4e-05, "loss": 0.6991, "step": 26 }, { "epoch": 0.2621359223300971, "grad_norm": 2.014798562695819, "learning_rate": 4.1538461538461544e-05, "loss": 0.6955, "step": 27 }, { "epoch": 0.27184466019417475, "grad_norm": 1.4584117807771175, "learning_rate": 4.307692307692308e-05, "loss": 0.6901, "step": 28 }, { "epoch": 0.2815533980582524, "grad_norm": 2.5240991865834883, "learning_rate": 4.461538461538462e-05, "loss": 0.6901, "step": 29 }, { "epoch": 0.2912621359223301, "grad_norm": 1.7349683843216974, "learning_rate": 4.615384615384615e-05, "loss": 0.6912, "step": 30 }, { "epoch": 0.30097087378640774, "grad_norm": 2.453207741393019, "learning_rate": 4.76923076923077e-05, "loss": 0.6837, "step": 31 }, { "epoch": 0.3106796116504854, "grad_norm": 2.1891597642473517, "learning_rate": 4.923076923076924e-05, "loss": 0.6834, "step": 32 }, { "epoch": 0.32038834951456313, "grad_norm": 2.0789856106085867, "learning_rate": 5.076923076923077e-05, "loss": 0.6854, "step": 33 }, { "epoch": 0.3300970873786408, "grad_norm": 1.7347914028228881, "learning_rate": 5.230769230769231e-05, "loss": 0.6734, "step": 34 }, { "epoch": 0.33980582524271846, "grad_norm": 2.1284302056196744, "learning_rate": 5.3846153846153853e-05, "loss": 0.6741, "step": 35 }, { "epoch": 0.34951456310679613, "grad_norm": 1.9299657090664841, "learning_rate": 5.538461538461539e-05, "loss": 0.6737, "step": 36 }, { "epoch": 0.3592233009708738, "grad_norm": 2.1216828159935135, "learning_rate": 5.692307692307693e-05, "loss": 0.6701, "step": 37 }, { "epoch": 0.36893203883495146, "grad_norm": 1.2362157926807915, "learning_rate": 5.846153846153846e-05, "loss": 0.6639, "step": 38 }, { "epoch": 0.3786407766990291, "grad_norm": 2.2344300721881525, "learning_rate": 6.000000000000001e-05, "loss": 0.666, "step": 39 }, { "epoch": 0.3883495145631068, "grad_norm": 1.544248418606736, "learning_rate": 6.153846153846155e-05, "loss": 0.6656, "step": 40 }, { "epoch": 0.39805825242718446, "grad_norm": 2.774120239479565, "learning_rate": 6.307692307692308e-05, "loss": 0.6683, "step": 41 }, { "epoch": 0.4077669902912621, "grad_norm": 2.1993216366589663, "learning_rate": 6.461538461538462e-05, "loss": 0.6649, "step": 42 }, { "epoch": 0.4174757281553398, "grad_norm": 1.8997195417294006, "learning_rate": 6.615384615384616e-05, "loss": 0.6501, "step": 43 }, { "epoch": 0.42718446601941745, "grad_norm": 1.9017422216012896, "learning_rate": 6.76923076923077e-05, "loss": 0.6556, "step": 44 }, { "epoch": 0.4368932038834951, "grad_norm": 2.2115485405167603, "learning_rate": 6.923076923076924e-05, "loss": 0.6582, "step": 45 }, { "epoch": 0.44660194174757284, "grad_norm": 2.2558473718778282, "learning_rate": 7.076923076923078e-05, "loss": 0.6599, "step": 46 }, { "epoch": 0.4563106796116505, "grad_norm": 1.8236441414000624, "learning_rate": 7.230769230769232e-05, "loss": 0.6502, "step": 47 }, { "epoch": 0.46601941747572817, "grad_norm": 1.668233835252892, "learning_rate": 7.384615384615386e-05, "loss": 0.6525, "step": 48 }, { "epoch": 0.47572815533980584, "grad_norm": 3.1753217004320975, "learning_rate": 7.538461538461539e-05, "loss": 0.6627, "step": 49 }, { "epoch": 0.4854368932038835, "grad_norm": 1.621218570570416, "learning_rate": 7.692307692307693e-05, "loss": 0.6484, "step": 50 }, { "epoch": 0.49514563106796117, "grad_norm": 4.339926226583821, "learning_rate": 7.846153846153847e-05, "loss": 0.6752, "step": 51 }, { "epoch": 0.5048543689320388, "grad_norm": 3.113087384499717, "learning_rate": 8e-05, "loss": 0.6722, "step": 52 }, { "epoch": 0.5145631067961165, "grad_norm": 2.714482442820161, "learning_rate": 7.999907919834168e-05, "loss": 0.6628, "step": 53 }, { "epoch": 0.5242718446601942, "grad_norm": 2.914244429607087, "learning_rate": 7.999631683576055e-05, "loss": 0.6581, "step": 54 }, { "epoch": 0.5339805825242718, "grad_norm": 1.884493343669805, "learning_rate": 7.999171303943594e-05, "loss": 0.6497, "step": 55 }, { "epoch": 0.5436893203883495, "grad_norm": 3.1324813256148873, "learning_rate": 7.998526802132707e-05, "loss": 0.6539, "step": 56 }, { "epoch": 0.5533980582524272, "grad_norm": 1.9123959562537083, "learning_rate": 7.997698207816309e-05, "loss": 0.6491, "step": 57 }, { "epoch": 0.5631067961165048, "grad_norm": 2.223063366245145, "learning_rate": 7.99668555914295e-05, "loss": 0.6515, "step": 58 }, { "epoch": 0.5728155339805825, "grad_norm": 2.268558088911201, "learning_rate": 7.995488902735063e-05, "loss": 0.6573, "step": 59 }, { "epoch": 0.5825242718446602, "grad_norm": 1.5722495721856204, "learning_rate": 7.994108293686804e-05, "loss": 0.6441, "step": 60 }, { "epoch": 0.5922330097087378, "grad_norm": 1.666894724940502, "learning_rate": 7.992543795561527e-05, "loss": 0.6408, "step": 61 }, { "epoch": 0.6019417475728155, "grad_norm": 1.0031568832099003, "learning_rate": 7.990795480388861e-05, "loss": 0.6342, "step": 62 }, { "epoch": 0.6116504854368932, "grad_norm": 2.2162360491704067, "learning_rate": 7.988863428661377e-05, "loss": 0.6589, "step": 63 }, { "epoch": 0.6213592233009708, "grad_norm": 1.7496212177283834, "learning_rate": 7.9867477293309e-05, "loss": 0.6484, "step": 64 }, { "epoch": 0.6310679611650486, "grad_norm": 1.5169651851287749, "learning_rate": 7.984448479804398e-05, "loss": 0.6394, "step": 65 }, { "epoch": 0.6407766990291263, "grad_norm": 1.9120046573933336, "learning_rate": 7.981965785939515e-05, "loss": 0.635, "step": 66 }, { "epoch": 0.6504854368932039, "grad_norm": 1.7557422251639143, "learning_rate": 7.97929976203968e-05, "loss": 0.647, "step": 67 }, { "epoch": 0.6601941747572816, "grad_norm": 1.3556049828157026, "learning_rate": 7.976450530848851e-05, "loss": 0.6374, "step": 68 }, { "epoch": 0.6699029126213593, "grad_norm": 2.244626845581171, "learning_rate": 7.973418223545874e-05, "loss": 0.6334, "step": 69 }, { "epoch": 0.6796116504854369, "grad_norm": 1.3591820052552963, "learning_rate": 7.970202979738426e-05, "loss": 0.6376, "step": 70 }, { "epoch": 0.6893203883495146, "grad_norm": 1.5805931826326196, "learning_rate": 7.966804947456599e-05, "loss": 0.632, "step": 71 }, { "epoch": 0.6990291262135923, "grad_norm": 1.334661947485613, "learning_rate": 7.96322428314608e-05, "loss": 0.6269, "step": 72 }, { "epoch": 0.7087378640776699, "grad_norm": 1.8903308561526113, "learning_rate": 7.959461151660952e-05, "loss": 0.6342, "step": 73 }, { "epoch": 0.7184466019417476, "grad_norm": 1.1038829068258371, "learning_rate": 7.955515726256101e-05, "loss": 0.6275, "step": 74 }, { "epoch": 0.7281553398058253, "grad_norm": 1.5525916361050445, "learning_rate": 7.951388188579237e-05, "loss": 0.6343, "step": 75 }, { "epoch": 0.7378640776699029, "grad_norm": 1.5390664333547033, "learning_rate": 7.94707872866254e-05, "loss": 0.6315, "step": 76 }, { "epoch": 0.7475728155339806, "grad_norm": 1.769589829850563, "learning_rate": 7.942587544913901e-05, "loss": 0.6329, "step": 77 }, { "epoch": 0.7572815533980582, "grad_norm": 1.4148162374660378, "learning_rate": 7.937914844107791e-05, "loss": 0.6297, "step": 78 }, { "epoch": 0.7669902912621359, "grad_norm": 1.7973463567105696, "learning_rate": 7.933060841375745e-05, "loss": 0.627, "step": 79 }, { "epoch": 0.7766990291262136, "grad_norm": 1.184326171503996, "learning_rate": 7.928025760196447e-05, "loss": 0.6234, "step": 80 }, { "epoch": 0.7864077669902912, "grad_norm": 1.6486867376699348, "learning_rate": 7.922809832385456e-05, "loss": 0.6224, "step": 81 }, { "epoch": 0.7961165048543689, "grad_norm": 1.2312763534767475, "learning_rate": 7.917413298084519e-05, "loss": 0.6207, "step": 82 }, { "epoch": 0.8058252427184466, "grad_norm": 1.319948768871293, "learning_rate": 7.911836405750525e-05, "loss": 0.618, "step": 83 }, { "epoch": 0.8155339805825242, "grad_norm": 1.2459374631380746, "learning_rate": 7.906079412144055e-05, "loss": 0.6215, "step": 84 }, { "epoch": 0.8252427184466019, "grad_norm": 1.2798874980653692, "learning_rate": 7.900142582317576e-05, "loss": 0.6172, "step": 85 }, { "epoch": 0.8349514563106796, "grad_norm": 2.093389549775017, "learning_rate": 7.894026189603225e-05, "loss": 0.618, "step": 86 }, { "epoch": 0.8446601941747572, "grad_norm": 1.1300093559740532, "learning_rate": 7.887730515600227e-05, "loss": 0.6173, "step": 87 }, { "epoch": 0.8543689320388349, "grad_norm": 2.3541947389099094, "learning_rate": 7.881255850161939e-05, "loss": 0.6243, "step": 88 }, { "epoch": 0.8640776699029126, "grad_norm": 1.5613220102186438, "learning_rate": 7.87460249138249e-05, "loss": 0.623, "step": 89 }, { "epoch": 0.8737864077669902, "grad_norm": 1.9104766159006328, "learning_rate": 7.867770745583074e-05, "loss": 0.6241, "step": 90 }, { "epoch": 0.883495145631068, "grad_norm": 1.419392875826999, "learning_rate": 7.860760927297833e-05, "loss": 0.62, "step": 91 }, { "epoch": 0.8932038834951457, "grad_norm": 1.3282640676407322, "learning_rate": 7.853573359259381e-05, "loss": 0.6166, "step": 92 }, { "epoch": 0.9029126213592233, "grad_norm": 1.3250464130522686, "learning_rate": 7.846208372383947e-05, "loss": 0.6179, "step": 93 }, { "epoch": 0.912621359223301, "grad_norm": 1.1377495249178353, "learning_rate": 7.838666305756138e-05, "loss": 0.6122, "step": 94 }, { "epoch": 0.9223300970873787, "grad_norm": 1.4646244528960923, "learning_rate": 7.830947506613324e-05, "loss": 0.6105, "step": 95 }, { "epoch": 0.9320388349514563, "grad_norm": 1.142509028160319, "learning_rate": 7.823052330329663e-05, "loss": 0.611, "step": 96 }, { "epoch": 0.941747572815534, "grad_norm": 2.0274152032009494, "learning_rate": 7.81498114039972e-05, "loss": 0.616, "step": 97 }, { "epoch": 0.9514563106796117, "grad_norm": 1.0686288368525192, "learning_rate": 7.806734308421753e-05, "loss": 0.6101, "step": 98 }, { "epoch": 0.9611650485436893, "grad_norm": 2.0849531670929626, "learning_rate": 7.798312214080588e-05, "loss": 0.6128, "step": 99 }, { "epoch": 0.970873786407767, "grad_norm": 1.723568607059033, "learning_rate": 7.789715245130148e-05, "loss": 0.6156, "step": 100 }, { "epoch": 0.9805825242718447, "grad_norm": 1.0991520002736015, "learning_rate": 7.780943797375594e-05, "loss": 0.6028, "step": 101 }, { "epoch": 0.9902912621359223, "grad_norm": 1.81475046862089, "learning_rate": 7.77199827465511e-05, "loss": 0.6176, "step": 102 }, { "epoch": 1.0, "grad_norm": 1.0995121126222103, "learning_rate": 7.762879088821302e-05, "loss": 0.612, "step": 103 }, { "epoch": 1.0097087378640777, "grad_norm": 1.1739346884276352, "learning_rate": 7.753586659722243e-05, "loss": 0.5928, "step": 104 }, { "epoch": 1.0194174757281553, "grad_norm": 1.475480407854494, "learning_rate": 7.74412141518214e-05, "loss": 0.5989, "step": 105 }, { "epoch": 1.029126213592233, "grad_norm": 1.3719226999649305, "learning_rate": 7.734483790981636e-05, "loss": 0.5969, "step": 106 }, { "epoch": 1.0388349514563107, "grad_norm": 1.0726737236370019, "learning_rate": 7.724674230837747e-05, "loss": 0.5825, "step": 107 }, { "epoch": 1.0485436893203883, "grad_norm": 1.3362590355075374, "learning_rate": 7.714693186383437e-05, "loss": 0.594, "step": 108 }, { "epoch": 1.058252427184466, "grad_norm": 0.820555469002108, "learning_rate": 7.704541117146819e-05, "loss": 0.5874, "step": 109 }, { "epoch": 1.0679611650485437, "grad_norm": 1.3323408029024517, "learning_rate": 7.694218490530004e-05, "loss": 0.5886, "step": 110 }, { "epoch": 1.0776699029126213, "grad_norm": 1.066797450314409, "learning_rate": 7.683725781787574e-05, "loss": 0.5876, "step": 111 }, { "epoch": 1.087378640776699, "grad_norm": 1.245072674676843, "learning_rate": 7.673063474004715e-05, "loss": 0.5879, "step": 112 }, { "epoch": 1.0970873786407767, "grad_norm": 1.2819802358790717, "learning_rate": 7.662232058074957e-05, "loss": 0.5864, "step": 113 }, { "epoch": 1.1067961165048543, "grad_norm": 1.436129834544921, "learning_rate": 7.651232032677588e-05, "loss": 0.5919, "step": 114 }, { "epoch": 1.116504854368932, "grad_norm": 0.8175594211587869, "learning_rate": 7.640063904254691e-05, "loss": 0.5817, "step": 115 }, { "epoch": 1.1262135922330097, "grad_norm": 1.687072132305568, "learning_rate": 7.628728186987824e-05, "loss": 0.5848, "step": 116 }, { "epoch": 1.1359223300970873, "grad_norm": 1.1326446741047016, "learning_rate": 7.617225402774348e-05, "loss": 0.5858, "step": 117 }, { "epoch": 1.145631067961165, "grad_norm": 0.82496802522063, "learning_rate": 7.605556081203405e-05, "loss": 0.5812, "step": 118 }, { "epoch": 1.1553398058252426, "grad_norm": 1.250641283128446, "learning_rate": 7.593720759531526e-05, "loss": 0.5876, "step": 119 }, { "epoch": 1.1650485436893203, "grad_norm": 1.1921582278913663, "learning_rate": 7.581719982657903e-05, "loss": 0.5842, "step": 120 }, { "epoch": 1.174757281553398, "grad_norm": 1.2734225443412055, "learning_rate": 7.569554303099296e-05, "loss": 0.579, "step": 121 }, { "epoch": 1.1844660194174756, "grad_norm": 1.1853318091968652, "learning_rate": 7.557224280964603e-05, "loss": 0.582, "step": 122 }, { "epoch": 1.1941747572815533, "grad_norm": 1.152376249051554, "learning_rate": 7.544730483929065e-05, "loss": 0.5836, "step": 123 }, { "epoch": 1.203883495145631, "grad_norm": 1.3041878863702414, "learning_rate": 7.532073487208132e-05, "loss": 0.5761, "step": 124 }, { "epoch": 1.2135922330097086, "grad_norm": 1.3112880078778733, "learning_rate": 7.519253873530986e-05, "loss": 0.5836, "step": 125 }, { "epoch": 1.2233009708737863, "grad_norm": 1.3474986040772934, "learning_rate": 7.5062722331137e-05, "loss": 0.5801, "step": 126 }, { "epoch": 1.233009708737864, "grad_norm": 0.8129513286807843, "learning_rate": 7.493129163632076e-05, "loss": 0.5748, "step": 127 }, { "epoch": 1.2427184466019416, "grad_norm": 1.1010576487307613, "learning_rate": 7.479825270194124e-05, "loss": 0.5842, "step": 128 }, { "epoch": 1.2524271844660193, "grad_norm": 1.398672981702769, "learning_rate": 7.466361165312199e-05, "loss": 0.5877, "step": 129 }, { "epoch": 1.262135922330097, "grad_norm": 1.2164255288325172, "learning_rate": 7.452737468874809e-05, "loss": 0.5826, "step": 130 }, { "epoch": 1.2718446601941746, "grad_norm": 0.7866460429939296, "learning_rate": 7.438954808118064e-05, "loss": 0.5778, "step": 131 }, { "epoch": 1.2815533980582523, "grad_norm": 1.2501923242149588, "learning_rate": 7.425013817596812e-05, "loss": 0.5797, "step": 132 }, { "epoch": 1.29126213592233, "grad_norm": 1.3225455051065684, "learning_rate": 7.41091513915541e-05, "loss": 0.578, "step": 133 }, { "epoch": 1.3009708737864076, "grad_norm": 1.0105911545481185, "learning_rate": 7.396659421898183e-05, "loss": 0.5754, "step": 134 }, { "epoch": 1.3106796116504853, "grad_norm": 1.2271731398121772, "learning_rate": 7.382247322159534e-05, "loss": 0.5808, "step": 135 }, { "epoch": 1.3203883495145632, "grad_norm": 1.1215866709600937, "learning_rate": 7.367679503473732e-05, "loss": 0.5738, "step": 136 }, { "epoch": 1.3300970873786409, "grad_norm": 1.0715712124069652, "learning_rate": 7.352956636544358e-05, "loss": 0.5754, "step": 137 }, { "epoch": 1.3398058252427185, "grad_norm": 0.9077358541837017, "learning_rate": 7.338079399213424e-05, "loss": 0.5711, "step": 138 }, { "epoch": 1.3495145631067962, "grad_norm": 1.064355488383383, "learning_rate": 7.32304847643017e-05, "loss": 0.5698, "step": 139 }, { "epoch": 1.3592233009708738, "grad_norm": 0.9384952345701946, "learning_rate": 7.30786456021953e-05, "loss": 0.5739, "step": 140 }, { "epoch": 1.3689320388349515, "grad_norm": 1.1821144606047003, "learning_rate": 7.292528349650262e-05, "loss": 0.5701, "step": 141 }, { "epoch": 1.3786407766990292, "grad_norm": 0.97575327357092, "learning_rate": 7.277040550802776e-05, "loss": 0.573, "step": 142 }, { "epoch": 1.3883495145631068, "grad_norm": 1.1580935526594842, "learning_rate": 7.261401876736611e-05, "loss": 0.5751, "step": 143 }, { "epoch": 1.3980582524271845, "grad_norm": 0.9292779139246362, "learning_rate": 7.245613047457621e-05, "loss": 0.5718, "step": 144 }, { "epoch": 1.4077669902912622, "grad_norm": 1.2215840715546447, "learning_rate": 7.229674789884813e-05, "loss": 0.5749, "step": 145 }, { "epoch": 1.4174757281553398, "grad_norm": 0.9138289618139581, "learning_rate": 7.213587837816889e-05, "loss": 0.5696, "step": 146 }, { "epoch": 1.4271844660194175, "grad_norm": 1.0533170144837214, "learning_rate": 7.197352931898454e-05, "loss": 0.5726, "step": 147 }, { "epoch": 1.4368932038834952, "grad_norm": 1.1454322229077762, "learning_rate": 7.180970819585923e-05, "loss": 0.5712, "step": 148 }, { "epoch": 1.4466019417475728, "grad_norm": 1.0300828181718542, "learning_rate": 7.164442255113107e-05, "loss": 0.5717, "step": 149 }, { "epoch": 1.4563106796116505, "grad_norm": 1.077162592235673, "learning_rate": 7.147767999456484e-05, "loss": 0.5785, "step": 150 }, { "epoch": 1.4660194174757282, "grad_norm": 1.2410959167210567, "learning_rate": 7.130948820300166e-05, "loss": 0.5771, "step": 151 }, { "epoch": 1.4757281553398058, "grad_norm": 1.2329275408989433, "learning_rate": 7.113985492000558e-05, "loss": 0.5683, "step": 152 }, { "epoch": 1.4854368932038835, "grad_norm": 1.0776714897294795, "learning_rate": 7.0968787955507e-05, "loss": 0.5716, "step": 153 }, { "epoch": 1.4951456310679612, "grad_norm": 0.8984266405483924, "learning_rate": 7.079629518544312e-05, "loss": 0.5726, "step": 154 }, { "epoch": 1.5048543689320388, "grad_norm": 1.1047143317546186, "learning_rate": 7.062238455139544e-05, "loss": 0.5754, "step": 155 }, { "epoch": 1.5145631067961165, "grad_norm": 1.3200995574966732, "learning_rate": 7.044706406022393e-05, "loss": 0.565, "step": 156 }, { "epoch": 1.5242718446601942, "grad_norm": 0.795738070159204, "learning_rate": 7.027034178369853e-05, "loss": 0.57, "step": 157 }, { "epoch": 1.5339805825242718, "grad_norm": 0.9946136430312941, "learning_rate": 7.009222585812755e-05, "loss": 0.5733, "step": 158 }, { "epoch": 1.5436893203883495, "grad_norm": 1.376070028063213, "learning_rate": 6.991272448398291e-05, "loss": 0.5756, "step": 159 }, { "epoch": 1.5533980582524272, "grad_norm": 0.7388882390316432, "learning_rate": 6.973184592552283e-05, "loss": 0.5654, "step": 160 }, { "epoch": 1.5631067961165048, "grad_norm": 1.1905180684222885, "learning_rate": 6.95495985104111e-05, "loss": 0.5711, "step": 161 }, { "epoch": 1.5728155339805825, "grad_norm": 0.8020799108353651, "learning_rate": 6.93659906293338e-05, "loss": 0.5623, "step": 162 }, { "epoch": 1.5825242718446602, "grad_norm": 1.0231881194650578, "learning_rate": 6.918103073561304e-05, "loss": 0.57, "step": 163 }, { "epoch": 1.5922330097087378, "grad_norm": 0.7935997277801292, "learning_rate": 6.899472734481765e-05, "loss": 0.5697, "step": 164 }, { "epoch": 1.6019417475728155, "grad_norm": 1.0863918666939771, "learning_rate": 6.880708903437116e-05, "loss": 0.5649, "step": 165 }, { "epoch": 1.6116504854368932, "grad_norm": 0.7946977064700003, "learning_rate": 6.8618124443157e-05, "loss": 0.5629, "step": 166 }, { "epoch": 1.6213592233009708, "grad_norm": 0.7600587207085567, "learning_rate": 6.842784227112057e-05, "loss": 0.5652, "step": 167 }, { "epoch": 1.6310679611650487, "grad_norm": 0.9222151118870435, "learning_rate": 6.823625127886888e-05, "loss": 0.5602, "step": 168 }, { "epoch": 1.6407766990291264, "grad_norm": 0.8500423865673382, "learning_rate": 6.804336028726706e-05, "loss": 0.5638, "step": 169 }, { "epoch": 1.650485436893204, "grad_norm": 1.009548584428779, "learning_rate": 6.78491781770324e-05, "loss": 0.5609, "step": 170 }, { "epoch": 1.6601941747572817, "grad_norm": 0.8596245784399692, "learning_rate": 6.765371388832531e-05, "loss": 0.5606, "step": 171 }, { "epoch": 1.6699029126213594, "grad_norm": 1.0991682854870917, "learning_rate": 6.745697642033791e-05, "loss": 0.5645, "step": 172 }, { "epoch": 1.679611650485437, "grad_norm": 0.9305228321364903, "learning_rate": 6.725897483087948e-05, "loss": 0.5572, "step": 173 }, { "epoch": 1.6893203883495147, "grad_norm": 0.9614358976051515, "learning_rate": 6.705971823595964e-05, "loss": 0.5616, "step": 174 }, { "epoch": 1.6990291262135924, "grad_norm": 1.0715923701630468, "learning_rate": 6.685921580936855e-05, "loss": 0.5629, "step": 175 }, { "epoch": 1.70873786407767, "grad_norm": 1.1630310321701869, "learning_rate": 6.665747678225454e-05, "loss": 0.565, "step": 176 }, { "epoch": 1.7184466019417477, "grad_norm": 0.8590469522925508, "learning_rate": 6.645451044269916e-05, "loss": 0.5585, "step": 177 }, { "epoch": 1.7281553398058254, "grad_norm": 0.7340387980261763, "learning_rate": 6.62503261352895e-05, "loss": 0.5613, "step": 178 }, { "epoch": 1.737864077669903, "grad_norm": 0.710845727118427, "learning_rate": 6.6044933260688e-05, "loss": 0.5562, "step": 179 }, { "epoch": 1.7475728155339807, "grad_norm": 0.8027866048243354, "learning_rate": 6.583834127519966e-05, "loss": 0.5616, "step": 180 }, { "epoch": 1.7572815533980584, "grad_norm": 1.0941839867674634, "learning_rate": 6.563055969033659e-05, "loss": 0.5591, "step": 181 }, { "epoch": 1.766990291262136, "grad_norm": 1.066378946036087, "learning_rate": 6.54215980723802e-05, "loss": 0.5612, "step": 182 }, { "epoch": 1.7766990291262137, "grad_norm": 0.7626588888095535, "learning_rate": 6.521146604194073e-05, "loss": 0.5528, "step": 183 }, { "epoch": 1.7864077669902914, "grad_norm": 0.8152238302302431, "learning_rate": 6.500017327351425e-05, "loss": 0.559, "step": 184 }, { "epoch": 1.796116504854369, "grad_norm": 1.0106535990003993, "learning_rate": 6.478772949503735e-05, "loss": 0.5586, "step": 185 }, { "epoch": 1.8058252427184467, "grad_norm": 1.3373530495717711, "learning_rate": 6.457414448743922e-05, "loss": 0.5602, "step": 186 }, { "epoch": 1.8155339805825244, "grad_norm": 0.6183996673247697, "learning_rate": 6.435942808419129e-05, "loss": 0.5563, "step": 187 }, { "epoch": 1.825242718446602, "grad_norm": 0.4513128772050125, "learning_rate": 6.41435901708546e-05, "loss": 0.5558, "step": 188 }, { "epoch": 1.8349514563106797, "grad_norm": 0.8068976523956712, "learning_rate": 6.392664068462455e-05, "loss": 0.5535, "step": 189 }, { "epoch": 1.8446601941747574, "grad_norm": 1.1435206608085113, "learning_rate": 6.370858961387348e-05, "loss": 0.5589, "step": 190 }, { "epoch": 1.854368932038835, "grad_norm": 0.9238122145618127, "learning_rate": 6.348944699769078e-05, "loss": 0.5521, "step": 191 }, { "epoch": 1.8640776699029127, "grad_norm": 0.7049348477803952, "learning_rate": 6.326922292542067e-05, "loss": 0.5533, "step": 192 }, { "epoch": 1.8737864077669903, "grad_norm": 0.4705716067710833, "learning_rate": 6.304792753619768e-05, "loss": 0.555, "step": 193 }, { "epoch": 1.883495145631068, "grad_norm": 0.5562958157730256, "learning_rate": 6.282557101847989e-05, "loss": 0.554, "step": 194 }, { "epoch": 1.8932038834951457, "grad_norm": 0.7776677254914264, "learning_rate": 6.260216360957982e-05, "loss": 0.5554, "step": 195 }, { "epoch": 1.9029126213592233, "grad_norm": 0.9626928256285646, "learning_rate": 6.237771559519309e-05, "loss": 0.557, "step": 196 }, { "epoch": 1.912621359223301, "grad_norm": 1.0125011253597305, "learning_rate": 6.215223730892488e-05, "loss": 0.5576, "step": 197 }, { "epoch": 1.9223300970873787, "grad_norm": 0.8791857222410507, "learning_rate": 6.192573913181423e-05, "loss": 0.5601, "step": 198 }, { "epoch": 1.9320388349514563, "grad_norm": 0.7995737202735174, "learning_rate": 6.169823149185594e-05, "loss": 0.558, "step": 199 }, { "epoch": 1.941747572815534, "grad_norm": 1.1194920590127924, "learning_rate": 6.146972486352062e-05, "loss": 0.5547, "step": 200 }, { "epoch": 1.9514563106796117, "grad_norm": 0.8287113779997599, "learning_rate": 6.124022976727246e-05, "loss": 0.5562, "step": 201 }, { "epoch": 1.9611650485436893, "grad_norm": 0.5720384230610551, "learning_rate": 6.1009756769084625e-05, "loss": 0.5549, "step": 202 }, { "epoch": 1.970873786407767, "grad_norm": 0.9412252549729389, "learning_rate": 6.077831647995312e-05, "loss": 0.5561, "step": 203 }, { "epoch": 1.9805825242718447, "grad_norm": 1.1651868503900755, "learning_rate": 6.0545919555408026e-05, "loss": 0.5549, "step": 204 }, { "epoch": 1.9902912621359223, "grad_norm": 0.6059833274896135, "learning_rate": 6.0312576695023015e-05, "loss": 0.5532, "step": 205 }, { "epoch": 2.0, "grad_norm": 0.7944875296787923, "learning_rate": 6.007829864192274e-05, "loss": 0.5471, "step": 206 }, { "epoch": 2.0097087378640777, "grad_norm": 0.9861185591611118, "learning_rate": 5.9843096182288184e-05, "loss": 0.5254, "step": 207 }, { "epoch": 2.0194174757281553, "grad_norm": 0.9193682360720784, "learning_rate": 5.960698014486009e-05, "loss": 0.5228, "step": 208 }, { "epoch": 2.029126213592233, "grad_norm": 0.8942942482878242, "learning_rate": 5.936996140044041e-05, "loss": 0.5213, "step": 209 }, { "epoch": 2.0388349514563107, "grad_norm": 0.9820673732498754, "learning_rate": 5.9132050861391774e-05, "loss": 0.5281, "step": 210 }, { "epoch": 2.0485436893203883, "grad_norm": 1.124245573357021, "learning_rate": 5.889325948113513e-05, "loss": 0.5252, "step": 211 }, { "epoch": 2.058252427184466, "grad_norm": 0.9569857392646174, "learning_rate": 5.865359825364543e-05, "loss": 0.5295, "step": 212 }, { "epoch": 2.0679611650485437, "grad_norm": 0.6979418510578903, "learning_rate": 5.841307821294546e-05, "loss": 0.5212, "step": 213 }, { "epoch": 2.0776699029126213, "grad_norm": 0.8030912411204089, "learning_rate": 5.8171710432597824e-05, "loss": 0.525, "step": 214 }, { "epoch": 2.087378640776699, "grad_norm": 1.0921718953136728, "learning_rate": 5.792950602519516e-05, "loss": 0.5249, "step": 215 }, { "epoch": 2.0970873786407767, "grad_norm": 0.6815361141499702, "learning_rate": 5.768647614184846e-05, "loss": 0.5256, "step": 216 }, { "epoch": 2.1067961165048543, "grad_norm": 0.5765577764382596, "learning_rate": 5.744263197167369e-05, "loss": 0.5196, "step": 217 }, { "epoch": 2.116504854368932, "grad_norm": 0.8497374889157474, "learning_rate": 5.719798474127668e-05, "loss": 0.5245, "step": 218 }, { "epoch": 2.1262135922330097, "grad_norm": 0.8473330942862705, "learning_rate": 5.69525457142362e-05, "loss": 0.5266, "step": 219 }, { "epoch": 2.1359223300970873, "grad_norm": 0.7383851235625609, "learning_rate": 5.6706326190585416e-05, "loss": 0.5241, "step": 220 }, { "epoch": 2.145631067961165, "grad_norm": 0.731625229205727, "learning_rate": 5.6459337506291594e-05, "loss": 0.5214, "step": 221 }, { "epoch": 2.1553398058252426, "grad_norm": 0.6257327662890664, "learning_rate": 5.621159103273424e-05, "loss": 0.521, "step": 222 }, { "epoch": 2.1650485436893203, "grad_norm": 0.5394134102082522, "learning_rate": 5.596309817618156e-05, "loss": 0.5202, "step": 223 }, { "epoch": 2.174757281553398, "grad_norm": 0.5108441929525305, "learning_rate": 5.571387037726524e-05, "loss": 0.5243, "step": 224 }, { "epoch": 2.1844660194174756, "grad_norm": 0.6078187548666663, "learning_rate": 5.5463919110453836e-05, "loss": 0.5196, "step": 225 }, { "epoch": 2.1941747572815533, "grad_norm": 0.663473200149725, "learning_rate": 5.521325588352437e-05, "loss": 0.5194, "step": 226 }, { "epoch": 2.203883495145631, "grad_norm": 0.6429831831672274, "learning_rate": 5.496189223703262e-05, "loss": 0.5173, "step": 227 }, { "epoch": 2.2135922330097086, "grad_norm": 0.6220750410182839, "learning_rate": 5.47098397437817e-05, "loss": 0.5209, "step": 228 }, { "epoch": 2.2233009708737863, "grad_norm": 0.5886484774766658, "learning_rate": 5.4457110008289306e-05, "loss": 0.5155, "step": 229 }, { "epoch": 2.233009708737864, "grad_norm": 0.4437653008171185, "learning_rate": 5.420371466625339e-05, "loss": 0.5185, "step": 230 }, { "epoch": 2.2427184466019416, "grad_norm": 0.39870906238532716, "learning_rate": 5.3949665384016556e-05, "loss": 0.5217, "step": 231 }, { "epoch": 2.2524271844660193, "grad_norm": 0.5717388200227635, "learning_rate": 5.369497385802877e-05, "loss": 0.5196, "step": 232 }, { "epoch": 2.262135922330097, "grad_norm": 0.6577624762016366, "learning_rate": 5.3439651814309044e-05, "loss": 0.5182, "step": 233 }, { "epoch": 2.2718446601941746, "grad_norm": 0.6946628514135419, "learning_rate": 5.3183711007905434e-05, "loss": 0.5188, "step": 234 }, { "epoch": 2.2815533980582523, "grad_norm": 0.6759552411316988, "learning_rate": 5.2927163222353876e-05, "loss": 0.5171, "step": 235 }, { "epoch": 2.29126213592233, "grad_norm": 0.6300902669161529, "learning_rate": 5.2670020269135703e-05, "loss": 0.5224, "step": 236 }, { "epoch": 2.3009708737864076, "grad_norm": 0.5185045096628276, "learning_rate": 5.241229398713379e-05, "loss": 0.5268, "step": 237 }, { "epoch": 2.3106796116504853, "grad_norm": 0.4449400609212431, "learning_rate": 5.2153996242087544e-05, "loss": 0.5207, "step": 238 }, { "epoch": 2.320388349514563, "grad_norm": 0.4056127362971999, "learning_rate": 5.1895138926046553e-05, "loss": 0.5207, "step": 239 }, { "epoch": 2.3300970873786406, "grad_norm": 0.31876595932444957, "learning_rate": 5.16357339568231e-05, "loss": 0.5233, "step": 240 }, { "epoch": 2.3398058252427183, "grad_norm": 0.3826214088490694, "learning_rate": 5.13757932774435e-05, "loss": 0.5229, "step": 241 }, { "epoch": 2.349514563106796, "grad_norm": 0.39293509694649387, "learning_rate": 5.111532885559816e-05, "loss": 0.517, "step": 242 }, { "epoch": 2.3592233009708736, "grad_norm": 0.3322451238923787, "learning_rate": 5.08543526830907e-05, "loss": 0.5183, "step": 243 }, { "epoch": 2.3689320388349513, "grad_norm": 0.35774381456905935, "learning_rate": 5.05928767752857e-05, "loss": 0.5207, "step": 244 }, { "epoch": 2.378640776699029, "grad_norm": 0.3490143073371343, "learning_rate": 5.033091317055565e-05, "loss": 0.5185, "step": 245 }, { "epoch": 2.3883495145631066, "grad_norm": 0.3488449914720745, "learning_rate": 5.006847392972664e-05, "loss": 0.5233, "step": 246 }, { "epoch": 2.3980582524271843, "grad_norm": 0.4080875571104722, "learning_rate": 4.9805571135523066e-05, "loss": 0.5196, "step": 247 }, { "epoch": 2.407766990291262, "grad_norm": 0.3959917551246165, "learning_rate": 4.954221689201138e-05, "loss": 0.5194, "step": 248 }, { "epoch": 2.4174757281553396, "grad_norm": 0.331330529722134, "learning_rate": 4.9278423324042776e-05, "loss": 0.5175, "step": 249 }, { "epoch": 2.4271844660194173, "grad_norm": 0.3809362865456529, "learning_rate": 4.901420257669501e-05, "loss": 0.5195, "step": 250 }, { "epoch": 2.436893203883495, "grad_norm": 0.3961884838888781, "learning_rate": 4.8749566814713204e-05, "loss": 0.5203, "step": 251 }, { "epoch": 2.4466019417475726, "grad_norm": 0.28927135799547965, "learning_rate": 4.848452822194977e-05, "loss": 0.522, "step": 252 }, { "epoch": 2.4563106796116507, "grad_norm": 0.32321223873962246, "learning_rate": 4.821909900080348e-05, "loss": 0.5192, "step": 253 }, { "epoch": 2.466019417475728, "grad_norm": 0.3034931174843224, "learning_rate": 4.7953291371657724e-05, "loss": 0.5214, "step": 254 }, { "epoch": 2.475728155339806, "grad_norm": 0.2955075395432886, "learning_rate": 4.768711757231775e-05, "loss": 0.5197, "step": 255 }, { "epoch": 2.4854368932038833, "grad_norm": 0.33319864095909213, "learning_rate": 4.742058985744738e-05, "loss": 0.5199, "step": 256 }, { "epoch": 2.4951456310679614, "grad_norm": 0.3018636044546277, "learning_rate": 4.715372049800467e-05, "loss": 0.5234, "step": 257 }, { "epoch": 2.5048543689320386, "grad_norm": 0.28068625121120355, "learning_rate": 4.688652178067708e-05, "loss": 0.5223, "step": 258 }, { "epoch": 2.5145631067961167, "grad_norm": 0.3229351457016303, "learning_rate": 4.661900600731571e-05, "loss": 0.5233, "step": 259 }, { "epoch": 2.524271844660194, "grad_norm": 0.3762838763816941, "learning_rate": 4.635118549436895e-05, "loss": 0.5238, "step": 260 }, { "epoch": 2.533980582524272, "grad_norm": 0.36897391891079456, "learning_rate": 4.608307257231541e-05, "loss": 0.5184, "step": 261 }, { "epoch": 2.5436893203883493, "grad_norm": 0.3195793897829173, "learning_rate": 4.5814679585096265e-05, "loss": 0.5169, "step": 262 }, { "epoch": 2.5533980582524274, "grad_norm": 0.2773810615279961, "learning_rate": 4.5546018889546876e-05, "loss": 0.5168, "step": 263 }, { "epoch": 2.5631067961165046, "grad_norm": 0.3441559401937381, "learning_rate": 4.527710285482799e-05, "loss": 0.517, "step": 264 }, { "epoch": 2.5728155339805827, "grad_norm": 0.37273218540100866, "learning_rate": 4.500794386185609e-05, "loss": 0.5185, "step": 265 }, { "epoch": 2.58252427184466, "grad_norm": 0.26503820958744123, "learning_rate": 4.473855430273355e-05, "loss": 0.5164, "step": 266 }, { "epoch": 2.592233009708738, "grad_norm": 0.2934088823294493, "learning_rate": 4.4468946580178026e-05, "loss": 0.5127, "step": 267 }, { "epoch": 2.6019417475728153, "grad_norm": 0.3186295434544236, "learning_rate": 4.4199133106951407e-05, "loss": 0.5173, "step": 268 }, { "epoch": 2.6116504854368934, "grad_norm": 0.3309244613515348, "learning_rate": 4.3929126305288364e-05, "loss": 0.5229, "step": 269 }, { "epoch": 2.6213592233009706, "grad_norm": 0.26814510063287106, "learning_rate": 4.365893860632444e-05, "loss": 0.5167, "step": 270 }, { "epoch": 2.6310679611650487, "grad_norm": 0.3074091286659034, "learning_rate": 4.338858244952369e-05, "loss": 0.5156, "step": 271 }, { "epoch": 2.6407766990291264, "grad_norm": 0.3823340679989687, "learning_rate": 4.3118070282106e-05, "loss": 0.5168, "step": 272 }, { "epoch": 2.650485436893204, "grad_norm": 0.47904147679754805, "learning_rate": 4.2847414558473987e-05, "loss": 0.5184, "step": 273 }, { "epoch": 2.6601941747572817, "grad_norm": 0.4269268816899063, "learning_rate": 4.257662773963961e-05, "loss": 0.5173, "step": 274 }, { "epoch": 2.6699029126213594, "grad_norm": 0.3094464875254195, "learning_rate": 4.230572229265045e-05, "loss": 0.5142, "step": 275 }, { "epoch": 2.679611650485437, "grad_norm": 0.31791889845655724, "learning_rate": 4.2034710690015766e-05, "loss": 0.517, "step": 276 }, { "epoch": 2.6893203883495147, "grad_norm": 0.4064644387432894, "learning_rate": 4.17636054091322e-05, "loss": 0.516, "step": 277 }, { "epoch": 2.6990291262135924, "grad_norm": 0.3053310406953197, "learning_rate": 4.1492418931709366e-05, "loss": 0.5175, "step": 278 }, { "epoch": 2.70873786407767, "grad_norm": 0.2765928946593284, "learning_rate": 4.1221163743195175e-05, "loss": 0.5185, "step": 279 }, { "epoch": 2.7184466019417477, "grad_norm": 0.3378318482897848, "learning_rate": 4.094985233220098e-05, "loss": 0.5211, "step": 280 }, { "epoch": 2.7281553398058254, "grad_norm": 0.3143734127353884, "learning_rate": 4.067849718992665e-05, "loss": 0.5197, "step": 281 }, { "epoch": 2.737864077669903, "grad_norm": 0.259820524121846, "learning_rate": 4.040711080958547e-05, "loss": 0.5259, "step": 282 }, { "epoch": 2.7475728155339807, "grad_norm": 0.32611605738060934, "learning_rate": 4.013570568582883e-05, "loss": 0.5174, "step": 283 }, { "epoch": 2.7572815533980584, "grad_norm": 0.29760137014246574, "learning_rate": 3.986429431417118e-05, "loss": 0.5124, "step": 284 }, { "epoch": 2.766990291262136, "grad_norm": 0.26849482707048117, "learning_rate": 3.959288919041455e-05, "loss": 0.5116, "step": 285 }, { "epoch": 2.7766990291262137, "grad_norm": 0.28358089368587186, "learning_rate": 3.9321502810073354e-05, "loss": 0.5179, "step": 286 }, { "epoch": 2.7864077669902914, "grad_norm": 0.3098687865760963, "learning_rate": 3.905014766779904e-05, "loss": 0.5148, "step": 287 }, { "epoch": 2.796116504854369, "grad_norm": 0.3018433729775123, "learning_rate": 3.8778836256804845e-05, "loss": 0.5165, "step": 288 }, { "epoch": 2.8058252427184467, "grad_norm": 0.23893414660568268, "learning_rate": 3.850758106829065e-05, "loss": 0.5161, "step": 289 }, { "epoch": 2.8155339805825244, "grad_norm": 0.2524448827230724, "learning_rate": 3.823639459086781e-05, "loss": 0.5152, "step": 290 }, { "epoch": 2.825242718446602, "grad_norm": 0.2770033187618851, "learning_rate": 3.796528930998425e-05, "loss": 0.5145, "step": 291 }, { "epoch": 2.8349514563106797, "grad_norm": 0.25224488768181713, "learning_rate": 3.769427770734955e-05, "loss": 0.5195, "step": 292 }, { "epoch": 2.8446601941747574, "grad_norm": 0.27486298363736394, "learning_rate": 3.742337226036041e-05, "loss": 0.5157, "step": 293 }, { "epoch": 2.854368932038835, "grad_norm": 0.23960082819762807, "learning_rate": 3.715258544152603e-05, "loss": 0.5154, "step": 294 }, { "epoch": 2.8640776699029127, "grad_norm": 0.2710779329439077, "learning_rate": 3.688192971789401e-05, "loss": 0.5199, "step": 295 }, { "epoch": 2.8737864077669903, "grad_norm": 0.256469645800299, "learning_rate": 3.6611417550476324e-05, "loss": 0.5149, "step": 296 }, { "epoch": 2.883495145631068, "grad_norm": 0.3010842522468919, "learning_rate": 3.6341061393675574e-05, "loss": 0.5188, "step": 297 }, { "epoch": 2.8932038834951457, "grad_norm": 0.24615248139015927, "learning_rate": 3.607087369471164e-05, "loss": 0.5142, "step": 298 }, { "epoch": 2.9029126213592233, "grad_norm": 0.3025647213003904, "learning_rate": 3.580086689304861e-05, "loss": 0.5188, "step": 299 }, { "epoch": 2.912621359223301, "grad_norm": 0.2506413395712758, "learning_rate": 3.553105341982198e-05, "loss": 0.5127, "step": 300 }, { "epoch": 2.9223300970873787, "grad_norm": 0.26488744144075266, "learning_rate": 3.526144569726647e-05, "loss": 0.5148, "step": 301 }, { "epoch": 2.9320388349514563, "grad_norm": 0.23180961021643326, "learning_rate": 3.499205613814393e-05, "loss": 0.518, "step": 302 }, { "epoch": 2.941747572815534, "grad_norm": 0.24614944764274665, "learning_rate": 3.472289714517203e-05, "loss": 0.5119, "step": 303 }, { "epoch": 2.9514563106796117, "grad_norm": 0.22977263011082621, "learning_rate": 3.445398111045313e-05, "loss": 0.5184, "step": 304 }, { "epoch": 2.9611650485436893, "grad_norm": 0.21286238167178875, "learning_rate": 3.418532041490375e-05, "loss": 0.517, "step": 305 }, { "epoch": 2.970873786407767, "grad_norm": 0.2548130433665424, "learning_rate": 3.3916927427684595e-05, "loss": 0.5186, "step": 306 }, { "epoch": 2.9805825242718447, "grad_norm": 0.31989149824666413, "learning_rate": 3.364881450563106e-05, "loss": 0.5153, "step": 307 }, { "epoch": 2.9902912621359223, "grad_norm": 0.2762238599236643, "learning_rate": 3.338099399268429e-05, "loss": 0.5167, "step": 308 }, { "epoch": 3.0, "grad_norm": 0.20481446931191682, "learning_rate": 3.311347821932292e-05, "loss": 0.5092, "step": 309 }, { "epoch": 3.0097087378640777, "grad_norm": 0.22631177358316332, "learning_rate": 3.284627950199535e-05, "loss": 0.4909, "step": 310 }, { "epoch": 3.0194174757281553, "grad_norm": 0.21905855651481868, "learning_rate": 3.2579410142552646e-05, "loss": 0.4889, "step": 311 }, { "epoch": 3.029126213592233, "grad_norm": 0.268856232171971, "learning_rate": 3.231288242768226e-05, "loss": 0.4882, "step": 312 }, { "epoch": 3.0388349514563107, "grad_norm": 0.2631480748788351, "learning_rate": 3.204670862834228e-05, "loss": 0.4822, "step": 313 }, { "epoch": 3.0485436893203883, "grad_norm": 0.28121213579534965, "learning_rate": 3.178090099919653e-05, "loss": 0.4848, "step": 314 }, { "epoch": 3.058252427184466, "grad_norm": 0.2553829402200111, "learning_rate": 3.1515471778050246e-05, "loss": 0.4853, "step": 315 }, { "epoch": 3.0679611650485437, "grad_norm": 0.29584988541778207, "learning_rate": 3.12504331852868e-05, "loss": 0.4835, "step": 316 }, { "epoch": 3.0776699029126213, "grad_norm": 0.21912251520340542, "learning_rate": 3.098579742330499e-05, "loss": 0.4866, "step": 317 }, { "epoch": 3.087378640776699, "grad_norm": 0.2816186665047795, "learning_rate": 3.0721576675957224e-05, "loss": 0.4856, "step": 318 }, { "epoch": 3.0970873786407767, "grad_norm": 0.24175134230414194, "learning_rate": 3.0457783107988642e-05, "loss": 0.4886, "step": 319 }, { "epoch": 3.1067961165048543, "grad_norm": 0.22350452741221052, "learning_rate": 3.0194428864476947e-05, "loss": 0.4836, "step": 320 }, { "epoch": 3.116504854368932, "grad_norm": 0.25073180546811125, "learning_rate": 2.9931526070273374e-05, "loss": 0.4884, "step": 321 }, { "epoch": 3.1262135922330097, "grad_norm": 0.19079613465758094, "learning_rate": 2.9669086829444364e-05, "loss": 0.485, "step": 322 }, { "epoch": 3.1359223300970873, "grad_norm": 0.23960288437553956, "learning_rate": 2.9407123224714312e-05, "loss": 0.4856, "step": 323 }, { "epoch": 3.145631067961165, "grad_norm": 0.22627618578886288, "learning_rate": 2.9145647316909306e-05, "loss": 0.4862, "step": 324 }, { "epoch": 3.1553398058252426, "grad_norm": 0.1960335183955326, "learning_rate": 2.8884671144401833e-05, "loss": 0.4869, "step": 325 }, { "epoch": 3.1650485436893203, "grad_norm": 0.2079951634137142, "learning_rate": 2.8624206722556508e-05, "loss": 0.491, "step": 326 }, { "epoch": 3.174757281553398, "grad_norm": 0.18132268629371445, "learning_rate": 2.8364266043176897e-05, "loss": 0.4876, "step": 327 }, { "epoch": 3.1844660194174756, "grad_norm": 0.18068511996455502, "learning_rate": 2.810486107395347e-05, "loss": 0.4858, "step": 328 }, { "epoch": 3.1941747572815533, "grad_norm": 0.19264695463261375, "learning_rate": 2.7846003757912473e-05, "loss": 0.4891, "step": 329 }, { "epoch": 3.203883495145631, "grad_norm": 0.16897146599425877, "learning_rate": 2.7587706012866227e-05, "loss": 0.4868, "step": 330 }, { "epoch": 3.2135922330097086, "grad_norm": 0.2264234031229311, "learning_rate": 2.7329979730864313e-05, "loss": 0.4844, "step": 331 }, { "epoch": 3.2233009708737863, "grad_norm": 0.2433703396011944, "learning_rate": 2.707283677764613e-05, "loss": 0.4855, "step": 332 }, { "epoch": 3.233009708737864, "grad_norm": 0.1780724623798919, "learning_rate": 2.6816288992094573e-05, "loss": 0.4853, "step": 333 }, { "epoch": 3.2427184466019416, "grad_norm": 0.2550763857085153, "learning_rate": 2.6560348185690956e-05, "loss": 0.4865, "step": 334 }, { "epoch": 3.2524271844660193, "grad_norm": 0.18943111426329542, "learning_rate": 2.6305026141971227e-05, "loss": 0.4869, "step": 335 }, { "epoch": 3.262135922330097, "grad_norm": 0.21284295474198855, "learning_rate": 2.6050334615983467e-05, "loss": 0.4872, "step": 336 }, { "epoch": 3.2718446601941746, "grad_norm": 0.22131826031463572, "learning_rate": 2.5796285333746615e-05, "loss": 0.4816, "step": 337 }, { "epoch": 3.2815533980582523, "grad_norm": 0.182135484368646, "learning_rate": 2.554288999171072e-05, "loss": 0.4915, "step": 338 }, { "epoch": 3.29126213592233, "grad_norm": 0.23761469952772257, "learning_rate": 2.5290160256218313e-05, "loss": 0.4853, "step": 339 }, { "epoch": 3.3009708737864076, "grad_norm": 0.1840898568254873, "learning_rate": 2.5038107762967393e-05, "loss": 0.4883, "step": 340 }, { "epoch": 3.3106796116504853, "grad_norm": 0.21359795567551282, "learning_rate": 2.4786744116475638e-05, "loss": 0.4871, "step": 341 }, { "epoch": 3.320388349514563, "grad_norm": 0.24890926664546134, "learning_rate": 2.4536080889546177e-05, "loss": 0.489, "step": 342 }, { "epoch": 3.3300970873786406, "grad_norm": 0.20782018056580667, "learning_rate": 2.4286129622734764e-05, "loss": 0.4844, "step": 343 }, { "epoch": 3.3398058252427183, "grad_norm": 0.21924373313456091, "learning_rate": 2.4036901823818454e-05, "loss": 0.4863, "step": 344 }, { "epoch": 3.349514563106796, "grad_norm": 0.19867052689304365, "learning_rate": 2.378840896726577e-05, "loss": 0.4908, "step": 345 }, { "epoch": 3.3592233009708736, "grad_norm": 0.2327076797489768, "learning_rate": 2.3540662493708423e-05, "loss": 0.4869, "step": 346 }, { "epoch": 3.3689320388349513, "grad_norm": 0.17819481518798666, "learning_rate": 2.3293673809414598e-05, "loss": 0.4816, "step": 347 }, { "epoch": 3.378640776699029, "grad_norm": 0.17940622231016476, "learning_rate": 2.3047454285763793e-05, "loss": 0.4874, "step": 348 }, { "epoch": 3.3883495145631066, "grad_norm": 0.16039118497959268, "learning_rate": 2.2802015258723324e-05, "loss": 0.4869, "step": 349 }, { "epoch": 3.3980582524271843, "grad_norm": 0.17049865419413107, "learning_rate": 2.2557368028326324e-05, "loss": 0.4845, "step": 350 }, { "epoch": 3.407766990291262, "grad_norm": 0.18902552117069968, "learning_rate": 2.2313523858151554e-05, "loss": 0.4872, "step": 351 }, { "epoch": 3.4174757281553396, "grad_norm": 0.15937546411881842, "learning_rate": 2.207049397480485e-05, "loss": 0.4845, "step": 352 }, { "epoch": 3.4271844660194173, "grad_norm": 0.19899647930250075, "learning_rate": 2.1828289567402173e-05, "loss": 0.4874, "step": 353 }, { "epoch": 3.436893203883495, "grad_norm": 0.17339320655418355, "learning_rate": 2.1586921787054564e-05, "loss": 0.4871, "step": 354 }, { "epoch": 3.4466019417475726, "grad_norm": 0.17543747218166306, "learning_rate": 2.1346401746354576e-05, "loss": 0.4828, "step": 355 }, { "epoch": 3.4563106796116507, "grad_norm": 0.17798441146417476, "learning_rate": 2.110674051886488e-05, "loss": 0.4838, "step": 356 }, { "epoch": 3.466019417475728, "grad_norm": 0.16029275511201987, "learning_rate": 2.0867949138608242e-05, "loss": 0.4841, "step": 357 }, { "epoch": 3.475728155339806, "grad_norm": 0.1663439154413797, "learning_rate": 2.06300385995596e-05, "loss": 0.4865, "step": 358 }, { "epoch": 3.4854368932038833, "grad_norm": 0.16712634012536365, "learning_rate": 2.0393019855139915e-05, "loss": 0.4833, "step": 359 }, { "epoch": 3.4951456310679614, "grad_norm": 0.17251097817233577, "learning_rate": 2.0156903817711812e-05, "loss": 0.485, "step": 360 }, { "epoch": 3.5048543689320386, "grad_norm": 0.17262950471482985, "learning_rate": 1.9921701358077265e-05, "loss": 0.4846, "step": 361 }, { "epoch": 3.5145631067961167, "grad_norm": 0.17329043714528009, "learning_rate": 1.9687423304976994e-05, "loss": 0.4835, "step": 362 }, { "epoch": 3.524271844660194, "grad_norm": 0.16219535331159693, "learning_rate": 1.9454080444591998e-05, "loss": 0.4849, "step": 363 }, { "epoch": 3.533980582524272, "grad_norm": 0.16515501782595504, "learning_rate": 1.9221683520046892e-05, "loss": 0.4857, "step": 364 }, { "epoch": 3.5436893203883493, "grad_norm": 0.19734035971441835, "learning_rate": 1.899024323091539e-05, "loss": 0.4836, "step": 365 }, { "epoch": 3.5533980582524274, "grad_norm": 0.16333328213007223, "learning_rate": 1.875977023272757e-05, "loss": 0.485, "step": 366 }, { "epoch": 3.5631067961165046, "grad_norm": 0.18228849361538985, "learning_rate": 1.853027513647937e-05, "loss": 0.4869, "step": 367 }, { "epoch": 3.5728155339805827, "grad_norm": 0.14382298808613353, "learning_rate": 1.8301768508144078e-05, "loss": 0.4837, "step": 368 }, { "epoch": 3.58252427184466, "grad_norm": 0.1436393142572651, "learning_rate": 1.8074260868185784e-05, "loss": 0.4828, "step": 369 }, { "epoch": 3.592233009708738, "grad_norm": 0.16780032690913915, "learning_rate": 1.7847762691075115e-05, "loss": 0.487, "step": 370 }, { "epoch": 3.6019417475728153, "grad_norm": 0.14947935740806928, "learning_rate": 1.762228440480692e-05, "loss": 0.4845, "step": 371 }, { "epoch": 3.6116504854368934, "grad_norm": 0.14559731703552367, "learning_rate": 1.7397836390420192e-05, "loss": 0.4836, "step": 372 }, { "epoch": 3.6213592233009706, "grad_norm": 0.1506762657862038, "learning_rate": 1.717442898152012e-05, "loss": 0.4855, "step": 373 }, { "epoch": 3.6310679611650487, "grad_norm": 0.13248764059764206, "learning_rate": 1.6952072463802326e-05, "loss": 0.4848, "step": 374 }, { "epoch": 3.6407766990291264, "grad_norm": 0.14524262090031537, "learning_rate": 1.6730777074579346e-05, "loss": 0.4841, "step": 375 }, { "epoch": 3.650485436893204, "grad_norm": 0.13279946211767724, "learning_rate": 1.651055300230922e-05, "loss": 0.4847, "step": 376 }, { "epoch": 3.6601941747572817, "grad_norm": 0.14766932876767538, "learning_rate": 1.6291410386126524e-05, "loss": 0.4854, "step": 377 }, { "epoch": 3.6699029126213594, "grad_norm": 0.14421279285412036, "learning_rate": 1.607335931537547e-05, "loss": 0.4834, "step": 378 }, { "epoch": 3.679611650485437, "grad_norm": 0.1332433537811279, "learning_rate": 1.585640982914541e-05, "loss": 0.4881, "step": 379 }, { "epoch": 3.6893203883495147, "grad_norm": 0.13964291558360487, "learning_rate": 1.564057191580873e-05, "loss": 0.4812, "step": 380 }, { "epoch": 3.6990291262135924, "grad_norm": 0.13379924755179787, "learning_rate": 1.54258555125608e-05, "loss": 0.4865, "step": 381 }, { "epoch": 3.70873786407767, "grad_norm": 0.13942795631282168, "learning_rate": 1.521227050496266e-05, "loss": 0.4862, "step": 382 }, { "epoch": 3.7184466019417477, "grad_norm": 0.14829905133246946, "learning_rate": 1.4999826726485754e-05, "loss": 0.4841, "step": 383 }, { "epoch": 3.7281553398058254, "grad_norm": 0.13774937243212151, "learning_rate": 1.4788533958059281e-05, "loss": 0.4873, "step": 384 }, { "epoch": 3.737864077669903, "grad_norm": 0.14632219044552267, "learning_rate": 1.457840192761979e-05, "loss": 0.4854, "step": 385 }, { "epoch": 3.7475728155339807, "grad_norm": 0.15302510684664683, "learning_rate": 1.4369440309663412e-05, "loss": 0.4833, "step": 386 }, { "epoch": 3.7572815533980584, "grad_norm": 0.14352691900463502, "learning_rate": 1.4161658724800357e-05, "loss": 0.4846, "step": 387 }, { "epoch": 3.766990291262136, "grad_norm": 0.1517023916720968, "learning_rate": 1.3955066739312e-05, "loss": 0.4867, "step": 388 }, { "epoch": 3.7766990291262137, "grad_norm": 0.15194309330245784, "learning_rate": 1.3749673864710524e-05, "loss": 0.4865, "step": 389 }, { "epoch": 3.7864077669902914, "grad_norm": 0.1468613357237534, "learning_rate": 1.3545489557300853e-05, "loss": 0.4846, "step": 390 }, { "epoch": 3.796116504854369, "grad_norm": 0.16395511168934657, "learning_rate": 1.3342523217745473e-05, "loss": 0.4869, "step": 391 }, { "epoch": 3.8058252427184467, "grad_norm": 0.15078193434327586, "learning_rate": 1.3140784190631459e-05, "loss": 0.4825, "step": 392 }, { "epoch": 3.8155339805825244, "grad_norm": 0.14510817920359698, "learning_rate": 1.2940281764040368e-05, "loss": 0.4825, "step": 393 }, { "epoch": 3.825242718446602, "grad_norm": 0.17265976668387825, "learning_rate": 1.2741025169120539e-05, "loss": 0.4872, "step": 394 }, { "epoch": 3.8349514563106797, "grad_norm": 0.14620030187493488, "learning_rate": 1.2543023579662106e-05, "loss": 0.4845, "step": 395 }, { "epoch": 3.8446601941747574, "grad_norm": 0.1418045201790065, "learning_rate": 1.234628611167469e-05, "loss": 0.4845, "step": 396 }, { "epoch": 3.854368932038835, "grad_norm": 0.14833568549734966, "learning_rate": 1.2150821822967611e-05, "loss": 0.4882, "step": 397 }, { "epoch": 3.8640776699029127, "grad_norm": 0.147355117137231, "learning_rate": 1.1956639712732958e-05, "loss": 0.4845, "step": 398 }, { "epoch": 3.8737864077669903, "grad_norm": 0.13630141790437844, "learning_rate": 1.1763748721131142e-05, "loss": 0.4819, "step": 399 }, { "epoch": 3.883495145631068, "grad_norm": 0.13485086713221084, "learning_rate": 1.1572157728879444e-05, "loss": 0.485, "step": 400 }, { "epoch": 3.8932038834951457, "grad_norm": 0.14115351641144186, "learning_rate": 1.1381875556843007e-05, "loss": 0.4874, "step": 401 }, { "epoch": 3.9029126213592233, "grad_norm": 0.13251154708545737, "learning_rate": 1.119291096562884e-05, "loss": 0.4879, "step": 402 }, { "epoch": 3.912621359223301, "grad_norm": 0.14538526251672798, "learning_rate": 1.1005272655182378e-05, "loss": 0.487, "step": 403 }, { "epoch": 3.9223300970873787, "grad_norm": 0.14640871835491487, "learning_rate": 1.0818969264386973e-05, "loss": 0.4843, "step": 404 }, { "epoch": 3.9320388349514563, "grad_norm": 0.13392456360817495, "learning_rate": 1.0634009370666214e-05, "loss": 0.4826, "step": 405 }, { "epoch": 3.941747572815534, "grad_norm": 0.15291675444361674, "learning_rate": 1.045040148958893e-05, "loss": 0.488, "step": 406 }, { "epoch": 3.9514563106796117, "grad_norm": 0.13901859144613157, "learning_rate": 1.0268154074477188e-05, "loss": 0.4853, "step": 407 }, { "epoch": 3.9611650485436893, "grad_norm": 0.13797907262373485, "learning_rate": 1.0087275516017083e-05, "loss": 0.4834, "step": 408 }, { "epoch": 3.970873786407767, "grad_norm": 0.13762327771479307, "learning_rate": 9.907774141872468e-06, "loss": 0.488, "step": 409 }, { "epoch": 3.9805825242718447, "grad_norm": 0.14986633803246888, "learning_rate": 9.729658216301479e-06, "loss": 0.4828, "step": 410 }, { "epoch": 3.9902912621359223, "grad_norm": 0.13435313597563736, "learning_rate": 9.552935939776083e-06, "loss": 0.4871, "step": 411 }, { "epoch": 4.0, "grad_norm": 0.14759467236298449, "learning_rate": 9.377615448604574e-06, "loss": 0.4831, "step": 412 }, { "epoch": 4.009708737864078, "grad_norm": 0.17306287750975244, "learning_rate": 9.203704814556871e-06, "loss": 0.4669, "step": 413 }, { "epoch": 4.019417475728155, "grad_norm": 0.14023047375285927, "learning_rate": 9.031212044493016e-06, "loss": 0.4659, "step": 414 }, { "epoch": 4.029126213592233, "grad_norm": 0.14153603129051684, "learning_rate": 8.860145079994433e-06, "loss": 0.4656, "step": 415 }, { "epoch": 4.038834951456311, "grad_norm": 0.14948734290428067, "learning_rate": 8.690511796998344e-06, "loss": 0.4653, "step": 416 }, { "epoch": 4.048543689320389, "grad_norm": 0.16197544725648444, "learning_rate": 8.522320005435162e-06, "loss": 0.4686, "step": 417 }, { "epoch": 4.058252427184466, "grad_norm": 0.16013381839658847, "learning_rate": 8.355577448868933e-06, "loss": 0.4709, "step": 418 }, { "epoch": 4.067961165048544, "grad_norm": 0.13999136843500865, "learning_rate": 8.190291804140775e-06, "loss": 0.4648, "step": 419 }, { "epoch": 4.077669902912621, "grad_norm": 0.1470240140496139, "learning_rate": 8.02647068101547e-06, "loss": 0.4681, "step": 420 }, { "epoch": 4.087378640776699, "grad_norm": 0.1355700665658678, "learning_rate": 7.864121621831126e-06, "loss": 0.467, "step": 421 }, { "epoch": 4.097087378640777, "grad_norm": 0.14144514644659487, "learning_rate": 7.703252101151873e-06, "loss": 0.4613, "step": 422 }, { "epoch": 4.106796116504855, "grad_norm": 0.129930341121314, "learning_rate": 7.5438695254238e-06, "loss": 0.4638, "step": 423 }, { "epoch": 4.116504854368932, "grad_norm": 0.14218321035952994, "learning_rate": 7.385981232633894e-06, "loss": 0.465, "step": 424 }, { "epoch": 4.12621359223301, "grad_norm": 0.1420896044955223, "learning_rate": 7.229594491972256e-06, "loss": 0.4659, "step": 425 }, { "epoch": 4.135922330097087, "grad_norm": 0.13076263084557904, "learning_rate": 7.07471650349739e-06, "loss": 0.4648, "step": 426 }, { "epoch": 4.145631067961165, "grad_norm": 0.13079701173247169, "learning_rate": 6.921354397804712e-06, "loss": 0.4651, "step": 427 }, { "epoch": 4.155339805825243, "grad_norm": 0.1299754901865967, "learning_rate": 6.7695152356983054e-06, "loss": 0.4647, "step": 428 }, { "epoch": 4.165048543689321, "grad_norm": 0.12764332296015807, "learning_rate": 6.619206007865768e-06, "loss": 0.4607, "step": 429 }, { "epoch": 4.174757281553398, "grad_norm": 0.12376699505437842, "learning_rate": 6.47043363455643e-06, "loss": 0.4674, "step": 430 }, { "epoch": 4.184466019417476, "grad_norm": 0.11650682736889516, "learning_rate": 6.323204965262686e-06, "loss": 0.463, "step": 431 }, { "epoch": 4.194174757281553, "grad_norm": 0.1217502825230698, "learning_rate": 6.177526778404663e-06, "loss": 0.4679, "step": 432 }, { "epoch": 4.203883495145631, "grad_norm": 0.11887732797821776, "learning_rate": 6.033405781018195e-06, "loss": 0.4655, "step": 433 }, { "epoch": 4.213592233009709, "grad_norm": 0.12496115598562435, "learning_rate": 5.8908486084459134e-06, "loss": 0.4635, "step": 434 }, { "epoch": 4.223300970873787, "grad_norm": 0.12470991178018197, "learning_rate": 5.74986182403189e-06, "loss": 0.4646, "step": 435 }, { "epoch": 4.233009708737864, "grad_norm": 0.1292310303978258, "learning_rate": 5.610451918819357e-06, "loss": 0.4651, "step": 436 }, { "epoch": 4.242718446601942, "grad_norm": 0.13694613972829348, "learning_rate": 5.472625311251918e-06, "loss": 0.4667, "step": 437 }, { "epoch": 4.252427184466019, "grad_norm": 0.12260186181424092, "learning_rate": 5.336388346878006e-06, "loss": 0.4654, "step": 438 }, { "epoch": 4.262135922330097, "grad_norm": 0.12523666490053748, "learning_rate": 5.201747298058765e-06, "loss": 0.4604, "step": 439 }, { "epoch": 4.271844660194175, "grad_norm": 0.12451383039630047, "learning_rate": 5.068708363679249e-06, "loss": 0.4645, "step": 440 }, { "epoch": 4.281553398058253, "grad_norm": 0.10990810773913537, "learning_rate": 4.937277668863014e-06, "loss": 0.4647, "step": 441 }, { "epoch": 4.29126213592233, "grad_norm": 0.10982607372347657, "learning_rate": 4.807461264690157e-06, "loss": 0.465, "step": 442 }, { "epoch": 4.300970873786408, "grad_norm": 0.12022614154318102, "learning_rate": 4.67926512791868e-06, "loss": 0.4654, "step": 443 }, { "epoch": 4.310679611650485, "grad_norm": 0.10923349467562726, "learning_rate": 4.552695160709362e-06, "loss": 0.466, "step": 444 }, { "epoch": 4.320388349514563, "grad_norm": 0.10653612984033727, "learning_rate": 4.427757190353976e-06, "loss": 0.4684, "step": 445 }, { "epoch": 4.330097087378641, "grad_norm": 0.10730397458255132, "learning_rate": 4.304456969007049e-06, "loss": 0.4657, "step": 446 }, { "epoch": 4.339805825242719, "grad_norm": 0.1003612678568525, "learning_rate": 4.182800173420991e-06, "loss": 0.4649, "step": 447 }, { "epoch": 4.349514563106796, "grad_norm": 0.11090894330673261, "learning_rate": 4.06279240468475e-06, "loss": 0.4631, "step": 448 }, { "epoch": 4.359223300970874, "grad_norm": 0.1167058523870307, "learning_rate": 3.9444391879659604e-06, "loss": 0.4665, "step": 449 }, { "epoch": 4.368932038834951, "grad_norm": 0.10433976260435962, "learning_rate": 3.827745972256529e-06, "loss": 0.4659, "step": 450 }, { "epoch": 4.378640776699029, "grad_norm": 0.10011905909384386, "learning_rate": 3.7127181301217817e-06, "loss": 0.4641, "step": 451 }, { "epoch": 4.388349514563107, "grad_norm": 0.10655509962726566, "learning_rate": 3.599360957453102e-06, "loss": 0.467, "step": 452 }, { "epoch": 4.398058252427185, "grad_norm": 0.10938664062060237, "learning_rate": 3.487679673224129e-06, "loss": 0.4607, "step": 453 }, { "epoch": 4.407766990291262, "grad_norm": 0.10191086413678936, "learning_rate": 3.3776794192504412e-06, "loss": 0.4623, "step": 454 }, { "epoch": 4.41747572815534, "grad_norm": 0.09548255692426758, "learning_rate": 3.269365259952859e-06, "loss": 0.4626, "step": 455 }, { "epoch": 4.427184466019417, "grad_norm": 0.10109872749133958, "learning_rate": 3.1627421821242586e-06, "loss": 0.4627, "step": 456 }, { "epoch": 4.436893203883495, "grad_norm": 0.1003445240191667, "learning_rate": 3.0578150946999695e-06, "loss": 0.4637, "step": 457 }, { "epoch": 4.446601941747573, "grad_norm": 0.10094079472060287, "learning_rate": 2.954588828531817e-06, "loss": 0.4671, "step": 458 }, { "epoch": 4.456310679611651, "grad_norm": 0.09877743367390623, "learning_rate": 2.8530681361656422e-06, "loss": 0.4649, "step": 459 }, { "epoch": 4.466019417475728, "grad_norm": 0.09360681988866616, "learning_rate": 2.7532576916225395e-06, "loss": 0.4678, "step": 460 }, { "epoch": 4.475728155339806, "grad_norm": 0.09976388673239875, "learning_rate": 2.6551620901836515e-06, "loss": 0.4645, "step": 461 }, { "epoch": 4.485436893203883, "grad_norm": 0.0965607126962889, "learning_rate": 2.5587858481786086e-06, "loss": 0.4664, "step": 462 }, { "epoch": 4.495145631067961, "grad_norm": 0.09897406466321508, "learning_rate": 2.4641334027775755e-06, "loss": 0.467, "step": 463 }, { "epoch": 4.504854368932039, "grad_norm": 0.10396092442681994, "learning_rate": 2.371209111786987e-06, "loss": 0.468, "step": 464 }, { "epoch": 4.514563106796117, "grad_norm": 0.0948373657066504, "learning_rate": 2.280017253448916e-06, "loss": 0.4642, "step": 465 }, { "epoch": 4.524271844660194, "grad_norm": 0.09507655846542853, "learning_rate": 2.190562026244072e-06, "loss": 0.4669, "step": 466 }, { "epoch": 4.533980582524272, "grad_norm": 0.09659880423736872, "learning_rate": 2.102847548698539e-06, "loss": 0.4642, "step": 467 }, { "epoch": 4.543689320388349, "grad_norm": 0.09162921938645756, "learning_rate": 2.0168778591941242e-06, "loss": 0.4694, "step": 468 }, { "epoch": 4.553398058252427, "grad_norm": 0.08943623447949108, "learning_rate": 1.9326569157824736e-06, "loss": 0.4654, "step": 469 }, { "epoch": 4.563106796116505, "grad_norm": 0.0958809139961805, "learning_rate": 1.850188596002802e-06, "loss": 0.4638, "step": 470 }, { "epoch": 4.572815533980583, "grad_norm": 0.0928325035695772, "learning_rate": 1.7694766967033805e-06, "loss": 0.4699, "step": 471 }, { "epoch": 4.58252427184466, "grad_norm": 0.09326809079774372, "learning_rate": 1.6905249338667617e-06, "loss": 0.4654, "step": 472 }, { "epoch": 4.592233009708738, "grad_norm": 0.08894714059682778, "learning_rate": 1.613336942438637e-06, "loss": 0.4678, "step": 473 }, { "epoch": 4.601941747572815, "grad_norm": 0.09702666150761258, "learning_rate": 1.5379162761605427e-06, "loss": 0.4642, "step": 474 }, { "epoch": 4.611650485436893, "grad_norm": 0.0925613181306804, "learning_rate": 1.4642664074061962e-06, "loss": 0.463, "step": 475 }, { "epoch": 4.621359223300971, "grad_norm": 0.0934935785749229, "learning_rate": 1.3923907270216819e-06, "loss": 0.4636, "step": 476 }, { "epoch": 4.631067961165049, "grad_norm": 0.08692636915664595, "learning_rate": 1.3222925441692635e-06, "loss": 0.4656, "step": 477 }, { "epoch": 4.640776699029126, "grad_norm": 0.08592627332074977, "learning_rate": 1.2539750861751031e-06, "loss": 0.464, "step": 478 }, { "epoch": 4.650485436893204, "grad_norm": 0.08826705670035949, "learning_rate": 1.1874414983806283e-06, "loss": 0.4669, "step": 479 }, { "epoch": 4.660194174757281, "grad_norm": 0.08820160855473339, "learning_rate": 1.1226948439977314e-06, "loss": 0.4631, "step": 480 }, { "epoch": 4.669902912621359, "grad_norm": 0.09622872398963238, "learning_rate": 1.0597381039677646e-06, "loss": 0.4686, "step": 481 }, { "epoch": 4.679611650485437, "grad_norm": 0.08913032971687475, "learning_rate": 9.985741768242429e-07, "loss": 0.4647, "step": 482 }, { "epoch": 4.689320388349515, "grad_norm": 0.09028397610583812, "learning_rate": 9.392058785594504e-07, "loss": 0.4623, "step": 483 }, { "epoch": 4.699029126213592, "grad_norm": 0.09375615377365946, "learning_rate": 8.816359424947652e-07, "loss": 0.4629, "step": 484 }, { "epoch": 4.70873786407767, "grad_norm": 0.09058552987790736, "learning_rate": 8.258670191548135e-07, "loss": 0.4633, "step": 485 }, { "epoch": 4.718446601941747, "grad_norm": 0.08822401104082032, "learning_rate": 7.719016761454479e-07, "loss": 0.4668, "step": 486 }, { "epoch": 4.728155339805825, "grad_norm": 0.0922329791045878, "learning_rate": 7.197423980355344e-07, "loss": 0.4665, "step": 487 }, { "epoch": 4.737864077669903, "grad_norm": 0.08744767491607641, "learning_rate": 6.693915862425692e-07, "loss": 0.4657, "step": 488 }, { "epoch": 4.747572815533981, "grad_norm": 0.09683409497038403, "learning_rate": 6.20851558922091e-07, "loss": 0.4615, "step": 489 }, { "epoch": 4.757281553398058, "grad_norm": 0.08456750174183916, "learning_rate": 5.741245508609972e-07, "loss": 0.4649, "step": 490 }, { "epoch": 4.766990291262136, "grad_norm": 0.08773860548864587, "learning_rate": 5.292127133746005e-07, "loss": 0.4653, "step": 491 }, { "epoch": 4.776699029126213, "grad_norm": 0.0917555007099176, "learning_rate": 4.861181142076276e-07, "loss": 0.4656, "step": 492 }, { "epoch": 4.786407766990291, "grad_norm": 0.08845762170540458, "learning_rate": 4.448427374389974e-07, "loss": 0.4692, "step": 493 }, { "epoch": 4.796116504854369, "grad_norm": 0.08827155953592115, "learning_rate": 4.053884833904809e-07, "loss": 0.4662, "step": 494 }, { "epoch": 4.805825242718447, "grad_norm": 0.08804630863083254, "learning_rate": 3.677571685392023e-07, "loss": 0.4653, "step": 495 }, { "epoch": 4.815533980582524, "grad_norm": 0.09273872362298237, "learning_rate": 3.319505254340172e-07, "loss": 0.4674, "step": 496 }, { "epoch": 4.825242718446602, "grad_norm": 0.08937217299785047, "learning_rate": 2.9797020261574494e-07, "loss": 0.4675, "step": 497 }, { "epoch": 4.834951456310679, "grad_norm": 0.08269997531774376, "learning_rate": 2.6581776454126075e-07, "loss": 0.4683, "step": 498 }, { "epoch": 4.844660194174757, "grad_norm": 0.08810498069517812, "learning_rate": 2.3549469151149085e-07, "loss": 0.4658, "step": 499 }, { "epoch": 4.854368932038835, "grad_norm": 0.08729625064317513, "learning_rate": 2.0700237960322279e-07, "loss": 0.4696, "step": 500 }, { "epoch": 4.864077669902913, "grad_norm": 0.08416226889517374, "learning_rate": 1.803421406048589e-07, "loss": 0.4636, "step": 501 }, { "epoch": 4.87378640776699, "grad_norm": 0.08684248344197969, "learning_rate": 1.5551520195601577e-07, "loss": 0.4647, "step": 502 }, { "epoch": 4.883495145631068, "grad_norm": 0.08935580567045806, "learning_rate": 1.3252270669100953e-07, "loss": 0.4642, "step": 503 }, { "epoch": 4.893203883495145, "grad_norm": 0.0846095709740149, "learning_rate": 1.113657133862267e-07, "loss": 0.4626, "step": 504 }, { "epoch": 4.902912621359223, "grad_norm": 0.08317972288259207, "learning_rate": 9.204519611138995e-08, "loss": 0.4628, "step": 505 }, { "epoch": 4.9126213592233015, "grad_norm": 0.0845289066820202, "learning_rate": 7.45620443847228e-08, "loss": 0.4657, "step": 506 }, { "epoch": 4.922330097087379, "grad_norm": 0.08611674149409979, "learning_rate": 5.891706313197354e-08, "loss": 0.4643, "step": 507 }, { "epoch": 4.932038834951456, "grad_norm": 0.08707702652246953, "learning_rate": 4.511097264938258e-08, "loss": 0.4653, "step": 508 }, { "epoch": 4.941747572815534, "grad_norm": 0.08653800248757759, "learning_rate": 3.314440857049572e-08, "loss": 0.4679, "step": 509 }, { "epoch": 4.951456310679612, "grad_norm": 0.08635221643773852, "learning_rate": 2.3017921836916425e-08, "loss": 0.4627, "step": 510 }, { "epoch": 4.961165048543689, "grad_norm": 0.08744848791920268, "learning_rate": 1.4731978672939407e-08, "loss": 0.4664, "step": 511 }, { "epoch": 4.970873786407767, "grad_norm": 0.08802432305414154, "learning_rate": 8.286960564065639e-09, "loss": 0.4628, "step": 512 }, { "epoch": 4.980582524271845, "grad_norm": 0.08116167073862661, "learning_rate": 3.683164239469683e-09, "loss": 0.4637, "step": 513 }, { "epoch": 4.990291262135923, "grad_norm": 0.08686945147014129, "learning_rate": 9.208016583128754e-10, "loss": 0.4606, "step": 514 }, { "epoch": 5.0, "grad_norm": 0.08703184321831914, "learning_rate": 0.0, "loss": 0.4639, "step": 515 }, { "epoch": 5.0, "step": 515, "total_flos": 8639713262960640.0, "train_loss": 0.0, "train_runtime": 13.1123, "train_samples_per_second": 20070.162, "train_steps_per_second": 39.276 } ], "logging_steps": 1, "max_steps": 515, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8639713262960640.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }