{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.994492525570417, "eval_steps": 500, "global_step": 1585, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003147128245476003, "grad_norm": 6.929598040614756, "learning_rate": 2.5157232704402517e-07, "loss": 1.0552, "step": 1 }, { "epoch": 0.006294256490952006, "grad_norm": 6.905796769662321, "learning_rate": 5.031446540880503e-07, "loss": 1.0526, "step": 2 }, { "epoch": 0.00944138473642801, "grad_norm": 6.968574034942788, "learning_rate": 7.547169811320755e-07, "loss": 1.0558, "step": 3 }, { "epoch": 0.012588512981904013, "grad_norm": 6.851623182903484, "learning_rate": 1.0062893081761007e-06, "loss": 1.0581, "step": 4 }, { "epoch": 0.015735641227380016, "grad_norm": 6.816097196762909, "learning_rate": 1.257861635220126e-06, "loss": 1.0691, "step": 5 }, { "epoch": 0.01888276947285602, "grad_norm": 6.269657845555971, "learning_rate": 1.509433962264151e-06, "loss": 1.0442, "step": 6 }, { "epoch": 0.022029897718332022, "grad_norm": 6.136829210521579, "learning_rate": 1.7610062893081762e-06, "loss": 1.0331, "step": 7 }, { "epoch": 0.025177025963808025, "grad_norm": 4.635434838113676, "learning_rate": 2.0125786163522013e-06, "loss": 0.9988, "step": 8 }, { "epoch": 0.02832415420928403, "grad_norm": 4.321448680290915, "learning_rate": 2.2641509433962266e-06, "loss": 0.9861, "step": 9 }, { "epoch": 0.03147128245476003, "grad_norm": 2.927976062577963, "learning_rate": 2.515723270440252e-06, "loss": 0.9606, "step": 10 }, { "epoch": 0.03461841070023604, "grad_norm": 2.853095690681028, "learning_rate": 2.767295597484277e-06, "loss": 0.9645, "step": 11 }, { "epoch": 0.03776553894571204, "grad_norm": 2.7462594823658737, "learning_rate": 3.018867924528302e-06, "loss": 0.9743, "step": 12 }, { "epoch": 0.040912667191188044, "grad_norm": 3.562428829453312, "learning_rate": 3.270440251572327e-06, "loss": 0.9278, "step": 13 }, { "epoch": 0.044059795436664044, "grad_norm": 4.263574542340476, "learning_rate": 3.5220125786163524e-06, "loss": 0.9396, "step": 14 }, { "epoch": 0.04720692368214005, "grad_norm": 3.985495537747823, "learning_rate": 3.7735849056603777e-06, "loss": 0.9377, "step": 15 }, { "epoch": 0.05035405192761605, "grad_norm": 3.739310444555421, "learning_rate": 4.025157232704403e-06, "loss": 0.9249, "step": 16 }, { "epoch": 0.05350118017309206, "grad_norm": 2.9474915656983605, "learning_rate": 4.276729559748428e-06, "loss": 0.9276, "step": 17 }, { "epoch": 0.05664830841856806, "grad_norm": 2.270408122661505, "learning_rate": 4.528301886792453e-06, "loss": 0.8794, "step": 18 }, { "epoch": 0.05979543666404406, "grad_norm": 2.019432371584577, "learning_rate": 4.779874213836478e-06, "loss": 0.8753, "step": 19 }, { "epoch": 0.06294256490952006, "grad_norm": 1.647937215366361, "learning_rate": 5.031446540880504e-06, "loss": 0.8624, "step": 20 }, { "epoch": 0.06608969315499606, "grad_norm": 1.2550304135134105, "learning_rate": 5.283018867924529e-06, "loss": 0.8442, "step": 21 }, { "epoch": 0.06923682140047208, "grad_norm": 1.1653152615859128, "learning_rate": 5.534591194968554e-06, "loss": 0.8329, "step": 22 }, { "epoch": 0.07238394964594808, "grad_norm": 0.9858900450552275, "learning_rate": 5.786163522012579e-06, "loss": 0.8175, "step": 23 }, { "epoch": 0.07553107789142408, "grad_norm": 1.022081335979346, "learning_rate": 6.037735849056604e-06, "loss": 0.8111, "step": 24 }, { "epoch": 0.07867820613690008, "grad_norm": 0.8753797612953995, "learning_rate": 6.289308176100629e-06, "loss": 0.8, "step": 25 }, { "epoch": 0.08182533438237609, "grad_norm": 0.9128953216524504, "learning_rate": 6.540880503144654e-06, "loss": 0.7959, "step": 26 }, { "epoch": 0.08497246262785209, "grad_norm": 0.977168784939036, "learning_rate": 6.792452830188679e-06, "loss": 0.794, "step": 27 }, { "epoch": 0.08811959087332809, "grad_norm": 0.8642439163002466, "learning_rate": 7.044025157232705e-06, "loss": 0.7739, "step": 28 }, { "epoch": 0.09126671911880409, "grad_norm": 0.7734428298213224, "learning_rate": 7.29559748427673e-06, "loss": 0.7886, "step": 29 }, { "epoch": 0.0944138473642801, "grad_norm": 0.9003447759767896, "learning_rate": 7.5471698113207555e-06, "loss": 0.7766, "step": 30 }, { "epoch": 0.0975609756097561, "grad_norm": 0.9359339373099113, "learning_rate": 7.79874213836478e-06, "loss": 0.7634, "step": 31 }, { "epoch": 0.1007081038552321, "grad_norm": 0.7278099390374022, "learning_rate": 8.050314465408805e-06, "loss": 0.7609, "step": 32 }, { "epoch": 0.1038552321007081, "grad_norm": 0.7509292280032696, "learning_rate": 8.301886792452832e-06, "loss": 0.7633, "step": 33 }, { "epoch": 0.10700236034618411, "grad_norm": 0.7689335379022276, "learning_rate": 8.553459119496857e-06, "loss": 0.7442, "step": 34 }, { "epoch": 0.11014948859166011, "grad_norm": 0.7122656766942688, "learning_rate": 8.805031446540882e-06, "loss": 0.7424, "step": 35 }, { "epoch": 0.11329661683713611, "grad_norm": 0.6581326216708504, "learning_rate": 9.056603773584907e-06, "loss": 0.7356, "step": 36 }, { "epoch": 0.11644374508261211, "grad_norm": 0.6078493790197033, "learning_rate": 9.308176100628931e-06, "loss": 0.7439, "step": 37 }, { "epoch": 0.11959087332808813, "grad_norm": 0.5534379930110489, "learning_rate": 9.559748427672956e-06, "loss": 0.7392, "step": 38 }, { "epoch": 0.12273800157356413, "grad_norm": 0.6054365759524477, "learning_rate": 9.811320754716981e-06, "loss": 0.7362, "step": 39 }, { "epoch": 0.12588512981904013, "grad_norm": 0.6171188975167723, "learning_rate": 1.0062893081761008e-05, "loss": 0.7282, "step": 40 }, { "epoch": 0.12903225806451613, "grad_norm": 0.5954823809501688, "learning_rate": 1.0314465408805033e-05, "loss": 0.7209, "step": 41 }, { "epoch": 0.13217938630999213, "grad_norm": 0.5100657067756379, "learning_rate": 1.0566037735849058e-05, "loss": 0.7276, "step": 42 }, { "epoch": 0.13532651455546812, "grad_norm": 0.5616309144822732, "learning_rate": 1.0817610062893083e-05, "loss": 0.7381, "step": 43 }, { "epoch": 0.13847364280094415, "grad_norm": 0.5909101450008207, "learning_rate": 1.1069182389937107e-05, "loss": 0.7269, "step": 44 }, { "epoch": 0.14162077104642015, "grad_norm": 0.5890949559740806, "learning_rate": 1.1320754716981132e-05, "loss": 0.7096, "step": 45 }, { "epoch": 0.14476789929189615, "grad_norm": 0.5975890175035441, "learning_rate": 1.1572327044025157e-05, "loss": 0.7218, "step": 46 }, { "epoch": 0.14791502753737215, "grad_norm": 0.5619034090767466, "learning_rate": 1.1823899371069182e-05, "loss": 0.7251, "step": 47 }, { "epoch": 0.15106215578284815, "grad_norm": 0.5774632138045557, "learning_rate": 1.2075471698113209e-05, "loss": 0.7137, "step": 48 }, { "epoch": 0.15420928402832415, "grad_norm": 0.5961867782994129, "learning_rate": 1.2327044025157234e-05, "loss": 0.7081, "step": 49 }, { "epoch": 0.15735641227380015, "grad_norm": 0.6521015732677928, "learning_rate": 1.2578616352201259e-05, "loss": 0.6987, "step": 50 }, { "epoch": 0.16050354051927615, "grad_norm": 0.506284287234856, "learning_rate": 1.2830188679245283e-05, "loss": 0.7063, "step": 51 }, { "epoch": 0.16365066876475218, "grad_norm": 0.5516498749754162, "learning_rate": 1.3081761006289308e-05, "loss": 0.6963, "step": 52 }, { "epoch": 0.16679779701022818, "grad_norm": 0.6024465952086706, "learning_rate": 1.3333333333333333e-05, "loss": 0.7056, "step": 53 }, { "epoch": 0.16994492525570418, "grad_norm": 0.5160023824672929, "learning_rate": 1.3584905660377358e-05, "loss": 0.7008, "step": 54 }, { "epoch": 0.17309205350118018, "grad_norm": 0.5791286636248881, "learning_rate": 1.3836477987421383e-05, "loss": 0.6996, "step": 55 }, { "epoch": 0.17623918174665618, "grad_norm": 0.5576591803332213, "learning_rate": 1.408805031446541e-05, "loss": 0.7102, "step": 56 }, { "epoch": 0.17938630999213218, "grad_norm": 0.6330972767133404, "learning_rate": 1.4339622641509435e-05, "loss": 0.6938, "step": 57 }, { "epoch": 0.18253343823760818, "grad_norm": 0.5239745977140408, "learning_rate": 1.459119496855346e-05, "loss": 0.6961, "step": 58 }, { "epoch": 0.18568056648308418, "grad_norm": 0.628316605306883, "learning_rate": 1.4842767295597484e-05, "loss": 0.6902, "step": 59 }, { "epoch": 0.1888276947285602, "grad_norm": 0.5368099681662118, "learning_rate": 1.5094339622641511e-05, "loss": 0.6829, "step": 60 }, { "epoch": 0.1919748229740362, "grad_norm": 0.5734668071542257, "learning_rate": 1.5345911949685536e-05, "loss": 0.6968, "step": 61 }, { "epoch": 0.1951219512195122, "grad_norm": 0.6868079128761005, "learning_rate": 1.559748427672956e-05, "loss": 0.6858, "step": 62 }, { "epoch": 0.1982690794649882, "grad_norm": 0.5683575072486896, "learning_rate": 1.5849056603773586e-05, "loss": 0.6932, "step": 63 }, { "epoch": 0.2014162077104642, "grad_norm": 0.5001123586647835, "learning_rate": 1.610062893081761e-05, "loss": 0.6897, "step": 64 }, { "epoch": 0.2045633359559402, "grad_norm": 0.5621386700847243, "learning_rate": 1.635220125786164e-05, "loss": 0.6874, "step": 65 }, { "epoch": 0.2077104642014162, "grad_norm": 0.5174313413850266, "learning_rate": 1.6603773584905664e-05, "loss": 0.6831, "step": 66 }, { "epoch": 0.2108575924468922, "grad_norm": 0.5217210506959701, "learning_rate": 1.685534591194969e-05, "loss": 0.6687, "step": 67 }, { "epoch": 0.21400472069236823, "grad_norm": 0.5035280266030444, "learning_rate": 1.7106918238993714e-05, "loss": 0.6834, "step": 68 }, { "epoch": 0.21715184893784423, "grad_norm": 0.5678994919125128, "learning_rate": 1.735849056603774e-05, "loss": 0.6951, "step": 69 }, { "epoch": 0.22029897718332023, "grad_norm": 0.5707567615688384, "learning_rate": 1.7610062893081763e-05, "loss": 0.6847, "step": 70 }, { "epoch": 0.22344610542879623, "grad_norm": 0.7809985354857447, "learning_rate": 1.7861635220125788e-05, "loss": 0.6774, "step": 71 }, { "epoch": 0.22659323367427223, "grad_norm": 1.046093110923802, "learning_rate": 1.8113207547169813e-05, "loss": 0.6708, "step": 72 }, { "epoch": 0.22974036191974823, "grad_norm": 1.1135255501368588, "learning_rate": 1.8364779874213838e-05, "loss": 0.6706, "step": 73 }, { "epoch": 0.23288749016522423, "grad_norm": 0.7442706325757177, "learning_rate": 1.8616352201257863e-05, "loss": 0.688, "step": 74 }, { "epoch": 0.23603461841070023, "grad_norm": 0.7511109436645367, "learning_rate": 1.8867924528301888e-05, "loss": 0.6856, "step": 75 }, { "epoch": 0.23918174665617625, "grad_norm": 0.9996568960492868, "learning_rate": 1.9119496855345913e-05, "loss": 0.6743, "step": 76 }, { "epoch": 0.24232887490165225, "grad_norm": 1.0404763602720255, "learning_rate": 1.9371069182389938e-05, "loss": 0.6769, "step": 77 }, { "epoch": 0.24547600314712825, "grad_norm": 0.9916721164476111, "learning_rate": 1.9622641509433963e-05, "loss": 0.6673, "step": 78 }, { "epoch": 0.24862313139260425, "grad_norm": 1.1612341326633233, "learning_rate": 1.9874213836477987e-05, "loss": 0.6669, "step": 79 }, { "epoch": 0.25177025963808025, "grad_norm": 0.759493553538397, "learning_rate": 2.0125786163522016e-05, "loss": 0.6585, "step": 80 }, { "epoch": 0.2549173878835563, "grad_norm": 1.1730448836839678, "learning_rate": 2.037735849056604e-05, "loss": 0.6689, "step": 81 }, { "epoch": 0.25806451612903225, "grad_norm": 1.3145750241994978, "learning_rate": 2.0628930817610066e-05, "loss": 0.6666, "step": 82 }, { "epoch": 0.2612116443745083, "grad_norm": 0.8560831722543999, "learning_rate": 2.088050314465409e-05, "loss": 0.6605, "step": 83 }, { "epoch": 0.26435877261998425, "grad_norm": 1.307515350251066, "learning_rate": 2.1132075471698115e-05, "loss": 0.6777, "step": 84 }, { "epoch": 0.2675059008654603, "grad_norm": 0.7959450309653999, "learning_rate": 2.138364779874214e-05, "loss": 0.6628, "step": 85 }, { "epoch": 0.27065302911093625, "grad_norm": 1.3601493926178994, "learning_rate": 2.1635220125786165e-05, "loss": 0.6684, "step": 86 }, { "epoch": 0.2738001573564123, "grad_norm": 1.0335217441808127, "learning_rate": 2.188679245283019e-05, "loss": 0.6644, "step": 87 }, { "epoch": 0.2769472856018883, "grad_norm": 1.1840289122844756, "learning_rate": 2.2138364779874215e-05, "loss": 0.6621, "step": 88 }, { "epoch": 0.2800944138473643, "grad_norm": 0.8482919292822308, "learning_rate": 2.238993710691824e-05, "loss": 0.6546, "step": 89 }, { "epoch": 0.2832415420928403, "grad_norm": 1.219466252432804, "learning_rate": 2.2641509433962265e-05, "loss": 0.66, "step": 90 }, { "epoch": 0.2863886703383163, "grad_norm": 1.0676120235119377, "learning_rate": 2.289308176100629e-05, "loss": 0.6592, "step": 91 }, { "epoch": 0.2895357985837923, "grad_norm": 1.2108955484478103, "learning_rate": 2.3144654088050315e-05, "loss": 0.6721, "step": 92 }, { "epoch": 0.2926829268292683, "grad_norm": 0.9120063102836191, "learning_rate": 2.339622641509434e-05, "loss": 0.6659, "step": 93 }, { "epoch": 0.2958300550747443, "grad_norm": 1.2734105963895854, "learning_rate": 2.3647798742138364e-05, "loss": 0.6707, "step": 94 }, { "epoch": 0.2989771833202203, "grad_norm": 1.0314294710331138, "learning_rate": 2.3899371069182393e-05, "loss": 0.6717, "step": 95 }, { "epoch": 0.3021243115656963, "grad_norm": 1.1455964773414549, "learning_rate": 2.4150943396226418e-05, "loss": 0.6665, "step": 96 }, { "epoch": 0.30527143981117233, "grad_norm": 1.285838785053339, "learning_rate": 2.4402515723270442e-05, "loss": 0.6639, "step": 97 }, { "epoch": 0.3084185680566483, "grad_norm": 1.263209287431304, "learning_rate": 2.4654088050314467e-05, "loss": 0.6601, "step": 98 }, { "epoch": 0.31156569630212433, "grad_norm": 0.8978757077780872, "learning_rate": 2.4905660377358492e-05, "loss": 0.6545, "step": 99 }, { "epoch": 0.3147128245476003, "grad_norm": 0.9825339416853001, "learning_rate": 2.5157232704402517e-05, "loss": 0.6476, "step": 100 }, { "epoch": 0.31785995279307633, "grad_norm": 1.308507693155568, "learning_rate": 2.5408805031446542e-05, "loss": 0.6689, "step": 101 }, { "epoch": 0.3210070810385523, "grad_norm": 1.1553828783407727, "learning_rate": 2.5660377358490567e-05, "loss": 0.6667, "step": 102 }, { "epoch": 0.3241542092840283, "grad_norm": 1.099078324578191, "learning_rate": 2.5911949685534592e-05, "loss": 0.6485, "step": 103 }, { "epoch": 0.32730133752950435, "grad_norm": 1.0681608956982722, "learning_rate": 2.6163522012578617e-05, "loss": 0.6492, "step": 104 }, { "epoch": 0.3304484657749803, "grad_norm": 1.0413270700584025, "learning_rate": 2.641509433962264e-05, "loss": 0.6565, "step": 105 }, { "epoch": 0.33359559402045635, "grad_norm": 1.1879394567955261, "learning_rate": 2.6666666666666667e-05, "loss": 0.6523, "step": 106 }, { "epoch": 0.3367427222659323, "grad_norm": 1.4725003254144884, "learning_rate": 2.691823899371069e-05, "loss": 0.6556, "step": 107 }, { "epoch": 0.33988985051140835, "grad_norm": 0.6126879058489698, "learning_rate": 2.7169811320754716e-05, "loss": 0.6572, "step": 108 }, { "epoch": 0.3430369787568843, "grad_norm": 1.180095012071221, "learning_rate": 2.742138364779874e-05, "loss": 0.6451, "step": 109 }, { "epoch": 0.34618410700236035, "grad_norm": 1.5775784222686664, "learning_rate": 2.7672955974842766e-05, "loss": 0.6479, "step": 110 }, { "epoch": 0.3493312352478363, "grad_norm": 1.0102816587441463, "learning_rate": 2.7924528301886794e-05, "loss": 0.6537, "step": 111 }, { "epoch": 0.35247836349331235, "grad_norm": 1.844674890611079, "learning_rate": 2.817610062893082e-05, "loss": 0.6632, "step": 112 }, { "epoch": 0.3556254917387884, "grad_norm": 1.1133503786547416, "learning_rate": 2.8427672955974844e-05, "loss": 0.6588, "step": 113 }, { "epoch": 0.35877261998426435, "grad_norm": 2.0384135105494825, "learning_rate": 2.867924528301887e-05, "loss": 0.6566, "step": 114 }, { "epoch": 0.3619197482297404, "grad_norm": 1.6427197756524008, "learning_rate": 2.8930817610062894e-05, "loss": 0.6536, "step": 115 }, { "epoch": 0.36506687647521635, "grad_norm": 1.7204143290151555, "learning_rate": 2.918238993710692e-05, "loss": 0.6516, "step": 116 }, { "epoch": 0.3682140047206924, "grad_norm": 1.6120937604495096, "learning_rate": 2.9433962264150944e-05, "loss": 0.643, "step": 117 }, { "epoch": 0.37136113296616835, "grad_norm": 1.4095110771525443, "learning_rate": 2.968553459119497e-05, "loss": 0.6621, "step": 118 }, { "epoch": 0.3745082612116444, "grad_norm": 1.459313179796121, "learning_rate": 2.9937106918238994e-05, "loss": 0.6594, "step": 119 }, { "epoch": 0.3776553894571204, "grad_norm": 1.0827125230585957, "learning_rate": 3.0188679245283022e-05, "loss": 0.6614, "step": 120 }, { "epoch": 0.3808025177025964, "grad_norm": 1.935573484990545, "learning_rate": 3.044025157232705e-05, "loss": 0.6586, "step": 121 }, { "epoch": 0.3839496459480724, "grad_norm": 1.38748901967822, "learning_rate": 3.069182389937107e-05, "loss": 0.6587, "step": 122 }, { "epoch": 0.3870967741935484, "grad_norm": 1.949825044536692, "learning_rate": 3.09433962264151e-05, "loss": 0.6541, "step": 123 }, { "epoch": 0.3902439024390244, "grad_norm": 1.7165782756639372, "learning_rate": 3.119496855345912e-05, "loss": 0.649, "step": 124 }, { "epoch": 0.3933910306845004, "grad_norm": 1.4753090682690373, "learning_rate": 3.144654088050315e-05, "loss": 0.6472, "step": 125 }, { "epoch": 0.3965381589299764, "grad_norm": 1.7890266274768591, "learning_rate": 3.169811320754717e-05, "loss": 0.6649, "step": 126 }, { "epoch": 0.3996852871754524, "grad_norm": 1.335629902067876, "learning_rate": 3.19496855345912e-05, "loss": 0.6544, "step": 127 }, { "epoch": 0.4028324154209284, "grad_norm": 1.5980486069135038, "learning_rate": 3.220125786163522e-05, "loss": 0.6511, "step": 128 }, { "epoch": 0.40597954366640443, "grad_norm": 1.3401486224120511, "learning_rate": 3.245283018867925e-05, "loss": 0.6435, "step": 129 }, { "epoch": 0.4091266719118804, "grad_norm": 1.5147750007237795, "learning_rate": 3.270440251572328e-05, "loss": 0.6493, "step": 130 }, { "epoch": 0.41227380015735643, "grad_norm": 1.5267838685199988, "learning_rate": 3.29559748427673e-05, "loss": 0.6535, "step": 131 }, { "epoch": 0.4154209284028324, "grad_norm": 1.2875034331430568, "learning_rate": 3.320754716981133e-05, "loss": 0.6434, "step": 132 }, { "epoch": 0.41856805664830843, "grad_norm": 1.6371277196579286, "learning_rate": 3.345911949685535e-05, "loss": 0.6584, "step": 133 }, { "epoch": 0.4217151848937844, "grad_norm": 1.5486461984110311, "learning_rate": 3.371069182389938e-05, "loss": 0.6458, "step": 134 }, { "epoch": 0.42486231313926043, "grad_norm": 1.3892293338547828, "learning_rate": 3.39622641509434e-05, "loss": 0.6447, "step": 135 }, { "epoch": 0.42800944138473646, "grad_norm": 1.6407567446245384, "learning_rate": 3.421383647798743e-05, "loss": 0.651, "step": 136 }, { "epoch": 0.4311565696302124, "grad_norm": 1.1366311200838441, "learning_rate": 3.446540880503145e-05, "loss": 0.6406, "step": 137 }, { "epoch": 0.43430369787568845, "grad_norm": 1.764056894253217, "learning_rate": 3.471698113207548e-05, "loss": 0.649, "step": 138 }, { "epoch": 0.4374508261211644, "grad_norm": 1.2131870613834437, "learning_rate": 3.49685534591195e-05, "loss": 0.6433, "step": 139 }, { "epoch": 0.44059795436664045, "grad_norm": 1.617902481723902, "learning_rate": 3.522012578616353e-05, "loss": 0.6442, "step": 140 }, { "epoch": 0.4437450826121164, "grad_norm": 1.2609898385086604, "learning_rate": 3.547169811320755e-05, "loss": 0.6329, "step": 141 }, { "epoch": 0.44689221085759245, "grad_norm": 1.7132073427376908, "learning_rate": 3.5723270440251577e-05, "loss": 0.6503, "step": 142 }, { "epoch": 0.4500393391030684, "grad_norm": 1.2177364936643433, "learning_rate": 3.59748427672956e-05, "loss": 0.6319, "step": 143 }, { "epoch": 0.45318646734854445, "grad_norm": 1.4831866244496776, "learning_rate": 3.6226415094339626e-05, "loss": 0.6447, "step": 144 }, { "epoch": 0.4563335955940205, "grad_norm": 1.9962471519744966, "learning_rate": 3.6477987421383655e-05, "loss": 0.6449, "step": 145 }, { "epoch": 0.45948072383949645, "grad_norm": 1.1203633745786987, "learning_rate": 3.6729559748427676e-05, "loss": 0.6447, "step": 146 }, { "epoch": 0.4626278520849725, "grad_norm": 1.7428487048510457, "learning_rate": 3.6981132075471704e-05, "loss": 0.6473, "step": 147 }, { "epoch": 0.46577498033044845, "grad_norm": 1.6557025727591974, "learning_rate": 3.7232704402515726e-05, "loss": 0.6521, "step": 148 }, { "epoch": 0.4689221085759245, "grad_norm": 1.633226108310503, "learning_rate": 3.7484276729559754e-05, "loss": 0.6424, "step": 149 }, { "epoch": 0.47206923682140045, "grad_norm": 1.500343658328648, "learning_rate": 3.7735849056603776e-05, "loss": 0.6599, "step": 150 }, { "epoch": 0.4752163650668765, "grad_norm": 1.7811230140847056, "learning_rate": 3.7987421383647804e-05, "loss": 0.6365, "step": 151 }, { "epoch": 0.4783634933123525, "grad_norm": 1.4623016887202551, "learning_rate": 3.8238993710691826e-05, "loss": 0.6484, "step": 152 }, { "epoch": 0.4815106215578285, "grad_norm": 1.4962594002979546, "learning_rate": 3.8490566037735854e-05, "loss": 0.6529, "step": 153 }, { "epoch": 0.4846577498033045, "grad_norm": 1.2968251690411272, "learning_rate": 3.8742138364779875e-05, "loss": 0.6405, "step": 154 }, { "epoch": 0.4878048780487805, "grad_norm": 1.6038094180226874, "learning_rate": 3.8993710691823904e-05, "loss": 0.6465, "step": 155 }, { "epoch": 0.4909520062942565, "grad_norm": 1.349439118443086, "learning_rate": 3.9245283018867925e-05, "loss": 0.6296, "step": 156 }, { "epoch": 0.4940991345397325, "grad_norm": 1.5954765047689583, "learning_rate": 3.9496855345911953e-05, "loss": 0.6464, "step": 157 }, { "epoch": 0.4972462627852085, "grad_norm": 1.2357067055631126, "learning_rate": 3.9748427672955975e-05, "loss": 0.6476, "step": 158 }, { "epoch": 0.5003933910306845, "grad_norm": 2.005111041334352, "learning_rate": 4e-05, "loss": 0.6451, "step": 159 }, { "epoch": 0.5035405192761605, "grad_norm": 1.6640763044427809, "learning_rate": 3.999995146438705e-05, "loss": 0.6457, "step": 160 }, { "epoch": 0.5066876475216365, "grad_norm": 1.6470772155507238, "learning_rate": 3.999980585778375e-05, "loss": 0.6307, "step": 161 }, { "epoch": 0.5098347757671126, "grad_norm": 1.5584873476563954, "learning_rate": 3.999956318089682e-05, "loss": 0.6365, "step": 162 }, { "epoch": 0.5129819040125885, "grad_norm": 0.9821030562408402, "learning_rate": 3.9999223434904104e-05, "loss": 0.6618, "step": 163 }, { "epoch": 0.5161290322580645, "grad_norm": 2.065476843178622, "learning_rate": 3.9998786621454584e-05, "loss": 0.6486, "step": 164 }, { "epoch": 0.5192761605035405, "grad_norm": 0.9162285189594601, "learning_rate": 3.999825274266836e-05, "loss": 0.6344, "step": 165 }, { "epoch": 0.5224232887490166, "grad_norm": 1.817254046632479, "learning_rate": 3.9997621801136645e-05, "loss": 0.6578, "step": 166 }, { "epoch": 0.5255704169944925, "grad_norm": 1.6213131489050836, "learning_rate": 3.999689379992174e-05, "loss": 0.6478, "step": 167 }, { "epoch": 0.5287175452399685, "grad_norm": 1.5191822069477583, "learning_rate": 3.9996068742557065e-05, "loss": 0.634, "step": 168 }, { "epoch": 0.5318646734854445, "grad_norm": 1.3625533453285046, "learning_rate": 3.999514663304708e-05, "loss": 0.6416, "step": 169 }, { "epoch": 0.5350118017309206, "grad_norm": 1.7443314517509239, "learning_rate": 3.999412747586729e-05, "loss": 0.6323, "step": 170 }, { "epoch": 0.5381589299763966, "grad_norm": 1.2764075205223595, "learning_rate": 3.999301127596425e-05, "loss": 0.6389, "step": 171 }, { "epoch": 0.5413060582218725, "grad_norm": 1.752440457864074, "learning_rate": 3.9991798038755484e-05, "loss": 0.6419, "step": 172 }, { "epoch": 0.5444531864673485, "grad_norm": 1.435326730571326, "learning_rate": 3.999048777012953e-05, "loss": 0.6285, "step": 173 }, { "epoch": 0.5476003147128246, "grad_norm": 1.9146516602093107, "learning_rate": 3.998908047644587e-05, "loss": 0.6471, "step": 174 }, { "epoch": 0.5507474429583006, "grad_norm": 1.6167083359019538, "learning_rate": 3.998757616453486e-05, "loss": 0.6403, "step": 175 }, { "epoch": 0.5538945712037766, "grad_norm": 1.4023315753110592, "learning_rate": 3.998597484169779e-05, "loss": 0.6396, "step": 176 }, { "epoch": 0.5570416994492525, "grad_norm": 1.3268570022647785, "learning_rate": 3.9984276515706764e-05, "loss": 0.6381, "step": 177 }, { "epoch": 0.5601888276947286, "grad_norm": 1.3545177342692951, "learning_rate": 3.998248119480473e-05, "loss": 0.6498, "step": 178 }, { "epoch": 0.5633359559402046, "grad_norm": 1.0685787928581592, "learning_rate": 3.998058888770537e-05, "loss": 0.6474, "step": 179 }, { "epoch": 0.5664830841856806, "grad_norm": 1.5577581689973945, "learning_rate": 3.997859960359313e-05, "loss": 0.6294, "step": 180 }, { "epoch": 0.5696302124311565, "grad_norm": 1.5970031198965817, "learning_rate": 3.997651335212311e-05, "loss": 0.6228, "step": 181 }, { "epoch": 0.5727773406766326, "grad_norm": 1.0686347825691989, "learning_rate": 3.997433014342106e-05, "loss": 0.6153, "step": 182 }, { "epoch": 0.5759244689221086, "grad_norm": 1.1929803859988213, "learning_rate": 3.9972049988083323e-05, "loss": 0.65, "step": 183 }, { "epoch": 0.5790715971675846, "grad_norm": 1.5562626483329678, "learning_rate": 3.9969672897176764e-05, "loss": 0.6256, "step": 184 }, { "epoch": 0.5822187254130606, "grad_norm": 1.05136149985961, "learning_rate": 3.996719888223875e-05, "loss": 0.6324, "step": 185 }, { "epoch": 0.5853658536585366, "grad_norm": 1.0205341272287771, "learning_rate": 3.996462795527706e-05, "loss": 0.6452, "step": 186 }, { "epoch": 0.5885129819040126, "grad_norm": 1.3028420216806813, "learning_rate": 3.996196012876984e-05, "loss": 0.6371, "step": 187 }, { "epoch": 0.5916601101494886, "grad_norm": 1.4324067360751134, "learning_rate": 3.995919541566555e-05, "loss": 0.6432, "step": 188 }, { "epoch": 0.5948072383949646, "grad_norm": 1.2819517544027286, "learning_rate": 3.995633382938291e-05, "loss": 0.6261, "step": 189 }, { "epoch": 0.5979543666404405, "grad_norm": 0.9819032579492132, "learning_rate": 3.995337538381079e-05, "loss": 0.6347, "step": 190 }, { "epoch": 0.6011014948859166, "grad_norm": 1.2953786226264292, "learning_rate": 3.9950320093308185e-05, "loss": 0.6358, "step": 191 }, { "epoch": 0.6042486231313926, "grad_norm": 1.2056447304755342, "learning_rate": 3.994716797270414e-05, "loss": 0.6316, "step": 192 }, { "epoch": 0.6073957513768686, "grad_norm": 1.508482407225396, "learning_rate": 3.9943919037297674e-05, "loss": 0.6333, "step": 193 }, { "epoch": 0.6105428796223447, "grad_norm": 1.2574626256613217, "learning_rate": 3.9940573302857675e-05, "loss": 0.6382, "step": 194 }, { "epoch": 0.6136900078678206, "grad_norm": 1.3371901111268312, "learning_rate": 3.993713078562288e-05, "loss": 0.6326, "step": 195 }, { "epoch": 0.6168371361132966, "grad_norm": 1.8420742844545008, "learning_rate": 3.993359150230177e-05, "loss": 0.6378, "step": 196 }, { "epoch": 0.6199842643587726, "grad_norm": 1.0329114512418138, "learning_rate": 3.992995547007245e-05, "loss": 0.6264, "step": 197 }, { "epoch": 0.6231313926042487, "grad_norm": 2.5550436716710894, "learning_rate": 3.992622270658264e-05, "loss": 0.6284, "step": 198 }, { "epoch": 0.6262785208497246, "grad_norm": 1.6496939828823534, "learning_rate": 3.992239322994953e-05, "loss": 0.6328, "step": 199 }, { "epoch": 0.6294256490952006, "grad_norm": 2.1055055233410798, "learning_rate": 3.991846705875973e-05, "loss": 0.6387, "step": 200 }, { "epoch": 0.6325727773406766, "grad_norm": 1.8595829111365227, "learning_rate": 3.9914444212069144e-05, "loss": 0.6352, "step": 201 }, { "epoch": 0.6357199055861527, "grad_norm": 1.8668994107558041, "learning_rate": 3.99103247094029e-05, "loss": 0.637, "step": 202 }, { "epoch": 0.6388670338316287, "grad_norm": 1.7037836647511615, "learning_rate": 3.990610857075527e-05, "loss": 0.6343, "step": 203 }, { "epoch": 0.6420141620771046, "grad_norm": 1.38923058159769, "learning_rate": 3.990179581658953e-05, "loss": 0.6325, "step": 204 }, { "epoch": 0.6451612903225806, "grad_norm": 1.7563051190358938, "learning_rate": 3.98973864678379e-05, "loss": 0.6249, "step": 205 }, { "epoch": 0.6483084185680567, "grad_norm": 1.005439976834508, "learning_rate": 3.9892880545901436e-05, "loss": 0.6218, "step": 206 }, { "epoch": 0.6514555468135327, "grad_norm": 2.054893019034046, "learning_rate": 3.988827807264989e-05, "loss": 0.6367, "step": 207 }, { "epoch": 0.6546026750590087, "grad_norm": 1.2789629172418557, "learning_rate": 3.988357907042165e-05, "loss": 0.6426, "step": 208 }, { "epoch": 0.6577498033044846, "grad_norm": 2.517606786751749, "learning_rate": 3.9878783562023615e-05, "loss": 0.644, "step": 209 }, { "epoch": 0.6608969315499607, "grad_norm": 2.1032212837598365, "learning_rate": 3.987389157073108e-05, "loss": 0.6498, "step": 210 }, { "epoch": 0.6640440597954367, "grad_norm": 1.6077441283045066, "learning_rate": 3.986890312028763e-05, "loss": 0.6318, "step": 211 }, { "epoch": 0.6671911880409127, "grad_norm": 1.3352828378955408, "learning_rate": 3.9863818234904996e-05, "loss": 0.6394, "step": 212 }, { "epoch": 0.6703383162863886, "grad_norm": 1.6466476954422824, "learning_rate": 3.985863693926301e-05, "loss": 0.6412, "step": 213 }, { "epoch": 0.6734854445318647, "grad_norm": 1.2055185581585135, "learning_rate": 3.9853359258509375e-05, "loss": 0.6377, "step": 214 }, { "epoch": 0.6766325727773407, "grad_norm": 1.8223991680747094, "learning_rate": 3.984798521825966e-05, "loss": 0.6323, "step": 215 }, { "epoch": 0.6797797010228167, "grad_norm": 1.4244972280249446, "learning_rate": 3.9842514844597106e-05, "loss": 0.6411, "step": 216 }, { "epoch": 0.6829268292682927, "grad_norm": 1.810262815503657, "learning_rate": 3.983694816407248e-05, "loss": 0.6287, "step": 217 }, { "epoch": 0.6860739575137687, "grad_norm": 1.584327074252924, "learning_rate": 3.983128520370403e-05, "loss": 0.6226, "step": 218 }, { "epoch": 0.6892210857592447, "grad_norm": 1.4679159722861652, "learning_rate": 3.982552599097727e-05, "loss": 0.629, "step": 219 }, { "epoch": 0.6923682140047207, "grad_norm": 1.4505081690005162, "learning_rate": 3.9819670553844885e-05, "loss": 0.6337, "step": 220 }, { "epoch": 0.6955153422501967, "grad_norm": 1.3729339166204286, "learning_rate": 3.981371892072661e-05, "loss": 0.6241, "step": 221 }, { "epoch": 0.6986624704956726, "grad_norm": 1.2886236495973369, "learning_rate": 3.9807671120509074e-05, "loss": 0.6354, "step": 222 }, { "epoch": 0.7018095987411487, "grad_norm": 1.2379553349269816, "learning_rate": 3.9801527182545624e-05, "loss": 0.625, "step": 223 }, { "epoch": 0.7049567269866247, "grad_norm": 1.2249283736735637, "learning_rate": 3.979528713665624e-05, "loss": 0.6301, "step": 224 }, { "epoch": 0.7081038552321007, "grad_norm": 1.336253285005203, "learning_rate": 3.978895101312738e-05, "loss": 0.6292, "step": 225 }, { "epoch": 0.7112509834775768, "grad_norm": 1.1205953980444752, "learning_rate": 3.9782518842711795e-05, "loss": 0.626, "step": 226 }, { "epoch": 0.7143981117230527, "grad_norm": 1.7027386427471345, "learning_rate": 3.977599065662843e-05, "loss": 0.6246, "step": 227 }, { "epoch": 0.7175452399685287, "grad_norm": 1.1291882936174378, "learning_rate": 3.976936648656223e-05, "loss": 0.6282, "step": 228 }, { "epoch": 0.7206923682140047, "grad_norm": 1.2813572025478552, "learning_rate": 3.976264636466401e-05, "loss": 0.6271, "step": 229 }, { "epoch": 0.7238394964594808, "grad_norm": 1.310136883140352, "learning_rate": 3.97558303235503e-05, "loss": 0.6299, "step": 230 }, { "epoch": 0.7269866247049567, "grad_norm": 1.234929455560365, "learning_rate": 3.9748918396303166e-05, "loss": 0.6273, "step": 231 }, { "epoch": 0.7301337529504327, "grad_norm": 1.152399890155409, "learning_rate": 3.974191061647007e-05, "loss": 0.6364, "step": 232 }, { "epoch": 0.7332808811959087, "grad_norm": 0.9820218267166709, "learning_rate": 3.973480701806371e-05, "loss": 0.6081, "step": 233 }, { "epoch": 0.7364280094413848, "grad_norm": 1.437527267365468, "learning_rate": 3.972760763556183e-05, "loss": 0.6335, "step": 234 }, { "epoch": 0.7395751376868608, "grad_norm": 1.2471735910370214, "learning_rate": 3.972031250390707e-05, "loss": 0.6245, "step": 235 }, { "epoch": 0.7427222659323367, "grad_norm": 0.7783741133192488, "learning_rate": 3.97129216585068e-05, "loss": 0.6174, "step": 236 }, { "epoch": 0.7458693941778127, "grad_norm": 1.422537426798629, "learning_rate": 3.9705435135232954e-05, "loss": 0.6259, "step": 237 }, { "epoch": 0.7490165224232888, "grad_norm": 1.1344073454119308, "learning_rate": 3.9697852970421816e-05, "loss": 0.6156, "step": 238 }, { "epoch": 0.7521636506687648, "grad_norm": 0.8130085642709579, "learning_rate": 3.96901752008739e-05, "loss": 0.6235, "step": 239 }, { "epoch": 0.7553107789142408, "grad_norm": 1.506021663227266, "learning_rate": 3.968240186385372e-05, "loss": 0.6308, "step": 240 }, { "epoch": 0.7584579071597167, "grad_norm": 0.8828709445930414, "learning_rate": 3.967453299708965e-05, "loss": 0.6249, "step": 241 }, { "epoch": 0.7616050354051928, "grad_norm": 1.1195490114701467, "learning_rate": 3.966656863877371e-05, "loss": 0.6222, "step": 242 }, { "epoch": 0.7647521636506688, "grad_norm": 1.159507235093096, "learning_rate": 3.965850882756141e-05, "loss": 0.6117, "step": 243 }, { "epoch": 0.7678992918961448, "grad_norm": 1.3618433823111082, "learning_rate": 3.9650353602571535e-05, "loss": 0.6164, "step": 244 }, { "epoch": 0.7710464201416207, "grad_norm": 1.4260909924046845, "learning_rate": 3.9642103003385976e-05, "loss": 0.6322, "step": 245 }, { "epoch": 0.7741935483870968, "grad_norm": 0.8960267340901048, "learning_rate": 3.963375707004951e-05, "loss": 0.6212, "step": 246 }, { "epoch": 0.7773406766325728, "grad_norm": 1.9505783063886386, "learning_rate": 3.9625315843069635e-05, "loss": 0.6272, "step": 247 }, { "epoch": 0.7804878048780488, "grad_norm": 1.060698991962841, "learning_rate": 3.9616779363416375e-05, "loss": 0.627, "step": 248 }, { "epoch": 0.7836349331235248, "grad_norm": 2.651158418118449, "learning_rate": 3.9608147672522056e-05, "loss": 0.632, "step": 249 }, { "epoch": 0.7867820613690008, "grad_norm": 1.9829172683055232, "learning_rate": 3.959942081228111e-05, "loss": 0.6334, "step": 250 }, { "epoch": 0.7899291896144768, "grad_norm": 2.0043712787473855, "learning_rate": 3.9590598825049896e-05, "loss": 0.6347, "step": 251 }, { "epoch": 0.7930763178599528, "grad_norm": 1.5914049220753375, "learning_rate": 3.958168175364646e-05, "loss": 0.6291, "step": 252 }, { "epoch": 0.7962234461054288, "grad_norm": 2.044617279526593, "learning_rate": 3.9572669641350366e-05, "loss": 0.6227, "step": 253 }, { "epoch": 0.7993705743509048, "grad_norm": 1.4921666169771846, "learning_rate": 3.956356253190245e-05, "loss": 0.6287, "step": 254 }, { "epoch": 0.8025177025963808, "grad_norm": 2.1859673527222374, "learning_rate": 3.9554360469504616e-05, "loss": 0.6349, "step": 255 }, { "epoch": 0.8056648308418568, "grad_norm": 1.8780839152913937, "learning_rate": 3.9545063498819655e-05, "loss": 0.6322, "step": 256 }, { "epoch": 0.8088119590873328, "grad_norm": 1.8855386967686778, "learning_rate": 3.9535671664970976e-05, "loss": 0.6236, "step": 257 }, { "epoch": 0.8119590873328089, "grad_norm": 1.57211297687127, "learning_rate": 3.952618501354241e-05, "loss": 0.6266, "step": 258 }, { "epoch": 0.8151062155782848, "grad_norm": 1.6840002131557807, "learning_rate": 3.951660359057802e-05, "loss": 0.6348, "step": 259 }, { "epoch": 0.8182533438237608, "grad_norm": 1.4505249387670598, "learning_rate": 3.9506927442581816e-05, "loss": 0.6185, "step": 260 }, { "epoch": 0.8214004720692368, "grad_norm": 1.6387407896735724, "learning_rate": 3.9497156616517584e-05, "loss": 0.6275, "step": 261 }, { "epoch": 0.8245476003147129, "grad_norm": 1.4191671751611528, "learning_rate": 3.948729115980862e-05, "loss": 0.618, "step": 262 }, { "epoch": 0.8276947285601888, "grad_norm": 1.6165413779568125, "learning_rate": 3.947733112033753e-05, "loss": 0.6261, "step": 263 }, { "epoch": 0.8308418568056648, "grad_norm": 1.3413248663670272, "learning_rate": 3.946727654644597e-05, "loss": 0.6129, "step": 264 }, { "epoch": 0.8339889850511408, "grad_norm": 1.4534436081435673, "learning_rate": 3.945712748693443e-05, "loss": 0.6156, "step": 265 }, { "epoch": 0.8371361132966169, "grad_norm": 1.2334383145263632, "learning_rate": 3.9446883991062e-05, "loss": 0.6197, "step": 266 }, { "epoch": 0.8402832415420929, "grad_norm": 1.750270105689965, "learning_rate": 3.94365461085461e-05, "loss": 0.6061, "step": 267 }, { "epoch": 0.8434303697875688, "grad_norm": 1.5161125269771183, "learning_rate": 3.94261138895623e-05, "loss": 0.6227, "step": 268 }, { "epoch": 0.8465774980330448, "grad_norm": 1.5221806720588753, "learning_rate": 3.9415587384744e-05, "loss": 0.6323, "step": 269 }, { "epoch": 0.8497246262785209, "grad_norm": 1.3897427486795066, "learning_rate": 3.940496664518223e-05, "loss": 0.6207, "step": 270 }, { "epoch": 0.8528717545239969, "grad_norm": 1.4191581228390167, "learning_rate": 3.939425172242541e-05, "loss": 0.6068, "step": 271 }, { "epoch": 0.8560188827694729, "grad_norm": 1.0898432509550877, "learning_rate": 3.9383442668479074e-05, "loss": 0.6194, "step": 272 }, { "epoch": 0.8591660110149488, "grad_norm": 1.729898493997823, "learning_rate": 3.937253953580562e-05, "loss": 0.621, "step": 273 }, { "epoch": 0.8623131392604249, "grad_norm": 1.3978303034299537, "learning_rate": 3.936154237732409e-05, "loss": 0.6143, "step": 274 }, { "epoch": 0.8654602675059009, "grad_norm": 1.6560127678824332, "learning_rate": 3.935045124640985e-05, "loss": 0.6128, "step": 275 }, { "epoch": 0.8686073957513769, "grad_norm": 1.5405477384904165, "learning_rate": 3.933926619689438e-05, "loss": 0.6227, "step": 276 }, { "epoch": 0.8717545239968528, "grad_norm": 1.232950563952649, "learning_rate": 3.932798728306502e-05, "loss": 0.6166, "step": 277 }, { "epoch": 0.8749016522423289, "grad_norm": 1.3226181835548478, "learning_rate": 3.931661455966465e-05, "loss": 0.6093, "step": 278 }, { "epoch": 0.8780487804878049, "grad_norm": 1.217634421516471, "learning_rate": 3.930514808189149e-05, "loss": 0.6155, "step": 279 }, { "epoch": 0.8811959087332809, "grad_norm": 1.0114325069677168, "learning_rate": 3.929358790539881e-05, "loss": 0.6253, "step": 280 }, { "epoch": 0.8843430369787569, "grad_norm": 1.1140247331568867, "learning_rate": 3.92819340862946e-05, "loss": 0.6239, "step": 281 }, { "epoch": 0.8874901652242329, "grad_norm": 1.0564363406645432, "learning_rate": 3.927018668114141e-05, "loss": 0.6211, "step": 282 }, { "epoch": 0.8906372934697089, "grad_norm": 1.4079391384169502, "learning_rate": 3.925834574695599e-05, "loss": 0.6182, "step": 283 }, { "epoch": 0.8937844217151849, "grad_norm": 1.039379605395923, "learning_rate": 3.924641134120903e-05, "loss": 0.6221, "step": 284 }, { "epoch": 0.8969315499606609, "grad_norm": 1.2213803291786665, "learning_rate": 3.9234383521824905e-05, "loss": 0.6161, "step": 285 }, { "epoch": 0.9000786782061369, "grad_norm": 1.0145611815593831, "learning_rate": 3.922226234718137e-05, "loss": 0.6148, "step": 286 }, { "epoch": 0.9032258064516129, "grad_norm": 1.437050058531951, "learning_rate": 3.92100478761093e-05, "loss": 0.6138, "step": 287 }, { "epoch": 0.9063729346970889, "grad_norm": 0.9288489831721957, "learning_rate": 3.919774016789237e-05, "loss": 0.621, "step": 288 }, { "epoch": 0.9095200629425649, "grad_norm": 1.2266763831925402, "learning_rate": 3.918533928226679e-05, "loss": 0.6156, "step": 289 }, { "epoch": 0.912667191188041, "grad_norm": 0.9671793379615254, "learning_rate": 3.917284527942103e-05, "loss": 0.6167, "step": 290 }, { "epoch": 0.9158143194335169, "grad_norm": 1.244082215609862, "learning_rate": 3.91602582199955e-05, "loss": 0.6161, "step": 291 }, { "epoch": 0.9189614476789929, "grad_norm": 1.1463688074695213, "learning_rate": 3.914757816508225e-05, "loss": 0.614, "step": 292 }, { "epoch": 0.9221085759244689, "grad_norm": 1.2261559130019553, "learning_rate": 3.913480517622472e-05, "loss": 0.6208, "step": 293 }, { "epoch": 0.925255704169945, "grad_norm": 1.0646827207279455, "learning_rate": 3.9121939315417386e-05, "loss": 0.6286, "step": 294 }, { "epoch": 0.9284028324154209, "grad_norm": 1.0539629025664061, "learning_rate": 3.910898064510549e-05, "loss": 0.6218, "step": 295 }, { "epoch": 0.9315499606608969, "grad_norm": 0.9722762493322944, "learning_rate": 3.909592922818474e-05, "loss": 0.621, "step": 296 }, { "epoch": 0.9346970889063729, "grad_norm": 1.0701286553105696, "learning_rate": 3.908278512800098e-05, "loss": 0.6215, "step": 297 }, { "epoch": 0.937844217151849, "grad_norm": 1.118122321373602, "learning_rate": 3.906954840834991e-05, "loss": 0.6214, "step": 298 }, { "epoch": 0.940991345397325, "grad_norm": 0.8652483960951545, "learning_rate": 3.9056219133476766e-05, "loss": 0.6199, "step": 299 }, { "epoch": 0.9441384736428009, "grad_norm": 1.0343112033921007, "learning_rate": 3.904279736807599e-05, "loss": 0.6104, "step": 300 }, { "epoch": 0.9472856018882769, "grad_norm": 0.9496922262871718, "learning_rate": 3.9029283177290944e-05, "loss": 0.6081, "step": 301 }, { "epoch": 0.950432730133753, "grad_norm": 0.9371417223191364, "learning_rate": 3.901567662671359e-05, "loss": 0.6182, "step": 302 }, { "epoch": 0.953579858379229, "grad_norm": 1.1256246896638376, "learning_rate": 3.9001977782384154e-05, "loss": 0.6272, "step": 303 }, { "epoch": 0.956726986624705, "grad_norm": 1.1988543784517776, "learning_rate": 3.898818671079081e-05, "loss": 0.6193, "step": 304 }, { "epoch": 0.9598741148701809, "grad_norm": 0.6640888343950058, "learning_rate": 3.897430347886937e-05, "loss": 0.623, "step": 305 }, { "epoch": 0.963021243115657, "grad_norm": 1.0259693265720489, "learning_rate": 3.896032815400295e-05, "loss": 0.6061, "step": 306 }, { "epoch": 0.966168371361133, "grad_norm": 1.6687489855665556, "learning_rate": 3.894626080402166e-05, "loss": 0.6158, "step": 307 }, { "epoch": 0.969315499606609, "grad_norm": 0.7655240494240164, "learning_rate": 3.893210149720222e-05, "loss": 0.6135, "step": 308 }, { "epoch": 0.9724626278520849, "grad_norm": 1.6527270765731679, "learning_rate": 3.8917850302267724e-05, "loss": 0.6214, "step": 309 }, { "epoch": 0.975609756097561, "grad_norm": 0.9742997284638653, "learning_rate": 3.890350728838719e-05, "loss": 0.6098, "step": 310 }, { "epoch": 0.978756884343037, "grad_norm": 1.2188875317692218, "learning_rate": 3.888907252517534e-05, "loss": 0.6078, "step": 311 }, { "epoch": 0.981904012588513, "grad_norm": 1.4090784716524412, "learning_rate": 3.887454608269217e-05, "loss": 0.6148, "step": 312 }, { "epoch": 0.985051140833989, "grad_norm": 0.8677610975018318, "learning_rate": 3.885992803144266e-05, "loss": 0.6283, "step": 313 }, { "epoch": 0.988198269079465, "grad_norm": 1.2008659411405975, "learning_rate": 3.8845218442376416e-05, "loss": 0.6179, "step": 314 }, { "epoch": 0.991345397324941, "grad_norm": 1.181118863273738, "learning_rate": 3.883041738688733e-05, "loss": 0.6138, "step": 315 }, { "epoch": 0.994492525570417, "grad_norm": 0.8124102237663046, "learning_rate": 3.8815524936813236e-05, "loss": 0.6024, "step": 316 }, { "epoch": 0.997639653815893, "grad_norm": 1.0702432156884507, "learning_rate": 3.880054116443556e-05, "loss": 0.6101, "step": 317 }, { "epoch": 1.002360346184107, "grad_norm": 2.065753171430859, "learning_rate": 3.878546614247894e-05, "loss": 1.027, "step": 318 }, { "epoch": 1.005507474429583, "grad_norm": 0.8860268190442901, "learning_rate": 3.8770299944110934e-05, "loss": 0.5977, "step": 319 }, { "epoch": 1.008654602675059, "grad_norm": 0.8640393060499529, "learning_rate": 3.875504264294161e-05, "loss": 0.5814, "step": 320 }, { "epoch": 1.011801730920535, "grad_norm": 1.0567074386948476, "learning_rate": 3.873969431302322e-05, "loss": 0.5838, "step": 321 }, { "epoch": 1.014948859166011, "grad_norm": 1.2201467147928156, "learning_rate": 3.872425502884982e-05, "loss": 0.5616, "step": 322 }, { "epoch": 1.018095987411487, "grad_norm": 1.2298258787473502, "learning_rate": 3.870872486535692e-05, "loss": 0.605, "step": 323 }, { "epoch": 1.021243115656963, "grad_norm": 0.9570547295059392, "learning_rate": 3.869310389792113e-05, "loss": 0.5571, "step": 324 }, { "epoch": 1.024390243902439, "grad_norm": 0.910399791203508, "learning_rate": 3.867739220235976e-05, "loss": 0.5803, "step": 325 }, { "epoch": 1.027537372147915, "grad_norm": 1.1255287832975964, "learning_rate": 3.866158985493049e-05, "loss": 0.5735, "step": 326 }, { "epoch": 1.030684500393391, "grad_norm": 1.2218937018260718, "learning_rate": 3.864569693233099e-05, "loss": 0.5839, "step": 327 }, { "epoch": 1.0338316286388671, "grad_norm": 0.879516220387298, "learning_rate": 3.862971351169853e-05, "loss": 0.5897, "step": 328 }, { "epoch": 1.036978756884343, "grad_norm": 0.9597145092489864, "learning_rate": 3.8613639670609624e-05, "loss": 0.5539, "step": 329 }, { "epoch": 1.040125885129819, "grad_norm": 1.6528619139400769, "learning_rate": 3.859747548707963e-05, "loss": 0.6214, "step": 330 }, { "epoch": 1.043273013375295, "grad_norm": 0.8058486040606079, "learning_rate": 3.858122103956242e-05, "loss": 0.5468, "step": 331 }, { "epoch": 1.046420141620771, "grad_norm": 1.623522624231283, "learning_rate": 3.8564876406949954e-05, "loss": 0.6004, "step": 332 }, { "epoch": 1.049567269866247, "grad_norm": 1.088449667069518, "learning_rate": 3.85484416685719e-05, "loss": 0.6047, "step": 333 }, { "epoch": 1.052714398111723, "grad_norm": 1.2546950767006624, "learning_rate": 3.8531916904195264e-05, "loss": 0.5485, "step": 334 }, { "epoch": 1.055861526357199, "grad_norm": 1.1169301229814428, "learning_rate": 3.851530219402401e-05, "loss": 0.6023, "step": 335 }, { "epoch": 1.0590086546026751, "grad_norm": 1.5613063395853726, "learning_rate": 3.8498597618698644e-05, "loss": 0.5753, "step": 336 }, { "epoch": 1.0621557828481512, "grad_norm": 1.0603407240619633, "learning_rate": 3.848180325929586e-05, "loss": 0.6027, "step": 337 }, { "epoch": 1.0653029110936272, "grad_norm": 1.9784565816301807, "learning_rate": 3.8464919197328106e-05, "loss": 0.563, "step": 338 }, { "epoch": 1.068450039339103, "grad_norm": 1.6560420050349316, "learning_rate": 3.84479455147432e-05, "loss": 0.6156, "step": 339 }, { "epoch": 1.071597167584579, "grad_norm": 1.3178042643700982, "learning_rate": 3.8430882293923966e-05, "loss": 0.5593, "step": 340 }, { "epoch": 1.074744295830055, "grad_norm": 1.3767031608324611, "learning_rate": 3.8413729617687786e-05, "loss": 0.6085, "step": 341 }, { "epoch": 1.077891424075531, "grad_norm": 1.0336068597716115, "learning_rate": 3.839648756928622e-05, "loss": 0.5806, "step": 342 }, { "epoch": 1.081038552321007, "grad_norm": 1.3254526922251793, "learning_rate": 3.837915623240462e-05, "loss": 0.5496, "step": 343 }, { "epoch": 1.0841856805664831, "grad_norm": 0.8088669835406253, "learning_rate": 3.836173569116167e-05, "loss": 0.6009, "step": 344 }, { "epoch": 1.0873328088119592, "grad_norm": 1.2752754683882992, "learning_rate": 3.834422603010906e-05, "loss": 0.5966, "step": 345 }, { "epoch": 1.0904799370574352, "grad_norm": 0.9879821313465822, "learning_rate": 3.8326627334230995e-05, "loss": 0.5685, "step": 346 }, { "epoch": 1.0936270653029112, "grad_norm": 0.957023686339743, "learning_rate": 3.830893968894382e-05, "loss": 0.5855, "step": 347 }, { "epoch": 1.096774193548387, "grad_norm": 0.8904028341609422, "learning_rate": 3.82911631800956e-05, "loss": 0.582, "step": 348 }, { "epoch": 1.099921321793863, "grad_norm": 0.9097097133530387, "learning_rate": 3.8273297893965725e-05, "loss": 0.5885, "step": 349 }, { "epoch": 1.103068450039339, "grad_norm": 0.8599788576525904, "learning_rate": 3.825534391726444e-05, "loss": 0.5842, "step": 350 }, { "epoch": 1.106215578284815, "grad_norm": 0.9167821894761861, "learning_rate": 3.823730133713248e-05, "loss": 0.5778, "step": 351 }, { "epoch": 1.1093627065302911, "grad_norm": 1.1530087162130414, "learning_rate": 3.821917024114061e-05, "loss": 0.5544, "step": 352 }, { "epoch": 1.1125098347757671, "grad_norm": 0.8086357525381911, "learning_rate": 3.820095071728923e-05, "loss": 0.5976, "step": 353 }, { "epoch": 1.1156569630212432, "grad_norm": 0.7747888713394967, "learning_rate": 3.81826428540079e-05, "loss": 0.6041, "step": 354 }, { "epoch": 1.1188040912667192, "grad_norm": 0.7508946410062841, "learning_rate": 3.816424674015497e-05, "loss": 0.5864, "step": 355 }, { "epoch": 1.1219512195121952, "grad_norm": 0.7959382362736754, "learning_rate": 3.814576246501708e-05, "loss": 0.5587, "step": 356 }, { "epoch": 1.125098347757671, "grad_norm": 0.685910628438955, "learning_rate": 3.812719011830882e-05, "loss": 0.5713, "step": 357 }, { "epoch": 1.128245476003147, "grad_norm": 0.7392010420761511, "learning_rate": 3.8108529790172204e-05, "loss": 0.5905, "step": 358 }, { "epoch": 1.131392604248623, "grad_norm": 0.6668167046568578, "learning_rate": 3.8089781571176275e-05, "loss": 0.5751, "step": 359 }, { "epoch": 1.1345397324940991, "grad_norm": 0.8000258627134674, "learning_rate": 3.8070945552316674e-05, "loss": 0.5621, "step": 360 }, { "epoch": 1.1376868607395751, "grad_norm": 0.7389373221950687, "learning_rate": 3.805202182501516e-05, "loss": 0.5529, "step": 361 }, { "epoch": 1.1408339889850512, "grad_norm": 0.769018684280187, "learning_rate": 3.80330104811192e-05, "loss": 0.6046, "step": 362 }, { "epoch": 1.1439811172305272, "grad_norm": 0.7301516878023783, "learning_rate": 3.801391161290153e-05, "loss": 0.5652, "step": 363 }, { "epoch": 1.1471282454760032, "grad_norm": 0.8229419243343139, "learning_rate": 3.7994725313059676e-05, "loss": 0.5792, "step": 364 }, { "epoch": 1.1502753737214793, "grad_norm": 0.6680976797053321, "learning_rate": 3.797545167471552e-05, "loss": 0.5741, "step": 365 }, { "epoch": 1.153422501966955, "grad_norm": 0.938538926694488, "learning_rate": 3.795609079141484e-05, "loss": 0.568, "step": 366 }, { "epoch": 1.156569630212431, "grad_norm": 1.2798546823891832, "learning_rate": 3.793664275712687e-05, "loss": 0.5779, "step": 367 }, { "epoch": 1.1597167584579071, "grad_norm": 0.5005920128387191, "learning_rate": 3.791710766624384e-05, "loss": 0.5567, "step": 368 }, { "epoch": 1.1628638867033831, "grad_norm": 1.3328977261494417, "learning_rate": 3.7897485613580516e-05, "loss": 0.584, "step": 369 }, { "epoch": 1.1660110149488592, "grad_norm": 0.9543328894815445, "learning_rate": 3.787777669437373e-05, "loss": 0.5818, "step": 370 }, { "epoch": 1.1691581431943352, "grad_norm": 0.6789082634257576, "learning_rate": 3.785798100428193e-05, "loss": 0.5907, "step": 371 }, { "epoch": 1.1723052714398112, "grad_norm": 0.6597293914224007, "learning_rate": 3.7838098639384705e-05, "loss": 0.5325, "step": 372 }, { "epoch": 1.1754523996852873, "grad_norm": 1.06089797604037, "learning_rate": 3.781812969618234e-05, "loss": 0.5935, "step": 373 }, { "epoch": 1.1785995279307633, "grad_norm": 1.1481361384627329, "learning_rate": 3.779807427159531e-05, "loss": 0.5596, "step": 374 }, { "epoch": 1.181746656176239, "grad_norm": 0.7407771811040466, "learning_rate": 3.777793246296387e-05, "loss": 0.5916, "step": 375 }, { "epoch": 1.1848937844217151, "grad_norm": 1.2671109018758089, "learning_rate": 3.775770436804751e-05, "loss": 0.5651, "step": 376 }, { "epoch": 1.1880409126671911, "grad_norm": 0.8857172105459614, "learning_rate": 3.7737390085024525e-05, "loss": 0.6174, "step": 377 }, { "epoch": 1.1911880409126672, "grad_norm": 0.9521903376602633, "learning_rate": 3.771698971249154e-05, "loss": 0.5565, "step": 378 }, { "epoch": 1.1943351691581432, "grad_norm": 1.1244996198067643, "learning_rate": 3.7696503349463014e-05, "loss": 0.6109, "step": 379 }, { "epoch": 1.1974822974036192, "grad_norm": 0.9098284775149258, "learning_rate": 3.767593109537076e-05, "loss": 0.5728, "step": 380 }, { "epoch": 1.2006294256490952, "grad_norm": 1.0765208853493442, "learning_rate": 3.765527305006348e-05, "loss": 0.5669, "step": 381 }, { "epoch": 1.2037765538945713, "grad_norm": 0.7147386911125436, "learning_rate": 3.763452931380626e-05, "loss": 0.5776, "step": 382 }, { "epoch": 1.2069236821400473, "grad_norm": 0.7742812162089677, "learning_rate": 3.7613699987280095e-05, "loss": 0.5561, "step": 383 }, { "epoch": 1.210070810385523, "grad_norm": 0.9406068378416047, "learning_rate": 3.75927851715814e-05, "loss": 0.5965, "step": 384 }, { "epoch": 1.2132179386309991, "grad_norm": 0.9172548442849622, "learning_rate": 3.7571784968221517e-05, "loss": 0.553, "step": 385 }, { "epoch": 1.2163650668764752, "grad_norm": 1.0846359511575892, "learning_rate": 3.75506994791262e-05, "loss": 0.5735, "step": 386 }, { "epoch": 1.2195121951219512, "grad_norm": 0.64281457660973, "learning_rate": 3.75295288066352e-05, "loss": 0.6026, "step": 387 }, { "epoch": 1.2226593233674272, "grad_norm": 1.2918057591143557, "learning_rate": 3.7508273053501644e-05, "loss": 0.5803, "step": 388 }, { "epoch": 1.2258064516129032, "grad_norm": 0.8285810992075407, "learning_rate": 3.7486932322891646e-05, "loss": 0.559, "step": 389 }, { "epoch": 1.2289535798583793, "grad_norm": 0.7903501432162211, "learning_rate": 3.7465506718383746e-05, "loss": 0.6019, "step": 390 }, { "epoch": 1.2321007081038553, "grad_norm": 1.0053744792111636, "learning_rate": 3.744399634396842e-05, "loss": 0.5609, "step": 391 }, { "epoch": 1.2352478363493313, "grad_norm": 0.8379114151897634, "learning_rate": 3.742240130404761e-05, "loss": 0.5899, "step": 392 }, { "epoch": 1.2383949645948071, "grad_norm": 0.9519548865776732, "learning_rate": 3.740072170343415e-05, "loss": 0.5973, "step": 393 }, { "epoch": 1.2415420928402832, "grad_norm": 0.7539686484958599, "learning_rate": 3.737895764735132e-05, "loss": 0.5714, "step": 394 }, { "epoch": 1.2446892210857592, "grad_norm": 1.0257885860344795, "learning_rate": 3.735710924143228e-05, "loss": 0.5829, "step": 395 }, { "epoch": 1.2478363493312352, "grad_norm": 0.7629141726668706, "learning_rate": 3.733517659171963e-05, "loss": 0.5801, "step": 396 }, { "epoch": 1.2509834775767112, "grad_norm": 0.7754556508930379, "learning_rate": 3.731315980466482e-05, "loss": 0.5472, "step": 397 }, { "epoch": 1.2541306058221873, "grad_norm": 0.7774464891625856, "learning_rate": 3.729105898712768e-05, "loss": 0.5962, "step": 398 }, { "epoch": 1.2572777340676633, "grad_norm": 1.174513316510275, "learning_rate": 3.726887424637588e-05, "loss": 0.6022, "step": 399 }, { "epoch": 1.2604248623131393, "grad_norm": 0.7079511116657756, "learning_rate": 3.7246605690084414e-05, "loss": 0.5344, "step": 400 }, { "epoch": 1.2635719905586154, "grad_norm": 0.9885418604513543, "learning_rate": 3.722425342633509e-05, "loss": 0.5927, "step": 401 }, { "epoch": 1.2667191188040912, "grad_norm": 1.4481157564971159, "learning_rate": 3.7201817563615994e-05, "loss": 0.5769, "step": 402 }, { "epoch": 1.2698662470495672, "grad_norm": 0.6732395737301099, "learning_rate": 3.717929821082095e-05, "loss": 0.5592, "step": 403 }, { "epoch": 1.2730133752950432, "grad_norm": 1.402245331031854, "learning_rate": 3.7156695477249034e-05, "loss": 0.583, "step": 404 }, { "epoch": 1.2761605035405192, "grad_norm": 0.7912573192484376, "learning_rate": 3.713400947260398e-05, "loss": 0.5765, "step": 405 }, { "epoch": 1.2793076317859953, "grad_norm": 0.7882115805671188, "learning_rate": 3.711124030699371e-05, "loss": 0.5971, "step": 406 }, { "epoch": 1.2824547600314713, "grad_norm": 1.0141187171254817, "learning_rate": 3.7088388090929776e-05, "loss": 0.5848, "step": 407 }, { "epoch": 1.2856018882769473, "grad_norm": 0.9391592675093976, "learning_rate": 3.706545293532679e-05, "loss": 0.5444, "step": 408 }, { "epoch": 1.2887490165224234, "grad_norm": 0.9095509483468426, "learning_rate": 3.704243495150195e-05, "loss": 0.5616, "step": 409 }, { "epoch": 1.2918961447678994, "grad_norm": 0.8507529149564299, "learning_rate": 3.701933425117444e-05, "loss": 0.5911, "step": 410 }, { "epoch": 1.2950432730133752, "grad_norm": 0.8232008726505865, "learning_rate": 3.699615094646494e-05, "loss": 0.5801, "step": 411 }, { "epoch": 1.2981904012588514, "grad_norm": 0.6912446644196883, "learning_rate": 3.697288514989502e-05, "loss": 0.558, "step": 412 }, { "epoch": 1.3013375295043272, "grad_norm": 0.6797610400444395, "learning_rate": 3.694953697438667e-05, "loss": 0.5922, "step": 413 }, { "epoch": 1.3044846577498033, "grad_norm": 0.9402207805888517, "learning_rate": 3.6926106533261676e-05, "loss": 0.5829, "step": 414 }, { "epoch": 1.3076317859952793, "grad_norm": 0.7820671057814051, "learning_rate": 3.690259394024113e-05, "loss": 0.5791, "step": 415 }, { "epoch": 1.3107789142407553, "grad_norm": 0.6624377804484535, "learning_rate": 3.687899930944484e-05, "loss": 0.5685, "step": 416 }, { "epoch": 1.3139260424862313, "grad_norm": 0.6610022512491404, "learning_rate": 3.68553227553908e-05, "loss": 0.5578, "step": 417 }, { "epoch": 1.3170731707317074, "grad_norm": 0.9462547727748756, "learning_rate": 3.683156439299459e-05, "loss": 0.5814, "step": 418 }, { "epoch": 1.3202202989771834, "grad_norm": 0.9348204674000715, "learning_rate": 3.680772433756892e-05, "loss": 0.5831, "step": 419 }, { "epoch": 1.3233674272226592, "grad_norm": 0.7963248960523923, "learning_rate": 3.678380270482292e-05, "loss": 0.5783, "step": 420 }, { "epoch": 1.3265145554681355, "grad_norm": 0.7393400925757423, "learning_rate": 3.675979961086173e-05, "loss": 0.5839, "step": 421 }, { "epoch": 1.3296616837136113, "grad_norm": 0.7023192863980032, "learning_rate": 3.673571517218582e-05, "loss": 0.5868, "step": 422 }, { "epoch": 1.3328088119590873, "grad_norm": 0.7626883730805661, "learning_rate": 3.67115495056905e-05, "loss": 0.5515, "step": 423 }, { "epoch": 1.3359559402045633, "grad_norm": 0.7190658979185762, "learning_rate": 3.66873027286653e-05, "loss": 0.5928, "step": 424 }, { "epoch": 1.3391030684500393, "grad_norm": 0.8697111635772896, "learning_rate": 3.6662974958793456e-05, "loss": 0.5825, "step": 425 }, { "epoch": 1.3422501966955154, "grad_norm": 1.048689473107099, "learning_rate": 3.663856631415128e-05, "loss": 0.5755, "step": 426 }, { "epoch": 1.3453973249409914, "grad_norm": 0.6268689833713994, "learning_rate": 3.661407691320763e-05, "loss": 0.5995, "step": 427 }, { "epoch": 1.3485444531864674, "grad_norm": 0.8771144942564589, "learning_rate": 3.65895068748233e-05, "loss": 0.529, "step": 428 }, { "epoch": 1.3516915814319432, "grad_norm": 0.9330631648070546, "learning_rate": 3.65648563182505e-05, "loss": 0.5698, "step": 429 }, { "epoch": 1.3548387096774195, "grad_norm": 0.6691205901538837, "learning_rate": 3.65401253631322e-05, "loss": 0.6007, "step": 430 }, { "epoch": 1.3579858379228953, "grad_norm": 0.8797201937311443, "learning_rate": 3.651531412950162e-05, "loss": 0.5688, "step": 431 }, { "epoch": 1.3611329661683713, "grad_norm": 0.812875798385014, "learning_rate": 3.6490422737781586e-05, "loss": 0.5599, "step": 432 }, { "epoch": 1.3642800944138473, "grad_norm": 0.739605090858723, "learning_rate": 3.646545130878401e-05, "loss": 0.5717, "step": 433 }, { "epoch": 1.3674272226593234, "grad_norm": 0.7027271799286325, "learning_rate": 3.6440399963709244e-05, "loss": 0.5919, "step": 434 }, { "epoch": 1.3705743509047994, "grad_norm": 0.6758853392810176, "learning_rate": 3.641526882414553e-05, "loss": 0.5847, "step": 435 }, { "epoch": 1.3737214791502754, "grad_norm": 0.6412837261448728, "learning_rate": 3.639005801206839e-05, "loss": 0.5786, "step": 436 }, { "epoch": 1.3768686073957515, "grad_norm": 0.7485694482103437, "learning_rate": 3.6364767649840056e-05, "loss": 0.5792, "step": 437 }, { "epoch": 1.3800157356412273, "grad_norm": 0.6846051950269184, "learning_rate": 3.633939786020884e-05, "loss": 0.5541, "step": 438 }, { "epoch": 1.3831628638867035, "grad_norm": 0.7297707216903999, "learning_rate": 3.631394876630858e-05, "loss": 0.5854, "step": 439 }, { "epoch": 1.3863099921321793, "grad_norm": 0.6692029491831896, "learning_rate": 3.628842049165801e-05, "loss": 0.5812, "step": 440 }, { "epoch": 1.3894571203776553, "grad_norm": 0.7299333511772628, "learning_rate": 3.626281316016017e-05, "loss": 0.568, "step": 441 }, { "epoch": 1.3926042486231314, "grad_norm": 0.6329118009441329, "learning_rate": 3.623712689610182e-05, "loss": 0.5833, "step": 442 }, { "epoch": 1.3957513768686074, "grad_norm": 0.7411202253172915, "learning_rate": 3.6211361824152816e-05, "loss": 0.5772, "step": 443 }, { "epoch": 1.3988985051140834, "grad_norm": 0.7471937332509346, "learning_rate": 3.618551806936551e-05, "loss": 0.5584, "step": 444 }, { "epoch": 1.4020456333595595, "grad_norm": 0.6697271784752193, "learning_rate": 3.6159595757174154e-05, "loss": 0.5565, "step": 445 }, { "epoch": 1.4051927616050355, "grad_norm": 0.7260444858526285, "learning_rate": 3.6133595013394276e-05, "loss": 0.6017, "step": 446 }, { "epoch": 1.4083398898505113, "grad_norm": 0.6938476082430989, "learning_rate": 3.6107515964222084e-05, "loss": 0.5702, "step": 447 }, { "epoch": 1.4114870180959875, "grad_norm": 0.6902716895190657, "learning_rate": 3.6081358736233836e-05, "loss": 0.6014, "step": 448 }, { "epoch": 1.4146341463414633, "grad_norm": 0.5887971251866968, "learning_rate": 3.605512345638525e-05, "loss": 0.545, "step": 449 }, { "epoch": 1.4177812745869394, "grad_norm": 0.6468738560521924, "learning_rate": 3.602881025201086e-05, "loss": 0.6023, "step": 450 }, { "epoch": 1.4209284028324154, "grad_norm": 0.6460851268856366, "learning_rate": 3.6002419250823415e-05, "loss": 0.5788, "step": 451 }, { "epoch": 1.4240755310778914, "grad_norm": 0.6553053911353882, "learning_rate": 3.597595058091325e-05, "loss": 0.5705, "step": 452 }, { "epoch": 1.4272226593233674, "grad_norm": 0.7688726651106929, "learning_rate": 3.594940437074769e-05, "loss": 0.6054, "step": 453 }, { "epoch": 1.4303697875688435, "grad_norm": 0.8193385892036661, "learning_rate": 3.592278074917039e-05, "loss": 0.5821, "step": 454 }, { "epoch": 1.4335169158143195, "grad_norm": 0.5937922521766558, "learning_rate": 3.589607984540072e-05, "loss": 0.55, "step": 455 }, { "epoch": 1.4366640440597953, "grad_norm": 0.9231773985131014, "learning_rate": 3.586930178903317e-05, "loss": 0.5766, "step": 456 }, { "epoch": 1.4398111723052716, "grad_norm": 1.0322774589585058, "learning_rate": 3.5842446710036664e-05, "loss": 0.5943, "step": 457 }, { "epoch": 1.4429583005507474, "grad_norm": 0.5933782616518375, "learning_rate": 3.581551473875397e-05, "loss": 0.5535, "step": 458 }, { "epoch": 1.4461054287962234, "grad_norm": 0.7009813777712108, "learning_rate": 3.578850600590108e-05, "loss": 0.5783, "step": 459 }, { "epoch": 1.4492525570416994, "grad_norm": 0.515906560455954, "learning_rate": 3.576142064256652e-05, "loss": 0.5431, "step": 460 }, { "epoch": 1.4523996852871754, "grad_norm": 0.7183973662906478, "learning_rate": 3.573425878021077e-05, "loss": 0.5902, "step": 461 }, { "epoch": 1.4555468135326515, "grad_norm": 0.7722657026633934, "learning_rate": 3.570702055066558e-05, "loss": 0.5808, "step": 462 }, { "epoch": 1.4586939417781275, "grad_norm": 0.6269618529944755, "learning_rate": 3.5679706086133384e-05, "loss": 0.5895, "step": 463 }, { "epoch": 1.4618410700236035, "grad_norm": 0.5326061244869541, "learning_rate": 3.5652315519186594e-05, "loss": 0.5394, "step": 464 }, { "epoch": 1.4649881982690793, "grad_norm": 0.7071744531566102, "learning_rate": 3.562484898276702e-05, "loss": 0.5843, "step": 465 }, { "epoch": 1.4681353265145556, "grad_norm": 0.4720130096048939, "learning_rate": 3.559730661018517e-05, "loss": 0.5768, "step": 466 }, { "epoch": 1.4712824547600314, "grad_norm": 0.7510254194400818, "learning_rate": 3.556968853511964e-05, "loss": 0.5587, "step": 467 }, { "epoch": 1.4744295830055074, "grad_norm": 0.7533116530402437, "learning_rate": 3.5541994891616455e-05, "loss": 0.5924, "step": 468 }, { "epoch": 1.4775767112509834, "grad_norm": 0.648313383238362, "learning_rate": 3.55142258140884e-05, "loss": 0.5672, "step": 469 }, { "epoch": 1.4807238394964595, "grad_norm": 0.6066187042141705, "learning_rate": 3.548638143731441e-05, "loss": 0.5393, "step": 470 }, { "epoch": 1.4838709677419355, "grad_norm": 0.5114501688468103, "learning_rate": 3.545846189643886e-05, "loss": 0.5775, "step": 471 }, { "epoch": 1.4870180959874115, "grad_norm": 0.6433306121899048, "learning_rate": 3.5430467326970956e-05, "loss": 0.5719, "step": 472 }, { "epoch": 1.4901652242328876, "grad_norm": 0.6315145421662225, "learning_rate": 3.540239786478407e-05, "loss": 0.5641, "step": 473 }, { "epoch": 1.4933123524783634, "grad_norm": 0.7411652973460111, "learning_rate": 3.537425364611504e-05, "loss": 0.5859, "step": 474 }, { "epoch": 1.4964594807238396, "grad_norm": 0.6753402761493095, "learning_rate": 3.5346034807563566e-05, "loss": 0.5664, "step": 475 }, { "epoch": 1.4996066089693154, "grad_norm": 0.6593613942752435, "learning_rate": 3.5317741486091515e-05, "loss": 0.5777, "step": 476 }, { "epoch": 1.5027537372147917, "grad_norm": 0.6224692930708144, "learning_rate": 3.5289373819022246e-05, "loss": 0.5511, "step": 477 }, { "epoch": 1.5059008654602675, "grad_norm": 0.5435829543716864, "learning_rate": 3.5260931944039976e-05, "loss": 0.5951, "step": 478 }, { "epoch": 1.5090479937057435, "grad_norm": 0.7727972521914884, "learning_rate": 3.523241599918908e-05, "loss": 0.5809, "step": 479 }, { "epoch": 1.5121951219512195, "grad_norm": 0.8135435570540013, "learning_rate": 3.520382612287346e-05, "loss": 0.5508, "step": 480 }, { "epoch": 1.5153422501966956, "grad_norm": 0.6445142709480808, "learning_rate": 3.517516245385582e-05, "loss": 0.5711, "step": 481 }, { "epoch": 1.5184893784421716, "grad_norm": 0.6571221471891615, "learning_rate": 3.514642513125704e-05, "loss": 0.5816, "step": 482 }, { "epoch": 1.5216365066876474, "grad_norm": 0.778181315537639, "learning_rate": 3.511761429455548e-05, "loss": 0.5487, "step": 483 }, { "epoch": 1.5247836349331236, "grad_norm": 0.6612223689047932, "learning_rate": 3.508873008358629e-05, "loss": 0.5693, "step": 484 }, { "epoch": 1.5279307631785994, "grad_norm": 0.8677284824093338, "learning_rate": 3.505977263854077e-05, "loss": 0.5777, "step": 485 }, { "epoch": 1.5310778914240757, "grad_norm": 0.7946151572521599, "learning_rate": 3.5030742099965654e-05, "loss": 0.6006, "step": 486 }, { "epoch": 1.5342250196695515, "grad_norm": 0.6423836766306755, "learning_rate": 3.5001638608762435e-05, "loss": 0.5902, "step": 487 }, { "epoch": 1.5373721479150275, "grad_norm": 0.9319785025741317, "learning_rate": 3.497246230618669e-05, "loss": 0.5391, "step": 488 }, { "epoch": 1.5405192761605035, "grad_norm": 0.646420731333806, "learning_rate": 3.49432133338474e-05, "loss": 0.601, "step": 489 }, { "epoch": 1.5436664044059796, "grad_norm": 0.6190175701198456, "learning_rate": 3.491389183370624e-05, "loss": 0.5381, "step": 490 }, { "epoch": 1.5468135326514556, "grad_norm": 0.7395463266615069, "learning_rate": 3.48844979480769e-05, "loss": 0.6024, "step": 491 }, { "epoch": 1.5499606608969314, "grad_norm": 0.6246796480797395, "learning_rate": 3.485503181962442e-05, "loss": 0.558, "step": 492 }, { "epoch": 1.5531077891424077, "grad_norm": 0.6101829668871169, "learning_rate": 3.4825493591364456e-05, "loss": 0.5797, "step": 493 }, { "epoch": 1.5562549173878835, "grad_norm": 0.7173522923458697, "learning_rate": 3.479588340666261e-05, "loss": 0.5769, "step": 494 }, { "epoch": 1.5594020456333597, "grad_norm": 0.7526388551403992, "learning_rate": 3.4766201409233725e-05, "loss": 0.5415, "step": 495 }, { "epoch": 1.5625491738788355, "grad_norm": 0.8776736439637768, "learning_rate": 3.4736447743141195e-05, "loss": 0.597, "step": 496 }, { "epoch": 1.5656963021243115, "grad_norm": 0.7955322141679404, "learning_rate": 3.470662255279627e-05, "loss": 0.566, "step": 497 }, { "epoch": 1.5688434303697876, "grad_norm": 0.693146892331598, "learning_rate": 3.467672598295733e-05, "loss": 0.5666, "step": 498 }, { "epoch": 1.5719905586152636, "grad_norm": 0.5243219068148551, "learning_rate": 3.4646758178729204e-05, "loss": 0.5772, "step": 499 }, { "epoch": 1.5751376868607396, "grad_norm": 0.6658543703419947, "learning_rate": 3.461671928556248e-05, "loss": 0.5677, "step": 500 }, { "epoch": 1.5782848151062154, "grad_norm": 0.70750696164214, "learning_rate": 3.458660944925277e-05, "loss": 0.5694, "step": 501 }, { "epoch": 1.5814319433516917, "grad_norm": 0.8301587050106674, "learning_rate": 3.455642881594e-05, "loss": 0.573, "step": 502 }, { "epoch": 1.5845790715971675, "grad_norm": 0.5989742871105483, "learning_rate": 3.452617753210772e-05, "loss": 0.5796, "step": 503 }, { "epoch": 1.5877261998426437, "grad_norm": 0.7607728281640884, "learning_rate": 3.44958557445824e-05, "loss": 0.5368, "step": 504 }, { "epoch": 1.5908733280881195, "grad_norm": 0.8923738785753914, "learning_rate": 3.44654636005327e-05, "loss": 0.5931, "step": 505 }, { "epoch": 1.5940204563335956, "grad_norm": 0.8841841122243062, "learning_rate": 3.443500124746873e-05, "loss": 0.5758, "step": 506 }, { "epoch": 1.5971675845790716, "grad_norm": 0.9060495002085946, "learning_rate": 3.4404468833241406e-05, "loss": 0.5762, "step": 507 }, { "epoch": 1.6003147128245476, "grad_norm": 0.7266975845748416, "learning_rate": 3.437386650604168e-05, "loss": 0.5887, "step": 508 }, { "epoch": 1.6034618410700237, "grad_norm": 0.7727096713217674, "learning_rate": 3.43431944143998e-05, "loss": 0.5511, "step": 509 }, { "epoch": 1.6066089693154995, "grad_norm": 0.9700581732284279, "learning_rate": 3.431245270718466e-05, "loss": 0.5864, "step": 510 }, { "epoch": 1.6097560975609757, "grad_norm": 1.0421876454341092, "learning_rate": 3.4281641533603014e-05, "loss": 0.5623, "step": 511 }, { "epoch": 1.6129032258064515, "grad_norm": 0.6522589351192359, "learning_rate": 3.425076104319878e-05, "loss": 0.5834, "step": 512 }, { "epoch": 1.6160503540519278, "grad_norm": 0.7995769943176967, "learning_rate": 3.4219811385852324e-05, "loss": 0.5484, "step": 513 }, { "epoch": 1.6191974822974036, "grad_norm": 0.8339896343295545, "learning_rate": 3.418879271177968e-05, "loss": 0.5417, "step": 514 }, { "epoch": 1.6223446105428796, "grad_norm": 0.6298557505945559, "learning_rate": 3.4157705171531904e-05, "loss": 0.6214, "step": 515 }, { "epoch": 1.6254917387883556, "grad_norm": 0.6627216090718417, "learning_rate": 3.4126548915994274e-05, "loss": 0.5306, "step": 516 }, { "epoch": 1.6286388670338316, "grad_norm": 0.8522248942562364, "learning_rate": 3.4095324096385584e-05, "loss": 0.5832, "step": 517 }, { "epoch": 1.6317859952793077, "grad_norm": 0.8071162098711794, "learning_rate": 3.406403086425741e-05, "loss": 0.5686, "step": 518 }, { "epoch": 1.6349331235247835, "grad_norm": 0.7731287731654394, "learning_rate": 3.403266937149337e-05, "loss": 0.5815, "step": 519 }, { "epoch": 1.6380802517702597, "grad_norm": 0.6533031600568543, "learning_rate": 3.4001239770308394e-05, "loss": 0.5915, "step": 520 }, { "epoch": 1.6412273800157355, "grad_norm": 0.569436746864039, "learning_rate": 3.396974221324798e-05, "loss": 0.5498, "step": 521 }, { "epoch": 1.6443745082612118, "grad_norm": 0.6632852776648928, "learning_rate": 3.3938176853187456e-05, "loss": 0.5456, "step": 522 }, { "epoch": 1.6475216365066876, "grad_norm": 0.5695584986960003, "learning_rate": 3.390654384333122e-05, "loss": 0.6047, "step": 523 }, { "epoch": 1.6506687647521636, "grad_norm": 0.5940386873921953, "learning_rate": 3.387484333721203e-05, "loss": 0.5619, "step": 524 }, { "epoch": 1.6538158929976396, "grad_norm": 0.6418545904839442, "learning_rate": 3.384307548869023e-05, "loss": 0.5559, "step": 525 }, { "epoch": 1.6569630212431157, "grad_norm": 0.7206004903104387, "learning_rate": 3.381124045195304e-05, "loss": 0.5975, "step": 526 }, { "epoch": 1.6601101494885917, "grad_norm": 0.7182344615178672, "learning_rate": 3.377933838151374e-05, "loss": 0.5509, "step": 527 }, { "epoch": 1.6632572777340675, "grad_norm": 0.49685467022501884, "learning_rate": 3.3747369432210986e-05, "loss": 0.5579, "step": 528 }, { "epoch": 1.6664044059795438, "grad_norm": 0.7462370895937678, "learning_rate": 3.371533375920805e-05, "loss": 0.584, "step": 529 }, { "epoch": 1.6695515342250196, "grad_norm": 0.5844502731729302, "learning_rate": 3.368323151799201e-05, "loss": 0.5502, "step": 530 }, { "epoch": 1.6726986624704958, "grad_norm": 0.7995900924270242, "learning_rate": 3.365106286437309e-05, "loss": 0.577, "step": 531 }, { "epoch": 1.6758457907159716, "grad_norm": 0.6980897656965266, "learning_rate": 3.36188279544838e-05, "loss": 0.5667, "step": 532 }, { "epoch": 1.6789929189614476, "grad_norm": 0.5272573380584673, "learning_rate": 3.358652694477825e-05, "loss": 0.5682, "step": 533 }, { "epoch": 1.6821400472069237, "grad_norm": 0.7386619419487044, "learning_rate": 3.355415999203139e-05, "loss": 0.5676, "step": 534 }, { "epoch": 1.6852871754523997, "grad_norm": 0.6234902692887895, "learning_rate": 3.352172725333819e-05, "loss": 0.5767, "step": 535 }, { "epoch": 1.6884343036978757, "grad_norm": 0.6168550151283233, "learning_rate": 3.3489228886112955e-05, "loss": 0.5414, "step": 536 }, { "epoch": 1.6915814319433515, "grad_norm": 0.6616970915400288, "learning_rate": 3.345666504808848e-05, "loss": 0.5958, "step": 537 }, { "epoch": 1.6947285601888278, "grad_norm": 0.5250838225987879, "learning_rate": 3.342403589731537e-05, "loss": 0.5574, "step": 538 }, { "epoch": 1.6978756884343036, "grad_norm": 0.4902293386353501, "learning_rate": 3.3391341592161186e-05, "loss": 0.5668, "step": 539 }, { "epoch": 1.7010228166797798, "grad_norm": 0.6680470248999262, "learning_rate": 3.3358582291309755e-05, "loss": 0.576, "step": 540 }, { "epoch": 1.7041699449252556, "grad_norm": 0.5361498475271551, "learning_rate": 3.3325758153760357e-05, "loss": 0.5699, "step": 541 }, { "epoch": 1.7073170731707317, "grad_norm": 0.7242179466038916, "learning_rate": 3.3292869338826944e-05, "loss": 0.5651, "step": 542 }, { "epoch": 1.7104642014162077, "grad_norm": 0.5773147674606367, "learning_rate": 3.3259916006137404e-05, "loss": 0.5399, "step": 543 }, { "epoch": 1.7136113296616837, "grad_norm": 0.7046072332761888, "learning_rate": 3.3226898315632747e-05, "loss": 0.5997, "step": 544 }, { "epoch": 1.7167584579071598, "grad_norm": 0.7085851971891551, "learning_rate": 3.319381642756637e-05, "loss": 0.5691, "step": 545 }, { "epoch": 1.7199055861526356, "grad_norm": 0.6071053828245467, "learning_rate": 3.3160670502503245e-05, "loss": 0.5605, "step": 546 }, { "epoch": 1.7230527143981118, "grad_norm": 0.6496056037574068, "learning_rate": 3.312746070131913e-05, "loss": 0.568, "step": 547 }, { "epoch": 1.7261998426435876, "grad_norm": 0.5889215814684938, "learning_rate": 3.309418718519985e-05, "loss": 0.5569, "step": 548 }, { "epoch": 1.7293469708890639, "grad_norm": 0.6871169447975174, "learning_rate": 3.306085011564045e-05, "loss": 0.5974, "step": 549 }, { "epoch": 1.7324940991345397, "grad_norm": 0.4861712620248983, "learning_rate": 3.302744965444445e-05, "loss": 0.5547, "step": 550 }, { "epoch": 1.7356412273800157, "grad_norm": 0.6213321890297462, "learning_rate": 3.299398596372301e-05, "loss": 0.5504, "step": 551 }, { "epoch": 1.7387883556254917, "grad_norm": 0.7451909016073313, "learning_rate": 3.296045920589423e-05, "loss": 0.5743, "step": 552 }, { "epoch": 1.7419354838709677, "grad_norm": 0.5451433585609038, "learning_rate": 3.292686954368228e-05, "loss": 0.5828, "step": 553 }, { "epoch": 1.7450826121164438, "grad_norm": 0.7140894697077395, "learning_rate": 3.2893217140116636e-05, "loss": 0.5451, "step": 554 }, { "epoch": 1.7482297403619196, "grad_norm": 0.7917204702318612, "learning_rate": 3.28595021585313e-05, "loss": 0.5749, "step": 555 }, { "epoch": 1.7513768686073958, "grad_norm": 0.736299315372961, "learning_rate": 3.282572476256401e-05, "loss": 0.5642, "step": 556 }, { "epoch": 1.7545239968528716, "grad_norm": 0.896182709362604, "learning_rate": 3.2791885116155416e-05, "loss": 0.5846, "step": 557 }, { "epoch": 1.7576711250983479, "grad_norm": 0.7630553132584026, "learning_rate": 3.275798338354833e-05, "loss": 0.5427, "step": 558 }, { "epoch": 1.7608182533438237, "grad_norm": 0.8175895208469556, "learning_rate": 3.272401972928688e-05, "loss": 0.5865, "step": 559 }, { "epoch": 1.7639653815892997, "grad_norm": 1.0859209320779282, "learning_rate": 3.2689994318215754e-05, "loss": 0.5655, "step": 560 }, { "epoch": 1.7671125098347757, "grad_norm": 0.5813459553206478, "learning_rate": 3.265590731547936e-05, "loss": 0.5778, "step": 561 }, { "epoch": 1.7702596380802518, "grad_norm": 0.932110967880023, "learning_rate": 3.262175888652106e-05, "loss": 0.5582, "step": 562 }, { "epoch": 1.7734067663257278, "grad_norm": 1.141526320853197, "learning_rate": 3.258754919708234e-05, "loss": 0.5511, "step": 563 }, { "epoch": 1.7765538945712038, "grad_norm": 0.7023132464915953, "learning_rate": 3.255327841320204e-05, "loss": 0.5876, "step": 564 }, { "epoch": 1.7797010228166799, "grad_norm": 1.273494297512138, "learning_rate": 3.251894670121549e-05, "loss": 0.5589, "step": 565 }, { "epoch": 1.7828481510621557, "grad_norm": 0.7051769266957719, "learning_rate": 3.2484554227753775e-05, "loss": 0.5926, "step": 566 }, { "epoch": 1.785995279307632, "grad_norm": 0.7937404532482675, "learning_rate": 3.2450101159742864e-05, "loss": 0.552, "step": 567 }, { "epoch": 1.7891424075531077, "grad_norm": 0.6969548206163279, "learning_rate": 3.241558766440284e-05, "loss": 0.5778, "step": 568 }, { "epoch": 1.7922895357985837, "grad_norm": 0.5249950117268797, "learning_rate": 3.238101390924706e-05, "loss": 0.5522, "step": 569 }, { "epoch": 1.7954366640440598, "grad_norm": 0.5989099837768208, "learning_rate": 3.234638006208138e-05, "loss": 0.571, "step": 570 }, { "epoch": 1.7985837922895358, "grad_norm": 0.648877478471487, "learning_rate": 3.231168629100328e-05, "loss": 0.5605, "step": 571 }, { "epoch": 1.8017309205350118, "grad_norm": 0.7019920843145531, "learning_rate": 3.227693276440111e-05, "loss": 0.5837, "step": 572 }, { "epoch": 1.8048780487804879, "grad_norm": 0.7208863035517205, "learning_rate": 3.224211965095326e-05, "loss": 0.5771, "step": 573 }, { "epoch": 1.8080251770259639, "grad_norm": 0.582448115703797, "learning_rate": 3.220724711962729e-05, "loss": 0.5769, "step": 574 }, { "epoch": 1.8111723052714397, "grad_norm": 0.6093974952092187, "learning_rate": 3.217231533967917e-05, "loss": 0.5375, "step": 575 }, { "epoch": 1.814319433516916, "grad_norm": 0.6885252290210996, "learning_rate": 3.213732448065244e-05, "loss": 0.5807, "step": 576 }, { "epoch": 1.8174665617623917, "grad_norm": 0.6107097439844204, "learning_rate": 3.2102274712377384e-05, "loss": 0.5778, "step": 577 }, { "epoch": 1.8206136900078678, "grad_norm": 0.581905246193802, "learning_rate": 3.206716620497019e-05, "loss": 0.5342, "step": 578 }, { "epoch": 1.8237608182533438, "grad_norm": 0.5218197149224699, "learning_rate": 3.203199912883216e-05, "loss": 0.5698, "step": 579 }, { "epoch": 1.8269079464988198, "grad_norm": 0.6188418099817343, "learning_rate": 3.199677365464884e-05, "loss": 0.5697, "step": 580 }, { "epoch": 1.8300550747442959, "grad_norm": 0.6904870334281441, "learning_rate": 3.196148995338924e-05, "loss": 0.5696, "step": 581 }, { "epoch": 1.8332022029897719, "grad_norm": 0.6072285553149802, "learning_rate": 3.192614819630497e-05, "loss": 0.5827, "step": 582 }, { "epoch": 1.836349331235248, "grad_norm": 0.5737961625680729, "learning_rate": 3.189074855492941e-05, "loss": 0.5633, "step": 583 }, { "epoch": 1.8394964594807237, "grad_norm": 0.5224516792912552, "learning_rate": 3.185529120107688e-05, "loss": 0.5523, "step": 584 }, { "epoch": 1.8426435877262, "grad_norm": 0.7333440316338549, "learning_rate": 3.181977630684183e-05, "loss": 0.5873, "step": 585 }, { "epoch": 1.8457907159716758, "grad_norm": 0.6648112330877247, "learning_rate": 3.1784204044597976e-05, "loss": 0.5526, "step": 586 }, { "epoch": 1.8489378442171518, "grad_norm": 0.5927824601123387, "learning_rate": 3.174857458699747e-05, "loss": 0.5667, "step": 587 }, { "epoch": 1.8520849724626278, "grad_norm": 0.8622116267692795, "learning_rate": 3.171288810697007e-05, "loss": 0.5867, "step": 588 }, { "epoch": 1.8552321007081038, "grad_norm": 0.5692350152296256, "learning_rate": 3.167714477772229e-05, "loss": 0.5831, "step": 589 }, { "epoch": 1.8583792289535799, "grad_norm": 0.755370477014244, "learning_rate": 3.164134477273658e-05, "loss": 0.5744, "step": 590 }, { "epoch": 1.861526357199056, "grad_norm": 0.6666470710363162, "learning_rate": 3.160548826577046e-05, "loss": 0.5646, "step": 591 }, { "epoch": 1.864673485444532, "grad_norm": 0.6685593450468315, "learning_rate": 3.156957543085566e-05, "loss": 0.546, "step": 592 }, { "epoch": 1.8678206136900077, "grad_norm": 0.8519423765856126, "learning_rate": 3.153360644229735e-05, "loss": 0.5773, "step": 593 }, { "epoch": 1.870967741935484, "grad_norm": 0.6858413196623089, "learning_rate": 3.149758147467321e-05, "loss": 0.5611, "step": 594 }, { "epoch": 1.8741148701809598, "grad_norm": 0.6424906456379045, "learning_rate": 3.146150070283263e-05, "loss": 0.5741, "step": 595 }, { "epoch": 1.877261998426436, "grad_norm": 0.7226105609045684, "learning_rate": 3.142536430189585e-05, "loss": 0.5635, "step": 596 }, { "epoch": 1.8804091266719118, "grad_norm": 0.5556670020965504, "learning_rate": 3.1389172447253106e-05, "loss": 0.5644, "step": 597 }, { "epoch": 1.8835562549173879, "grad_norm": 0.6676742327342214, "learning_rate": 3.135292531456378e-05, "loss": 0.5686, "step": 598 }, { "epoch": 1.886703383162864, "grad_norm": 0.5117507094897004, "learning_rate": 3.131662307975556e-05, "loss": 0.569, "step": 599 }, { "epoch": 1.88985051140834, "grad_norm": 0.8463097941158746, "learning_rate": 3.128026591902356e-05, "loss": 0.5623, "step": 600 }, { "epoch": 1.892997639653816, "grad_norm": 0.5791294005826809, "learning_rate": 3.1243854008829493e-05, "loss": 0.5502, "step": 601 }, { "epoch": 1.8961447678992918, "grad_norm": 0.5549775959821279, "learning_rate": 3.1207387525900795e-05, "loss": 0.5428, "step": 602 }, { "epoch": 1.899291896144768, "grad_norm": 0.6916786541488753, "learning_rate": 3.117086664722977e-05, "loss": 0.5735, "step": 603 }, { "epoch": 1.9024390243902438, "grad_norm": 0.5153262400722942, "learning_rate": 3.113429155007276e-05, "loss": 0.5574, "step": 604 }, { "epoch": 1.90558615263572, "grad_norm": 0.6922452984565507, "learning_rate": 3.109766241194922e-05, "loss": 0.5672, "step": 605 }, { "epoch": 1.9087332808811959, "grad_norm": 0.6162773584248936, "learning_rate": 3.106097941064093e-05, "loss": 0.5616, "step": 606 }, { "epoch": 1.911880409126672, "grad_norm": 0.6022937951308683, "learning_rate": 3.1024242724191074e-05, "loss": 0.5729, "step": 607 }, { "epoch": 1.915027537372148, "grad_norm": 0.5971187647434458, "learning_rate": 3.098745253090342e-05, "loss": 0.5575, "step": 608 }, { "epoch": 1.918174665617624, "grad_norm": 0.6007052376113771, "learning_rate": 3.0950609009341424e-05, "loss": 0.5853, "step": 609 }, { "epoch": 1.9213217938631, "grad_norm": 0.6798286394330892, "learning_rate": 3.0913712338327376e-05, "loss": 0.5652, "step": 610 }, { "epoch": 1.9244689221085758, "grad_norm": 0.4998017682786324, "learning_rate": 3.087676269694153e-05, "loss": 0.5437, "step": 611 }, { "epoch": 1.927616050354052, "grad_norm": 0.5900141909915693, "learning_rate": 3.083976026452125e-05, "loss": 0.5683, "step": 612 }, { "epoch": 1.9307631785995278, "grad_norm": 0.6801189891795948, "learning_rate": 3.080270522066008e-05, "loss": 0.5613, "step": 613 }, { "epoch": 1.933910306845004, "grad_norm": 0.6473158183105817, "learning_rate": 3.076559774520697e-05, "loss": 0.6134, "step": 614 }, { "epoch": 1.93705743509048, "grad_norm": 0.6272385353530213, "learning_rate": 3.072843801826532e-05, "loss": 0.5474, "step": 615 }, { "epoch": 1.940204563335956, "grad_norm": 0.5345902788380883, "learning_rate": 3.069122622019214e-05, "loss": 0.5526, "step": 616 }, { "epoch": 1.943351691581432, "grad_norm": 0.5673813938868798, "learning_rate": 3.065396253159717e-05, "loss": 0.555, "step": 617 }, { "epoch": 1.946498819826908, "grad_norm": 0.6523154486055993, "learning_rate": 3.061664713334202e-05, "loss": 0.5879, "step": 618 }, { "epoch": 1.949645948072384, "grad_norm": 0.5478136247550162, "learning_rate": 3.057928020653925e-05, "loss": 0.5626, "step": 619 }, { "epoch": 1.9527930763178598, "grad_norm": 0.6276196177093876, "learning_rate": 3.054186193255154e-05, "loss": 0.5753, "step": 620 }, { "epoch": 1.955940204563336, "grad_norm": 0.5863504244120591, "learning_rate": 3.0504392492990763e-05, "loss": 0.5411, "step": 621 }, { "epoch": 1.9590873328088119, "grad_norm": 0.6126483770064565, "learning_rate": 3.0466872069717152e-05, "loss": 0.57, "step": 622 }, { "epoch": 1.9622344610542881, "grad_norm": 0.7429812948474108, "learning_rate": 3.0429300844838373e-05, "loss": 0.5775, "step": 623 }, { "epoch": 1.965381589299764, "grad_norm": 0.5191951431840917, "learning_rate": 3.0391679000708673e-05, "loss": 0.5443, "step": 624 }, { "epoch": 1.96852871754524, "grad_norm": 0.518779230546286, "learning_rate": 3.0354006719927987e-05, "loss": 0.5356, "step": 625 }, { "epoch": 1.971675845790716, "grad_norm": 0.6399044762326972, "learning_rate": 3.0316284185341023e-05, "loss": 0.6069, "step": 626 }, { "epoch": 1.974822974036192, "grad_norm": 0.6811166693200108, "learning_rate": 3.0278511580036417e-05, "loss": 0.5327, "step": 627 }, { "epoch": 1.977970102281668, "grad_norm": 0.5946157024716091, "learning_rate": 3.0240689087345838e-05, "loss": 0.5843, "step": 628 }, { "epoch": 1.9811172305271438, "grad_norm": 0.6365089414451961, "learning_rate": 3.0202816890843062e-05, "loss": 0.5892, "step": 629 }, { "epoch": 1.98426435877262, "grad_norm": 0.7183274496457972, "learning_rate": 3.016489517434312e-05, "loss": 0.5145, "step": 630 }, { "epoch": 1.987411487018096, "grad_norm": 0.6164005756653712, "learning_rate": 3.012692412190138e-05, "loss": 0.5784, "step": 631 }, { "epoch": 1.9905586152635721, "grad_norm": 0.4388269705199884, "learning_rate": 3.008890391781268e-05, "loss": 0.5773, "step": 632 }, { "epoch": 1.993705743509048, "grad_norm": 0.6502984666030054, "learning_rate": 3.0050834746610412e-05, "loss": 0.5544, "step": 633 }, { "epoch": 1.996852871754524, "grad_norm": 0.6922648777230136, "learning_rate": 3.0012716793065634e-05, "loss": 0.555, "step": 634 }, { "epoch": 2.001573564122738, "grad_norm": 0.8869807905073702, "learning_rate": 2.9974550242186156e-05, "loss": 0.9625, "step": 635 }, { "epoch": 2.004720692368214, "grad_norm": 1.0358494933593285, "learning_rate": 2.993633527921569e-05, "loss": 0.5616, "step": 636 }, { "epoch": 2.00786782061369, "grad_norm": 0.9346566129032547, "learning_rate": 2.9898072089632882e-05, "loss": 0.476, "step": 637 }, { "epoch": 2.011014948859166, "grad_norm": 0.7853808249371318, "learning_rate": 2.9859760859150488e-05, "loss": 0.5242, "step": 638 }, { "epoch": 2.014162077104642, "grad_norm": 0.6808785603976919, "learning_rate": 2.9821401773714394e-05, "loss": 0.4915, "step": 639 }, { "epoch": 2.017309205350118, "grad_norm": 1.079169965146025, "learning_rate": 2.9782995019502787e-05, "loss": 0.5268, "step": 640 }, { "epoch": 2.020456333595594, "grad_norm": 0.6687486233311583, "learning_rate": 2.97445407829252e-05, "loss": 0.4946, "step": 641 }, { "epoch": 2.02360346184107, "grad_norm": 0.7161583063428312, "learning_rate": 2.9706039250621626e-05, "loss": 0.5282, "step": 642 }, { "epoch": 2.026750590086546, "grad_norm": 0.8180598647734951, "learning_rate": 2.9667490609461604e-05, "loss": 0.5101, "step": 643 }, { "epoch": 2.029897718332022, "grad_norm": 0.7917644525656923, "learning_rate": 2.9628895046543342e-05, "loss": 0.5029, "step": 644 }, { "epoch": 2.033044846577498, "grad_norm": 0.6513990862103043, "learning_rate": 2.9590252749192757e-05, "loss": 0.5118, "step": 645 }, { "epoch": 2.036191974822974, "grad_norm": 0.7808164985343838, "learning_rate": 2.955156390496262e-05, "loss": 0.5195, "step": 646 }, { "epoch": 2.0393391030684502, "grad_norm": 0.7912837002277737, "learning_rate": 2.9512828701631596e-05, "loss": 0.5265, "step": 647 }, { "epoch": 2.042486231313926, "grad_norm": 0.5929853901605757, "learning_rate": 2.9474047327203377e-05, "loss": 0.4743, "step": 648 }, { "epoch": 2.045633359559402, "grad_norm": 0.5776575088322736, "learning_rate": 2.943521996990573e-05, "loss": 0.5317, "step": 649 }, { "epoch": 2.048780487804878, "grad_norm": 0.5820287876548046, "learning_rate": 2.9396346818189633e-05, "loss": 0.4907, "step": 650 }, { "epoch": 2.051927616050354, "grad_norm": 0.5401403467336713, "learning_rate": 2.9357428060728286e-05, "loss": 0.5108, "step": 651 }, { "epoch": 2.05507474429583, "grad_norm": 0.5681209497944092, "learning_rate": 2.9318463886416286e-05, "loss": 0.5115, "step": 652 }, { "epoch": 2.058221872541306, "grad_norm": 0.7457016880475935, "learning_rate": 2.9279454484368622e-05, "loss": 0.5245, "step": 653 }, { "epoch": 2.061369000786782, "grad_norm": 0.7100559492936808, "learning_rate": 2.9240400043919846e-05, "loss": 0.5066, "step": 654 }, { "epoch": 2.064516129032258, "grad_norm": 0.5562962703615024, "learning_rate": 2.9201300754623046e-05, "loss": 0.5197, "step": 655 }, { "epoch": 2.0676632572777343, "grad_norm": 0.7228119547511469, "learning_rate": 2.916215680624904e-05, "loss": 0.4895, "step": 656 }, { "epoch": 2.07081038552321, "grad_norm": 0.6104757189458597, "learning_rate": 2.912296838878537e-05, "loss": 0.5248, "step": 657 }, { "epoch": 2.073957513768686, "grad_norm": 0.6489476520349108, "learning_rate": 2.9083735692435428e-05, "loss": 0.5091, "step": 658 }, { "epoch": 2.077104642014162, "grad_norm": 0.5407920259634255, "learning_rate": 2.9044458907617515e-05, "loss": 0.5057, "step": 659 }, { "epoch": 2.080251770259638, "grad_norm": 0.5264195062329879, "learning_rate": 2.9005138224963906e-05, "loss": 0.5459, "step": 660 }, { "epoch": 2.083398898505114, "grad_norm": 0.5997049734314334, "learning_rate": 2.896577383531994e-05, "loss": 0.5098, "step": 661 }, { "epoch": 2.08654602675059, "grad_norm": 0.6428632064411095, "learning_rate": 2.8926365929743108e-05, "loss": 0.4887, "step": 662 }, { "epoch": 2.0896931549960662, "grad_norm": 0.45721246844892666, "learning_rate": 2.888691469950209e-05, "loss": 0.5194, "step": 663 }, { "epoch": 2.092840283241542, "grad_norm": 0.6356505848651062, "learning_rate": 2.8847420336075842e-05, "loss": 0.4969, "step": 664 }, { "epoch": 2.0959874114870183, "grad_norm": 0.5203943278435145, "learning_rate": 2.880788303115269e-05, "loss": 0.512, "step": 665 }, { "epoch": 2.099134539732494, "grad_norm": 0.5416771289936778, "learning_rate": 2.876830297662936e-05, "loss": 0.5221, "step": 666 }, { "epoch": 2.10228166797797, "grad_norm": 0.48612729179676883, "learning_rate": 2.8728680364610072e-05, "loss": 0.4813, "step": 667 }, { "epoch": 2.105428796223446, "grad_norm": 0.5280735471764217, "learning_rate": 2.8689015387405606e-05, "loss": 0.5197, "step": 668 }, { "epoch": 2.108575924468922, "grad_norm": 0.6468916366598001, "learning_rate": 2.8649308237532357e-05, "loss": 0.5204, "step": 669 }, { "epoch": 2.111723052714398, "grad_norm": 0.4384358357805341, "learning_rate": 2.860955910771142e-05, "loss": 0.4934, "step": 670 }, { "epoch": 2.114870180959874, "grad_norm": 0.5293046040256487, "learning_rate": 2.856976819086762e-05, "loss": 0.4993, "step": 671 }, { "epoch": 2.1180173092053503, "grad_norm": 0.5255077696255191, "learning_rate": 2.8529935680128617e-05, "loss": 0.5212, "step": 672 }, { "epoch": 2.121164437450826, "grad_norm": 0.5342842320364793, "learning_rate": 2.849006176882394e-05, "loss": 0.4941, "step": 673 }, { "epoch": 2.1243115656963023, "grad_norm": 0.4952609066161591, "learning_rate": 2.8450146650484068e-05, "loss": 0.5259, "step": 674 }, { "epoch": 2.127458693941778, "grad_norm": 0.5889133681118838, "learning_rate": 2.8410190518839468e-05, "loss": 0.5226, "step": 675 }, { "epoch": 2.1306058221872544, "grad_norm": 0.5930345041899265, "learning_rate": 2.8370193567819675e-05, "loss": 0.5162, "step": 676 }, { "epoch": 2.13375295043273, "grad_norm": 0.4066191621033764, "learning_rate": 2.833015599155235e-05, "loss": 0.5079, "step": 677 }, { "epoch": 2.136900078678206, "grad_norm": 0.5174227837268748, "learning_rate": 2.829007798436232e-05, "loss": 0.4783, "step": 678 }, { "epoch": 2.140047206923682, "grad_norm": 0.4906472007347332, "learning_rate": 2.824995974077064e-05, "loss": 0.5275, "step": 679 }, { "epoch": 2.143194335169158, "grad_norm": 0.46230506366167495, "learning_rate": 2.820980145549367e-05, "loss": 0.503, "step": 680 }, { "epoch": 2.1463414634146343, "grad_norm": 0.5467185655256718, "learning_rate": 2.816960332344212e-05, "loss": 0.5192, "step": 681 }, { "epoch": 2.14948859166011, "grad_norm": 0.45235629652152315, "learning_rate": 2.8129365539720073e-05, "loss": 0.5113, "step": 682 }, { "epoch": 2.1526357199055863, "grad_norm": 0.5083240071575459, "learning_rate": 2.8089088299624084e-05, "loss": 0.4992, "step": 683 }, { "epoch": 2.155782848151062, "grad_norm": 0.5603968704569885, "learning_rate": 2.8048771798642208e-05, "loss": 0.5324, "step": 684 }, { "epoch": 2.1589299763965384, "grad_norm": 0.4794631589703544, "learning_rate": 2.8008416232453056e-05, "loss": 0.482, "step": 685 }, { "epoch": 2.162077104642014, "grad_norm": 0.4324878828428329, "learning_rate": 2.7968021796924834e-05, "loss": 0.505, "step": 686 }, { "epoch": 2.16522423288749, "grad_norm": 0.5405954748624254, "learning_rate": 2.792758868811442e-05, "loss": 0.5035, "step": 687 }, { "epoch": 2.1683713611329662, "grad_norm": 0.46018663007159943, "learning_rate": 2.7887117102266373e-05, "loss": 0.5513, "step": 688 }, { "epoch": 2.171518489378442, "grad_norm": 0.42047087691548024, "learning_rate": 2.7846607235812032e-05, "loss": 0.4948, "step": 689 }, { "epoch": 2.1746656176239183, "grad_norm": 0.5427417159577586, "learning_rate": 2.7806059285368506e-05, "loss": 0.514, "step": 690 }, { "epoch": 2.177812745869394, "grad_norm": 0.43986897145724735, "learning_rate": 2.7765473447737767e-05, "loss": 0.4673, "step": 691 }, { "epoch": 2.1809598741148704, "grad_norm": 0.4754880772061975, "learning_rate": 2.772484991990565e-05, "loss": 0.5417, "step": 692 }, { "epoch": 2.184107002360346, "grad_norm": 0.47502258387933016, "learning_rate": 2.7684188899040955e-05, "loss": 0.4796, "step": 693 }, { "epoch": 2.1872541306058224, "grad_norm": 0.40705818520163617, "learning_rate": 2.764349058249443e-05, "loss": 0.5157, "step": 694 }, { "epoch": 2.190401258851298, "grad_norm": 0.6538511475739581, "learning_rate": 2.7602755167797853e-05, "loss": 0.4961, "step": 695 }, { "epoch": 2.193548387096774, "grad_norm": 0.44699881941017616, "learning_rate": 2.7561982852663045e-05, "loss": 0.5207, "step": 696 }, { "epoch": 2.1966955153422503, "grad_norm": 0.6212501071680818, "learning_rate": 2.752117383498095e-05, "loss": 0.5439, "step": 697 }, { "epoch": 2.199842643587726, "grad_norm": 0.5759129065235197, "learning_rate": 2.7480328312820633e-05, "loss": 0.4595, "step": 698 }, { "epoch": 2.2029897718332023, "grad_norm": 0.6097237129221698, "learning_rate": 2.7439446484428337e-05, "loss": 0.5334, "step": 699 }, { "epoch": 2.206136900078678, "grad_norm": 0.6817032082261668, "learning_rate": 2.7398528548226526e-05, "loss": 0.5309, "step": 700 }, { "epoch": 2.2092840283241544, "grad_norm": 0.5924111305234294, "learning_rate": 2.7357574702812902e-05, "loss": 0.484, "step": 701 }, { "epoch": 2.21243115656963, "grad_norm": 0.5523261349165254, "learning_rate": 2.731658514695948e-05, "loss": 0.5364, "step": 702 }, { "epoch": 2.2155782848151064, "grad_norm": 0.5572106085621287, "learning_rate": 2.7275560079611564e-05, "loss": 0.53, "step": 703 }, { "epoch": 2.2187254130605822, "grad_norm": 0.5065834566763677, "learning_rate": 2.7234499699886843e-05, "loss": 0.4973, "step": 704 }, { "epoch": 2.221872541306058, "grad_norm": 0.5003500662242514, "learning_rate": 2.719340420707439e-05, "loss": 0.5014, "step": 705 }, { "epoch": 2.2250196695515343, "grad_norm": 0.3741659698585933, "learning_rate": 2.715227380063369e-05, "loss": 0.4975, "step": 706 }, { "epoch": 2.22816679779701, "grad_norm": 0.444161319841057, "learning_rate": 2.711110868019369e-05, "loss": 0.5203, "step": 707 }, { "epoch": 2.2313139260424864, "grad_norm": 0.4160861591456116, "learning_rate": 2.706990904555184e-05, "loss": 0.489, "step": 708 }, { "epoch": 2.234461054287962, "grad_norm": 0.4599418626187508, "learning_rate": 2.702867509667308e-05, "loss": 0.5134, "step": 709 }, { "epoch": 2.2376081825334384, "grad_norm": 0.41866439278740125, "learning_rate": 2.6987407033688914e-05, "loss": 0.5248, "step": 710 }, { "epoch": 2.240755310778914, "grad_norm": 0.39041212627496713, "learning_rate": 2.6946105056896406e-05, "loss": 0.4978, "step": 711 }, { "epoch": 2.2439024390243905, "grad_norm": 0.417852325813291, "learning_rate": 2.6904769366757245e-05, "loss": 0.4977, "step": 712 }, { "epoch": 2.2470495672698663, "grad_norm": 0.4808675566465674, "learning_rate": 2.6863400163896726e-05, "loss": 0.5183, "step": 713 }, { "epoch": 2.250196695515342, "grad_norm": 0.4161660973559079, "learning_rate": 2.6821997649102818e-05, "loss": 0.5278, "step": 714 }, { "epoch": 2.2533438237608183, "grad_norm": 0.45962709409500624, "learning_rate": 2.678056202332516e-05, "loss": 0.52, "step": 715 }, { "epoch": 2.256490952006294, "grad_norm": 0.41242936769273836, "learning_rate": 2.6739093487674104e-05, "loss": 0.4999, "step": 716 }, { "epoch": 2.2596380802517704, "grad_norm": 0.5058761449658284, "learning_rate": 2.6697592243419723e-05, "loss": 0.5216, "step": 717 }, { "epoch": 2.262785208497246, "grad_norm": 0.3744491655107618, "learning_rate": 2.6656058491990867e-05, "loss": 0.4743, "step": 718 }, { "epoch": 2.2659323367427224, "grad_norm": 0.5243504074727018, "learning_rate": 2.661449243497412e-05, "loss": 0.525, "step": 719 }, { "epoch": 2.2690794649881982, "grad_norm": 0.43302902154306294, "learning_rate": 2.657289427411291e-05, "loss": 0.4997, "step": 720 }, { "epoch": 2.2722265932336745, "grad_norm": 0.4225068744947921, "learning_rate": 2.6531264211306442e-05, "loss": 0.5153, "step": 721 }, { "epoch": 2.2753737214791503, "grad_norm": 0.45512676490493853, "learning_rate": 2.6489602448608795e-05, "loss": 0.5275, "step": 722 }, { "epoch": 2.278520849724626, "grad_norm": 0.3718800854251584, "learning_rate": 2.6447909188227874e-05, "loss": 0.4815, "step": 723 }, { "epoch": 2.2816679779701023, "grad_norm": 0.49964042658507957, "learning_rate": 2.640618463252448e-05, "loss": 0.5093, "step": 724 }, { "epoch": 2.284815106215578, "grad_norm": 0.5301891798377791, "learning_rate": 2.63644289840113e-05, "loss": 0.5242, "step": 725 }, { "epoch": 2.2879622344610544, "grad_norm": 0.4050871646973943, "learning_rate": 2.6322642445351942e-05, "loss": 0.5035, "step": 726 }, { "epoch": 2.29110936270653, "grad_norm": 0.5023627156664985, "learning_rate": 2.6280825219359916e-05, "loss": 0.507, "step": 727 }, { "epoch": 2.2942564909520065, "grad_norm": 0.4391094098568942, "learning_rate": 2.6238977508997705e-05, "loss": 0.5149, "step": 728 }, { "epoch": 2.2974036191974823, "grad_norm": 0.47537818242731106, "learning_rate": 2.6197099517375728e-05, "loss": 0.5014, "step": 729 }, { "epoch": 2.3005507474429585, "grad_norm": 0.46340780511359153, "learning_rate": 2.61551914477514e-05, "loss": 0.5378, "step": 730 }, { "epoch": 2.3036978756884343, "grad_norm": 0.4381885150917152, "learning_rate": 2.611325350352808e-05, "loss": 0.5302, "step": 731 }, { "epoch": 2.30684500393391, "grad_norm": 0.43651726345372543, "learning_rate": 2.6071285888254173e-05, "loss": 0.4827, "step": 732 }, { "epoch": 2.3099921321793864, "grad_norm": 0.42700123656874067, "learning_rate": 2.602928880562206e-05, "loss": 0.5209, "step": 733 }, { "epoch": 2.313139260424862, "grad_norm": 0.4936338227402977, "learning_rate": 2.5987262459467168e-05, "loss": 0.5234, "step": 734 }, { "epoch": 2.3162863886703384, "grad_norm": 0.6422115819077465, "learning_rate": 2.5945207053766927e-05, "loss": 0.4934, "step": 735 }, { "epoch": 2.3194335169158142, "grad_norm": 0.5426504118993586, "learning_rate": 2.5903122792639835e-05, "loss": 0.4959, "step": 736 }, { "epoch": 2.3225806451612905, "grad_norm": 0.5610780996366858, "learning_rate": 2.5861009880344435e-05, "loss": 0.5072, "step": 737 }, { "epoch": 2.3257277734067663, "grad_norm": 0.6128983248924292, "learning_rate": 2.5818868521278333e-05, "loss": 0.5295, "step": 738 }, { "epoch": 2.3288749016522425, "grad_norm": 0.4668022048915129, "learning_rate": 2.577669891997718e-05, "loss": 0.4872, "step": 739 }, { "epoch": 2.3320220298977183, "grad_norm": 0.5249195533669971, "learning_rate": 2.573450128111374e-05, "loss": 0.5111, "step": 740 }, { "epoch": 2.335169158143194, "grad_norm": 0.501737921275211, "learning_rate": 2.569227580949683e-05, "loss": 0.5173, "step": 741 }, { "epoch": 2.3383162863886704, "grad_norm": 0.5066876036326522, "learning_rate": 2.565002271007037e-05, "loss": 0.502, "step": 742 }, { "epoch": 2.341463414634146, "grad_norm": 0.4365145077451115, "learning_rate": 2.5607742187912362e-05, "loss": 0.5084, "step": 743 }, { "epoch": 2.3446105428796225, "grad_norm": 0.44179872832361533, "learning_rate": 2.5565434448233915e-05, "loss": 0.5012, "step": 744 }, { "epoch": 2.3477576711250983, "grad_norm": 0.38725032898097067, "learning_rate": 2.5523099696378238e-05, "loss": 0.5246, "step": 745 }, { "epoch": 2.3509047993705745, "grad_norm": 0.4516415072913574, "learning_rate": 2.5480738137819642e-05, "loss": 0.4923, "step": 746 }, { "epoch": 2.3540519276160503, "grad_norm": 0.405281408168675, "learning_rate": 2.5438349978162552e-05, "loss": 0.5248, "step": 747 }, { "epoch": 2.3571990558615266, "grad_norm": 0.45641439852412824, "learning_rate": 2.5395935423140487e-05, "loss": 0.5216, "step": 748 }, { "epoch": 2.3603461841070024, "grad_norm": 0.3726940888910086, "learning_rate": 2.5353494678615107e-05, "loss": 0.4828, "step": 749 }, { "epoch": 2.363493312352478, "grad_norm": 0.4749642740007426, "learning_rate": 2.5311027950575152e-05, "loss": 0.495, "step": 750 }, { "epoch": 2.3666404405979544, "grad_norm": 0.3717895586016363, "learning_rate": 2.5268535445135505e-05, "loss": 0.5197, "step": 751 }, { "epoch": 2.3697875688434302, "grad_norm": 0.5304840030716524, "learning_rate": 2.522601736853613e-05, "loss": 0.4806, "step": 752 }, { "epoch": 2.3729346970889065, "grad_norm": 0.46220541231520146, "learning_rate": 2.5183473927141125e-05, "loss": 0.5116, "step": 753 }, { "epoch": 2.3760818253343823, "grad_norm": 0.5043416489424374, "learning_rate": 2.514090532743769e-05, "loss": 0.5078, "step": 754 }, { "epoch": 2.3792289535798585, "grad_norm": 0.42879087005379485, "learning_rate": 2.509831177603514e-05, "loss": 0.4947, "step": 755 }, { "epoch": 2.3823760818253343, "grad_norm": 0.4390099762856732, "learning_rate": 2.505569347966387e-05, "loss": 0.5158, "step": 756 }, { "epoch": 2.3855232100708106, "grad_norm": 0.42762212558355356, "learning_rate": 2.5013050645174414e-05, "loss": 0.501, "step": 757 }, { "epoch": 2.3886703383162864, "grad_norm": 0.39660686265271283, "learning_rate": 2.497038347953637e-05, "loss": 0.5072, "step": 758 }, { "epoch": 2.391817466561762, "grad_norm": 0.44332823895817053, "learning_rate": 2.4927692189837448e-05, "loss": 0.5079, "step": 759 }, { "epoch": 2.3949645948072384, "grad_norm": 0.3841557280664548, "learning_rate": 2.4884976983282426e-05, "loss": 0.5008, "step": 760 }, { "epoch": 2.3981117230527143, "grad_norm": 0.3783162289122301, "learning_rate": 2.48422380671922e-05, "loss": 0.5075, "step": 761 }, { "epoch": 2.4012588512981905, "grad_norm": 0.4626314996814243, "learning_rate": 2.4799475649002706e-05, "loss": 0.5179, "step": 762 }, { "epoch": 2.4044059795436663, "grad_norm": 0.35098996182801706, "learning_rate": 2.4756689936263958e-05, "loss": 0.5106, "step": 763 }, { "epoch": 2.4075531077891426, "grad_norm": 0.5176720851002166, "learning_rate": 2.471388113663904e-05, "loss": 0.5474, "step": 764 }, { "epoch": 2.4107002360346184, "grad_norm": 0.44450352974703067, "learning_rate": 2.4671049457903086e-05, "loss": 0.443, "step": 765 }, { "epoch": 2.4138473642800946, "grad_norm": 0.5072356340865095, "learning_rate": 2.4628195107942273e-05, "loss": 0.5415, "step": 766 }, { "epoch": 2.4169944925255704, "grad_norm": 0.41936557184075446, "learning_rate": 2.4585318294752818e-05, "loss": 0.4955, "step": 767 }, { "epoch": 2.420141620771046, "grad_norm": 0.5116342896251873, "learning_rate": 2.4542419226439953e-05, "loss": 0.5317, "step": 768 }, { "epoch": 2.4232887490165225, "grad_norm": 0.4557793409894804, "learning_rate": 2.449949811121695e-05, "loss": 0.4922, "step": 769 }, { "epoch": 2.4264358772619983, "grad_norm": 0.5252751532185341, "learning_rate": 2.445655515740406e-05, "loss": 0.5344, "step": 770 }, { "epoch": 2.4295830055074745, "grad_norm": 0.47915013961773817, "learning_rate": 2.441359057342755e-05, "loss": 0.4823, "step": 771 }, { "epoch": 2.4327301337529503, "grad_norm": 0.47466054181163075, "learning_rate": 2.437060456781865e-05, "loss": 0.5184, "step": 772 }, { "epoch": 2.4358772619984266, "grad_norm": 0.5095317232071647, "learning_rate": 2.432759734921259e-05, "loss": 0.5309, "step": 773 }, { "epoch": 2.4390243902439024, "grad_norm": 0.412356192481546, "learning_rate": 2.428456912634753e-05, "loss": 0.4912, "step": 774 }, { "epoch": 2.4421715184893786, "grad_norm": 0.5168874991069228, "learning_rate": 2.424152010806359e-05, "loss": 0.5102, "step": 775 }, { "epoch": 2.4453186467348544, "grad_norm": 0.4218110874274139, "learning_rate": 2.4198450503301816e-05, "loss": 0.5119, "step": 776 }, { "epoch": 2.4484657749803302, "grad_norm": 0.46960683665524694, "learning_rate": 2.415536052110318e-05, "loss": 0.4883, "step": 777 }, { "epoch": 2.4516129032258065, "grad_norm": 0.38171463482040097, "learning_rate": 2.4112250370607546e-05, "loss": 0.5033, "step": 778 }, { "epoch": 2.4547600314712823, "grad_norm": 0.5018899134437422, "learning_rate": 2.4069120261052682e-05, "loss": 0.5322, "step": 779 }, { "epoch": 2.4579071597167585, "grad_norm": 0.41906891191421164, "learning_rate": 2.4025970401773204e-05, "loss": 0.4813, "step": 780 }, { "epoch": 2.4610542879622344, "grad_norm": 0.48223066804804066, "learning_rate": 2.3982801002199612e-05, "loss": 0.5062, "step": 781 }, { "epoch": 2.4642014162077106, "grad_norm": 0.5738880479428738, "learning_rate": 2.393961227185723e-05, "loss": 0.5011, "step": 782 }, { "epoch": 2.4673485444531864, "grad_norm": 0.5092728168679931, "learning_rate": 2.38964044203652e-05, "loss": 0.5038, "step": 783 }, { "epoch": 2.4704956726986627, "grad_norm": 0.5672892970502025, "learning_rate": 2.385317765743548e-05, "loss": 0.512, "step": 784 }, { "epoch": 2.4736428009441385, "grad_norm": 0.48443665358354104, "learning_rate": 2.3809932192871826e-05, "loss": 0.5249, "step": 785 }, { "epoch": 2.4767899291896143, "grad_norm": 0.4661612625300044, "learning_rate": 2.3766668236568736e-05, "loss": 0.5161, "step": 786 }, { "epoch": 2.4799370574350905, "grad_norm": 0.5142533804492694, "learning_rate": 2.3723385998510478e-05, "loss": 0.5235, "step": 787 }, { "epoch": 2.4830841856805663, "grad_norm": 0.3731275373448457, "learning_rate": 2.3680085688770048e-05, "loss": 0.489, "step": 788 }, { "epoch": 2.4862313139260426, "grad_norm": 0.4812773095674535, "learning_rate": 2.3636767517508162e-05, "loss": 0.5204, "step": 789 }, { "epoch": 2.4893784421715184, "grad_norm": 0.36359194843424747, "learning_rate": 2.359343169497221e-05, "loss": 0.514, "step": 790 }, { "epoch": 2.4925255704169946, "grad_norm": 0.4419525373424479, "learning_rate": 2.355007843149526e-05, "loss": 0.4946, "step": 791 }, { "epoch": 2.4956726986624704, "grad_norm": 0.42979935733957053, "learning_rate": 2.3506707937495042e-05, "loss": 0.5281, "step": 792 }, { "epoch": 2.4988198269079467, "grad_norm": 0.3835614105262742, "learning_rate": 2.3463320423472903e-05, "loss": 0.5143, "step": 793 }, { "epoch": 2.5019669551534225, "grad_norm": 0.43444497283325684, "learning_rate": 2.34199161000128e-05, "loss": 0.5177, "step": 794 }, { "epoch": 2.5051140833988983, "grad_norm": 0.4134676064163985, "learning_rate": 2.337649517778028e-05, "loss": 0.5084, "step": 795 }, { "epoch": 2.5082612116443745, "grad_norm": 0.4192423352575172, "learning_rate": 2.3333057867521452e-05, "loss": 0.5109, "step": 796 }, { "epoch": 2.5114083398898503, "grad_norm": 0.47607754776561306, "learning_rate": 2.3289604380061958e-05, "loss": 0.514, "step": 797 }, { "epoch": 2.5145554681353266, "grad_norm": 0.43895415692577056, "learning_rate": 2.3246134926305975e-05, "loss": 0.4877, "step": 798 }, { "epoch": 2.5177025963808024, "grad_norm": 0.45510815079628525, "learning_rate": 2.3202649717235148e-05, "loss": 0.5133, "step": 799 }, { "epoch": 2.5208497246262787, "grad_norm": 0.4427380510366315, "learning_rate": 2.315914896390761e-05, "loss": 0.5046, "step": 800 }, { "epoch": 2.5239968528717545, "grad_norm": 0.45212416154203056, "learning_rate": 2.3115632877456934e-05, "loss": 0.5093, "step": 801 }, { "epoch": 2.5271439811172307, "grad_norm": 0.4848734648886371, "learning_rate": 2.3072101669091117e-05, "loss": 0.5403, "step": 802 }, { "epoch": 2.5302911093627065, "grad_norm": 0.4284645582640883, "learning_rate": 2.3028555550091536e-05, "loss": 0.5223, "step": 803 }, { "epoch": 2.5334382376081823, "grad_norm": 0.5065448594186726, "learning_rate": 2.2984994731811958e-05, "loss": 0.51, "step": 804 }, { "epoch": 2.5365853658536586, "grad_norm": 0.392577515342448, "learning_rate": 2.2941419425677484e-05, "loss": 0.5427, "step": 805 }, { "epoch": 2.5397324940991344, "grad_norm": 0.480807545800186, "learning_rate": 2.289782984318353e-05, "loss": 0.5121, "step": 806 }, { "epoch": 2.5428796223446106, "grad_norm": 0.4237156745350743, "learning_rate": 2.2854226195894804e-05, "loss": 0.4915, "step": 807 }, { "epoch": 2.5460267505900864, "grad_norm": 0.40940864808502503, "learning_rate": 2.2810608695444292e-05, "loss": 0.504, "step": 808 }, { "epoch": 2.5491738788355627, "grad_norm": 0.4119851823926001, "learning_rate": 2.2766977553532187e-05, "loss": 0.5299, "step": 809 }, { "epoch": 2.5523210070810385, "grad_norm": 0.4130762816212127, "learning_rate": 2.2723332981924937e-05, "loss": 0.4969, "step": 810 }, { "epoch": 2.5554681353265147, "grad_norm": 0.41074305886180484, "learning_rate": 2.2679675192454123e-05, "loss": 0.4796, "step": 811 }, { "epoch": 2.5586152635719905, "grad_norm": 0.4539999707531148, "learning_rate": 2.2636004397015512e-05, "loss": 0.5463, "step": 812 }, { "epoch": 2.5617623918174663, "grad_norm": 0.4264741420489925, "learning_rate": 2.2592320807567977e-05, "loss": 0.4768, "step": 813 }, { "epoch": 2.5649095200629426, "grad_norm": 0.4286763576549067, "learning_rate": 2.2548624636132516e-05, "loss": 0.5112, "step": 814 }, { "epoch": 2.568056648308419, "grad_norm": 0.44492687170314177, "learning_rate": 2.2504916094791155e-05, "loss": 0.508, "step": 815 }, { "epoch": 2.5712037765538946, "grad_norm": 0.37885862660162245, "learning_rate": 2.2461195395685994e-05, "loss": 0.5072, "step": 816 }, { "epoch": 2.5743509047993705, "grad_norm": 0.5106741265250734, "learning_rate": 2.2417462751018113e-05, "loss": 0.5493, "step": 817 }, { "epoch": 2.5774980330448467, "grad_norm": 0.3632295363346517, "learning_rate": 2.23737183730466e-05, "loss": 0.4631, "step": 818 }, { "epoch": 2.5806451612903225, "grad_norm": 0.5365749079329822, "learning_rate": 2.2329962474087455e-05, "loss": 0.5158, "step": 819 }, { "epoch": 2.5837922895357988, "grad_norm": 0.4654117353997409, "learning_rate": 2.2286195266512635e-05, "loss": 0.5182, "step": 820 }, { "epoch": 2.5869394177812746, "grad_norm": 0.4088610719508472, "learning_rate": 2.224241696274894e-05, "loss": 0.5151, "step": 821 }, { "epoch": 2.5900865460267504, "grad_norm": 0.45576138898697316, "learning_rate": 2.2198627775277078e-05, "loss": 0.529, "step": 822 }, { "epoch": 2.5932336742722266, "grad_norm": 0.4131002917308441, "learning_rate": 2.215482791663054e-05, "loss": 0.4844, "step": 823 }, { "epoch": 2.596380802517703, "grad_norm": 0.4606000370803025, "learning_rate": 2.2111017599394627e-05, "loss": 0.5081, "step": 824 }, { "epoch": 2.5995279307631787, "grad_norm": 0.502321230958265, "learning_rate": 2.20671970362054e-05, "loss": 0.4879, "step": 825 }, { "epoch": 2.6026750590086545, "grad_norm": 0.42224326757300795, "learning_rate": 2.2023366439748647e-05, "loss": 0.4969, "step": 826 }, { "epoch": 2.6058221872541307, "grad_norm": 0.5269049401237401, "learning_rate": 2.1979526022758857e-05, "loss": 0.5288, "step": 827 }, { "epoch": 2.6089693154996065, "grad_norm": 0.46230735850829585, "learning_rate": 2.1935675998018175e-05, "loss": 0.5053, "step": 828 }, { "epoch": 2.612116443745083, "grad_norm": 0.4727993429576673, "learning_rate": 2.1891816578355387e-05, "loss": 0.5197, "step": 829 }, { "epoch": 2.6152635719905586, "grad_norm": 0.44264214533923524, "learning_rate": 2.1847947976644882e-05, "loss": 0.5018, "step": 830 }, { "epoch": 2.6184107002360344, "grad_norm": 0.5055955604260659, "learning_rate": 2.18040704058056e-05, "loss": 0.5169, "step": 831 }, { "epoch": 2.6215578284815106, "grad_norm": 0.43677389530029376, "learning_rate": 2.1760184078800016e-05, "loss": 0.5045, "step": 832 }, { "epoch": 2.624704956726987, "grad_norm": 0.5768947879521789, "learning_rate": 2.1716289208633116e-05, "loss": 0.5246, "step": 833 }, { "epoch": 2.6278520849724627, "grad_norm": 0.4712226478373909, "learning_rate": 2.1672386008351346e-05, "loss": 0.4935, "step": 834 }, { "epoch": 2.6309992132179385, "grad_norm": 0.5337996631076506, "learning_rate": 2.162847469104157e-05, "loss": 0.5203, "step": 835 }, { "epoch": 2.6341463414634148, "grad_norm": 0.4367835591952792, "learning_rate": 2.1584555469830067e-05, "loss": 0.4775, "step": 836 }, { "epoch": 2.6372934697088906, "grad_norm": 0.513188259032418, "learning_rate": 2.154062855788146e-05, "loss": 0.5008, "step": 837 }, { "epoch": 2.640440597954367, "grad_norm": 0.45447000330227766, "learning_rate": 2.1496694168397716e-05, "loss": 0.5321, "step": 838 }, { "epoch": 2.6435877261998426, "grad_norm": 0.47125461614152103, "learning_rate": 2.1452752514617087e-05, "loss": 0.4861, "step": 839 }, { "epoch": 2.6467348544453184, "grad_norm": 0.4409432391062593, "learning_rate": 2.140880380981307e-05, "loss": 0.5233, "step": 840 }, { "epoch": 2.6498819826907947, "grad_norm": 0.5309669026394624, "learning_rate": 2.1364848267293424e-05, "loss": 0.5338, "step": 841 }, { "epoch": 2.653029110936271, "grad_norm": 0.40586300361985456, "learning_rate": 2.1320886100399045e-05, "loss": 0.5037, "step": 842 }, { "epoch": 2.6561762391817467, "grad_norm": 0.540051626366727, "learning_rate": 2.1276917522503017e-05, "loss": 0.528, "step": 843 }, { "epoch": 2.6593233674272225, "grad_norm": 0.4006507834778298, "learning_rate": 2.1232942747009516e-05, "loss": 0.4866, "step": 844 }, { "epoch": 2.662470495672699, "grad_norm": 0.6074678023209782, "learning_rate": 2.1188961987352825e-05, "loss": 0.4863, "step": 845 }, { "epoch": 2.6656176239181746, "grad_norm": 0.524924268220035, "learning_rate": 2.1144975456996254e-05, "loss": 0.5609, "step": 846 }, { "epoch": 2.668764752163651, "grad_norm": 0.48996765687142846, "learning_rate": 2.1100983369431113e-05, "loss": 0.4959, "step": 847 }, { "epoch": 2.6719118804091266, "grad_norm": 0.4249594980122473, "learning_rate": 2.1056985938175702e-05, "loss": 0.5125, "step": 848 }, { "epoch": 2.6750590086546024, "grad_norm": 0.49834813656701993, "learning_rate": 2.1012983376774255e-05, "loss": 0.5047, "step": 849 }, { "epoch": 2.6782061369000787, "grad_norm": 0.4168575322233032, "learning_rate": 2.09689758987959e-05, "loss": 0.5106, "step": 850 }, { "epoch": 2.681353265145555, "grad_norm": 0.46898762750512035, "learning_rate": 2.0924963717833625e-05, "loss": 0.4981, "step": 851 }, { "epoch": 2.6845003933910307, "grad_norm": 0.45576203072274146, "learning_rate": 2.0880947047503245e-05, "loss": 0.4946, "step": 852 }, { "epoch": 2.6876475216365066, "grad_norm": 0.4738564187252312, "learning_rate": 2.0836926101442375e-05, "loss": 0.5331, "step": 853 }, { "epoch": 2.690794649881983, "grad_norm": 0.4302791146012412, "learning_rate": 2.079290109330937e-05, "loss": 0.4961, "step": 854 }, { "epoch": 2.6939417781274586, "grad_norm": 0.47679421081246964, "learning_rate": 2.074887223678231e-05, "loss": 0.5207, "step": 855 }, { "epoch": 2.697088906372935, "grad_norm": 0.4209024643198245, "learning_rate": 2.070483974555794e-05, "loss": 0.5094, "step": 856 }, { "epoch": 2.7002360346184107, "grad_norm": 0.40617414636341587, "learning_rate": 2.066080383335067e-05, "loss": 0.5175, "step": 857 }, { "epoch": 2.7033831628638865, "grad_norm": 0.42113681289201627, "learning_rate": 2.061676471389149e-05, "loss": 0.4998, "step": 858 }, { "epoch": 2.7065302911093627, "grad_norm": 0.39058660308978405, "learning_rate": 2.0572722600926963e-05, "loss": 0.5258, "step": 859 }, { "epoch": 2.709677419354839, "grad_norm": 0.3853478587959275, "learning_rate": 2.0528677708218182e-05, "loss": 0.5054, "step": 860 }, { "epoch": 2.7128245476003148, "grad_norm": 0.3661149507703519, "learning_rate": 2.0484630249539744e-05, "loss": 0.5043, "step": 861 }, { "epoch": 2.7159716758457906, "grad_norm": 0.40711054029867527, "learning_rate": 2.0440580438678683e-05, "loss": 0.5244, "step": 862 }, { "epoch": 2.719118804091267, "grad_norm": 0.3652482622197375, "learning_rate": 2.0396528489433462e-05, "loss": 0.4899, "step": 863 }, { "epoch": 2.7222659323367426, "grad_norm": 0.3720015216795421, "learning_rate": 2.03524746156129e-05, "loss": 0.4915, "step": 864 }, { "epoch": 2.725413060582219, "grad_norm": 0.44032053432840434, "learning_rate": 2.0308419031035198e-05, "loss": 0.5275, "step": 865 }, { "epoch": 2.7285601888276947, "grad_norm": 0.3737368381561298, "learning_rate": 2.026436194952682e-05, "loss": 0.5182, "step": 866 }, { "epoch": 2.7317073170731705, "grad_norm": 0.4122697478594307, "learning_rate": 2.0220303584921517e-05, "loss": 0.5256, "step": 867 }, { "epoch": 2.7348544453186467, "grad_norm": 0.36665473236295937, "learning_rate": 2.0176244151059255e-05, "loss": 0.5173, "step": 868 }, { "epoch": 2.738001573564123, "grad_norm": 0.3335486591184441, "learning_rate": 2.0132183861785212e-05, "loss": 0.4572, "step": 869 }, { "epoch": 2.741148701809599, "grad_norm": 0.38036549112647583, "learning_rate": 2.008812293094869e-05, "loss": 0.5377, "step": 870 }, { "epoch": 2.7442958300550746, "grad_norm": 0.33037565691132226, "learning_rate": 2.0044061572402115e-05, "loss": 0.491, "step": 871 }, { "epoch": 2.747442958300551, "grad_norm": 0.39753101673110847, "learning_rate": 2e-05, "loss": 0.5177, "step": 872 }, { "epoch": 2.7505900865460267, "grad_norm": 0.37934933259758424, "learning_rate": 1.9955938427597892e-05, "loss": 0.523, "step": 873 }, { "epoch": 2.753737214791503, "grad_norm": 0.4113073150527321, "learning_rate": 1.9911877069051317e-05, "loss": 0.501, "step": 874 }, { "epoch": 2.7568843430369787, "grad_norm": 0.36383957044919724, "learning_rate": 1.9867816138214794e-05, "loss": 0.4869, "step": 875 }, { "epoch": 2.7600314712824545, "grad_norm": 0.40346499233864774, "learning_rate": 1.9823755848940745e-05, "loss": 0.5307, "step": 876 }, { "epoch": 2.7631785995279308, "grad_norm": 0.40358164566370097, "learning_rate": 1.9779696415078493e-05, "loss": 0.5199, "step": 877 }, { "epoch": 2.766325727773407, "grad_norm": 0.3387651787698826, "learning_rate": 1.973563805047319e-05, "loss": 0.4933, "step": 878 }, { "epoch": 2.769472856018883, "grad_norm": 0.4119862988912321, "learning_rate": 1.969158096896481e-05, "loss": 0.505, "step": 879 }, { "epoch": 2.7726199842643586, "grad_norm": 0.398916067627908, "learning_rate": 1.9647525384387102e-05, "loss": 0.5274, "step": 880 }, { "epoch": 2.775767112509835, "grad_norm": 0.3925892974223439, "learning_rate": 1.9603471510566545e-05, "loss": 0.5213, "step": 881 }, { "epoch": 2.7789142407553107, "grad_norm": 0.4005281636322755, "learning_rate": 1.955941956132132e-05, "loss": 0.5029, "step": 882 }, { "epoch": 2.782061369000787, "grad_norm": 0.33952223831530753, "learning_rate": 1.951536975046026e-05, "loss": 0.5198, "step": 883 }, { "epoch": 2.7852084972462627, "grad_norm": 0.38608198397536103, "learning_rate": 1.9471322291781818e-05, "loss": 0.4766, "step": 884 }, { "epoch": 2.7883556254917385, "grad_norm": 0.365009789979654, "learning_rate": 1.9427277399073047e-05, "loss": 0.5072, "step": 885 }, { "epoch": 2.791502753737215, "grad_norm": 0.35042837456840825, "learning_rate": 1.9383235286108522e-05, "loss": 0.5237, "step": 886 }, { "epoch": 2.794649881982691, "grad_norm": 0.36211165712974785, "learning_rate": 1.9339196166649333e-05, "loss": 0.5082, "step": 887 }, { "epoch": 2.797797010228167, "grad_norm": 0.329398288687774, "learning_rate": 1.9295160254442062e-05, "loss": 0.4907, "step": 888 }, { "epoch": 2.8009441384736427, "grad_norm": 0.3921121599600544, "learning_rate": 1.9251127763217695e-05, "loss": 0.5087, "step": 889 }, { "epoch": 2.804091266719119, "grad_norm": 0.39222971203676776, "learning_rate": 1.9207098906690632e-05, "loss": 0.5155, "step": 890 }, { "epoch": 2.8072383949645947, "grad_norm": 0.3869489751568991, "learning_rate": 1.916307389855763e-05, "loss": 0.4923, "step": 891 }, { "epoch": 2.810385523210071, "grad_norm": 0.36534142344404397, "learning_rate": 1.911905295249676e-05, "loss": 0.5173, "step": 892 }, { "epoch": 2.8135326514555468, "grad_norm": 0.4096373953978077, "learning_rate": 1.9075036282166385e-05, "loss": 0.4787, "step": 893 }, { "epoch": 2.8166797797010226, "grad_norm": 0.42907981958111935, "learning_rate": 1.903102410120411e-05, "loss": 0.4999, "step": 894 }, { "epoch": 2.819826907946499, "grad_norm": 0.44754981578982994, "learning_rate": 1.8987016623225748e-05, "loss": 0.5223, "step": 895 }, { "epoch": 2.822974036191975, "grad_norm": 0.4503356738206301, "learning_rate": 1.8943014061824304e-05, "loss": 0.483, "step": 896 }, { "epoch": 2.826121164437451, "grad_norm": 0.4462637475879935, "learning_rate": 1.889901663056889e-05, "loss": 0.5146, "step": 897 }, { "epoch": 2.8292682926829267, "grad_norm": 0.4758553248752999, "learning_rate": 1.8855024543003756e-05, "loss": 0.4987, "step": 898 }, { "epoch": 2.832415420928403, "grad_norm": 0.3818873003232349, "learning_rate": 1.8811038012647175e-05, "loss": 0.5232, "step": 899 }, { "epoch": 2.8355625491738787, "grad_norm": 0.4459217250813105, "learning_rate": 1.8767057252990484e-05, "loss": 0.4975, "step": 900 }, { "epoch": 2.838709677419355, "grad_norm": 0.45703475054822507, "learning_rate": 1.8723082477496993e-05, "loss": 0.5226, "step": 901 }, { "epoch": 2.841856805664831, "grad_norm": 0.49145431991499045, "learning_rate": 1.8679113899600965e-05, "loss": 0.5098, "step": 902 }, { "epoch": 2.8450039339103066, "grad_norm": 0.3858560750864749, "learning_rate": 1.8635151732706586e-05, "loss": 0.512, "step": 903 }, { "epoch": 2.848151062155783, "grad_norm": 0.3978300138355392, "learning_rate": 1.859119619018693e-05, "loss": 0.4671, "step": 904 }, { "epoch": 2.851298190401259, "grad_norm": 0.40494272314425184, "learning_rate": 1.854724748538292e-05, "loss": 0.5072, "step": 905 }, { "epoch": 2.854445318646735, "grad_norm": 0.4247470229056908, "learning_rate": 1.850330583160229e-05, "loss": 0.5315, "step": 906 }, { "epoch": 2.8575924468922107, "grad_norm": 0.4169321625496455, "learning_rate": 1.8459371442118542e-05, "loss": 0.5246, "step": 907 }, { "epoch": 2.860739575137687, "grad_norm": 0.3901805457711373, "learning_rate": 1.8415444530169936e-05, "loss": 0.5006, "step": 908 }, { "epoch": 2.8638867033831628, "grad_norm": 0.4121320993719406, "learning_rate": 1.837152530895844e-05, "loss": 0.507, "step": 909 }, { "epoch": 2.867033831628639, "grad_norm": 0.36255560340653853, "learning_rate": 1.8327613991648657e-05, "loss": 0.501, "step": 910 }, { "epoch": 2.870180959874115, "grad_norm": 0.48965568319475056, "learning_rate": 1.8283710791366887e-05, "loss": 0.4897, "step": 911 }, { "epoch": 2.8733280881195906, "grad_norm": 0.388852081772549, "learning_rate": 1.823981592119999e-05, "loss": 0.5012, "step": 912 }, { "epoch": 2.876475216365067, "grad_norm": 0.42848323102553554, "learning_rate": 1.8195929594194404e-05, "loss": 0.4902, "step": 913 }, { "epoch": 2.879622344610543, "grad_norm": 0.47008473589718597, "learning_rate": 1.8152052023355125e-05, "loss": 0.5589, "step": 914 }, { "epoch": 2.882769472856019, "grad_norm": 0.30454578165861085, "learning_rate": 1.8108183421644613e-05, "loss": 0.4709, "step": 915 }, { "epoch": 2.8859166011014947, "grad_norm": 0.4002304069091809, "learning_rate": 1.806432400198183e-05, "loss": 0.5426, "step": 916 }, { "epoch": 2.889063729346971, "grad_norm": 0.3787420784015964, "learning_rate": 1.8020473977241157e-05, "loss": 0.4764, "step": 917 }, { "epoch": 2.892210857592447, "grad_norm": 0.4353640762286048, "learning_rate": 1.797663356025136e-05, "loss": 0.5275, "step": 918 }, { "epoch": 2.895357985837923, "grad_norm": 0.5073628969017314, "learning_rate": 1.7932802963794607e-05, "loss": 0.5153, "step": 919 }, { "epoch": 2.898505114083399, "grad_norm": 0.5041788326127704, "learning_rate": 1.7888982400605376e-05, "loss": 0.4932, "step": 920 }, { "epoch": 2.9016522423288746, "grad_norm": 0.460386672312264, "learning_rate": 1.7845172083369465e-05, "loss": 0.5119, "step": 921 }, { "epoch": 2.904799370574351, "grad_norm": 0.4781432319925081, "learning_rate": 1.7801372224722925e-05, "loss": 0.5004, "step": 922 }, { "epoch": 2.907946498819827, "grad_norm": 0.40584542775039373, "learning_rate": 1.775758303725106e-05, "loss": 0.5166, "step": 923 }, { "epoch": 2.911093627065303, "grad_norm": 0.42543709277206193, "learning_rate": 1.7713804733487375e-05, "loss": 0.508, "step": 924 }, { "epoch": 2.9142407553107788, "grad_norm": 0.42925628769889695, "learning_rate": 1.7670037525912555e-05, "loss": 0.506, "step": 925 }, { "epoch": 2.917387883556255, "grad_norm": 0.3766803868387005, "learning_rate": 1.762628162695341e-05, "loss": 0.5205, "step": 926 }, { "epoch": 2.920535011801731, "grad_norm": 0.45357831441605395, "learning_rate": 1.7582537248981894e-05, "loss": 0.4778, "step": 927 }, { "epoch": 2.923682140047207, "grad_norm": 0.42810065630689936, "learning_rate": 1.753880460431401e-05, "loss": 0.5002, "step": 928 }, { "epoch": 2.926829268292683, "grad_norm": 0.4755212300543142, "learning_rate": 1.749508390520885e-05, "loss": 0.5384, "step": 929 }, { "epoch": 2.9299763965381587, "grad_norm": 0.4293108713905954, "learning_rate": 1.7451375363867487e-05, "loss": 0.5221, "step": 930 }, { "epoch": 2.933123524783635, "grad_norm": 0.42645458489471416, "learning_rate": 1.7407679192432023e-05, "loss": 0.4949, "step": 931 }, { "epoch": 2.936270653029111, "grad_norm": 0.45544102637961936, "learning_rate": 1.736399560298449e-05, "loss": 0.5314, "step": 932 }, { "epoch": 2.939417781274587, "grad_norm": 0.4357317492875381, "learning_rate": 1.732032480754589e-05, "loss": 0.4868, "step": 933 }, { "epoch": 2.9425649095200628, "grad_norm": 0.5080522514835767, "learning_rate": 1.7276667018075073e-05, "loss": 0.5223, "step": 934 }, { "epoch": 2.945712037765539, "grad_norm": 0.4545220250229832, "learning_rate": 1.7233022446467817e-05, "loss": 0.4655, "step": 935 }, { "epoch": 2.948859166011015, "grad_norm": 0.5764080721118271, "learning_rate": 1.7189391304555715e-05, "loss": 0.5433, "step": 936 }, { "epoch": 2.952006294256491, "grad_norm": 0.5051853544982342, "learning_rate": 1.71457738041052e-05, "loss": 0.4882, "step": 937 }, { "epoch": 2.955153422501967, "grad_norm": 0.541977388868933, "learning_rate": 1.7102170156816473e-05, "loss": 0.5092, "step": 938 }, { "epoch": 2.9583005507474427, "grad_norm": 0.4421182710596497, "learning_rate": 1.705858057432252e-05, "loss": 0.512, "step": 939 }, { "epoch": 2.961447678992919, "grad_norm": 0.4291759732245689, "learning_rate": 1.7015005268188042e-05, "loss": 0.5096, "step": 940 }, { "epoch": 2.964594807238395, "grad_norm": 0.42281901679826367, "learning_rate": 1.6971444449908474e-05, "loss": 0.496, "step": 941 }, { "epoch": 2.967741935483871, "grad_norm": 0.36550645510303176, "learning_rate": 1.6927898330908893e-05, "loss": 0.5298, "step": 942 }, { "epoch": 2.970889063729347, "grad_norm": 0.4439902202892732, "learning_rate": 1.6884367122543072e-05, "loss": 0.4919, "step": 943 }, { "epoch": 2.974036191974823, "grad_norm": 0.4145819697915999, "learning_rate": 1.6840851036092395e-05, "loss": 0.5012, "step": 944 }, { "epoch": 2.977183320220299, "grad_norm": 0.4024149607221758, "learning_rate": 1.6797350282764856e-05, "loss": 0.535, "step": 945 }, { "epoch": 2.980330448465775, "grad_norm": 0.4088679260911552, "learning_rate": 1.6753865073694028e-05, "loss": 0.5207, "step": 946 }, { "epoch": 2.983477576711251, "grad_norm": 0.4722988645871877, "learning_rate": 1.6710395619938042e-05, "loss": 0.499, "step": 947 }, { "epoch": 2.9866247049567267, "grad_norm": 0.33656951076586683, "learning_rate": 1.666694213247855e-05, "loss": 0.5037, "step": 948 }, { "epoch": 2.989771833202203, "grad_norm": 0.4038311667361262, "learning_rate": 1.6623504822219726e-05, "loss": 0.5221, "step": 949 }, { "epoch": 2.992918961447679, "grad_norm": 0.3656238746602209, "learning_rate": 1.658008389998721e-05, "loss": 0.4887, "step": 950 }, { "epoch": 2.996066089693155, "grad_norm": 0.3597010342945152, "learning_rate": 1.6536679576527104e-05, "loss": 0.5313, "step": 951 }, { "epoch": 3.000786782061369, "grad_norm": 0.8437011095934643, "learning_rate": 1.6493292062504965e-05, "loss": 0.873, "step": 952 }, { "epoch": 3.003933910306845, "grad_norm": 0.4894489456737364, "learning_rate": 1.6449921568504747e-05, "loss": 0.4624, "step": 953 }, { "epoch": 3.0070810385523212, "grad_norm": 0.5196796511583233, "learning_rate": 1.6406568305027798e-05, "loss": 0.4646, "step": 954 }, { "epoch": 3.010228166797797, "grad_norm": 0.6421523874225777, "learning_rate": 1.6363232482491844e-05, "loss": 0.4526, "step": 955 }, { "epoch": 3.013375295043273, "grad_norm": 0.47988438785390297, "learning_rate": 1.631991431122995e-05, "loss": 0.4308, "step": 956 }, { "epoch": 3.016522423288749, "grad_norm": 0.607462624981888, "learning_rate": 1.627661400148953e-05, "loss": 0.4859, "step": 957 }, { "epoch": 3.019669551534225, "grad_norm": 0.5030069793452936, "learning_rate": 1.6233331763431274e-05, "loss": 0.4513, "step": 958 }, { "epoch": 3.022816679779701, "grad_norm": 0.5778067720286014, "learning_rate": 1.6190067807128184e-05, "loss": 0.4531, "step": 959 }, { "epoch": 3.025963808025177, "grad_norm": 0.39106884195501385, "learning_rate": 1.6146822342564525e-05, "loss": 0.4201, "step": 960 }, { "epoch": 3.029110936270653, "grad_norm": 0.6507261847344724, "learning_rate": 1.6103595579634806e-05, "loss": 0.4846, "step": 961 }, { "epoch": 3.032258064516129, "grad_norm": 0.38933170208126056, "learning_rate": 1.606038772814278e-05, "loss": 0.4406, "step": 962 }, { "epoch": 3.0354051927616053, "grad_norm": 0.5285339792483837, "learning_rate": 1.6017198997800395e-05, "loss": 0.4531, "step": 963 }, { "epoch": 3.038552321007081, "grad_norm": 0.4647220568403595, "learning_rate": 1.5974029598226796e-05, "loss": 0.4513, "step": 964 }, { "epoch": 3.041699449252557, "grad_norm": 0.39006146074109477, "learning_rate": 1.5930879738947328e-05, "loss": 0.4525, "step": 965 }, { "epoch": 3.044846577498033, "grad_norm": 0.5111098953041673, "learning_rate": 1.588774962939246e-05, "loss": 0.4518, "step": 966 }, { "epoch": 3.047993705743509, "grad_norm": 0.40886909787213843, "learning_rate": 1.5844639478896827e-05, "loss": 0.457, "step": 967 }, { "epoch": 3.051140833988985, "grad_norm": 0.41611581011446236, "learning_rate": 1.580154949669819e-05, "loss": 0.4295, "step": 968 }, { "epoch": 3.054287962234461, "grad_norm": 0.3846880322150094, "learning_rate": 1.5758479891936418e-05, "loss": 0.4377, "step": 969 }, { "epoch": 3.0574350904799372, "grad_norm": 0.39308263395636117, "learning_rate": 1.5715430873652476e-05, "loss": 0.449, "step": 970 }, { "epoch": 3.060582218725413, "grad_norm": 0.3664501208137125, "learning_rate": 1.5672402650787412e-05, "loss": 0.4367, "step": 971 }, { "epoch": 3.0637293469708893, "grad_norm": 0.39215891536789105, "learning_rate": 1.5629395432181352e-05, "loss": 0.4569, "step": 972 }, { "epoch": 3.066876475216365, "grad_norm": 0.37791276065723745, "learning_rate": 1.5586409426572462e-05, "loss": 0.4553, "step": 973 }, { "epoch": 3.070023603461841, "grad_norm": 0.34823341311056166, "learning_rate": 1.554344484259595e-05, "loss": 0.4417, "step": 974 }, { "epoch": 3.073170731707317, "grad_norm": 0.42528363706310823, "learning_rate": 1.5500501888783057e-05, "loss": 0.4421, "step": 975 }, { "epoch": 3.076317859952793, "grad_norm": 0.35592182294418906, "learning_rate": 1.545758077356005e-05, "loss": 0.4596, "step": 976 }, { "epoch": 3.079464988198269, "grad_norm": 0.41726134174420965, "learning_rate": 1.541468170524719e-05, "loss": 0.4256, "step": 977 }, { "epoch": 3.082612116443745, "grad_norm": 0.3579509357206158, "learning_rate": 1.537180489205773e-05, "loss": 0.4766, "step": 978 }, { "epoch": 3.0857592446892212, "grad_norm": 0.34613018627695974, "learning_rate": 1.5328950542096917e-05, "loss": 0.43, "step": 979 }, { "epoch": 3.088906372934697, "grad_norm": 0.37351932093944284, "learning_rate": 1.5286118863360963e-05, "loss": 0.4533, "step": 980 }, { "epoch": 3.0920535011801733, "grad_norm": 0.3788549663096128, "learning_rate": 1.5243310063736052e-05, "loss": 0.4673, "step": 981 }, { "epoch": 3.095200629425649, "grad_norm": 0.37077490540161007, "learning_rate": 1.5200524350997306e-05, "loss": 0.4488, "step": 982 }, { "epoch": 3.098347757671125, "grad_norm": 0.3667195042366108, "learning_rate": 1.5157761932807806e-05, "loss": 0.4555, "step": 983 }, { "epoch": 3.101494885916601, "grad_norm": 0.3756336351908015, "learning_rate": 1.5115023016717576e-05, "loss": 0.455, "step": 984 }, { "epoch": 3.104642014162077, "grad_norm": 0.3467248778353456, "learning_rate": 1.5072307810162559e-05, "loss": 0.4429, "step": 985 }, { "epoch": 3.107789142407553, "grad_norm": 0.4074814463065111, "learning_rate": 1.5029616520463636e-05, "loss": 0.457, "step": 986 }, { "epoch": 3.110936270653029, "grad_norm": 0.3735336842092654, "learning_rate": 1.498694935482559e-05, "loss": 0.4599, "step": 987 }, { "epoch": 3.1140833988985053, "grad_norm": 0.3645860933631925, "learning_rate": 1.4944306520336129e-05, "loss": 0.4395, "step": 988 }, { "epoch": 3.117230527143981, "grad_norm": 0.3954959214109424, "learning_rate": 1.4901688223964871e-05, "loss": 0.4217, "step": 989 }, { "epoch": 3.1203776553894573, "grad_norm": 0.3285213122250933, "learning_rate": 1.4859094672562314e-05, "loss": 0.4578, "step": 990 }, { "epoch": 3.123524783634933, "grad_norm": 0.45616500289860873, "learning_rate": 1.4816526072858881e-05, "loss": 0.4545, "step": 991 }, { "epoch": 3.126671911880409, "grad_norm": 0.35479473494648844, "learning_rate": 1.4773982631463879e-05, "loss": 0.4504, "step": 992 }, { "epoch": 3.129819040125885, "grad_norm": 0.40625773248569635, "learning_rate": 1.4731464554864503e-05, "loss": 0.4594, "step": 993 }, { "epoch": 3.132966168371361, "grad_norm": 0.3417485916649555, "learning_rate": 1.468897204942485e-05, "loss": 0.4064, "step": 994 }, { "epoch": 3.1361132966168372, "grad_norm": 0.44443922220128873, "learning_rate": 1.4646505321384896e-05, "loss": 0.481, "step": 995 }, { "epoch": 3.139260424862313, "grad_norm": 0.33497076522190666, "learning_rate": 1.4604064576859513e-05, "loss": 0.4439, "step": 996 }, { "epoch": 3.1424075531077893, "grad_norm": 0.40460214907111397, "learning_rate": 1.4561650021837461e-05, "loss": 0.4535, "step": 997 }, { "epoch": 3.145554681353265, "grad_norm": 0.3548757014421584, "learning_rate": 1.4519261862180365e-05, "loss": 0.4305, "step": 998 }, { "epoch": 3.1487018095987414, "grad_norm": 0.39006944874130967, "learning_rate": 1.447690030362177e-05, "loss": 0.4605, "step": 999 }, { "epoch": 3.151848937844217, "grad_norm": 0.2947807614670378, "learning_rate": 1.4434565551766091e-05, "loss": 0.4375, "step": 1000 }, { "epoch": 3.154996066089693, "grad_norm": 0.37151093567185317, "learning_rate": 1.4392257812087644e-05, "loss": 0.4437, "step": 1001 }, { "epoch": 3.158143194335169, "grad_norm": 0.3043566527179534, "learning_rate": 1.4349977289929639e-05, "loss": 0.4434, "step": 1002 }, { "epoch": 3.161290322580645, "grad_norm": 0.3239420861001395, "learning_rate": 1.4307724190503174e-05, "loss": 0.4602, "step": 1003 }, { "epoch": 3.1644374508261213, "grad_norm": 0.3022915343958959, "learning_rate": 1.4265498718886263e-05, "loss": 0.4432, "step": 1004 }, { "epoch": 3.167584579071597, "grad_norm": 0.32662353296688856, "learning_rate": 1.4223301080022829e-05, "loss": 0.4655, "step": 1005 }, { "epoch": 3.1707317073170733, "grad_norm": 0.30216819737098594, "learning_rate": 1.4181131478721679e-05, "loss": 0.4488, "step": 1006 }, { "epoch": 3.173878835562549, "grad_norm": 0.3194738978026303, "learning_rate": 1.4138990119655573e-05, "loss": 0.4664, "step": 1007 }, { "epoch": 3.1770259638080254, "grad_norm": 0.3238992239390259, "learning_rate": 1.4096877207360172e-05, "loss": 0.4247, "step": 1008 }, { "epoch": 3.180173092053501, "grad_norm": 0.38746919789270406, "learning_rate": 1.4054792946233082e-05, "loss": 0.4439, "step": 1009 }, { "epoch": 3.183320220298977, "grad_norm": 0.31269921601318473, "learning_rate": 1.4012737540532842e-05, "loss": 0.4728, "step": 1010 }, { "epoch": 3.1864673485444532, "grad_norm": 0.4583973254397915, "learning_rate": 1.3970711194377944e-05, "loss": 0.4425, "step": 1011 }, { "epoch": 3.189614476789929, "grad_norm": 0.3107178198172973, "learning_rate": 1.3928714111745834e-05, "loss": 0.4473, "step": 1012 }, { "epoch": 3.1927616050354053, "grad_norm": 0.42357819649857154, "learning_rate": 1.3886746496471927e-05, "loss": 0.4447, "step": 1013 }, { "epoch": 3.195908733280881, "grad_norm": 0.337426375904954, "learning_rate": 1.3844808552248612e-05, "loss": 0.4678, "step": 1014 }, { "epoch": 3.1990558615263573, "grad_norm": 0.42860966362762376, "learning_rate": 1.3802900482624275e-05, "loss": 0.4345, "step": 1015 }, { "epoch": 3.202202989771833, "grad_norm": 0.3879836551122409, "learning_rate": 1.3761022491002298e-05, "loss": 0.4367, "step": 1016 }, { "epoch": 3.2053501180173094, "grad_norm": 0.4089210534095531, "learning_rate": 1.3719174780640087e-05, "loss": 0.4589, "step": 1017 }, { "epoch": 3.208497246262785, "grad_norm": 0.3593147408746166, "learning_rate": 1.3677357554648061e-05, "loss": 0.4503, "step": 1018 }, { "epoch": 3.211644374508261, "grad_norm": 0.31622524795270396, "learning_rate": 1.36355710159887e-05, "loss": 0.4558, "step": 1019 }, { "epoch": 3.2147915027537373, "grad_norm": 0.4132536639954306, "learning_rate": 1.3593815367475518e-05, "loss": 0.4373, "step": 1020 }, { "epoch": 3.217938630999213, "grad_norm": 0.3606321436614752, "learning_rate": 1.3552090811772134e-05, "loss": 0.4504, "step": 1021 }, { "epoch": 3.2210857592446893, "grad_norm": 0.36333999573846615, "learning_rate": 1.3510397551391212e-05, "loss": 0.4623, "step": 1022 }, { "epoch": 3.224232887490165, "grad_norm": 0.29550053984309993, "learning_rate": 1.3468735788693563e-05, "loss": 0.4287, "step": 1023 }, { "epoch": 3.2273800157356414, "grad_norm": 0.43844592374705915, "learning_rate": 1.3427105725887098e-05, "loss": 0.4675, "step": 1024 }, { "epoch": 3.230527143981117, "grad_norm": 0.34132693908368855, "learning_rate": 1.3385507565025884e-05, "loss": 0.4495, "step": 1025 }, { "epoch": 3.2336742722265934, "grad_norm": 0.435235912189035, "learning_rate": 1.334394150800914e-05, "loss": 0.4518, "step": 1026 }, { "epoch": 3.2368214004720692, "grad_norm": 0.35002633272777456, "learning_rate": 1.3302407756580278e-05, "loss": 0.4442, "step": 1027 }, { "epoch": 3.239968528717545, "grad_norm": 0.5022707232854282, "learning_rate": 1.32609065123259e-05, "loss": 0.4543, "step": 1028 }, { "epoch": 3.2431156569630213, "grad_norm": 0.3200367426841411, "learning_rate": 1.3219437976674847e-05, "loss": 0.4368, "step": 1029 }, { "epoch": 3.246262785208497, "grad_norm": 0.5138494915176968, "learning_rate": 1.317800235089719e-05, "loss": 0.464, "step": 1030 }, { "epoch": 3.2494099134539733, "grad_norm": 0.3687799468130162, "learning_rate": 1.313659983610328e-05, "loss": 0.4695, "step": 1031 }, { "epoch": 3.252557041699449, "grad_norm": 0.47713155015281183, "learning_rate": 1.3095230633242761e-05, "loss": 0.4455, "step": 1032 }, { "epoch": 3.2557041699449254, "grad_norm": 0.37374667859633126, "learning_rate": 1.3053894943103598e-05, "loss": 0.4389, "step": 1033 }, { "epoch": 3.258851298190401, "grad_norm": 0.3842402902889691, "learning_rate": 1.3012592966311091e-05, "loss": 0.4884, "step": 1034 }, { "epoch": 3.2619984264358775, "grad_norm": 0.3547447427297586, "learning_rate": 1.2971324903326923e-05, "loss": 0.4232, "step": 1035 }, { "epoch": 3.2651455546813533, "grad_norm": 0.32157783163039644, "learning_rate": 1.293009095444816e-05, "loss": 0.4414, "step": 1036 }, { "epoch": 3.2682926829268295, "grad_norm": 0.3546515803316329, "learning_rate": 1.2888891319806312e-05, "loss": 0.4535, "step": 1037 }, { "epoch": 3.2714398111723053, "grad_norm": 0.33391924791649685, "learning_rate": 1.284772619936632e-05, "loss": 0.4627, "step": 1038 }, { "epoch": 3.274586939417781, "grad_norm": 0.34842719790074683, "learning_rate": 1.2806595792925616e-05, "loss": 0.4348, "step": 1039 }, { "epoch": 3.2777340676632574, "grad_norm": 0.31720262229680574, "learning_rate": 1.2765500300113163e-05, "loss": 0.4465, "step": 1040 }, { "epoch": 3.280881195908733, "grad_norm": 0.37969959728816255, "learning_rate": 1.2724439920388445e-05, "loss": 0.4726, "step": 1041 }, { "epoch": 3.2840283241542094, "grad_norm": 0.3205555271128335, "learning_rate": 1.268341485304053e-05, "loss": 0.4179, "step": 1042 }, { "epoch": 3.2871754523996852, "grad_norm": 0.38765876310575315, "learning_rate": 1.2642425297187101e-05, "loss": 0.4647, "step": 1043 }, { "epoch": 3.2903225806451615, "grad_norm": 0.3079512941344033, "learning_rate": 1.260147145177348e-05, "loss": 0.4545, "step": 1044 }, { "epoch": 3.2934697088906373, "grad_norm": 0.4183847497019012, "learning_rate": 1.256055351557167e-05, "loss": 0.4565, "step": 1045 }, { "epoch": 3.2966168371361135, "grad_norm": 0.3070683731820513, "learning_rate": 1.2519671687179375e-05, "loss": 0.4438, "step": 1046 }, { "epoch": 3.2997639653815893, "grad_norm": 0.399817426938315, "learning_rate": 1.2478826165019053e-05, "loss": 0.4262, "step": 1047 }, { "epoch": 3.302911093627065, "grad_norm": 0.3547125394607051, "learning_rate": 1.243801714733696e-05, "loss": 0.4658, "step": 1048 }, { "epoch": 3.3060582218725414, "grad_norm": 0.3436731206723249, "learning_rate": 1.2397244832202153e-05, "loss": 0.4504, "step": 1049 }, { "epoch": 3.309205350118017, "grad_norm": 0.38057310070493267, "learning_rate": 1.2356509417505573e-05, "loss": 0.4501, "step": 1050 }, { "epoch": 3.3123524783634934, "grad_norm": 0.27293264885045054, "learning_rate": 1.231581110095905e-05, "loss": 0.4407, "step": 1051 }, { "epoch": 3.3154996066089693, "grad_norm": 0.40319367595969435, "learning_rate": 1.2275150080094348e-05, "loss": 0.4636, "step": 1052 }, { "epoch": 3.3186467348544455, "grad_norm": 0.31214404066637935, "learning_rate": 1.2234526552262243e-05, "loss": 0.4537, "step": 1053 }, { "epoch": 3.3217938630999213, "grad_norm": 0.3196911749183487, "learning_rate": 1.21939407146315e-05, "loss": 0.4464, "step": 1054 }, { "epoch": 3.3249409913453976, "grad_norm": 0.43833460528156976, "learning_rate": 1.2153392764187974e-05, "loss": 0.4575, "step": 1055 }, { "epoch": 3.3280881195908734, "grad_norm": 0.3320927741275933, "learning_rate": 1.2112882897733634e-05, "loss": 0.4548, "step": 1056 }, { "epoch": 3.331235247836349, "grad_norm": 0.38130010599050557, "learning_rate": 1.2072411311885588e-05, "loss": 0.4433, "step": 1057 }, { "epoch": 3.3343823760818254, "grad_norm": 0.32380680107010973, "learning_rate": 1.2031978203075172e-05, "loss": 0.4331, "step": 1058 }, { "epoch": 3.337529504327301, "grad_norm": 0.36742670403821587, "learning_rate": 1.1991583767546948e-05, "loss": 0.4616, "step": 1059 }, { "epoch": 3.3406766325727775, "grad_norm": 0.36279961379407005, "learning_rate": 1.1951228201357794e-05, "loss": 0.4571, "step": 1060 }, { "epoch": 3.3438237608182533, "grad_norm": 0.27086173597488794, "learning_rate": 1.1910911700375924e-05, "loss": 0.4171, "step": 1061 }, { "epoch": 3.3469708890637295, "grad_norm": 0.40509428003250936, "learning_rate": 1.1870634460279937e-05, "loss": 0.4959, "step": 1062 }, { "epoch": 3.3501180173092053, "grad_norm": 0.2788427389494092, "learning_rate": 1.1830396676557889e-05, "loss": 0.4259, "step": 1063 }, { "epoch": 3.3532651455546816, "grad_norm": 0.30665073285225036, "learning_rate": 1.1790198544506333e-05, "loss": 0.4427, "step": 1064 }, { "epoch": 3.3564122738001574, "grad_norm": 0.32614144159278, "learning_rate": 1.1750040259229365e-05, "loss": 0.4537, "step": 1065 }, { "epoch": 3.359559402045633, "grad_norm": 0.3266438015263831, "learning_rate": 1.170992201563769e-05, "loss": 0.4392, "step": 1066 }, { "epoch": 3.3627065302911094, "grad_norm": 0.2993872663278503, "learning_rate": 1.1669844008447654e-05, "loss": 0.4505, "step": 1067 }, { "epoch": 3.3658536585365852, "grad_norm": 0.2878451066832284, "learning_rate": 1.1629806432180324e-05, "loss": 0.4244, "step": 1068 }, { "epoch": 3.3690007867820615, "grad_norm": 0.33355834908904675, "learning_rate": 1.1589809481160539e-05, "loss": 0.4556, "step": 1069 }, { "epoch": 3.3721479150275373, "grad_norm": 0.27525931720756097, "learning_rate": 1.1549853349515939e-05, "loss": 0.4165, "step": 1070 }, { "epoch": 3.3752950432730136, "grad_norm": 0.31458748223848476, "learning_rate": 1.1509938231176068e-05, "loss": 0.4807, "step": 1071 }, { "epoch": 3.3784421715184894, "grad_norm": 0.3033894833023403, "learning_rate": 1.1470064319871393e-05, "loss": 0.4384, "step": 1072 }, { "epoch": 3.3815892997639656, "grad_norm": 0.29732541952252484, "learning_rate": 1.1430231809132386e-05, "loss": 0.4831, "step": 1073 }, { "epoch": 3.3847364280094414, "grad_norm": 0.29115088294335617, "learning_rate": 1.139044089228858e-05, "loss": 0.4428, "step": 1074 }, { "epoch": 3.387883556254917, "grad_norm": 0.296430263712168, "learning_rate": 1.1350691762467644e-05, "loss": 0.4515, "step": 1075 }, { "epoch": 3.3910306845003935, "grad_norm": 0.28782588365398537, "learning_rate": 1.1310984612594394e-05, "loss": 0.4184, "step": 1076 }, { "epoch": 3.3941778127458693, "grad_norm": 0.3067545371689205, "learning_rate": 1.1271319635389934e-05, "loss": 0.4626, "step": 1077 }, { "epoch": 3.3973249409913455, "grad_norm": 0.3161311488832454, "learning_rate": 1.1231697023370645e-05, "loss": 0.4504, "step": 1078 }, { "epoch": 3.4004720692368213, "grad_norm": 0.31991819718734327, "learning_rate": 1.1192116968847313e-05, "loss": 0.4573, "step": 1079 }, { "epoch": 3.4036191974822976, "grad_norm": 0.2902807527048948, "learning_rate": 1.1152579663924167e-05, "loss": 0.4562, "step": 1080 }, { "epoch": 3.4067663257277734, "grad_norm": 0.345254009382499, "learning_rate": 1.1113085300497919e-05, "loss": 0.4496, "step": 1081 }, { "epoch": 3.4099134539732496, "grad_norm": 0.27394920734180683, "learning_rate": 1.1073634070256895e-05, "loss": 0.4494, "step": 1082 }, { "epoch": 3.4130605822187254, "grad_norm": 0.3148362640218033, "learning_rate": 1.1034226164680066e-05, "loss": 0.4688, "step": 1083 }, { "epoch": 3.4162077104642012, "grad_norm": 0.2943354574773677, "learning_rate": 1.0994861775036101e-05, "loss": 0.44, "step": 1084 }, { "epoch": 3.4193548387096775, "grad_norm": 0.29129387101628995, "learning_rate": 1.0955541092382496e-05, "loss": 0.4464, "step": 1085 }, { "epoch": 3.4225019669551533, "grad_norm": 0.31501719097176634, "learning_rate": 1.0916264307564574e-05, "loss": 0.4651, "step": 1086 }, { "epoch": 3.4256490952006295, "grad_norm": 0.33225178849500403, "learning_rate": 1.0877031611214632e-05, "loss": 0.4587, "step": 1087 }, { "epoch": 3.4287962234461054, "grad_norm": 0.3345423444622172, "learning_rate": 1.083784319375097e-05, "loss": 0.4447, "step": 1088 }, { "epoch": 3.4319433516915816, "grad_norm": 0.27316577067100334, "learning_rate": 1.0798699245376959e-05, "loss": 0.4346, "step": 1089 }, { "epoch": 3.4350904799370574, "grad_norm": 0.30011292227050673, "learning_rate": 1.0759599956080162e-05, "loss": 0.4725, "step": 1090 }, { "epoch": 3.4382376081825337, "grad_norm": 0.29665774985859605, "learning_rate": 1.072054551563138e-05, "loss": 0.4387, "step": 1091 }, { "epoch": 3.4413847364280095, "grad_norm": 0.3039564711038381, "learning_rate": 1.068153611358372e-05, "loss": 0.4556, "step": 1092 }, { "epoch": 3.4445318646734853, "grad_norm": 0.30191917389875206, "learning_rate": 1.0642571939271723e-05, "loss": 0.4451, "step": 1093 }, { "epoch": 3.4476789929189615, "grad_norm": 0.29199496459557056, "learning_rate": 1.0603653181810379e-05, "loss": 0.4725, "step": 1094 }, { "epoch": 3.4508261211644373, "grad_norm": 0.3271566573252477, "learning_rate": 1.0564780030094272e-05, "loss": 0.4431, "step": 1095 }, { "epoch": 3.4539732494099136, "grad_norm": 0.27957614050990437, "learning_rate": 1.0525952672796636e-05, "loss": 0.4489, "step": 1096 }, { "epoch": 3.4571203776553894, "grad_norm": 0.2911856670468552, "learning_rate": 1.0487171298368412e-05, "loss": 0.4611, "step": 1097 }, { "epoch": 3.4602675059008656, "grad_norm": 0.32032706145968165, "learning_rate": 1.0448436095037385e-05, "loss": 0.4481, "step": 1098 }, { "epoch": 3.4634146341463414, "grad_norm": 0.32197178012548605, "learning_rate": 1.040974725080724e-05, "loss": 0.473, "step": 1099 }, { "epoch": 3.4665617623918177, "grad_norm": 0.3105584033001555, "learning_rate": 1.0371104953456663e-05, "loss": 0.468, "step": 1100 }, { "epoch": 3.4697088906372935, "grad_norm": 0.30202604130512095, "learning_rate": 1.0332509390538404e-05, "loss": 0.4344, "step": 1101 }, { "epoch": 3.4728560188827693, "grad_norm": 0.30955812132114763, "learning_rate": 1.0293960749378384e-05, "loss": 0.4636, "step": 1102 }, { "epoch": 3.4760031471282455, "grad_norm": 0.30252588064347674, "learning_rate": 1.0255459217074803e-05, "loss": 0.4607, "step": 1103 }, { "epoch": 3.4791502753737213, "grad_norm": 0.30519810066545155, "learning_rate": 1.021700498049722e-05, "loss": 0.4227, "step": 1104 }, { "epoch": 3.4822974036191976, "grad_norm": 0.28348020669441537, "learning_rate": 1.017859822628561e-05, "loss": 0.4605, "step": 1105 }, { "epoch": 3.4854445318646734, "grad_norm": 0.28732215547896256, "learning_rate": 1.0140239140849519e-05, "loss": 0.4494, "step": 1106 }, { "epoch": 3.4885916601101497, "grad_norm": 0.3173927002942517, "learning_rate": 1.0101927910367118e-05, "loss": 0.4524, "step": 1107 }, { "epoch": 3.4917387883556255, "grad_norm": 0.2670843455004391, "learning_rate": 1.006366472078432e-05, "loss": 0.4494, "step": 1108 }, { "epoch": 3.4948859166011017, "grad_norm": 0.28811825068841074, "learning_rate": 1.0025449757813852e-05, "loss": 0.4524, "step": 1109 }, { "epoch": 3.4980330448465775, "grad_norm": 0.29011073031991363, "learning_rate": 9.987283206934374e-06, "loss": 0.4568, "step": 1110 }, { "epoch": 3.5011801730920533, "grad_norm": 0.2651530625627191, "learning_rate": 9.949165253389588e-06, "loss": 0.4492, "step": 1111 }, { "epoch": 3.5043273013375296, "grad_norm": 0.28366453872298675, "learning_rate": 9.911096082187324e-06, "loss": 0.4428, "step": 1112 }, { "epoch": 3.5074744295830054, "grad_norm": 0.31212089217996963, "learning_rate": 9.873075878098623e-06, "loss": 0.4498, "step": 1113 }, { "epoch": 3.5106215578284816, "grad_norm": 0.2839621070001912, "learning_rate": 9.835104825656884e-06, "loss": 0.4508, "step": 1114 }, { "epoch": 3.5137686860739574, "grad_norm": 0.3356792889009391, "learning_rate": 9.797183109156938e-06, "loss": 0.473, "step": 1115 }, { "epoch": 3.5169158143194337, "grad_norm": 0.2583265687099093, "learning_rate": 9.759310912654167e-06, "loss": 0.4527, "step": 1116 }, { "epoch": 3.5200629425649095, "grad_norm": 0.3088294300859714, "learning_rate": 9.72148841996359e-06, "loss": 0.4491, "step": 1117 }, { "epoch": 3.5232100708103857, "grad_norm": 0.3100355304231426, "learning_rate": 9.683715814658987e-06, "loss": 0.4532, "step": 1118 }, { "epoch": 3.5263571990558615, "grad_norm": 0.28230509785485414, "learning_rate": 9.645993280072021e-06, "loss": 0.442, "step": 1119 }, { "epoch": 3.5295043273013373, "grad_norm": 0.302124749981287, "learning_rate": 9.608320999291333e-06, "loss": 0.4408, "step": 1120 }, { "epoch": 3.5326514555468136, "grad_norm": 0.31719529084230913, "learning_rate": 9.570699155161633e-06, "loss": 0.4501, "step": 1121 }, { "epoch": 3.5357985837922894, "grad_norm": 0.33943970273737095, "learning_rate": 9.533127930282855e-06, "loss": 0.4703, "step": 1122 }, { "epoch": 3.5389457120377656, "grad_norm": 0.37639643321932154, "learning_rate": 9.49560750700924e-06, "loss": 0.4608, "step": 1123 }, { "epoch": 3.5420928402832415, "grad_norm": 0.26960896872512685, "learning_rate": 9.458138067448469e-06, "loss": 0.4265, "step": 1124 }, { "epoch": 3.5452399685287177, "grad_norm": 0.32993014271365495, "learning_rate": 9.420719793460758e-06, "loss": 0.4337, "step": 1125 }, { "epoch": 3.5483870967741935, "grad_norm": 0.34626577482812293, "learning_rate": 9.383352866657987e-06, "loss": 0.4766, "step": 1126 }, { "epoch": 3.5515342250196698, "grad_norm": 0.30347167116503154, "learning_rate": 9.346037468402831e-06, "loss": 0.4318, "step": 1127 }, { "epoch": 3.5546813532651456, "grad_norm": 0.338884524041918, "learning_rate": 9.308773779807863e-06, "loss": 0.4561, "step": 1128 }, { "epoch": 3.5578284815106214, "grad_norm": 0.2816135680779377, "learning_rate": 9.271561981734687e-06, "loss": 0.4462, "step": 1129 }, { "epoch": 3.5609756097560976, "grad_norm": 0.3152958510112679, "learning_rate": 9.234402254793033e-06, "loss": 0.4359, "step": 1130 }, { "epoch": 3.5641227380015734, "grad_norm": 0.28881402590713195, "learning_rate": 9.19729477933992e-06, "loss": 0.4623, "step": 1131 }, { "epoch": 3.5672698662470497, "grad_norm": 0.2654482197647601, "learning_rate": 9.16023973547876e-06, "loss": 0.4351, "step": 1132 }, { "epoch": 3.5704169944925255, "grad_norm": 0.2793158707050921, "learning_rate": 9.123237303058474e-06, "loss": 0.4537, "step": 1133 }, { "epoch": 3.5735641227380017, "grad_norm": 0.32371965320338886, "learning_rate": 9.086287661672629e-06, "loss": 0.4293, "step": 1134 }, { "epoch": 3.5767112509834775, "grad_norm": 0.3088971938054911, "learning_rate": 9.049390990658579e-06, "loss": 0.4696, "step": 1135 }, { "epoch": 3.579858379228954, "grad_norm": 0.2754420783162646, "learning_rate": 9.012547469096584e-06, "loss": 0.4503, "step": 1136 }, { "epoch": 3.5830055074744296, "grad_norm": 0.2897270638504705, "learning_rate": 8.975757275808936e-06, "loss": 0.4504, "step": 1137 }, { "epoch": 3.5861526357199054, "grad_norm": 0.32953833717635256, "learning_rate": 8.93902058935908e-06, "loss": 0.4516, "step": 1138 }, { "epoch": 3.5892997639653816, "grad_norm": 0.293563094341784, "learning_rate": 8.902337588050783e-06, "loss": 0.4605, "step": 1139 }, { "epoch": 3.5924468922108574, "grad_norm": 0.28541037221224175, "learning_rate": 8.865708449927241e-06, "loss": 0.4385, "step": 1140 }, { "epoch": 3.5955940204563337, "grad_norm": 0.2769667048762866, "learning_rate": 8.829133352770236e-06, "loss": 0.4528, "step": 1141 }, { "epoch": 3.5987411487018095, "grad_norm": 0.3063282915245064, "learning_rate": 8.792612474099213e-06, "loss": 0.4595, "step": 1142 }, { "epoch": 3.6018882769472857, "grad_norm": 0.27297998759691816, "learning_rate": 8.756145991170513e-06, "loss": 0.4387, "step": 1143 }, { "epoch": 3.6050354051927616, "grad_norm": 0.29288749403302455, "learning_rate": 8.719734080976441e-06, "loss": 0.4522, "step": 1144 }, { "epoch": 3.608182533438238, "grad_norm": 0.26484230856498, "learning_rate": 8.683376920244446e-06, "loss": 0.4269, "step": 1145 }, { "epoch": 3.6113296616837136, "grad_norm": 0.2594230460233364, "learning_rate": 8.647074685436223e-06, "loss": 0.4692, "step": 1146 }, { "epoch": 3.6144767899291894, "grad_norm": 0.3145876246673129, "learning_rate": 8.610827552746897e-06, "loss": 0.4552, "step": 1147 }, { "epoch": 3.6176239181746657, "grad_norm": 0.2953504368805096, "learning_rate": 8.57463569810415e-06, "loss": 0.4543, "step": 1148 }, { "epoch": 3.6207710464201415, "grad_norm": 0.2903789889032914, "learning_rate": 8.538499297167385e-06, "loss": 0.4324, "step": 1149 }, { "epoch": 3.6239181746656177, "grad_norm": 0.2951350855679963, "learning_rate": 8.502418525326801e-06, "loss": 0.4382, "step": 1150 }, { "epoch": 3.6270653029110935, "grad_norm": 0.28051588971875474, "learning_rate": 8.466393557702659e-06, "loss": 0.4487, "step": 1151 }, { "epoch": 3.6302124311565698, "grad_norm": 0.32378806547935485, "learning_rate": 8.430424569144345e-06, "loss": 0.4541, "step": 1152 }, { "epoch": 3.6333595594020456, "grad_norm": 0.33107209363046897, "learning_rate": 8.394511734229556e-06, "loss": 0.465, "step": 1153 }, { "epoch": 3.636506687647522, "grad_norm": 0.31121756126159067, "learning_rate": 8.358655227263424e-06, "loss": 0.4587, "step": 1154 }, { "epoch": 3.6396538158929976, "grad_norm": 0.3283802721771546, "learning_rate": 8.322855222277708e-06, "loss": 0.4456, "step": 1155 }, { "epoch": 3.6428009441384734, "grad_norm": 0.2377363192142234, "learning_rate": 8.287111893029929e-06, "loss": 0.4455, "step": 1156 }, { "epoch": 3.6459480723839497, "grad_norm": 0.2932521033552982, "learning_rate": 8.251425413002534e-06, "loss": 0.4541, "step": 1157 }, { "epoch": 3.6490952006294255, "grad_norm": 0.2805611062166459, "learning_rate": 8.215795955402032e-06, "loss": 0.4489, "step": 1158 }, { "epoch": 3.6522423288749017, "grad_norm": 0.2684630930767302, "learning_rate": 8.180223693158175e-06, "loss": 0.4418, "step": 1159 }, { "epoch": 3.6553894571203775, "grad_norm": 0.2833671321964946, "learning_rate": 8.144708798923125e-06, "loss": 0.4673, "step": 1160 }, { "epoch": 3.658536585365854, "grad_norm": 0.2587422980629999, "learning_rate": 8.109251445070602e-06, "loss": 0.4253, "step": 1161 }, { "epoch": 3.6616837136113296, "grad_norm": 0.2886779130244593, "learning_rate": 8.073851803695033e-06, "loss": 0.4686, "step": 1162 }, { "epoch": 3.664830841856806, "grad_norm": 0.2677528808996215, "learning_rate": 8.03851004661076e-06, "loss": 0.4403, "step": 1163 }, { "epoch": 3.6679779701022817, "grad_norm": 0.25135806823271106, "learning_rate": 8.003226345351161e-06, "loss": 0.4565, "step": 1164 }, { "epoch": 3.6711250983477575, "grad_norm": 0.26013779523809955, "learning_rate": 7.968000871167849e-06, "loss": 0.4469, "step": 1165 }, { "epoch": 3.6742722265932337, "grad_norm": 0.2602422338025341, "learning_rate": 7.93283379502982e-06, "loss": 0.4506, "step": 1166 }, { "epoch": 3.6774193548387095, "grad_norm": 0.2526902487256982, "learning_rate": 7.897725287622625e-06, "loss": 0.4476, "step": 1167 }, { "epoch": 3.6805664830841858, "grad_norm": 0.2788501883728098, "learning_rate": 7.862675519347562e-06, "loss": 0.4648, "step": 1168 }, { "epoch": 3.6837136113296616, "grad_norm": 0.26375904220072866, "learning_rate": 7.827684660320832e-06, "loss": 0.4482, "step": 1169 }, { "epoch": 3.686860739575138, "grad_norm": 0.27321586538638337, "learning_rate": 7.792752880372718e-06, "loss": 0.456, "step": 1170 }, { "epoch": 3.6900078678206136, "grad_norm": 0.2605976271837837, "learning_rate": 7.757880349046742e-06, "loss": 0.4167, "step": 1171 }, { "epoch": 3.69315499606609, "grad_norm": 0.3033078670365782, "learning_rate": 7.723067235598882e-06, "loss": 0.4889, "step": 1172 }, { "epoch": 3.6963021243115657, "grad_norm": 0.285808272646413, "learning_rate": 7.688313708996724e-06, "loss": 0.4731, "step": 1173 }, { "epoch": 3.6994492525570415, "grad_norm": 0.25978835534209727, "learning_rate": 7.653619937918633e-06, "loss": 0.4417, "step": 1174 }, { "epoch": 3.7025963808025177, "grad_norm": 0.291667100739017, "learning_rate": 7.618986090752944e-06, "loss": 0.4479, "step": 1175 }, { "epoch": 3.7057435090479935, "grad_norm": 0.27599608659380026, "learning_rate": 7.584412335597164e-06, "loss": 0.4521, "step": 1176 }, { "epoch": 3.70889063729347, "grad_norm": 0.28827831234418355, "learning_rate": 7.5498988402571375e-06, "loss": 0.4696, "step": 1177 }, { "epoch": 3.7120377655389456, "grad_norm": 0.28088824035080084, "learning_rate": 7.515445772246233e-06, "loss": 0.4429, "step": 1178 }, { "epoch": 3.715184893784422, "grad_norm": 0.27258796059875257, "learning_rate": 7.481053298784513e-06, "loss": 0.4414, "step": 1179 }, { "epoch": 3.7183320220298977, "grad_norm": 0.2633113146422214, "learning_rate": 7.446721586797965e-06, "loss": 0.4539, "step": 1180 }, { "epoch": 3.721479150275374, "grad_norm": 0.28264104854960853, "learning_rate": 7.4124508029176634e-06, "loss": 0.4691, "step": 1181 }, { "epoch": 3.7246262785208497, "grad_norm": 0.26255314972538524, "learning_rate": 7.37824111347895e-06, "loss": 0.4478, "step": 1182 }, { "epoch": 3.7277734067663255, "grad_norm": 0.25393959149162687, "learning_rate": 7.344092684520647e-06, "loss": 0.4294, "step": 1183 }, { "epoch": 3.7309205350118018, "grad_norm": 0.26457063732023456, "learning_rate": 7.310005681784249e-06, "loss": 0.4511, "step": 1184 }, { "epoch": 3.7340676632572776, "grad_norm": 0.2980208241476433, "learning_rate": 7.275980270713116e-06, "loss": 0.45, "step": 1185 }, { "epoch": 3.737214791502754, "grad_norm": 0.2619386826593012, "learning_rate": 7.242016616451675e-06, "loss": 0.4706, "step": 1186 }, { "epoch": 3.7403619197482296, "grad_norm": 0.25726156936586797, "learning_rate": 7.208114883844585e-06, "loss": 0.4346, "step": 1187 }, { "epoch": 3.743509047993706, "grad_norm": 0.30187535227273843, "learning_rate": 7.174275237435995e-06, "loss": 0.4524, "step": 1188 }, { "epoch": 3.7466561762391817, "grad_norm": 0.24968257054319973, "learning_rate": 7.140497841468708e-06, "loss": 0.4377, "step": 1189 }, { "epoch": 3.749803304484658, "grad_norm": 0.2898224004095643, "learning_rate": 7.106782859883377e-06, "loss": 0.463, "step": 1190 }, { "epoch": 3.7529504327301337, "grad_norm": 0.2684756971084031, "learning_rate": 7.073130456317728e-06, "loss": 0.4401, "step": 1191 }, { "epoch": 3.7560975609756095, "grad_norm": 0.2782685168228126, "learning_rate": 7.03954079410577e-06, "loss": 0.4541, "step": 1192 }, { "epoch": 3.759244689221086, "grad_norm": 0.2685406178964316, "learning_rate": 7.0060140362769866e-06, "loss": 0.4622, "step": 1193 }, { "epoch": 3.762391817466562, "grad_norm": 0.2734876298756259, "learning_rate": 6.97255034555556e-06, "loss": 0.4446, "step": 1194 }, { "epoch": 3.765538945712038, "grad_norm": 0.2361780988310677, "learning_rate": 6.939149884359548e-06, "loss": 0.4466, "step": 1195 }, { "epoch": 3.7686860739575136, "grad_norm": 0.2392787383755442, "learning_rate": 6.905812814800148e-06, "loss": 0.459, "step": 1196 }, { "epoch": 3.77183320220299, "grad_norm": 0.2649448309081467, "learning_rate": 6.872539298680874e-06, "loss": 0.4605, "step": 1197 }, { "epoch": 3.7749803304484657, "grad_norm": 0.25909568260718674, "learning_rate": 6.8393294974967624e-06, "loss": 0.4326, "step": 1198 }, { "epoch": 3.778127458693942, "grad_norm": 0.2469264004719789, "learning_rate": 6.806183572433634e-06, "loss": 0.4558, "step": 1199 }, { "epoch": 3.7812745869394178, "grad_norm": 0.2659864809268254, "learning_rate": 6.773101684367253e-06, "loss": 0.4501, "step": 1200 }, { "epoch": 3.7844217151848936, "grad_norm": 0.22605570719334842, "learning_rate": 6.740083993862599e-06, "loss": 0.4285, "step": 1201 }, { "epoch": 3.78756884343037, "grad_norm": 0.2568671201381285, "learning_rate": 6.7071306611730605e-06, "loss": 0.4559, "step": 1202 }, { "epoch": 3.790715971675846, "grad_norm": 0.2717829883222361, "learning_rate": 6.674241846239647e-06, "loss": 0.4506, "step": 1203 }, { "epoch": 3.793863099921322, "grad_norm": 0.24909097103686217, "learning_rate": 6.641417708690243e-06, "loss": 0.4466, "step": 1204 }, { "epoch": 3.7970102281667977, "grad_norm": 0.2745372511795683, "learning_rate": 6.60865840783882e-06, "loss": 0.4577, "step": 1205 }, { "epoch": 3.800157356412274, "grad_norm": 0.24699266487226804, "learning_rate": 6.575964102684638e-06, "loss": 0.4268, "step": 1206 }, { "epoch": 3.8033044846577497, "grad_norm": 0.2809825663762849, "learning_rate": 6.543334951911524e-06, "loss": 0.4724, "step": 1207 }, { "epoch": 3.806451612903226, "grad_norm": 0.23191080827372706, "learning_rate": 6.510771113887051e-06, "loss": 0.426, "step": 1208 }, { "epoch": 3.809598741148702, "grad_norm": 0.2548225394219185, "learning_rate": 6.478272746661807e-06, "loss": 0.4324, "step": 1209 }, { "epoch": 3.8127458693941776, "grad_norm": 0.25442728582670243, "learning_rate": 6.445840007968615e-06, "loss": 0.4581, "step": 1210 }, { "epoch": 3.815892997639654, "grad_norm": 0.23164636026813668, "learning_rate": 6.4134730552217505e-06, "loss": 0.4377, "step": 1211 }, { "epoch": 3.81904012588513, "grad_norm": 0.2506709647270749, "learning_rate": 6.3811720455162066e-06, "loss": 0.4449, "step": 1212 }, { "epoch": 3.822187254130606, "grad_norm": 0.24540007687497106, "learning_rate": 6.348937135626922e-06, "loss": 0.4375, "step": 1213 }, { "epoch": 3.8253343823760817, "grad_norm": 0.25688702433945887, "learning_rate": 6.3167684820079935e-06, "loss": 0.4532, "step": 1214 }, { "epoch": 3.828481510621558, "grad_norm": 0.25389579407575735, "learning_rate": 6.284666240791964e-06, "loss": 0.462, "step": 1215 }, { "epoch": 3.8316286388670338, "grad_norm": 0.2518855884586944, "learning_rate": 6.25263056778902e-06, "loss": 0.4383, "step": 1216 }, { "epoch": 3.83477576711251, "grad_norm": 0.24076981553815266, "learning_rate": 6.220661618486268e-06, "loss": 0.4448, "step": 1217 }, { "epoch": 3.837922895357986, "grad_norm": 0.28662980317279635, "learning_rate": 6.188759548046966e-06, "loss": 0.475, "step": 1218 }, { "epoch": 3.8410700236034616, "grad_norm": 0.273283179790291, "learning_rate": 6.156924511309772e-06, "loss": 0.454, "step": 1219 }, { "epoch": 3.844217151848938, "grad_norm": 0.23538840471581582, "learning_rate": 6.125156662787974e-06, "loss": 0.4304, "step": 1220 }, { "epoch": 3.847364280094414, "grad_norm": 0.2696502178915839, "learning_rate": 6.093456156668789e-06, "loss": 0.4327, "step": 1221 }, { "epoch": 3.85051140833989, "grad_norm": 0.24916492807660676, "learning_rate": 6.061823146812551e-06, "loss": 0.4632, "step": 1222 }, { "epoch": 3.8536585365853657, "grad_norm": 0.26232676880028327, "learning_rate": 6.030257786752025e-06, "loss": 0.4574, "step": 1223 }, { "epoch": 3.856805664830842, "grad_norm": 0.26236306850412155, "learning_rate": 5.998760229691609e-06, "loss": 0.4518, "step": 1224 }, { "epoch": 3.859952793076318, "grad_norm": 0.2532097902831472, "learning_rate": 5.9673306285066334e-06, "loss": 0.4417, "step": 1225 }, { "epoch": 3.863099921321794, "grad_norm": 0.25795699902561314, "learning_rate": 5.935969135742594e-06, "loss": 0.4697, "step": 1226 }, { "epoch": 3.86624704956727, "grad_norm": 0.25127140846223195, "learning_rate": 5.904675903614423e-06, "loss": 0.4405, "step": 1227 }, { "epoch": 3.8693941778127456, "grad_norm": 0.2639369068470998, "learning_rate": 5.87345108400573e-06, "loss": 0.4501, "step": 1228 }, { "epoch": 3.872541306058222, "grad_norm": 0.26587113576108584, "learning_rate": 5.842294828468103e-06, "loss": 0.4563, "step": 1229 }, { "epoch": 3.875688434303698, "grad_norm": 0.25849684097964204, "learning_rate": 5.811207288220324e-06, "loss": 0.4448, "step": 1230 }, { "epoch": 3.878835562549174, "grad_norm": 0.2452714319381085, "learning_rate": 5.78018861414769e-06, "loss": 0.4528, "step": 1231 }, { "epoch": 3.8819826907946497, "grad_norm": 0.2979932615250963, "learning_rate": 5.749238956801224e-06, "loss": 0.4604, "step": 1232 }, { "epoch": 3.885129819040126, "grad_norm": 0.2878800183330725, "learning_rate": 5.718358466396989e-06, "loss": 0.4737, "step": 1233 }, { "epoch": 3.888276947285602, "grad_norm": 0.2706243109140571, "learning_rate": 5.6875472928153406e-06, "loss": 0.4477, "step": 1234 }, { "epoch": 3.891424075531078, "grad_norm": 0.2851474406358009, "learning_rate": 5.656805585600205e-06, "loss": 0.4393, "step": 1235 }, { "epoch": 3.894571203776554, "grad_norm": 0.25037111117408634, "learning_rate": 5.626133493958326e-06, "loss": 0.4413, "step": 1236 }, { "epoch": 3.8977183320220297, "grad_norm": 0.251141291032625, "learning_rate": 5.595531166758597e-06, "loss": 0.4342, "step": 1237 }, { "epoch": 3.900865460267506, "grad_norm": 0.28369489479084636, "learning_rate": 5.564998752531274e-06, "loss": 0.4535, "step": 1238 }, { "epoch": 3.904012588512982, "grad_norm": 0.2642259062646044, "learning_rate": 5.534536399467314e-06, "loss": 0.4641, "step": 1239 }, { "epoch": 3.907159716758458, "grad_norm": 0.2662150854932771, "learning_rate": 5.504144255417605e-06, "loss": 0.4522, "step": 1240 }, { "epoch": 3.9103068450039338, "grad_norm": 0.2562572069358635, "learning_rate": 5.473822467892283e-06, "loss": 0.4521, "step": 1241 }, { "epoch": 3.91345397324941, "grad_norm": 0.25491920921375993, "learning_rate": 5.443571184060003e-06, "loss": 0.4441, "step": 1242 }, { "epoch": 3.916601101494886, "grad_norm": 0.27267676966274623, "learning_rate": 5.413390550747235e-06, "loss": 0.455, "step": 1243 }, { "epoch": 3.919748229740362, "grad_norm": 0.25033486517798026, "learning_rate": 5.383280714437518e-06, "loss": 0.4448, "step": 1244 }, { "epoch": 3.922895357985838, "grad_norm": 0.2538139344130827, "learning_rate": 5.3532418212708005e-06, "loss": 0.4566, "step": 1245 }, { "epoch": 3.9260424862313137, "grad_norm": 0.2393368548565171, "learning_rate": 5.323274017042679e-06, "loss": 0.4422, "step": 1246 }, { "epoch": 3.92918961447679, "grad_norm": 0.2641138860959141, "learning_rate": 5.293377447203736e-06, "loss": 0.4545, "step": 1247 }, { "epoch": 3.932336742722266, "grad_norm": 0.28051370043311474, "learning_rate": 5.2635522568588104e-06, "loss": 0.4439, "step": 1248 }, { "epoch": 3.935483870967742, "grad_norm": 0.2668627514694695, "learning_rate": 5.233798590766279e-06, "loss": 0.4441, "step": 1249 }, { "epoch": 3.938630999213218, "grad_norm": 0.27446406089755127, "learning_rate": 5.204116593337391e-06, "loss": 0.4736, "step": 1250 }, { "epoch": 3.941778127458694, "grad_norm": 0.24657020997355675, "learning_rate": 5.174506408635549e-06, "loss": 0.4403, "step": 1251 }, { "epoch": 3.94492525570417, "grad_norm": 0.2539033223879348, "learning_rate": 5.144968180375582e-06, "loss": 0.454, "step": 1252 }, { "epoch": 3.948072383949646, "grad_norm": 0.2688344127945621, "learning_rate": 5.115502051923107e-06, "loss": 0.4385, "step": 1253 }, { "epoch": 3.951219512195122, "grad_norm": 0.24967312836049135, "learning_rate": 5.08610816629377e-06, "loss": 0.4593, "step": 1254 }, { "epoch": 3.9543666404405977, "grad_norm": 0.2503341374401903, "learning_rate": 5.056786666152607e-06, "loss": 0.4561, "step": 1255 }, { "epoch": 3.957513768686074, "grad_norm": 0.24358898694391512, "learning_rate": 5.027537693813318e-06, "loss": 0.4299, "step": 1256 }, { "epoch": 3.96066089693155, "grad_norm": 0.25400102773666905, "learning_rate": 4.998361391237572e-06, "loss": 0.4567, "step": 1257 }, { "epoch": 3.963808025177026, "grad_norm": 0.25044577158164033, "learning_rate": 4.96925790003435e-06, "loss": 0.4395, "step": 1258 }, { "epoch": 3.966955153422502, "grad_norm": 0.2851929917597238, "learning_rate": 4.940227361459235e-06, "loss": 0.4441, "step": 1259 }, { "epoch": 3.970102281667978, "grad_norm": 0.2760376789615726, "learning_rate": 4.9112699164137125e-06, "loss": 0.4564, "step": 1260 }, { "epoch": 3.973249409913454, "grad_norm": 0.24822387612918478, "learning_rate": 4.88238570544453e-06, "loss": 0.4367, "step": 1261 }, { "epoch": 3.97639653815893, "grad_norm": 0.26813541769717697, "learning_rate": 4.8535748687429626e-06, "loss": 0.4568, "step": 1262 }, { "epoch": 3.979543666404406, "grad_norm": 0.2741691569090349, "learning_rate": 4.824837546144183e-06, "loss": 0.4307, "step": 1263 }, { "epoch": 3.9826907946498817, "grad_norm": 0.2638749468978124, "learning_rate": 4.796173877126547e-06, "loss": 0.4514, "step": 1264 }, { "epoch": 3.985837922895358, "grad_norm": 0.2698153680403995, "learning_rate": 4.767584000810923e-06, "loss": 0.447, "step": 1265 }, { "epoch": 3.9889850511408342, "grad_norm": 0.25760541199532266, "learning_rate": 4.73906805596003e-06, "loss": 0.4569, "step": 1266 }, { "epoch": 3.99213217938631, "grad_norm": 0.27869142570317007, "learning_rate": 4.7106261809777555e-06, "loss": 0.4505, "step": 1267 }, { "epoch": 3.995279307631786, "grad_norm": 0.2668849867730668, "learning_rate": 4.682258513908491e-06, "loss": 0.4303, "step": 1268 }, { "epoch": 3.998426435877262, "grad_norm": 0.5231563669429266, "learning_rate": 4.6539651924364386e-06, "loss": 0.7924, "step": 1269 }, { "epoch": 4.003147128245476, "grad_norm": 0.39985400842560725, "learning_rate": 4.6257463538849634e-06, "loss": 0.4139, "step": 1270 }, { "epoch": 4.006294256490952, "grad_norm": 0.34856593564315125, "learning_rate": 4.5976021352159354e-06, "loss": 0.4066, "step": 1271 }, { "epoch": 4.009441384736428, "grad_norm": 0.2697848208316017, "learning_rate": 4.569532673029049e-06, "loss": 0.405, "step": 1272 }, { "epoch": 4.012588512981904, "grad_norm": 0.36536846636389503, "learning_rate": 4.541538103561147e-06, "loss": 0.4099, "step": 1273 }, { "epoch": 4.01573564122738, "grad_norm": 0.3822880389205967, "learning_rate": 4.5136185626855974e-06, "loss": 0.4115, "step": 1274 }, { "epoch": 4.018882769472856, "grad_norm": 0.30382860641169, "learning_rate": 4.4857741859116024e-06, "loss": 0.406, "step": 1275 }, { "epoch": 4.022029897718332, "grad_norm": 0.3476521357529064, "learning_rate": 4.458005108383554e-06, "loss": 0.3933, "step": 1276 }, { "epoch": 4.025177025963808, "grad_norm": 0.38110304808948026, "learning_rate": 4.430311464880368e-06, "loss": 0.4128, "step": 1277 }, { "epoch": 4.028324154209284, "grad_norm": 0.3408670039383935, "learning_rate": 4.402693389814838e-06, "loss": 0.4046, "step": 1278 }, { "epoch": 4.03147128245476, "grad_norm": 0.28165306615311936, "learning_rate": 4.3751510172329854e-06, "loss": 0.4132, "step": 1279 }, { "epoch": 4.034618410700236, "grad_norm": 0.3035311504325217, "learning_rate": 4.347684480813412e-06, "loss": 0.4183, "step": 1280 }, { "epoch": 4.037765538945712, "grad_norm": 0.3209565209551781, "learning_rate": 4.3202939138666225e-06, "loss": 0.4101, "step": 1281 }, { "epoch": 4.040912667191188, "grad_norm": 0.3182494739242946, "learning_rate": 4.292979449334423e-06, "loss": 0.4003, "step": 1282 }, { "epoch": 4.044059795436664, "grad_norm": 0.28165910027825686, "learning_rate": 4.265741219789234e-06, "loss": 0.4075, "step": 1283 }, { "epoch": 4.04720692368214, "grad_norm": 0.2984326713469533, "learning_rate": 4.2385793574334834e-06, "loss": 0.3958, "step": 1284 }, { "epoch": 4.050354051927616, "grad_norm": 0.2800971603115525, "learning_rate": 4.211493994098928e-06, "loss": 0.4103, "step": 1285 }, { "epoch": 4.053501180173092, "grad_norm": 0.28810828941236666, "learning_rate": 4.184485261246032e-06, "loss": 0.4067, "step": 1286 }, { "epoch": 4.056648308418568, "grad_norm": 0.27485623439753526, "learning_rate": 4.157553289963343e-06, "loss": 0.408, "step": 1287 }, { "epoch": 4.059795436664044, "grad_norm": 0.27989550341259173, "learning_rate": 4.130698210966839e-06, "loss": 0.3983, "step": 1288 }, { "epoch": 4.06294256490952, "grad_norm": 0.28334021515417757, "learning_rate": 4.103920154599282e-06, "loss": 0.414, "step": 1289 }, { "epoch": 4.066089693154996, "grad_norm": 0.26098813310869456, "learning_rate": 4.0772192508296136e-06, "loss": 0.3966, "step": 1290 }, { "epoch": 4.069236821400472, "grad_norm": 0.28050338178042683, "learning_rate": 4.0505956292523116e-06, "loss": 0.412, "step": 1291 }, { "epoch": 4.072383949645948, "grad_norm": 0.2505929913804565, "learning_rate": 4.024049419086755e-06, "loss": 0.4124, "step": 1292 }, { "epoch": 4.075531077891424, "grad_norm": 0.2761708626349122, "learning_rate": 3.997580749176597e-06, "loss": 0.4075, "step": 1293 }, { "epoch": 4.0786782061369005, "grad_norm": 0.27161068055765497, "learning_rate": 3.9711897479891485e-06, "loss": 0.4008, "step": 1294 }, { "epoch": 4.081825334382376, "grad_norm": 0.2858129658437941, "learning_rate": 3.944876543614753e-06, "loss": 0.4007, "step": 1295 }, { "epoch": 4.084972462627852, "grad_norm": 0.29589833337185484, "learning_rate": 3.918641263766163e-06, "loss": 0.4125, "step": 1296 }, { "epoch": 4.088119590873328, "grad_norm": 0.2588524865322138, "learning_rate": 3.892484035777921e-06, "loss": 0.4089, "step": 1297 }, { "epoch": 4.091266719118804, "grad_norm": 0.26760010167593534, "learning_rate": 3.866404986605728e-06, "loss": 0.4146, "step": 1298 }, { "epoch": 4.09441384736428, "grad_norm": 0.29495649911223, "learning_rate": 3.840404242825848e-06, "loss": 0.406, "step": 1299 }, { "epoch": 4.097560975609756, "grad_norm": 0.26662575099279967, "learning_rate": 3.814481930634497e-06, "loss": 0.4031, "step": 1300 }, { "epoch": 4.100708103855232, "grad_norm": 0.2583645865911736, "learning_rate": 3.7886381758471944e-06, "loss": 0.4059, "step": 1301 }, { "epoch": 4.103855232100708, "grad_norm": 0.25557494028108535, "learning_rate": 3.7628731038981856e-06, "loss": 0.4055, "step": 1302 }, { "epoch": 4.1070023603461845, "grad_norm": 0.2837872643694464, "learning_rate": 3.7371868398398346e-06, "loss": 0.413, "step": 1303 }, { "epoch": 4.11014948859166, "grad_norm": 0.2513483558936818, "learning_rate": 3.7115795083419937e-06, "loss": 0.4096, "step": 1304 }, { "epoch": 4.113296616837136, "grad_norm": 0.2512125129099929, "learning_rate": 3.6860512336914256e-06, "loss": 0.409, "step": 1305 }, { "epoch": 4.116443745082612, "grad_norm": 0.2576029104676369, "learning_rate": 3.6606021397911605e-06, "loss": 0.3965, "step": 1306 }, { "epoch": 4.119590873328088, "grad_norm": 0.3077600696852511, "learning_rate": 3.635232350159945e-06, "loss": 0.4017, "step": 1307 }, { "epoch": 4.122738001573564, "grad_norm": 0.30096423158758295, "learning_rate": 3.6099419879316065e-06, "loss": 0.4059, "step": 1308 }, { "epoch": 4.12588512981904, "grad_norm": 0.241419047105453, "learning_rate": 3.584731175854479e-06, "loss": 0.4057, "step": 1309 }, { "epoch": 4.129032258064516, "grad_norm": 0.3067465629753672, "learning_rate": 3.559600036290762e-06, "loss": 0.4057, "step": 1310 }, { "epoch": 4.132179386309992, "grad_norm": 0.28376888195603434, "learning_rate": 3.5345486912159954e-06, "loss": 0.4033, "step": 1311 }, { "epoch": 4.1353265145554685, "grad_norm": 0.23183080156041488, "learning_rate": 3.5095772622184177e-06, "loss": 0.4062, "step": 1312 }, { "epoch": 4.138473642800944, "grad_norm": 0.24058407221906955, "learning_rate": 3.48468587049839e-06, "loss": 0.4064, "step": 1313 }, { "epoch": 4.14162077104642, "grad_norm": 0.2489883247653874, "learning_rate": 3.459874636867804e-06, "loss": 0.4071, "step": 1314 }, { "epoch": 4.144767899291896, "grad_norm": 0.2606905433022103, "learning_rate": 3.435143681749504e-06, "loss": 0.408, "step": 1315 }, { "epoch": 4.147915027537372, "grad_norm": 0.2705046247273454, "learning_rate": 3.4104931251766993e-06, "loss": 0.3972, "step": 1316 }, { "epoch": 4.151062155782848, "grad_norm": 0.24481800219341115, "learning_rate": 3.3859230867923842e-06, "loss": 0.4071, "step": 1317 }, { "epoch": 4.154209284028324, "grad_norm": 0.2300331920081832, "learning_rate": 3.3614336858487294e-06, "loss": 0.406, "step": 1318 }, { "epoch": 4.1573564122738, "grad_norm": 0.2622648565386252, "learning_rate": 3.337025041206552e-06, "loss": 0.4069, "step": 1319 }, { "epoch": 4.160503540519276, "grad_norm": 0.24755732541806952, "learning_rate": 3.3126972713347017e-06, "loss": 0.3964, "step": 1320 }, { "epoch": 4.1636506687647525, "grad_norm": 0.24521010796980297, "learning_rate": 3.288450494309512e-06, "loss": 0.4087, "step": 1321 }, { "epoch": 4.166797797010228, "grad_norm": 0.243073517258642, "learning_rate": 3.264284827814186e-06, "loss": 0.3996, "step": 1322 }, { "epoch": 4.169944925255704, "grad_norm": 0.2578493124544258, "learning_rate": 3.240200389138275e-06, "loss": 0.413, "step": 1323 }, { "epoch": 4.17309205350118, "grad_norm": 0.23083384176711083, "learning_rate": 3.2161972951770793e-06, "loss": 0.409, "step": 1324 }, { "epoch": 4.176239181746656, "grad_norm": 0.26371050841393034, "learning_rate": 3.192275662431088e-06, "loss": 0.4149, "step": 1325 }, { "epoch": 4.1793863099921325, "grad_norm": 0.21635862942599873, "learning_rate": 3.168435607005409e-06, "loss": 0.4152, "step": 1326 }, { "epoch": 4.182533438237608, "grad_norm": 0.25038919966086376, "learning_rate": 3.1446772446092087e-06, "loss": 0.3989, "step": 1327 }, { "epoch": 4.185680566483084, "grad_norm": 0.2538265944471299, "learning_rate": 3.1210006905551602e-06, "loss": 0.4111, "step": 1328 }, { "epoch": 4.18882769472856, "grad_norm": 0.2523419884908278, "learning_rate": 3.097406059758874e-06, "loss": 0.4232, "step": 1329 }, { "epoch": 4.191974822974037, "grad_norm": 0.24458106176738695, "learning_rate": 3.073893466738325e-06, "loss": 0.4071, "step": 1330 }, { "epoch": 4.195121951219512, "grad_norm": 0.24334039749426561, "learning_rate": 3.0504630256133326e-06, "loss": 0.4061, "step": 1331 }, { "epoch": 4.198269079464988, "grad_norm": 0.2712419815481148, "learning_rate": 3.0271148501049796e-06, "loss": 0.4025, "step": 1332 }, { "epoch": 4.201416207710464, "grad_norm": 0.24319759647270833, "learning_rate": 3.0038490535350685e-06, "loss": 0.4016, "step": 1333 }, { "epoch": 4.20456333595594, "grad_norm": 0.22524563871518907, "learning_rate": 2.9806657488255665e-06, "loss": 0.407, "step": 1334 }, { "epoch": 4.2077104642014165, "grad_norm": 0.24679305363109214, "learning_rate": 2.9575650484980568e-06, "loss": 0.3904, "step": 1335 }, { "epoch": 4.210857592446892, "grad_norm": 0.2649918330668857, "learning_rate": 2.934547064673212e-06, "loss": 0.4028, "step": 1336 }, { "epoch": 4.214004720692368, "grad_norm": 0.24510719686404595, "learning_rate": 2.911611909070229e-06, "loss": 0.3997, "step": 1337 }, { "epoch": 4.217151848937844, "grad_norm": 0.238894620688521, "learning_rate": 2.888759693006291e-06, "loss": 0.4008, "step": 1338 }, { "epoch": 4.220298977183321, "grad_norm": 0.21793391307390142, "learning_rate": 2.8659905273960233e-06, "loss": 0.4015, "step": 1339 }, { "epoch": 4.223446105428796, "grad_norm": 0.22998316847358136, "learning_rate": 2.8433045227509693e-06, "loss": 0.4047, "step": 1340 }, { "epoch": 4.226593233674272, "grad_norm": 0.23351985066501862, "learning_rate": 2.82070178917905e-06, "loss": 0.4054, "step": 1341 }, { "epoch": 4.229740361919748, "grad_norm": 0.27760093628492105, "learning_rate": 2.798182436384014e-06, "loss": 0.399, "step": 1342 }, { "epoch": 4.232887490165224, "grad_norm": 0.23788980231020992, "learning_rate": 2.7757465736649147e-06, "loss": 0.4104, "step": 1343 }, { "epoch": 4.2360346184107005, "grad_norm": 0.24654087188596363, "learning_rate": 2.753394309915589e-06, "loss": 0.4029, "step": 1344 }, { "epoch": 4.239181746656176, "grad_norm": 0.22498193770029656, "learning_rate": 2.731125753624124e-06, "loss": 0.4177, "step": 1345 }, { "epoch": 4.242328874901652, "grad_norm": 0.24467879466713818, "learning_rate": 2.708941012872326e-06, "loss": 0.4037, "step": 1346 }, { "epoch": 4.245476003147128, "grad_norm": 0.23216696901315087, "learning_rate": 2.6868401953351807e-06, "loss": 0.4084, "step": 1347 }, { "epoch": 4.248623131392605, "grad_norm": 0.22267899491966328, "learning_rate": 2.6648234082803705e-06, "loss": 0.4016, "step": 1348 }, { "epoch": 4.25177025963808, "grad_norm": 0.22320373263218757, "learning_rate": 2.642890758567722e-06, "loss": 0.4014, "step": 1349 }, { "epoch": 4.254917387883556, "grad_norm": 0.2282911262813021, "learning_rate": 2.621042352648693e-06, "loss": 0.4019, "step": 1350 }, { "epoch": 4.258064516129032, "grad_norm": 0.23150782371834555, "learning_rate": 2.5992782965658547e-06, "loss": 0.4047, "step": 1351 }, { "epoch": 4.261211644374509, "grad_norm": 0.22978744608702004, "learning_rate": 2.5775986959523925e-06, "loss": 0.4098, "step": 1352 }, { "epoch": 4.2643587726199845, "grad_norm": 0.237921148030588, "learning_rate": 2.556003656031576e-06, "loss": 0.4071, "step": 1353 }, { "epoch": 4.26750590086546, "grad_norm": 0.23356194839626504, "learning_rate": 2.5344932816162615e-06, "loss": 0.4126, "step": 1354 }, { "epoch": 4.270653029110936, "grad_norm": 0.24938990083349305, "learning_rate": 2.5130676771083585e-06, "loss": 0.3942, "step": 1355 }, { "epoch": 4.273800157356412, "grad_norm": 0.2222933450703462, "learning_rate": 2.4917269464983564e-06, "loss": 0.407, "step": 1356 }, { "epoch": 4.276947285601889, "grad_norm": 0.23038561355673803, "learning_rate": 2.470471193364805e-06, "loss": 0.3991, "step": 1357 }, { "epoch": 4.280094413847364, "grad_norm": 0.24048036243021406, "learning_rate": 2.4493005208738006e-06, "loss": 0.4103, "step": 1358 }, { "epoch": 4.28324154209284, "grad_norm": 0.24150535433511164, "learning_rate": 2.428215031778496e-06, "loss": 0.4026, "step": 1359 }, { "epoch": 4.286388670338316, "grad_norm": 0.24484793703016558, "learning_rate": 2.407214828418607e-06, "loss": 0.3997, "step": 1360 }, { "epoch": 4.289535798583792, "grad_norm": 0.2410295427955201, "learning_rate": 2.386300012719909e-06, "loss": 0.4107, "step": 1361 }, { "epoch": 4.2926829268292686, "grad_norm": 0.23834846513449068, "learning_rate": 2.365470686193745e-06, "loss": 0.4023, "step": 1362 }, { "epoch": 4.295830055074744, "grad_norm": 0.23615554155542462, "learning_rate": 2.3447269499365245e-06, "loss": 0.4108, "step": 1363 }, { "epoch": 4.29897718332022, "grad_norm": 0.2235992225457463, "learning_rate": 2.3240689046292398e-06, "loss": 0.4026, "step": 1364 }, { "epoch": 4.302124311565696, "grad_norm": 0.22355351647845262, "learning_rate": 2.3034966505369918e-06, "loss": 0.4111, "step": 1365 }, { "epoch": 4.305271439811173, "grad_norm": 0.22346679840500028, "learning_rate": 2.2830102875084605e-06, "loss": 0.4071, "step": 1366 }, { "epoch": 4.3084185680566485, "grad_norm": 0.204481079618315, "learning_rate": 2.26260991497548e-06, "loss": 0.4174, "step": 1367 }, { "epoch": 4.311565696302124, "grad_norm": 0.23428268541426928, "learning_rate": 2.242295631952496e-06, "loss": 0.3975, "step": 1368 }, { "epoch": 4.3147128245476, "grad_norm": 0.2529300915882862, "learning_rate": 2.222067537036132e-06, "loss": 0.3963, "step": 1369 }, { "epoch": 4.317859952793077, "grad_norm": 0.23794648521514886, "learning_rate": 2.2019257284046926e-06, "loss": 0.4064, "step": 1370 }, { "epoch": 4.321007081038553, "grad_norm": 0.23514607936361892, "learning_rate": 2.1818703038176703e-06, "loss": 0.4125, "step": 1371 }, { "epoch": 4.324154209284028, "grad_norm": 0.23494478364751686, "learning_rate": 2.1619013606152994e-06, "loss": 0.4045, "step": 1372 }, { "epoch": 4.327301337529504, "grad_norm": 0.22970284055952198, "learning_rate": 2.142018995718078e-06, "loss": 0.4011, "step": 1373 }, { "epoch": 4.33044846577498, "grad_norm": 0.22353700147820285, "learning_rate": 2.122223305626272e-06, "loss": 0.3961, "step": 1374 }, { "epoch": 4.333595594020457, "grad_norm": 0.23004457401194028, "learning_rate": 2.1025143864194876e-06, "loss": 0.4006, "step": 1375 }, { "epoch": 4.3367427222659325, "grad_norm": 0.22271838804401656, "learning_rate": 2.082892333756161e-06, "loss": 0.4107, "step": 1376 }, { "epoch": 4.339889850511408, "grad_norm": 0.2536096244783794, "learning_rate": 2.063357242873134e-06, "loss": 0.4143, "step": 1377 }, { "epoch": 4.343036978756884, "grad_norm": 0.23437814886577593, "learning_rate": 2.0439092085851685e-06, "loss": 0.4137, "step": 1378 }, { "epoch": 4.34618410700236, "grad_norm": 0.2271814604599019, "learning_rate": 2.0245483252844876e-06, "loss": 0.4087, "step": 1379 }, { "epoch": 4.349331235247837, "grad_norm": 0.22962388160368896, "learning_rate": 2.005274686940326e-06, "loss": 0.4027, "step": 1380 }, { "epoch": 4.352478363493312, "grad_norm": 0.22693012118130107, "learning_rate": 1.9860883870984725e-06, "loss": 0.4142, "step": 1381 }, { "epoch": 4.355625491738788, "grad_norm": 0.23266421196656764, "learning_rate": 1.966989518880802e-06, "loss": 0.3987, "step": 1382 }, { "epoch": 4.358772619984264, "grad_norm": 0.23705401380270869, "learning_rate": 1.9479781749848503e-06, "loss": 0.4116, "step": 1383 }, { "epoch": 4.361919748229741, "grad_norm": 0.23435894430026766, "learning_rate": 1.929054447683334e-06, "loss": 0.4022, "step": 1384 }, { "epoch": 4.3650668764752165, "grad_norm": 0.23802959991267317, "learning_rate": 1.9102184288237246e-06, "loss": 0.4176, "step": 1385 }, { "epoch": 4.368214004720692, "grad_norm": 0.22275801058423705, "learning_rate": 1.8914702098277948e-06, "loss": 0.4059, "step": 1386 }, { "epoch": 4.371361132966168, "grad_norm": 0.22939658749057806, "learning_rate": 1.87280988169118e-06, "loss": 0.4094, "step": 1387 }, { "epoch": 4.374508261211645, "grad_norm": 0.25039433961364527, "learning_rate": 1.854237534982919e-06, "loss": 0.407, "step": 1388 }, { "epoch": 4.377655389457121, "grad_norm": 0.2212864081241604, "learning_rate": 1.8357532598450412e-06, "loss": 0.3965, "step": 1389 }, { "epoch": 4.380802517702596, "grad_norm": 0.22895286253748667, "learning_rate": 1.8173571459921024e-06, "loss": 0.3999, "step": 1390 }, { "epoch": 4.383949645948072, "grad_norm": 0.2143332455962388, "learning_rate": 1.7990492827107742e-06, "loss": 0.4143, "step": 1391 }, { "epoch": 4.387096774193548, "grad_norm": 0.21916080732094737, "learning_rate": 1.780829758859388e-06, "loss": 0.4053, "step": 1392 }, { "epoch": 4.390243902439025, "grad_norm": 0.22708688040625066, "learning_rate": 1.7626986628675236e-06, "loss": 0.3959, "step": 1393 }, { "epoch": 4.3933910306845005, "grad_norm": 0.2410409889887991, "learning_rate": 1.7446560827355652e-06, "loss": 0.402, "step": 1394 }, { "epoch": 4.396538158929976, "grad_norm": 0.23099349556256393, "learning_rate": 1.726702106034286e-06, "loss": 0.4148, "step": 1395 }, { "epoch": 4.399685287175452, "grad_norm": 0.22073791862182493, "learning_rate": 1.7088368199044047e-06, "loss": 0.4178, "step": 1396 }, { "epoch": 4.402832415420928, "grad_norm": 0.22316395047578508, "learning_rate": 1.6910603110561895e-06, "loss": 0.4119, "step": 1397 }, { "epoch": 4.405979543666405, "grad_norm": 0.2189088160487407, "learning_rate": 1.6733726657690108e-06, "loss": 0.4025, "step": 1398 }, { "epoch": 4.4091266719118805, "grad_norm": 0.21750370355701165, "learning_rate": 1.6557739698909436e-06, "loss": 0.4111, "step": 1399 }, { "epoch": 4.412273800157356, "grad_norm": 0.23427229679001665, "learning_rate": 1.6382643088383288e-06, "loss": 0.3983, "step": 1400 }, { "epoch": 4.415420928402832, "grad_norm": 0.214624345546693, "learning_rate": 1.620843767595388e-06, "loss": 0.4142, "step": 1401 }, { "epoch": 4.418568056648309, "grad_norm": 0.22190509237137726, "learning_rate": 1.6035124307137828e-06, "loss": 0.4018, "step": 1402 }, { "epoch": 4.421715184893785, "grad_norm": 0.22933036447627164, "learning_rate": 1.5862703823122205e-06, "loss": 0.4041, "step": 1403 }, { "epoch": 4.42486231313926, "grad_norm": 0.22856794413059017, "learning_rate": 1.5691177060760377e-06, "loss": 0.4078, "step": 1404 }, { "epoch": 4.428009441384736, "grad_norm": 0.227335410882579, "learning_rate": 1.5520544852568042e-06, "loss": 0.3982, "step": 1405 }, { "epoch": 4.431156569630213, "grad_norm": 0.21207415831235102, "learning_rate": 1.5350808026719023e-06, "loss": 0.4071, "step": 1406 }, { "epoch": 4.434303697875689, "grad_norm": 0.22335589734890615, "learning_rate": 1.5181967407041454e-06, "loss": 0.3971, "step": 1407 }, { "epoch": 4.4374508261211645, "grad_norm": 0.22022397625368526, "learning_rate": 1.5014023813013578e-06, "loss": 0.4105, "step": 1408 }, { "epoch": 4.44059795436664, "grad_norm": 0.22294865873844572, "learning_rate": 1.4846978059759964e-06, "loss": 0.4152, "step": 1409 }, { "epoch": 4.443745082612116, "grad_norm": 0.23078813748776092, "learning_rate": 1.46808309580474e-06, "loss": 0.4012, "step": 1410 }, { "epoch": 4.446892210857593, "grad_norm": 0.2162240274521999, "learning_rate": 1.451558331428109e-06, "loss": 0.3941, "step": 1411 }, { "epoch": 4.450039339103069, "grad_norm": 0.23037478782989287, "learning_rate": 1.4351235930500497e-06, "loss": 0.3952, "step": 1412 }, { "epoch": 4.453186467348544, "grad_norm": 0.22007340136864595, "learning_rate": 1.4187789604375812e-06, "loss": 0.4118, "step": 1413 }, { "epoch": 4.45633359559402, "grad_norm": 0.23576271215221423, "learning_rate": 1.4025245129203736e-06, "loss": 0.4059, "step": 1414 }, { "epoch": 4.459480723839496, "grad_norm": 0.22441915779719693, "learning_rate": 1.3863603293903839e-06, "loss": 0.4015, "step": 1415 }, { "epoch": 4.462627852084973, "grad_norm": 0.2284380804154695, "learning_rate": 1.3702864883014755e-06, "loss": 0.4128, "step": 1416 }, { "epoch": 4.4657749803304485, "grad_norm": 0.22050257816998334, "learning_rate": 1.3543030676690138e-06, "loss": 0.4102, "step": 1417 }, { "epoch": 4.468922108575924, "grad_norm": 0.22331735465975805, "learning_rate": 1.338410145069511e-06, "loss": 0.4139, "step": 1418 }, { "epoch": 4.4720692368214, "grad_norm": 0.2196984071690986, "learning_rate": 1.3226077976402473e-06, "loss": 0.404, "step": 1419 }, { "epoch": 4.475216365066877, "grad_norm": 0.2109055906141469, "learning_rate": 1.3068961020788761e-06, "loss": 0.4115, "step": 1420 }, { "epoch": 4.478363493312353, "grad_norm": 0.21755200155816148, "learning_rate": 1.2912751346430818e-06, "loss": 0.391, "step": 1421 }, { "epoch": 4.481510621557828, "grad_norm": 0.2132260845561604, "learning_rate": 1.2757449711501835e-06, "loss": 0.4082, "step": 1422 }, { "epoch": 4.484657749803304, "grad_norm": 0.2239953728568239, "learning_rate": 1.2603056869767816e-06, "loss": 0.4102, "step": 1423 }, { "epoch": 4.487804878048781, "grad_norm": 0.21787966690564375, "learning_rate": 1.244957357058394e-06, "loss": 0.4115, "step": 1424 }, { "epoch": 4.490952006294257, "grad_norm": 0.21981977423701385, "learning_rate": 1.2297000558890716e-06, "loss": 0.4033, "step": 1425 }, { "epoch": 4.4940991345397325, "grad_norm": 0.21317215521752114, "learning_rate": 1.2145338575210653e-06, "loss": 0.4047, "step": 1426 }, { "epoch": 4.497246262785208, "grad_norm": 0.22312130217927098, "learning_rate": 1.1994588355644509e-06, "loss": 0.4053, "step": 1427 }, { "epoch": 4.500393391030684, "grad_norm": 0.22447203330251364, "learning_rate": 1.1844750631867675e-06, "loss": 0.4119, "step": 1428 }, { "epoch": 4.503540519276161, "grad_norm": 0.22046801333751878, "learning_rate": 1.1695826131126765e-06, "loss": 0.4191, "step": 1429 }, { "epoch": 4.506687647521637, "grad_norm": 0.22112092363500188, "learning_rate": 1.154781557623592e-06, "loss": 0.4048, "step": 1430 }, { "epoch": 4.5098347757671124, "grad_norm": 0.2200529030796755, "learning_rate": 1.1400719685573458e-06, "loss": 0.4097, "step": 1431 }, { "epoch": 4.512981904012588, "grad_norm": 0.22276401643430727, "learning_rate": 1.125453917307837e-06, "loss": 0.4073, "step": 1432 }, { "epoch": 4.516129032258064, "grad_norm": 0.22154029972838743, "learning_rate": 1.1109274748246634e-06, "loss": 0.4043, "step": 1433 }, { "epoch": 4.519276160503541, "grad_norm": 0.21625363180786117, "learning_rate": 1.0964927116128088e-06, "loss": 0.4038, "step": 1434 }, { "epoch": 4.522423288749017, "grad_norm": 0.22587542625356785, "learning_rate": 1.0821496977322822e-06, "loss": 0.4083, "step": 1435 }, { "epoch": 4.525570416994492, "grad_norm": 0.2298925317259965, "learning_rate": 1.0678985027977795e-06, "loss": 0.4097, "step": 1436 }, { "epoch": 4.528717545239968, "grad_norm": 0.21553526604171172, "learning_rate": 1.0537391959783495e-06, "loss": 0.4169, "step": 1437 }, { "epoch": 4.531864673485445, "grad_norm": 0.2176940353070853, "learning_rate": 1.0396718459970522e-06, "loss": 0.4075, "step": 1438 }, { "epoch": 4.535011801730921, "grad_norm": 0.22147515126537134, "learning_rate": 1.0256965211306347e-06, "loss": 0.4082, "step": 1439 }, { "epoch": 4.5381589299763965, "grad_norm": 0.22956285830976184, "learning_rate": 1.011813289209198e-06, "loss": 0.4068, "step": 1440 }, { "epoch": 4.541306058221872, "grad_norm": 0.20714083744900075, "learning_rate": 9.980222176158527e-07, "loss": 0.4088, "step": 1441 }, { "epoch": 4.544453186467349, "grad_norm": 0.22430766033401095, "learning_rate": 9.84323373286411e-07, "loss": 0.4004, "step": 1442 }, { "epoch": 4.547600314712825, "grad_norm": 0.21407640131514646, "learning_rate": 9.707168227090546e-07, "loss": 0.4037, "step": 1443 }, { "epoch": 4.550747442958301, "grad_norm": 0.2186295462567785, "learning_rate": 9.572026319240147e-07, "loss": 0.4194, "step": 1444 }, { "epoch": 4.553894571203776, "grad_norm": 0.21975559225432, "learning_rate": 9.437808665232429e-07, "loss": 0.4044, "step": 1445 }, { "epoch": 4.557041699449252, "grad_norm": 0.22000522416273766, "learning_rate": 9.304515916500923e-07, "loss": 0.4137, "step": 1446 }, { "epoch": 4.560188827694729, "grad_norm": 0.21013017876360932, "learning_rate": 9.172148719990237e-07, "loss": 0.4073, "step": 1447 }, { "epoch": 4.563335955940205, "grad_norm": 0.21581120172406837, "learning_rate": 9.040707718152664e-07, "loss": 0.4047, "step": 1448 }, { "epoch": 4.5664830841856805, "grad_norm": 0.21451324203213717, "learning_rate": 8.910193548945134e-07, "loss": 0.4151, "step": 1449 }, { "epoch": 4.569630212431156, "grad_norm": 0.22318798632431602, "learning_rate": 8.780606845826179e-07, "loss": 0.4004, "step": 1450 }, { "epoch": 4.572777340676632, "grad_norm": 0.1991294427029985, "learning_rate": 8.651948237752816e-07, "loss": 0.4096, "step": 1451 }, { "epoch": 4.575924468922109, "grad_norm": 0.21945769936920279, "learning_rate": 8.524218349177515e-07, "loss": 0.4058, "step": 1452 }, { "epoch": 4.579071597167585, "grad_norm": 0.21294148461496837, "learning_rate": 8.397417800045083e-07, "loss": 0.4104, "step": 1453 }, { "epoch": 4.58221872541306, "grad_norm": 0.2289947471169245, "learning_rate": 8.271547205789731e-07, "loss": 0.3924, "step": 1454 }, { "epoch": 4.585365853658536, "grad_norm": 0.22297476089583868, "learning_rate": 8.146607177332111e-07, "loss": 0.3915, "step": 1455 }, { "epoch": 4.588512981904013, "grad_norm": 0.2162476472768259, "learning_rate": 8.022598321076369e-07, "loss": 0.3994, "step": 1456 }, { "epoch": 4.591660110149489, "grad_norm": 0.2232886473508073, "learning_rate": 7.899521238907048e-07, "loss": 0.4009, "step": 1457 }, { "epoch": 4.5948072383949645, "grad_norm": 0.23138631812380756, "learning_rate": 7.777376528186286e-07, "loss": 0.4112, "step": 1458 }, { "epoch": 4.59795436664044, "grad_norm": 0.21409653415620175, "learning_rate": 7.656164781750975e-07, "loss": 0.4181, "step": 1459 }, { "epoch": 4.601101494885917, "grad_norm": 0.22193142902845509, "learning_rate": 7.535886587909736e-07, "loss": 0.404, "step": 1460 }, { "epoch": 4.604248623131393, "grad_norm": 0.21959675902623732, "learning_rate": 7.416542530440174e-07, "loss": 0.4057, "step": 1461 }, { "epoch": 4.607395751376869, "grad_norm": 0.21867188925414008, "learning_rate": 7.298133188585921e-07, "loss": 0.4006, "step": 1462 }, { "epoch": 4.610542879622344, "grad_norm": 0.23145111441732844, "learning_rate": 7.180659137054014e-07, "loss": 0.3977, "step": 1463 }, { "epoch": 4.61369000786782, "grad_norm": 0.20970090083827234, "learning_rate": 7.064120946011987e-07, "loss": 0.4074, "step": 1464 }, { "epoch": 4.616837136113297, "grad_norm": 0.2179204076829795, "learning_rate": 6.948519181085101e-07, "loss": 0.4027, "step": 1465 }, { "epoch": 4.619984264358773, "grad_norm": 0.21581183670826473, "learning_rate": 6.833854403353535e-07, "loss": 0.4148, "step": 1466 }, { "epoch": 4.6231313926042485, "grad_norm": 0.21450682726152265, "learning_rate": 6.720127169349888e-07, "loss": 0.412, "step": 1467 }, { "epoch": 4.626278520849724, "grad_norm": 0.2250984916103483, "learning_rate": 6.607338031056243e-07, "loss": 0.4041, "step": 1468 }, { "epoch": 4.6294256490952, "grad_norm": 0.21695604933835538, "learning_rate": 6.49548753590159e-07, "loss": 0.3942, "step": 1469 }, { "epoch": 4.632572777340677, "grad_norm": 0.20819532381718653, "learning_rate": 6.384576226759165e-07, "loss": 0.4117, "step": 1470 }, { "epoch": 4.635719905586153, "grad_norm": 0.22286060436310867, "learning_rate": 6.274604641943783e-07, "loss": 0.4033, "step": 1471 }, { "epoch": 4.6388670338316285, "grad_norm": 0.21036765805313895, "learning_rate": 6.165573315209283e-07, "loss": 0.4036, "step": 1472 }, { "epoch": 4.642014162077104, "grad_norm": 0.21227522537219817, "learning_rate": 6.057482775745938e-07, "loss": 0.4082, "step": 1473 }, { "epoch": 4.645161290322581, "grad_norm": 0.21892049964637353, "learning_rate": 5.950333548177734e-07, "loss": 0.4096, "step": 1474 }, { "epoch": 4.648308418568057, "grad_norm": 0.2176553643569443, "learning_rate": 5.84412615256007e-07, "loss": 0.4014, "step": 1475 }, { "epoch": 4.651455546813533, "grad_norm": 0.2199131756115072, "learning_rate": 5.738861104377003e-07, "loss": 0.4144, "step": 1476 }, { "epoch": 4.654602675059008, "grad_norm": 0.21639846694910558, "learning_rate": 5.634538914539001e-07, "loss": 0.4158, "step": 1477 }, { "epoch": 4.657749803304485, "grad_norm": 0.21197788045529345, "learning_rate": 5.531160089380061e-07, "loss": 0.4098, "step": 1478 }, { "epoch": 4.660896931549961, "grad_norm": 0.2171708383066465, "learning_rate": 5.428725130655732e-07, "loss": 0.4075, "step": 1479 }, { "epoch": 4.664044059795437, "grad_norm": 0.2102587081469045, "learning_rate": 5.327234535540337e-07, "loss": 0.4078, "step": 1480 }, { "epoch": 4.6671911880409125, "grad_norm": 0.2246748030537084, "learning_rate": 5.226688796624757e-07, "loss": 0.4073, "step": 1481 }, { "epoch": 4.670338316286388, "grad_norm": 0.21599194135810337, "learning_rate": 5.127088401913827e-07, "loss": 0.409, "step": 1482 }, { "epoch": 4.673485444531865, "grad_norm": 0.222001456808214, "learning_rate": 5.028433834824186e-07, "loss": 0.407, "step": 1483 }, { "epoch": 4.676632572777341, "grad_norm": 0.22556759982109623, "learning_rate": 4.930725574181838e-07, "loss": 0.4029, "step": 1484 }, { "epoch": 4.679779701022817, "grad_norm": 0.22675132381159563, "learning_rate": 4.833964094219834e-07, "loss": 0.4081, "step": 1485 }, { "epoch": 4.682926829268292, "grad_norm": 0.22311156946152774, "learning_rate": 4.738149864575925e-07, "loss": 0.4072, "step": 1486 }, { "epoch": 4.686073957513768, "grad_norm": 0.21164988462618337, "learning_rate": 4.6432833502903176e-07, "loss": 0.3928, "step": 1487 }, { "epoch": 4.689221085759245, "grad_norm": 0.21501199816999694, "learning_rate": 4.549365011803475e-07, "loss": 0.4077, "step": 1488 }, { "epoch": 4.692368214004721, "grad_norm": 0.20616052059898535, "learning_rate": 4.456395304953853e-07, "loss": 0.3986, "step": 1489 }, { "epoch": 4.6955153422501965, "grad_norm": 0.21548324024487242, "learning_rate": 4.3643746809755695e-07, "loss": 0.4098, "step": 1490 }, { "epoch": 4.698662470495672, "grad_norm": 0.2231505323697405, "learning_rate": 4.27330358649638e-07, "loss": 0.4042, "step": 1491 }, { "epoch": 4.701809598741149, "grad_norm": 0.22843117462309953, "learning_rate": 4.183182463535418e-07, "loss": 0.4096, "step": 1492 }, { "epoch": 4.704956726986625, "grad_norm": 0.20462231224872493, "learning_rate": 4.094011749501103e-07, "loss": 0.4065, "step": 1493 }, { "epoch": 4.708103855232101, "grad_norm": 0.21804695686103368, "learning_rate": 4.005791877188947e-07, "loss": 0.4143, "step": 1494 }, { "epoch": 4.711250983477576, "grad_norm": 0.2086192038711514, "learning_rate": 3.9185232747794843e-07, "loss": 0.3912, "step": 1495 }, { "epoch": 4.714398111723053, "grad_norm": 0.211022943339496, "learning_rate": 3.832206365836255e-07, "loss": 0.4026, "step": 1496 }, { "epoch": 4.717545239968529, "grad_norm": 0.2088349486284656, "learning_rate": 3.7468415693036495e-07, "loss": 0.4075, "step": 1497 }, { "epoch": 4.720692368214005, "grad_norm": 0.2089695668261833, "learning_rate": 3.662429299504955e-07, "loss": 0.4174, "step": 1498 }, { "epoch": 4.7238394964594805, "grad_norm": 0.20713627934049905, "learning_rate": 3.578969966140289e-07, "loss": 0.4091, "step": 1499 }, { "epoch": 4.726986624704956, "grad_norm": 0.2173964909886165, "learning_rate": 3.4964639742846253e-07, "loss": 0.4136, "step": 1500 }, { "epoch": 4.730133752950433, "grad_norm": 0.21628797092246888, "learning_rate": 3.414911724385905e-07, "loss": 0.4062, "step": 1501 }, { "epoch": 4.733280881195909, "grad_norm": 0.20929765491064145, "learning_rate": 3.3343136122629295e-07, "loss": 0.4037, "step": 1502 }, { "epoch": 4.736428009441385, "grad_norm": 0.2225887429417968, "learning_rate": 3.254670029103579e-07, "loss": 0.398, "step": 1503 }, { "epoch": 4.7395751376868605, "grad_norm": 0.24208779005585707, "learning_rate": 3.1759813614628656e-07, "loss": 0.4057, "step": 1504 }, { "epoch": 4.742722265932336, "grad_norm": 0.22395770260059794, "learning_rate": 3.098247991261061e-07, "loss": 0.4049, "step": 1505 }, { "epoch": 4.745869394177813, "grad_norm": 0.2047845376910192, "learning_rate": 3.0214702957818587e-07, "loss": 0.4126, "step": 1506 }, { "epoch": 4.749016522423289, "grad_norm": 0.2133448799717391, "learning_rate": 2.945648647670485e-07, "loss": 0.4091, "step": 1507 }, { "epoch": 4.752163650668765, "grad_norm": 0.2173251110200107, "learning_rate": 2.870783414931988e-07, "loss": 0.396, "step": 1508 }, { "epoch": 4.755310778914241, "grad_norm": 0.2187167201560873, "learning_rate": 2.796874960929352e-07, "loss": 0.4124, "step": 1509 }, { "epoch": 4.758457907159717, "grad_norm": 0.2208294332041325, "learning_rate": 2.723923644381765e-07, "loss": 0.4002, "step": 1510 }, { "epoch": 4.761605035405193, "grad_norm": 0.21817133183867887, "learning_rate": 2.6519298193629526e-07, "loss": 0.4144, "step": 1511 }, { "epoch": 4.764752163650669, "grad_norm": 0.21498445279173387, "learning_rate": 2.580893835299314e-07, "loss": 0.4029, "step": 1512 }, { "epoch": 4.7678992918961445, "grad_norm": 0.21297319478984542, "learning_rate": 2.5108160369683663e-07, "loss": 0.4071, "step": 1513 }, { "epoch": 4.771046420141621, "grad_norm": 0.21291427119757153, "learning_rate": 2.441696764497037e-07, "loss": 0.407, "step": 1514 }, { "epoch": 4.774193548387097, "grad_norm": 0.2159769244693944, "learning_rate": 2.3735363533599065e-07, "loss": 0.3867, "step": 1515 }, { "epoch": 4.777340676632573, "grad_norm": 0.218700499287757, "learning_rate": 2.3063351343777241e-07, "loss": 0.4028, "step": 1516 }, { "epoch": 4.780487804878049, "grad_norm": 0.20727391837895415, "learning_rate": 2.2400934337157176e-07, "loss": 0.407, "step": 1517 }, { "epoch": 4.783634933123524, "grad_norm": 0.22594037043531556, "learning_rate": 2.1748115728820852e-07, "loss": 0.4058, "step": 1518 }, { "epoch": 4.786782061369001, "grad_norm": 0.21081874655691793, "learning_rate": 2.110489868726262e-07, "loss": 0.4177, "step": 1519 }, { "epoch": 4.789929189614477, "grad_norm": 0.22151547664122262, "learning_rate": 2.047128633437634e-07, "loss": 0.4057, "step": 1520 }, { "epoch": 4.793076317859953, "grad_norm": 0.22101898922423646, "learning_rate": 1.9847281745438262e-07, "loss": 0.415, "step": 1521 }, { "epoch": 4.7962234461054285, "grad_norm": 0.2193144165610068, "learning_rate": 1.923288794909306e-07, "loss": 0.4104, "step": 1522 }, { "epoch": 4.799370574350904, "grad_norm": 0.21581312267270572, "learning_rate": 1.862810792733849e-07, "loss": 0.3954, "step": 1523 }, { "epoch": 4.802517702596381, "grad_norm": 0.21917215039327326, "learning_rate": 1.8032944615511638e-07, "loss": 0.4151, "step": 1524 }, { "epoch": 4.805664830841857, "grad_norm": 0.20125184352963307, "learning_rate": 1.7447400902273813e-07, "loss": 0.4118, "step": 1525 }, { "epoch": 4.808811959087333, "grad_norm": 0.21956751608662672, "learning_rate": 1.6871479629597897e-07, "loss": 0.4098, "step": 1526 }, { "epoch": 4.811959087332809, "grad_norm": 0.21515776124286765, "learning_rate": 1.630518359275235e-07, "loss": 0.4022, "step": 1527 }, { "epoch": 4.815106215578285, "grad_norm": 0.21441421698457916, "learning_rate": 1.574851554029011e-07, "loss": 0.4095, "step": 1528 }, { "epoch": 4.818253343823761, "grad_norm": 0.21770150209148334, "learning_rate": 1.5201478174033724e-07, "loss": 0.4007, "step": 1529 }, { "epoch": 4.821400472069237, "grad_norm": 0.21413660508932328, "learning_rate": 1.4664074149062457e-07, "loss": 0.4102, "step": 1530 }, { "epoch": 4.8245476003147125, "grad_norm": 0.2076863584844004, "learning_rate": 1.4136306073699868e-07, "loss": 0.4144, "step": 1531 }, { "epoch": 4.827694728560189, "grad_norm": 0.22291240712551658, "learning_rate": 1.3618176509500257e-07, "loss": 0.4014, "step": 1532 }, { "epoch": 4.830841856805665, "grad_norm": 0.21671960130680187, "learning_rate": 1.3109687971237793e-07, "loss": 0.3931, "step": 1533 }, { "epoch": 4.833988985051141, "grad_norm": 0.203265956592449, "learning_rate": 1.2610842926892076e-07, "loss": 0.4173, "step": 1534 }, { "epoch": 4.837136113296617, "grad_norm": 0.20458987302893475, "learning_rate": 1.2121643797638805e-07, "loss": 0.4206, "step": 1535 }, { "epoch": 4.840283241542092, "grad_norm": 0.21532940470424342, "learning_rate": 1.1642092957835582e-07, "loss": 0.4078, "step": 1536 }, { "epoch": 4.843430369787569, "grad_norm": 0.2052403167798288, "learning_rate": 1.1172192735011689e-07, "loss": 0.4073, "step": 1537 }, { "epoch": 4.846577498033045, "grad_norm": 0.21654113378965234, "learning_rate": 1.0711945409856983e-07, "loss": 0.4066, "step": 1538 }, { "epoch": 4.849724626278521, "grad_norm": 0.20462899076304167, "learning_rate": 1.0261353216209691e-07, "loss": 0.4111, "step": 1539 }, { "epoch": 4.8528717545239966, "grad_norm": 0.2081899249002058, "learning_rate": 9.820418341047078e-08, "loss": 0.4142, "step": 1540 }, { "epoch": 4.856018882769473, "grad_norm": 0.21737945180393162, "learning_rate": 9.389142924473238e-08, "loss": 0.4068, "step": 1541 }, { "epoch": 4.859166011014949, "grad_norm": 0.21645245039465824, "learning_rate": 8.96752905970999e-08, "loss": 0.4037, "step": 1542 }, { "epoch": 4.862313139260425, "grad_norm": 0.21851150478427397, "learning_rate": 8.555578793085994e-08, "loss": 0.4036, "step": 1543 }, { "epoch": 4.865460267505901, "grad_norm": 0.2120811042181584, "learning_rate": 8.153294124027211e-08, "loss": 0.4121, "step": 1544 }, { "epoch": 4.868607395751377, "grad_norm": 0.20575624884205296, "learning_rate": 7.760677005046902e-08, "loss": 0.4069, "step": 1545 }, { "epoch": 4.871754523996853, "grad_norm": 0.21391937617578313, "learning_rate": 7.377729341736306e-08, "loss": 0.4041, "step": 1546 }, { "epoch": 4.874901652242329, "grad_norm": 0.21156486072971747, "learning_rate": 7.004452992755317e-08, "loss": 0.3964, "step": 1547 }, { "epoch": 4.878048780487805, "grad_norm": 0.20671844316636703, "learning_rate": 6.640849769823599e-08, "loss": 0.4057, "step": 1548 }, { "epoch": 4.881195908733281, "grad_norm": 0.21259600584021066, "learning_rate": 6.286921437711924e-08, "loss": 0.3976, "step": 1549 }, { "epoch": 4.884343036978757, "grad_norm": 0.2058375833275837, "learning_rate": 5.9426697142328514e-08, "loss": 0.4107, "step": 1550 }, { "epoch": 4.887490165224233, "grad_norm": 0.20744913477051533, "learning_rate": 5.6080962702333983e-08, "loss": 0.4079, "step": 1551 }, { "epoch": 4.890637293469709, "grad_norm": 0.22577938836544023, "learning_rate": 5.2832027295861566e-08, "loss": 0.3983, "step": 1552 }, { "epoch": 4.893784421715185, "grad_norm": 0.20294764681156627, "learning_rate": 4.967990669181744e-08, "loss": 0.4063, "step": 1553 }, { "epoch": 4.8969315499606605, "grad_norm": 0.20869502528574135, "learning_rate": 4.6624616189214765e-08, "loss": 0.4044, "step": 1554 }, { "epoch": 4.900078678206137, "grad_norm": 0.20440330759726497, "learning_rate": 4.3666170617093774e-08, "loss": 0.4137, "step": 1555 }, { "epoch": 4.903225806451613, "grad_norm": 0.2181494330788591, "learning_rate": 4.080458433444845e-08, "loss": 0.404, "step": 1556 }, { "epoch": 4.906372934697089, "grad_norm": 0.21141190073150423, "learning_rate": 3.803987123016439e-08, "loss": 0.4121, "step": 1557 }, { "epoch": 4.909520062942565, "grad_norm": 0.20552978785557943, "learning_rate": 3.5372044722945534e-08, "loss": 0.4035, "step": 1558 }, { "epoch": 4.912667191188041, "grad_norm": 0.21497441633582837, "learning_rate": 3.2801117761254163e-08, "loss": 0.4148, "step": 1559 }, { "epoch": 4.915814319433517, "grad_norm": 0.2128972917412232, "learning_rate": 3.03271028232377e-08, "loss": 0.3996, "step": 1560 }, { "epoch": 4.918961447678993, "grad_norm": 0.21833114877923207, "learning_rate": 2.7950011916682007e-08, "loss": 0.4065, "step": 1561 }, { "epoch": 4.922108575924469, "grad_norm": 0.2158153544540106, "learning_rate": 2.566985657894483e-08, "loss": 0.4102, "step": 1562 }, { "epoch": 4.925255704169945, "grad_norm": 0.21773560895208463, "learning_rate": 2.3486647876895806e-08, "loss": 0.3977, "step": 1563 }, { "epoch": 4.928402832415421, "grad_norm": 0.21076258848562276, "learning_rate": 2.14003964068743e-08, "loss": 0.3986, "step": 1564 }, { "epoch": 4.931549960660897, "grad_norm": 0.21467255291492235, "learning_rate": 1.9411112294629442e-08, "loss": 0.4124, "step": 1565 }, { "epoch": 4.934697088906373, "grad_norm": 0.20923975584514357, "learning_rate": 1.751880519527571e-08, "loss": 0.4023, "step": 1566 }, { "epoch": 4.937844217151849, "grad_norm": 0.20904400841214246, "learning_rate": 1.5723484293237446e-08, "loss": 0.4082, "step": 1567 }, { "epoch": 4.940991345397325, "grad_norm": 0.20840331694698017, "learning_rate": 1.4025158302217735e-08, "loss": 0.4053, "step": 1568 }, { "epoch": 4.944138473642801, "grad_norm": 0.21324068817526423, "learning_rate": 1.242383546514736e-08, "loss": 0.4033, "step": 1569 }, { "epoch": 4.947285601888277, "grad_norm": 0.21410240816558584, "learning_rate": 1.0919523554140387e-08, "loss": 0.4034, "step": 1570 }, { "epoch": 4.950432730133753, "grad_norm": 0.20995826331255832, "learning_rate": 9.512229870469736e-09, "loss": 0.4041, "step": 1571 }, { "epoch": 4.9535798583792285, "grad_norm": 0.22014255278213607, "learning_rate": 8.201961244520552e-09, "loss": 0.4013, "step": 1572 }, { "epoch": 4.956726986624705, "grad_norm": 0.20658086287862656, "learning_rate": 6.988724035761341e-09, "loss": 0.4055, "step": 1573 }, { "epoch": 4.959874114870181, "grad_norm": 0.21532440781117343, "learning_rate": 5.872524132715107e-09, "loss": 0.407, "step": 1574 }, { "epoch": 4.963021243115657, "grad_norm": 0.22086256851698965, "learning_rate": 4.853366952928262e-09, "loss": 0.4036, "step": 1575 }, { "epoch": 4.966168371361133, "grad_norm": 0.22229005085819714, "learning_rate": 3.931257442937319e-09, "loss": 0.4138, "step": 1576 }, { "epoch": 4.969315499606609, "grad_norm": 0.20761601002360436, "learning_rate": 3.1062000782600134e-09, "loss": 0.397, "step": 1577 }, { "epoch": 4.972462627852085, "grad_norm": 0.2268431980032955, "learning_rate": 2.3781988633619914e-09, "loss": 0.4074, "step": 1578 }, { "epoch": 4.975609756097561, "grad_norm": 0.21131761190647141, "learning_rate": 1.7472573316457132e-09, "loss": 0.4057, "step": 1579 }, { "epoch": 4.978756884343037, "grad_norm": 0.21908993904462645, "learning_rate": 1.2133785454193636e-09, "loss": 0.4021, "step": 1580 }, { "epoch": 4.9819040125885135, "grad_norm": 0.21296216958733533, "learning_rate": 7.765650958990734e-10, "loss": 0.4024, "step": 1581 }, { "epoch": 4.985051140833989, "grad_norm": 0.2097355620791621, "learning_rate": 4.368191031844937e-10, "loss": 0.4038, "step": 1582 }, { "epoch": 4.988198269079465, "grad_norm": 0.2119478059627475, "learning_rate": 1.9414221625435602e-10, "loss": 0.4153, "step": 1583 }, { "epoch": 4.991345397324941, "grad_norm": 0.21027656821222138, "learning_rate": 4.853561295536935e-11, "loss": 0.3986, "step": 1584 }, { "epoch": 4.994492525570417, "grad_norm": 0.20504931593427467, "learning_rate": 0.0, "loss": 0.3896, "step": 1585 }, { "epoch": 4.994492525570417, "step": 1585, "total_flos": 5.954272714247897e+18, "train_loss": 0.5234080795045907, "train_runtime": 72319.3947, "train_samples_per_second": 2.811, "train_steps_per_second": 0.022 } ], "logging_steps": 1.0, "max_steps": 1585, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.954272714247897e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }