{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 26533, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000376889156898956, "grad_norm": 3.3290820403013632, "learning_rate": 3.391107761868877e-08, "loss": 0.8407, "step": 10 }, { "epoch": 0.000753778313797912, "grad_norm": 2.5764460749760327, "learning_rate": 7.15900527505652e-08, "loss": 0.8487, "step": 20 }, { "epoch": 0.0011306674706968681, "grad_norm": 3.1381979941321165, "learning_rate": 1.092690278824416e-07, "loss": 0.8437, "step": 30 }, { "epoch": 0.001507556627595824, "grad_norm": 2.9762629392340583, "learning_rate": 1.4694800301431802e-07, "loss": 0.8462, "step": 40 }, { "epoch": 0.0018844457844947801, "grad_norm": 3.8702318858093343, "learning_rate": 1.8462697814619442e-07, "loss": 0.8557, "step": 50 }, { "epoch": 0.0022613349413937362, "grad_norm": 2.567740305342297, "learning_rate": 2.2230595327807085e-07, "loss": 0.7769, "step": 60 }, { "epoch": 0.002638224098292692, "grad_norm": 2.3831563848971293, "learning_rate": 2.5998492840994723e-07, "loss": 0.825, "step": 70 }, { "epoch": 0.003015113255191648, "grad_norm": 2.2771940101499366, "learning_rate": 2.976639035418237e-07, "loss": 0.8116, "step": 80 }, { "epoch": 0.003392002412090604, "grad_norm": 1.5832075179588132, "learning_rate": 3.353428786737001e-07, "loss": 0.7362, "step": 90 }, { "epoch": 0.0037688915689895602, "grad_norm": 2.3176022862671526, "learning_rate": 3.7302185380557655e-07, "loss": 0.776, "step": 100 }, { "epoch": 0.004145780725888516, "grad_norm": 2.3560080639273617, "learning_rate": 4.107008289374529e-07, "loss": 0.7496, "step": 110 }, { "epoch": 0.0045226698827874725, "grad_norm": 1.439480969015099, "learning_rate": 4.4837980406932935e-07, "loss": 0.7045, "step": 120 }, { "epoch": 0.004899559039686428, "grad_norm": 1.9309695894872856, "learning_rate": 4.860587792012058e-07, "loss": 0.7354, "step": 130 }, { "epoch": 0.005276448196585384, "grad_norm": 1.9566795366950147, "learning_rate": 5.237377543330822e-07, "loss": 0.7171, "step": 140 }, { "epoch": 0.00565333735348434, "grad_norm": 1.9730496032128708, "learning_rate": 5.614167294649587e-07, "loss": 0.7264, "step": 150 }, { "epoch": 0.006030226510383296, "grad_norm": 1.5644394630847538, "learning_rate": 5.99095704596835e-07, "loss": 0.6523, "step": 160 }, { "epoch": 0.006407115667282253, "grad_norm": 1.8613164375921365, "learning_rate": 6.367746797287114e-07, "loss": 0.6955, "step": 170 }, { "epoch": 0.006784004824181208, "grad_norm": 2.0520868176808023, "learning_rate": 6.744536548605879e-07, "loss": 0.7115, "step": 180 }, { "epoch": 0.007160893981080164, "grad_norm": 1.9416771515631597, "learning_rate": 7.121326299924643e-07, "loss": 0.6734, "step": 190 }, { "epoch": 0.0075377831379791205, "grad_norm": 2.4867970424407413, "learning_rate": 7.498116051243407e-07, "loss": 0.6785, "step": 200 }, { "epoch": 0.007914672294878076, "grad_norm": 1.7774834832854693, "learning_rate": 7.874905802562172e-07, "loss": 0.6227, "step": 210 }, { "epoch": 0.008291561451777032, "grad_norm": 1.9812937141799203, "learning_rate": 8.251695553880935e-07, "loss": 0.6806, "step": 220 }, { "epoch": 0.00866845060867599, "grad_norm": 2.921310201335651, "learning_rate": 8.628485305199699e-07, "loss": 0.6409, "step": 230 }, { "epoch": 0.009045339765574945, "grad_norm": 2.158350529344399, "learning_rate": 9.005275056518463e-07, "loss": 0.6757, "step": 240 }, { "epoch": 0.0094222289224739, "grad_norm": 2.0780037382003784, "learning_rate": 9.382064807837228e-07, "loss": 0.6759, "step": 250 }, { "epoch": 0.009799118079372856, "grad_norm": 1.9015358166448175, "learning_rate": 9.758854559155991e-07, "loss": 0.5856, "step": 260 }, { "epoch": 0.010176007236271812, "grad_norm": 1.7097668450833368, "learning_rate": 1.0135644310474755e-06, "loss": 0.6735, "step": 270 }, { "epoch": 0.010552896393170768, "grad_norm": 1.9842362284855692, "learning_rate": 1.051243406179352e-06, "loss": 0.6286, "step": 280 }, { "epoch": 0.010929785550069725, "grad_norm": 1.6319392350237532, "learning_rate": 1.0889223813112285e-06, "loss": 0.5993, "step": 290 }, { "epoch": 0.01130667470696868, "grad_norm": 1.9629194279960003, "learning_rate": 1.126601356443105e-06, "loss": 0.5989, "step": 300 }, { "epoch": 0.011683563863867636, "grad_norm": 1.635011424220465, "learning_rate": 1.1642803315749811e-06, "loss": 0.6292, "step": 310 }, { "epoch": 0.012060453020766592, "grad_norm": 2.7449123371579023, "learning_rate": 1.2019593067068578e-06, "loss": 0.6444, "step": 320 }, { "epoch": 0.012437342177665548, "grad_norm": 1.7611175230514056, "learning_rate": 1.2396382818387342e-06, "loss": 0.6001, "step": 330 }, { "epoch": 0.012814231334564505, "grad_norm": 1.7299587835974657, "learning_rate": 1.2773172569706106e-06, "loss": 0.6373, "step": 340 }, { "epoch": 0.01319112049146346, "grad_norm": 2.1190034707614367, "learning_rate": 1.314996232102487e-06, "loss": 0.6386, "step": 350 }, { "epoch": 0.013568009648362417, "grad_norm": 1.6507907146985035, "learning_rate": 1.3526752072343632e-06, "loss": 0.6025, "step": 360 }, { "epoch": 0.013944898805261372, "grad_norm": 1.6248644379536064, "learning_rate": 1.3903541823662398e-06, "loss": 0.633, "step": 370 }, { "epoch": 0.014321787962160328, "grad_norm": 1.8214328463856349, "learning_rate": 1.4280331574981162e-06, "loss": 0.6231, "step": 380 }, { "epoch": 0.014698677119059285, "grad_norm": 1.7233452084340422, "learning_rate": 1.4657121326299926e-06, "loss": 0.6086, "step": 390 }, { "epoch": 0.015075566275958241, "grad_norm": 1.9949384108347457, "learning_rate": 1.503391107761869e-06, "loss": 0.5944, "step": 400 }, { "epoch": 0.015452455432857197, "grad_norm": 1.7891389236926964, "learning_rate": 1.5410700828937456e-06, "loss": 0.5935, "step": 410 }, { "epoch": 0.015829344589756152, "grad_norm": 1.9798760306416079, "learning_rate": 1.5787490580256218e-06, "loss": 0.6405, "step": 420 }, { "epoch": 0.01620623374665511, "grad_norm": 1.8498442500498027, "learning_rate": 1.6164280331574982e-06, "loss": 0.5948, "step": 430 }, { "epoch": 0.016583122903554064, "grad_norm": 1.7813340027075553, "learning_rate": 1.6541070082893746e-06, "loss": 0.6369, "step": 440 }, { "epoch": 0.01696001206045302, "grad_norm": 1.9276634385297304, "learning_rate": 1.691785983421251e-06, "loss": 0.5752, "step": 450 }, { "epoch": 0.01733690121735198, "grad_norm": 1.6512215587375432, "learning_rate": 1.7294649585531276e-06, "loss": 0.5961, "step": 460 }, { "epoch": 0.017713790374250932, "grad_norm": 1.80925998198896, "learning_rate": 1.7671439336850038e-06, "loss": 0.6343, "step": 470 }, { "epoch": 0.01809067953114989, "grad_norm": 1.837322385048155, "learning_rate": 1.8048229088168804e-06, "loss": 0.6125, "step": 480 }, { "epoch": 0.018467568688048844, "grad_norm": 1.976044009740098, "learning_rate": 1.8425018839487568e-06, "loss": 0.5939, "step": 490 }, { "epoch": 0.0188444578449478, "grad_norm": 2.16596793862554, "learning_rate": 1.880180859080633e-06, "loss": 0.5832, "step": 500 }, { "epoch": 0.01922134700184676, "grad_norm": 1.7490489143874088, "learning_rate": 1.9178598342125096e-06, "loss": 0.6141, "step": 510 }, { "epoch": 0.019598236158745713, "grad_norm": 2.7984276241901394, "learning_rate": 1.955538809344386e-06, "loss": 0.5967, "step": 520 }, { "epoch": 0.01997512531564467, "grad_norm": 1.8460062687463663, "learning_rate": 1.9932177844762624e-06, "loss": 0.5855, "step": 530 }, { "epoch": 0.020352014472543624, "grad_norm": 3.071868251764471, "learning_rate": 2.030896759608139e-06, "loss": 0.5923, "step": 540 }, { "epoch": 0.02072890362944258, "grad_norm": 2.249558954464503, "learning_rate": 2.0685757347400153e-06, "loss": 0.6267, "step": 550 }, { "epoch": 0.021105792786341535, "grad_norm": 1.7739493406466322, "learning_rate": 2.1062547098718917e-06, "loss": 0.5733, "step": 560 }, { "epoch": 0.021482681943240493, "grad_norm": 1.9571521040237465, "learning_rate": 2.143933685003768e-06, "loss": 0.5901, "step": 570 }, { "epoch": 0.02185957110013945, "grad_norm": 1.8862454156341757, "learning_rate": 2.1816126601356445e-06, "loss": 0.5668, "step": 580 }, { "epoch": 0.022236460257038404, "grad_norm": 1.9035008144788996, "learning_rate": 2.219291635267521e-06, "loss": 0.6003, "step": 590 }, { "epoch": 0.02261334941393736, "grad_norm": 2.1401923526726114, "learning_rate": 2.2569706103993973e-06, "loss": 0.6069, "step": 600 }, { "epoch": 0.022990238570836315, "grad_norm": 1.8983635538399102, "learning_rate": 2.2946495855312737e-06, "loss": 0.5551, "step": 610 }, { "epoch": 0.023367127727735273, "grad_norm": 1.8341056062808996, "learning_rate": 2.33232856066315e-06, "loss": 0.6195, "step": 620 }, { "epoch": 0.02374401688463423, "grad_norm": 1.9001660208603897, "learning_rate": 2.3700075357950265e-06, "loss": 0.5787, "step": 630 }, { "epoch": 0.024120906041533184, "grad_norm": 2.024583452910895, "learning_rate": 2.407686510926903e-06, "loss": 0.6006, "step": 640 }, { "epoch": 0.02449779519843214, "grad_norm": 1.9671891126956806, "learning_rate": 2.4453654860587793e-06, "loss": 0.6045, "step": 650 }, { "epoch": 0.024874684355331095, "grad_norm": 1.9647192549218615, "learning_rate": 2.483044461190656e-06, "loss": 0.5632, "step": 660 }, { "epoch": 0.025251573512230053, "grad_norm": 1.6577621142537855, "learning_rate": 2.5207234363225325e-06, "loss": 0.5713, "step": 670 }, { "epoch": 0.02562846266912901, "grad_norm": 1.8104039197666246, "learning_rate": 2.5584024114544085e-06, "loss": 0.6227, "step": 680 }, { "epoch": 0.026005351826027964, "grad_norm": 1.8269715554295154, "learning_rate": 2.596081386586285e-06, "loss": 0.5576, "step": 690 }, { "epoch": 0.02638224098292692, "grad_norm": 2.132660241375296, "learning_rate": 2.6337603617181617e-06, "loss": 0.5985, "step": 700 }, { "epoch": 0.026759130139825876, "grad_norm": 1.9004356716576036, "learning_rate": 2.671439336850038e-06, "loss": 0.5741, "step": 710 }, { "epoch": 0.027136019296724833, "grad_norm": 1.9904870404606059, "learning_rate": 2.709118311981914e-06, "loss": 0.608, "step": 720 }, { "epoch": 0.02751290845362379, "grad_norm": 1.7934156140344455, "learning_rate": 2.7467972871137905e-06, "loss": 0.6041, "step": 730 }, { "epoch": 0.027889797610522744, "grad_norm": 1.9452679410394749, "learning_rate": 2.7844762622456674e-06, "loss": 0.569, "step": 740 }, { "epoch": 0.028266686767421702, "grad_norm": 1.9646256949492564, "learning_rate": 2.8221552373775433e-06, "loss": 0.5566, "step": 750 }, { "epoch": 0.028643575924320656, "grad_norm": 1.6964066105708113, "learning_rate": 2.8598342125094197e-06, "loss": 0.5771, "step": 760 }, { "epoch": 0.029020465081219613, "grad_norm": 1.626500474046293, "learning_rate": 2.8975131876412966e-06, "loss": 0.6028, "step": 770 }, { "epoch": 0.02939735423811857, "grad_norm": 1.705518791196053, "learning_rate": 2.935192162773173e-06, "loss": 0.598, "step": 780 }, { "epoch": 0.029774243395017524, "grad_norm": 2.023608938357471, "learning_rate": 2.972871137905049e-06, "loss": 0.599, "step": 790 }, { "epoch": 0.030151132551916482, "grad_norm": 1.9251923030741178, "learning_rate": 3.0105501130369258e-06, "loss": 0.5705, "step": 800 }, { "epoch": 0.030528021708815436, "grad_norm": 1.8099259001405903, "learning_rate": 3.048229088168802e-06, "loss": 0.5824, "step": 810 }, { "epoch": 0.030904910865714393, "grad_norm": 1.9287634604630308, "learning_rate": 3.085908063300678e-06, "loss": 0.5761, "step": 820 }, { "epoch": 0.03128180002261335, "grad_norm": 1.783048816707533, "learning_rate": 3.123587038432555e-06, "loss": 0.579, "step": 830 }, { "epoch": 0.031658689179512305, "grad_norm": 1.8166051797381484, "learning_rate": 3.1612660135644314e-06, "loss": 0.5877, "step": 840 }, { "epoch": 0.03203557833641126, "grad_norm": 2.038736220774468, "learning_rate": 3.1989449886963074e-06, "loss": 0.572, "step": 850 }, { "epoch": 0.03241246749331022, "grad_norm": 1.738601505380381, "learning_rate": 3.236623963828184e-06, "loss": 0.5777, "step": 860 }, { "epoch": 0.03278935665020918, "grad_norm": 1.6676309446260524, "learning_rate": 3.2743029389600606e-06, "loss": 0.5637, "step": 870 }, { "epoch": 0.03316624580710813, "grad_norm": 2.161220914250185, "learning_rate": 3.311981914091937e-06, "loss": 0.5862, "step": 880 }, { "epoch": 0.033543134964007085, "grad_norm": 1.4967702154384854, "learning_rate": 3.3496608892238134e-06, "loss": 0.5529, "step": 890 }, { "epoch": 0.03392002412090604, "grad_norm": 2.2279038438220877, "learning_rate": 3.38733986435569e-06, "loss": 0.5979, "step": 900 }, { "epoch": 0.034296913277805, "grad_norm": 1.8345971384276711, "learning_rate": 3.4250188394875662e-06, "loss": 0.5669, "step": 910 }, { "epoch": 0.03467380243470396, "grad_norm": 1.8142008601019335, "learning_rate": 3.462697814619443e-06, "loss": 0.6041, "step": 920 }, { "epoch": 0.03505069159160291, "grad_norm": 1.6693434904318734, "learning_rate": 3.500376789751319e-06, "loss": 0.5272, "step": 930 }, { "epoch": 0.035427580748501865, "grad_norm": 1.945433750394992, "learning_rate": 3.5380557648831954e-06, "loss": 0.558, "step": 940 }, { "epoch": 0.03580446990540082, "grad_norm": 2.1129599329826614, "learning_rate": 3.5757347400150723e-06, "loss": 0.5801, "step": 950 }, { "epoch": 0.03618135906229978, "grad_norm": 1.905120951845058, "learning_rate": 3.6134137151469482e-06, "loss": 0.572, "step": 960 }, { "epoch": 0.03655824821919874, "grad_norm": 1.972585987519169, "learning_rate": 3.6510926902788246e-06, "loss": 0.5995, "step": 970 }, { "epoch": 0.03693513737609769, "grad_norm": 1.9036055814375914, "learning_rate": 3.688771665410701e-06, "loss": 0.5993, "step": 980 }, { "epoch": 0.037312026532996645, "grad_norm": 1.8646464012423685, "learning_rate": 3.7264506405425774e-06, "loss": 0.5733, "step": 990 }, { "epoch": 0.0376889156898956, "grad_norm": 2.3089435664838933, "learning_rate": 3.764129615674454e-06, "loss": 0.5835, "step": 1000 }, { "epoch": 0.03806580484679456, "grad_norm": 2.17191968568418, "learning_rate": 3.8018085908063303e-06, "loss": 0.5412, "step": 1010 }, { "epoch": 0.03844269400369352, "grad_norm": 1.8949936547455895, "learning_rate": 3.839487565938207e-06, "loss": 0.5966, "step": 1020 }, { "epoch": 0.03881958316059247, "grad_norm": 2.473982294301116, "learning_rate": 3.877166541070083e-06, "loss": 0.5615, "step": 1030 }, { "epoch": 0.039196472317491425, "grad_norm": 1.7089999900943973, "learning_rate": 3.9148455162019595e-06, "loss": 0.5752, "step": 1040 }, { "epoch": 0.03957336147439038, "grad_norm": 1.85392931543439, "learning_rate": 3.952524491333836e-06, "loss": 0.578, "step": 1050 }, { "epoch": 0.03995025063128934, "grad_norm": 1.4905002182549854, "learning_rate": 3.990203466465712e-06, "loss": 0.553, "step": 1060 }, { "epoch": 0.04032713978818829, "grad_norm": 1.7544817835449993, "learning_rate": 4.027882441597589e-06, "loss": 0.5564, "step": 1070 }, { "epoch": 0.04070402894508725, "grad_norm": 1.798461388686054, "learning_rate": 4.065561416729465e-06, "loss": 0.5635, "step": 1080 }, { "epoch": 0.041080918101986205, "grad_norm": 1.7675958852701312, "learning_rate": 4.1032403918613415e-06, "loss": 0.5683, "step": 1090 }, { "epoch": 0.04145780725888516, "grad_norm": 2.1395567530774895, "learning_rate": 4.140919366993218e-06, "loss": 0.5678, "step": 1100 }, { "epoch": 0.04183469641578412, "grad_norm": 1.703017057556802, "learning_rate": 4.178598342125095e-06, "loss": 0.552, "step": 1110 }, { "epoch": 0.04221158557268307, "grad_norm": 1.6184789818203074, "learning_rate": 4.216277317256971e-06, "loss": 0.5655, "step": 1120 }, { "epoch": 0.04258847472958203, "grad_norm": 2.0996907497711574, "learning_rate": 4.253956292388847e-06, "loss": 0.5525, "step": 1130 }, { "epoch": 0.042965363886480985, "grad_norm": 1.8890470204414929, "learning_rate": 4.291635267520724e-06, "loss": 0.5608, "step": 1140 }, { "epoch": 0.04334225304337994, "grad_norm": 2.080468775436257, "learning_rate": 4.3293142426526e-06, "loss": 0.5511, "step": 1150 }, { "epoch": 0.0437191422002789, "grad_norm": 1.8224041675858285, "learning_rate": 4.366993217784476e-06, "loss": 0.5727, "step": 1160 }, { "epoch": 0.04409603135717785, "grad_norm": 1.6149909377743559, "learning_rate": 4.4046721929163536e-06, "loss": 0.5448, "step": 1170 }, { "epoch": 0.04447292051407681, "grad_norm": 1.9061771805744654, "learning_rate": 4.442351168048229e-06, "loss": 0.5637, "step": 1180 }, { "epoch": 0.044849809670975765, "grad_norm": 1.807877264857849, "learning_rate": 4.4800301431801055e-06, "loss": 0.5511, "step": 1190 }, { "epoch": 0.04522669882787472, "grad_norm": 2.0181868598887487, "learning_rate": 4.517709118311983e-06, "loss": 0.5582, "step": 1200 }, { "epoch": 0.04560358798477368, "grad_norm": 1.573360107816235, "learning_rate": 4.555388093443859e-06, "loss": 0.5644, "step": 1210 }, { "epoch": 0.04598047714167263, "grad_norm": 1.771584321526413, "learning_rate": 4.593067068575735e-06, "loss": 0.5689, "step": 1220 }, { "epoch": 0.04635736629857159, "grad_norm": 2.0063138861207364, "learning_rate": 4.630746043707611e-06, "loss": 0.5724, "step": 1230 }, { "epoch": 0.046734255455470546, "grad_norm": 1.6920600006872095, "learning_rate": 4.668425018839488e-06, "loss": 0.5566, "step": 1240 }, { "epoch": 0.0471111446123695, "grad_norm": 1.8971593808774774, "learning_rate": 4.706103993971364e-06, "loss": 0.5699, "step": 1250 }, { "epoch": 0.04748803376926846, "grad_norm": 1.587611604567327, "learning_rate": 4.74378296910324e-06, "loss": 0.5649, "step": 1260 }, { "epoch": 0.04786492292616741, "grad_norm": 2.0265270332073246, "learning_rate": 4.781461944235118e-06, "loss": 0.5374, "step": 1270 }, { "epoch": 0.04824181208306637, "grad_norm": 2.0590285975116083, "learning_rate": 4.819140919366993e-06, "loss": 0.576, "step": 1280 }, { "epoch": 0.048618701239965326, "grad_norm": 1.7133539806651839, "learning_rate": 4.8568198944988696e-06, "loss": 0.5706, "step": 1290 }, { "epoch": 0.04899559039686428, "grad_norm": 1.8988624169513535, "learning_rate": 4.894498869630747e-06, "loss": 0.5552, "step": 1300 }, { "epoch": 0.04937247955376324, "grad_norm": 2.0372135320392086, "learning_rate": 4.932177844762623e-06, "loss": 0.5459, "step": 1310 }, { "epoch": 0.04974936871066219, "grad_norm": 1.7794743693654322, "learning_rate": 4.969856819894499e-06, "loss": 0.5548, "step": 1320 }, { "epoch": 0.05012625786756115, "grad_norm": 1.7352843761532744, "learning_rate": 5.007535795026376e-06, "loss": 0.571, "step": 1330 }, { "epoch": 0.050503147024460106, "grad_norm": 1.8996845237311992, "learning_rate": 5.0452147701582524e-06, "loss": 0.5493, "step": 1340 }, { "epoch": 0.05088003618135906, "grad_norm": 2.1612445033194962, "learning_rate": 5.082893745290128e-06, "loss": 0.5707, "step": 1350 }, { "epoch": 0.05125692533825802, "grad_norm": 2.022115535247945, "learning_rate": 5.120572720422004e-06, "loss": 0.5541, "step": 1360 }, { "epoch": 0.05163381449515697, "grad_norm": 1.714380161573395, "learning_rate": 5.158251695553881e-06, "loss": 0.5522, "step": 1370 }, { "epoch": 0.05201070365205593, "grad_norm": 1.8375152499536425, "learning_rate": 5.195930670685758e-06, "loss": 0.5607, "step": 1380 }, { "epoch": 0.052387592808954886, "grad_norm": 1.7211038349954213, "learning_rate": 5.2336096458176345e-06, "loss": 0.5429, "step": 1390 }, { "epoch": 0.05276448196585384, "grad_norm": 1.9959664354157889, "learning_rate": 5.271288620949511e-06, "loss": 0.5656, "step": 1400 }, { "epoch": 0.0531413711227528, "grad_norm": 1.7536569082906184, "learning_rate": 5.308967596081387e-06, "loss": 0.5452, "step": 1410 }, { "epoch": 0.05351826027965175, "grad_norm": 1.7231610063386784, "learning_rate": 5.346646571213263e-06, "loss": 0.5353, "step": 1420 }, { "epoch": 0.05389514943655071, "grad_norm": 1.8970119256356177, "learning_rate": 5.384325546345139e-06, "loss": 0.5302, "step": 1430 }, { "epoch": 0.054272038593449666, "grad_norm": 1.5800174842538417, "learning_rate": 5.4220045214770165e-06, "loss": 0.5624, "step": 1440 }, { "epoch": 0.05464892775034862, "grad_norm": 2.0740091270784053, "learning_rate": 5.459683496608893e-06, "loss": 0.5377, "step": 1450 }, { "epoch": 0.05502581690724758, "grad_norm": 1.8294455280243227, "learning_rate": 5.497362471740769e-06, "loss": 0.5566, "step": 1460 }, { "epoch": 0.05540270606414653, "grad_norm": 1.5838428291746807, "learning_rate": 5.535041446872646e-06, "loss": 0.5446, "step": 1470 }, { "epoch": 0.05577959522104549, "grad_norm": 2.1056158207672233, "learning_rate": 5.572720422004522e-06, "loss": 0.5726, "step": 1480 }, { "epoch": 0.056156484377944446, "grad_norm": 1.8677258424310819, "learning_rate": 5.610399397136398e-06, "loss": 0.5678, "step": 1490 }, { "epoch": 0.056533373534843404, "grad_norm": 2.014882289331746, "learning_rate": 5.648078372268275e-06, "loss": 0.5623, "step": 1500 }, { "epoch": 0.05691026269174236, "grad_norm": 1.8459621787433622, "learning_rate": 5.685757347400151e-06, "loss": 0.5775, "step": 1510 }, { "epoch": 0.05728715184864131, "grad_norm": 1.7030968368942396, "learning_rate": 5.723436322532028e-06, "loss": 0.5456, "step": 1520 }, { "epoch": 0.05766404100554027, "grad_norm": 1.7530142515014413, "learning_rate": 5.761115297663904e-06, "loss": 0.5493, "step": 1530 }, { "epoch": 0.058040930162439226, "grad_norm": 1.7511504402842564, "learning_rate": 5.7987942727957805e-06, "loss": 0.5633, "step": 1540 }, { "epoch": 0.058417819319338184, "grad_norm": 2.1931311143367607, "learning_rate": 5.836473247927656e-06, "loss": 0.5581, "step": 1550 }, { "epoch": 0.05879470847623714, "grad_norm": 3.636694230456593, "learning_rate": 5.874152223059534e-06, "loss": 0.546, "step": 1560 }, { "epoch": 0.05917159763313609, "grad_norm": 2.3094095827220302, "learning_rate": 5.91183119819141e-06, "loss": 0.5875, "step": 1570 }, { "epoch": 0.05954848679003505, "grad_norm": 2.0011971290567794, "learning_rate": 5.949510173323286e-06, "loss": 0.5575, "step": 1580 }, { "epoch": 0.059925375946934006, "grad_norm": 1.6650652223048363, "learning_rate": 5.9871891484551625e-06, "loss": 0.5525, "step": 1590 }, { "epoch": 0.060302265103832964, "grad_norm": 1.9533169899266176, "learning_rate": 6.024868123587039e-06, "loss": 0.574, "step": 1600 }, { "epoch": 0.06067915426073192, "grad_norm": 1.685943647766806, "learning_rate": 6.062547098718915e-06, "loss": 0.5149, "step": 1610 }, { "epoch": 0.06105604341763087, "grad_norm": 1.3728725575465968, "learning_rate": 6.100226073850791e-06, "loss": 0.5436, "step": 1620 }, { "epoch": 0.06143293257452983, "grad_norm": 1.8741392657740152, "learning_rate": 6.137905048982669e-06, "loss": 0.5628, "step": 1630 }, { "epoch": 0.061809821731428786, "grad_norm": 1.9761429074276111, "learning_rate": 6.1755840241145446e-06, "loss": 0.5713, "step": 1640 }, { "epoch": 0.062186710888327744, "grad_norm": 2.03436821453595, "learning_rate": 6.213262999246421e-06, "loss": 0.6002, "step": 1650 }, { "epoch": 0.0625636000452267, "grad_norm": 1.832975464096823, "learning_rate": 6.250941974378297e-06, "loss": 0.5728, "step": 1660 }, { "epoch": 0.06294048920212565, "grad_norm": 1.633511698263684, "learning_rate": 6.288620949510174e-06, "loss": 0.5552, "step": 1670 }, { "epoch": 0.06331737835902461, "grad_norm": 2.090535151172006, "learning_rate": 6.32629992464205e-06, "loss": 0.531, "step": 1680 }, { "epoch": 0.06369426751592357, "grad_norm": 1.6050052308654612, "learning_rate": 6.363978899773927e-06, "loss": 0.5891, "step": 1690 }, { "epoch": 0.06407115667282252, "grad_norm": 2.212232256420831, "learning_rate": 6.401657874905803e-06, "loss": 0.5752, "step": 1700 }, { "epoch": 0.06444804582972148, "grad_norm": 1.877674806453838, "learning_rate": 6.439336850037679e-06, "loss": 0.5778, "step": 1710 }, { "epoch": 0.06482493498662044, "grad_norm": 1.8261529051865222, "learning_rate": 6.477015825169556e-06, "loss": 0.5437, "step": 1720 }, { "epoch": 0.0652018241435194, "grad_norm": 2.1674429445191032, "learning_rate": 6.514694800301432e-06, "loss": 0.5439, "step": 1730 }, { "epoch": 0.06557871330041835, "grad_norm": 1.7036520231489474, "learning_rate": 6.552373775433309e-06, "loss": 0.5716, "step": 1740 }, { "epoch": 0.0659556024573173, "grad_norm": 2.031911396789848, "learning_rate": 6.590052750565186e-06, "loss": 0.5365, "step": 1750 }, { "epoch": 0.06633249161421625, "grad_norm": 1.784884433990894, "learning_rate": 6.627731725697062e-06, "loss": 0.5556, "step": 1760 }, { "epoch": 0.06670938077111521, "grad_norm": 1.7250914216810251, "learning_rate": 6.665410700828938e-06, "loss": 0.577, "step": 1770 }, { "epoch": 0.06708626992801417, "grad_norm": 2.0118741851139417, "learning_rate": 6.703089675960814e-06, "loss": 0.5792, "step": 1780 }, { "epoch": 0.06746315908491313, "grad_norm": 2.083914596235162, "learning_rate": 6.740768651092691e-06, "loss": 0.5817, "step": 1790 }, { "epoch": 0.06784004824181208, "grad_norm": 1.8512470922958455, "learning_rate": 6.778447626224567e-06, "loss": 0.5421, "step": 1800 }, { "epoch": 0.06821693739871104, "grad_norm": 1.8612678054005098, "learning_rate": 6.816126601356444e-06, "loss": 0.5392, "step": 1810 }, { "epoch": 0.06859382655561, "grad_norm": 1.928991115625655, "learning_rate": 6.853805576488321e-06, "loss": 0.5819, "step": 1820 }, { "epoch": 0.06897071571250896, "grad_norm": 1.8008419029962375, "learning_rate": 6.891484551620197e-06, "loss": 0.5547, "step": 1830 }, { "epoch": 0.06934760486940791, "grad_norm": 2.0597436169161973, "learning_rate": 6.929163526752073e-06, "loss": 0.5599, "step": 1840 }, { "epoch": 0.06972449402630686, "grad_norm": 2.0570020874528567, "learning_rate": 6.966842501883949e-06, "loss": 0.5694, "step": 1850 }, { "epoch": 0.07010138318320581, "grad_norm": 1.5952035385299899, "learning_rate": 7.0045214770158254e-06, "loss": 0.5406, "step": 1860 }, { "epoch": 0.07047827234010477, "grad_norm": 1.7271708038895457, "learning_rate": 7.042200452147702e-06, "loss": 0.555, "step": 1870 }, { "epoch": 0.07085516149700373, "grad_norm": 2.0101886368535813, "learning_rate": 7.079879427279579e-06, "loss": 0.5716, "step": 1880 }, { "epoch": 0.07123205065390269, "grad_norm": 1.6065477858483548, "learning_rate": 7.1175584024114555e-06, "loss": 0.5476, "step": 1890 }, { "epoch": 0.07160893981080164, "grad_norm": 2.02286611730447, "learning_rate": 7.155237377543331e-06, "loss": 0.5383, "step": 1900 }, { "epoch": 0.0719858289677006, "grad_norm": 2.037314841983119, "learning_rate": 7.1929163526752075e-06, "loss": 0.5587, "step": 1910 }, { "epoch": 0.07236271812459956, "grad_norm": 1.8264192102516816, "learning_rate": 7.230595327807084e-06, "loss": 0.5553, "step": 1920 }, { "epoch": 0.07273960728149852, "grad_norm": 1.9520150148970015, "learning_rate": 7.26827430293896e-06, "loss": 0.567, "step": 1930 }, { "epoch": 0.07311649643839747, "grad_norm": 1.6905896085482799, "learning_rate": 7.3059532780708375e-06, "loss": 0.5252, "step": 1940 }, { "epoch": 0.07349338559529642, "grad_norm": 1.8762657820598212, "learning_rate": 7.343632253202714e-06, "loss": 0.5696, "step": 1950 }, { "epoch": 0.07387027475219538, "grad_norm": 1.7670005320740287, "learning_rate": 7.38131122833459e-06, "loss": 0.5444, "step": 1960 }, { "epoch": 0.07424716390909433, "grad_norm": 1.7538737345614326, "learning_rate": 7.418990203466466e-06, "loss": 0.5884, "step": 1970 }, { "epoch": 0.07462405306599329, "grad_norm": 1.515985834449344, "learning_rate": 7.456669178598342e-06, "loss": 0.5446, "step": 1980 }, { "epoch": 0.07500094222289225, "grad_norm": 1.5815994265482278, "learning_rate": 7.494348153730219e-06, "loss": 0.5591, "step": 1990 }, { "epoch": 0.0753778313797912, "grad_norm": 2.0488961746529486, "learning_rate": 7.532027128862096e-06, "loss": 0.5706, "step": 2000 }, { "epoch": 0.07575472053669016, "grad_norm": 1.969269141686112, "learning_rate": 7.569706103993972e-06, "loss": 0.558, "step": 2010 }, { "epoch": 0.07613160969358912, "grad_norm": 2.266330513109259, "learning_rate": 7.607385079125849e-06, "loss": 0.5396, "step": 2020 }, { "epoch": 0.07650849885048808, "grad_norm": 1.8646005092443476, "learning_rate": 7.645064054257724e-06, "loss": 0.5423, "step": 2030 }, { "epoch": 0.07688538800738703, "grad_norm": 1.7399985247018814, "learning_rate": 7.682743029389602e-06, "loss": 0.549, "step": 2040 }, { "epoch": 0.07726227716428598, "grad_norm": 1.8255156942684898, "learning_rate": 7.720422004521477e-06, "loss": 0.5549, "step": 2050 }, { "epoch": 0.07763916632118494, "grad_norm": 1.946851326618598, "learning_rate": 7.758100979653354e-06, "loss": 0.5666, "step": 2060 }, { "epoch": 0.07801605547808389, "grad_norm": 1.9532587449674659, "learning_rate": 7.795779954785232e-06, "loss": 0.5711, "step": 2070 }, { "epoch": 0.07839294463498285, "grad_norm": 1.7952693056787907, "learning_rate": 7.833458929917107e-06, "loss": 0.5386, "step": 2080 }, { "epoch": 0.07876983379188181, "grad_norm": 1.8221658229163284, "learning_rate": 7.871137905048983e-06, "loss": 0.5731, "step": 2090 }, { "epoch": 0.07914672294878076, "grad_norm": 1.833428780602818, "learning_rate": 7.90881688018086e-06, "loss": 0.5546, "step": 2100 }, { "epoch": 0.07952361210567972, "grad_norm": 1.7570404308629375, "learning_rate": 7.946495855312736e-06, "loss": 0.5323, "step": 2110 }, { "epoch": 0.07990050126257868, "grad_norm": 1.802778650409238, "learning_rate": 7.984174830444613e-06, "loss": 0.5422, "step": 2120 }, { "epoch": 0.08027739041947764, "grad_norm": 1.6222520770975064, "learning_rate": 8.02185380557649e-06, "loss": 0.5573, "step": 2130 }, { "epoch": 0.08065427957637658, "grad_norm": 1.9266827677857417, "learning_rate": 8.059532780708366e-06, "loss": 0.5559, "step": 2140 }, { "epoch": 0.08103116873327554, "grad_norm": 2.050814084562181, "learning_rate": 8.097211755840241e-06, "loss": 0.551, "step": 2150 }, { "epoch": 0.0814080578901745, "grad_norm": 1.7585001593613225, "learning_rate": 8.134890730972118e-06, "loss": 0.5364, "step": 2160 }, { "epoch": 0.08178494704707345, "grad_norm": 1.8883866215853788, "learning_rate": 8.172569706103994e-06, "loss": 0.5888, "step": 2170 }, { "epoch": 0.08216183620397241, "grad_norm": 1.758532254936197, "learning_rate": 8.210248681235871e-06, "loss": 0.5712, "step": 2180 }, { "epoch": 0.08253872536087137, "grad_norm": 1.7452201238698966, "learning_rate": 8.247927656367748e-06, "loss": 0.562, "step": 2190 }, { "epoch": 0.08291561451777033, "grad_norm": 2.0545039008121373, "learning_rate": 8.285606631499624e-06, "loss": 0.5326, "step": 2200 }, { "epoch": 0.08329250367466928, "grad_norm": 1.7697141454393999, "learning_rate": 8.323285606631501e-06, "loss": 0.5777, "step": 2210 }, { "epoch": 0.08366939283156824, "grad_norm": 1.9252812449566288, "learning_rate": 8.360964581763377e-06, "loss": 0.5747, "step": 2220 }, { "epoch": 0.0840462819884672, "grad_norm": 1.8673410308838145, "learning_rate": 8.398643556895252e-06, "loss": 0.5352, "step": 2230 }, { "epoch": 0.08442317114536614, "grad_norm": 1.649056430020697, "learning_rate": 8.43632253202713e-06, "loss": 0.5661, "step": 2240 }, { "epoch": 0.0848000603022651, "grad_norm": 1.8813402816460187, "learning_rate": 8.474001507159007e-06, "loss": 0.5323, "step": 2250 }, { "epoch": 0.08517694945916406, "grad_norm": 1.8849842368697092, "learning_rate": 8.511680482290882e-06, "loss": 0.5502, "step": 2260 }, { "epoch": 0.08555383861606301, "grad_norm": 1.7894509940448735, "learning_rate": 8.54935945742276e-06, "loss": 0.5819, "step": 2270 }, { "epoch": 0.08593072777296197, "grad_norm": 1.7627934782136696, "learning_rate": 8.587038432554635e-06, "loss": 0.5574, "step": 2280 }, { "epoch": 0.08630761692986093, "grad_norm": 1.7684239573461413, "learning_rate": 8.62471740768651e-06, "loss": 0.5421, "step": 2290 }, { "epoch": 0.08668450608675989, "grad_norm": 1.8420421564268776, "learning_rate": 8.662396382818388e-06, "loss": 0.5786, "step": 2300 }, { "epoch": 0.08706139524365884, "grad_norm": 1.600074045318061, "learning_rate": 8.700075357950264e-06, "loss": 0.535, "step": 2310 }, { "epoch": 0.0874382844005578, "grad_norm": 2.151450395002285, "learning_rate": 8.737754333082141e-06, "loss": 0.5821, "step": 2320 }, { "epoch": 0.08781517355745676, "grad_norm": 1.805383691002577, "learning_rate": 8.775433308214018e-06, "loss": 0.5787, "step": 2330 }, { "epoch": 0.0881920627143557, "grad_norm": 1.9600049150179448, "learning_rate": 8.813112283345894e-06, "loss": 0.5501, "step": 2340 }, { "epoch": 0.08856895187125466, "grad_norm": 2.0847026403429485, "learning_rate": 8.85079125847777e-06, "loss": 0.5462, "step": 2350 }, { "epoch": 0.08894584102815362, "grad_norm": 1.7768359780016918, "learning_rate": 8.888470233609646e-06, "loss": 0.5837, "step": 2360 }, { "epoch": 0.08932273018505257, "grad_norm": 2.0158186623423386, "learning_rate": 8.926149208741522e-06, "loss": 0.5616, "step": 2370 }, { "epoch": 0.08969961934195153, "grad_norm": 1.6581403784334712, "learning_rate": 8.9638281838734e-06, "loss": 0.5743, "step": 2380 }, { "epoch": 0.09007650849885049, "grad_norm": 2.5790022328647044, "learning_rate": 9.001507159005277e-06, "loss": 0.5744, "step": 2390 }, { "epoch": 0.09045339765574945, "grad_norm": 1.723560039585966, "learning_rate": 9.039186134137152e-06, "loss": 0.5602, "step": 2400 }, { "epoch": 0.0908302868126484, "grad_norm": 1.351214600774913, "learning_rate": 9.07686510926903e-06, "loss": 0.5331, "step": 2410 }, { "epoch": 0.09120717596954736, "grad_norm": 1.8039553055086206, "learning_rate": 9.114544084400905e-06, "loss": 0.5677, "step": 2420 }, { "epoch": 0.09158406512644632, "grad_norm": 1.9123242622126195, "learning_rate": 9.15222305953278e-06, "loss": 0.5588, "step": 2430 }, { "epoch": 0.09196095428334526, "grad_norm": 2.006300165478271, "learning_rate": 9.189902034664658e-06, "loss": 0.5812, "step": 2440 }, { "epoch": 0.09233784344024422, "grad_norm": 1.9159468022645592, "learning_rate": 9.227581009796535e-06, "loss": 0.5372, "step": 2450 }, { "epoch": 0.09271473259714318, "grad_norm": 1.7917577869195462, "learning_rate": 9.26525998492841e-06, "loss": 0.5675, "step": 2460 }, { "epoch": 0.09309162175404213, "grad_norm": 5.4204153510281, "learning_rate": 9.302938960060288e-06, "loss": 0.5674, "step": 2470 }, { "epoch": 0.09346851091094109, "grad_norm": 1.6618922792887658, "learning_rate": 9.340617935192163e-06, "loss": 0.5745, "step": 2480 }, { "epoch": 0.09384540006784005, "grad_norm": 1.665306637026819, "learning_rate": 9.378296910324039e-06, "loss": 0.5556, "step": 2490 }, { "epoch": 0.094222289224739, "grad_norm": 1.825366672374041, "learning_rate": 9.415975885455916e-06, "loss": 0.5318, "step": 2500 }, { "epoch": 0.09459917838163796, "grad_norm": 1.9696174903266126, "learning_rate": 9.453654860587793e-06, "loss": 0.5411, "step": 2510 }, { "epoch": 0.09497606753853692, "grad_norm": 1.7049399302783044, "learning_rate": 9.491333835719669e-06, "loss": 0.5706, "step": 2520 }, { "epoch": 0.09535295669543588, "grad_norm": 1.7849968576762765, "learning_rate": 9.529012810851546e-06, "loss": 0.5665, "step": 2530 }, { "epoch": 0.09572984585233482, "grad_norm": 1.353686035468744, "learning_rate": 9.566691785983422e-06, "loss": 0.5459, "step": 2540 }, { "epoch": 0.09610673500923378, "grad_norm": 1.7623768145461352, "learning_rate": 9.604370761115297e-06, "loss": 0.5866, "step": 2550 }, { "epoch": 0.09648362416613274, "grad_norm": 1.9641841168234557, "learning_rate": 9.642049736247175e-06, "loss": 0.5466, "step": 2560 }, { "epoch": 0.0968605133230317, "grad_norm": 1.3736072242645836, "learning_rate": 9.679728711379052e-06, "loss": 0.5391, "step": 2570 }, { "epoch": 0.09723740247993065, "grad_norm": 1.654177532173035, "learning_rate": 9.717407686510927e-06, "loss": 0.5592, "step": 2580 }, { "epoch": 0.09761429163682961, "grad_norm": 1.675077992009493, "learning_rate": 9.755086661642805e-06, "loss": 0.5484, "step": 2590 }, { "epoch": 0.09799118079372857, "grad_norm": 1.7745168797445137, "learning_rate": 9.79276563677468e-06, "loss": 0.5656, "step": 2600 }, { "epoch": 0.09836806995062752, "grad_norm": 1.665358523138359, "learning_rate": 9.830444611906557e-06, "loss": 0.5544, "step": 2610 }, { "epoch": 0.09874495910752648, "grad_norm": 2.373990573362573, "learning_rate": 9.868123587038433e-06, "loss": 0.5448, "step": 2620 }, { "epoch": 0.09912184826442544, "grad_norm": 1.6903642288991887, "learning_rate": 9.90580256217031e-06, "loss": 0.5562, "step": 2630 }, { "epoch": 0.09949873742132438, "grad_norm": 1.711502598162347, "learning_rate": 9.943481537302186e-06, "loss": 0.5582, "step": 2640 }, { "epoch": 0.09987562657822334, "grad_norm": 1.7630192145087855, "learning_rate": 9.981160512434063e-06, "loss": 0.5568, "step": 2650 }, { "epoch": 0.1002525157351223, "grad_norm": 1.9391272093641074, "learning_rate": 9.999998918198758e-06, "loss": 0.5814, "step": 2660 }, { "epoch": 0.10062940489202125, "grad_norm": 1.1176379107900862, "learning_rate": 9.999990263791625e-06, "loss": 0.5327, "step": 2670 }, { "epoch": 0.10100629404892021, "grad_norm": 1.7144355514907554, "learning_rate": 9.99997295499234e-06, "loss": 0.5424, "step": 2680 }, { "epoch": 0.10138318320581917, "grad_norm": 1.3867888190170987, "learning_rate": 9.99994699183086e-06, "loss": 0.5498, "step": 2690 }, { "epoch": 0.10176007236271813, "grad_norm": 1.6251131139875976, "learning_rate": 9.999912374352125e-06, "loss": 0.5526, "step": 2700 }, { "epoch": 0.10213696151961708, "grad_norm": 1.758258285021875, "learning_rate": 9.999869102616057e-06, "loss": 0.5647, "step": 2710 }, { "epoch": 0.10251385067651604, "grad_norm": 1.8922253129449746, "learning_rate": 9.99981717669755e-06, "loss": 0.569, "step": 2720 }, { "epoch": 0.102890739833415, "grad_norm": 1.6492463045156762, "learning_rate": 9.999756596686483e-06, "loss": 0.5639, "step": 2730 }, { "epoch": 0.10326762899031394, "grad_norm": 1.6884719814636986, "learning_rate": 9.999687362687714e-06, "loss": 0.5302, "step": 2740 }, { "epoch": 0.1036445181472129, "grad_norm": 1.7663588108147437, "learning_rate": 9.999609474821078e-06, "loss": 0.5776, "step": 2750 }, { "epoch": 0.10402140730411186, "grad_norm": 1.6023970608406795, "learning_rate": 9.999522933221389e-06, "loss": 0.5609, "step": 2760 }, { "epoch": 0.10439829646101081, "grad_norm": 1.582018849874625, "learning_rate": 9.999427738038438e-06, "loss": 0.5432, "step": 2770 }, { "epoch": 0.10477518561790977, "grad_norm": 1.8661113330406751, "learning_rate": 9.999323889437004e-06, "loss": 0.5505, "step": 2780 }, { "epoch": 0.10515207477480873, "grad_norm": 2.4377666457132405, "learning_rate": 9.99921138759683e-06, "loss": 0.552, "step": 2790 }, { "epoch": 0.10552896393170769, "grad_norm": 1.9769702999570735, "learning_rate": 9.999090232712648e-06, "loss": 0.5706, "step": 2800 }, { "epoch": 0.10590585308860664, "grad_norm": 1.5852863285358036, "learning_rate": 9.998960424994157e-06, "loss": 0.5468, "step": 2810 }, { "epoch": 0.1062827422455056, "grad_norm": 1.9487316640058903, "learning_rate": 9.998821964666043e-06, "loss": 0.5566, "step": 2820 }, { "epoch": 0.10665963140240456, "grad_norm": 1.470091180676127, "learning_rate": 9.998674851967965e-06, "loss": 0.5375, "step": 2830 }, { "epoch": 0.1070365205593035, "grad_norm": 1.8861024609523855, "learning_rate": 9.998519087154555e-06, "loss": 0.5714, "step": 2840 }, { "epoch": 0.10741340971620246, "grad_norm": 1.8625015944443275, "learning_rate": 9.998354670495426e-06, "loss": 0.5718, "step": 2850 }, { "epoch": 0.10779029887310142, "grad_norm": 1.9173353321183049, "learning_rate": 9.99818160227516e-06, "loss": 0.5591, "step": 2860 }, { "epoch": 0.10816718803000037, "grad_norm": 1.6563786175924313, "learning_rate": 9.997999882793323e-06, "loss": 0.5644, "step": 2870 }, { "epoch": 0.10854407718689933, "grad_norm": 1.8385719564959482, "learning_rate": 9.997809512364447e-06, "loss": 0.5574, "step": 2880 }, { "epoch": 0.10892096634379829, "grad_norm": 1.7214173447884336, "learning_rate": 9.99761049131804e-06, "loss": 0.5575, "step": 2890 }, { "epoch": 0.10929785550069725, "grad_norm": 2.263948770043836, "learning_rate": 9.997402819998585e-06, "loss": 0.5717, "step": 2900 }, { "epoch": 0.1096747446575962, "grad_norm": 1.617474690577473, "learning_rate": 9.997186498765537e-06, "loss": 0.5381, "step": 2910 }, { "epoch": 0.11005163381449516, "grad_norm": 1.9206922286733692, "learning_rate": 9.996961527993322e-06, "loss": 0.5403, "step": 2920 }, { "epoch": 0.11042852297139412, "grad_norm": 1.6363891592183775, "learning_rate": 9.996727908071337e-06, "loss": 0.5504, "step": 2930 }, { "epoch": 0.11080541212829306, "grad_norm": 1.7892900410710342, "learning_rate": 9.99648563940395e-06, "loss": 0.5398, "step": 2940 }, { "epoch": 0.11118230128519202, "grad_norm": 1.837754723538967, "learning_rate": 9.996234722410501e-06, "loss": 0.5501, "step": 2950 }, { "epoch": 0.11155919044209098, "grad_norm": 1.870686763606362, "learning_rate": 9.995975157525298e-06, "loss": 0.5697, "step": 2960 }, { "epoch": 0.11193607959898993, "grad_norm": 1.8670194608446757, "learning_rate": 9.995706945197616e-06, "loss": 0.5819, "step": 2970 }, { "epoch": 0.11231296875588889, "grad_norm": 1.764875781288927, "learning_rate": 9.995430085891698e-06, "loss": 0.5558, "step": 2980 }, { "epoch": 0.11268985791278785, "grad_norm": 1.6461496237741704, "learning_rate": 9.995144580086757e-06, "loss": 0.5528, "step": 2990 }, { "epoch": 0.11306674706968681, "grad_norm": 1.6525100198106808, "learning_rate": 9.99485042827697e-06, "loss": 0.5412, "step": 3000 }, { "epoch": 0.11344363622658576, "grad_norm": 1.7111537561130798, "learning_rate": 9.994547630971476e-06, "loss": 0.556, "step": 3010 }, { "epoch": 0.11382052538348472, "grad_norm": 1.6948046733289588, "learning_rate": 9.994236188694384e-06, "loss": 0.5634, "step": 3020 }, { "epoch": 0.11419741454038368, "grad_norm": 1.7787138971691205, "learning_rate": 9.99391610198476e-06, "loss": 0.5099, "step": 3030 }, { "epoch": 0.11457430369728262, "grad_norm": 1.6976810544764316, "learning_rate": 9.993587371396642e-06, "loss": 0.5526, "step": 3040 }, { "epoch": 0.11495119285418158, "grad_norm": 1.8408592594962616, "learning_rate": 9.99324999749902e-06, "loss": 0.5328, "step": 3050 }, { "epoch": 0.11532808201108054, "grad_norm": 1.7138365866573475, "learning_rate": 9.992903980875849e-06, "loss": 0.5233, "step": 3060 }, { "epoch": 0.1157049711679795, "grad_norm": 1.6023041921101526, "learning_rate": 9.992549322126044e-06, "loss": 0.5562, "step": 3070 }, { "epoch": 0.11608186032487845, "grad_norm": 1.7101785936588327, "learning_rate": 9.992186021863475e-06, "loss": 0.5625, "step": 3080 }, { "epoch": 0.11645874948177741, "grad_norm": 1.7736884499183863, "learning_rate": 9.991814080716974e-06, "loss": 0.5624, "step": 3090 }, { "epoch": 0.11683563863867637, "grad_norm": 2.1118966740309313, "learning_rate": 9.991433499330326e-06, "loss": 0.569, "step": 3100 }, { "epoch": 0.11721252779557532, "grad_norm": 1.882358555790821, "learning_rate": 9.991044278362274e-06, "loss": 0.5556, "step": 3110 }, { "epoch": 0.11758941695247428, "grad_norm": 1.4807120288858666, "learning_rate": 9.990646418486512e-06, "loss": 0.5332, "step": 3120 }, { "epoch": 0.11796630610937324, "grad_norm": 1.9456842492772166, "learning_rate": 9.990239920391687e-06, "loss": 0.5456, "step": 3130 }, { "epoch": 0.11834319526627218, "grad_norm": 1.9332110812656393, "learning_rate": 9.9898247847814e-06, "loss": 0.5914, "step": 3140 }, { "epoch": 0.11872008442317114, "grad_norm": 1.819320948849462, "learning_rate": 9.989401012374207e-06, "loss": 0.5212, "step": 3150 }, { "epoch": 0.1190969735800701, "grad_norm": 1.5937443837733034, "learning_rate": 9.9889686039036e-06, "loss": 0.5719, "step": 3160 }, { "epoch": 0.11947386273696906, "grad_norm": 1.4724382207969984, "learning_rate": 9.988527560118033e-06, "loss": 0.5514, "step": 3170 }, { "epoch": 0.11985075189386801, "grad_norm": 1.7128687990343496, "learning_rate": 9.988077881780896e-06, "loss": 0.5342, "step": 3180 }, { "epoch": 0.12022764105076697, "grad_norm": 1.7843398826770018, "learning_rate": 9.987619569670532e-06, "loss": 0.5425, "step": 3190 }, { "epoch": 0.12060453020766593, "grad_norm": 1.6747602178489263, "learning_rate": 9.987152624580223e-06, "loss": 0.5498, "step": 3200 }, { "epoch": 0.12098141936456488, "grad_norm": 1.9740415780360288, "learning_rate": 9.986677047318199e-06, "loss": 0.5787, "step": 3210 }, { "epoch": 0.12135830852146384, "grad_norm": 1.5401247418757107, "learning_rate": 9.986192838707624e-06, "loss": 0.529, "step": 3220 }, { "epoch": 0.1217351976783628, "grad_norm": 1.6649769987957272, "learning_rate": 9.98569999958661e-06, "loss": 0.5532, "step": 3230 }, { "epoch": 0.12211208683526174, "grad_norm": 2.0615017580666546, "learning_rate": 9.9851985308082e-06, "loss": 0.5306, "step": 3240 }, { "epoch": 0.1224889759921607, "grad_norm": 1.7489740809134746, "learning_rate": 9.98468843324038e-06, "loss": 0.5728, "step": 3250 }, { "epoch": 0.12286586514905966, "grad_norm": 1.9584508485487868, "learning_rate": 9.984169707766065e-06, "loss": 0.5497, "step": 3260 }, { "epoch": 0.12324275430595862, "grad_norm": 1.8986564469950582, "learning_rate": 9.98364235528311e-06, "loss": 0.531, "step": 3270 }, { "epoch": 0.12361964346285757, "grad_norm": 1.796715014709634, "learning_rate": 9.983106376704299e-06, "loss": 0.5604, "step": 3280 }, { "epoch": 0.12399653261975653, "grad_norm": 1.4479882986534132, "learning_rate": 9.98256177295735e-06, "loss": 0.5441, "step": 3290 }, { "epoch": 0.12437342177665549, "grad_norm": 1.9018212839289914, "learning_rate": 9.982008544984902e-06, "loss": 0.586, "step": 3300 }, { "epoch": 0.12475031093355445, "grad_norm": 1.6731029649854103, "learning_rate": 9.981446693744532e-06, "loss": 0.5422, "step": 3310 }, { "epoch": 0.1251272000904534, "grad_norm": 1.8224277036655119, "learning_rate": 9.980876220208738e-06, "loss": 0.574, "step": 3320 }, { "epoch": 0.12550408924735235, "grad_norm": 1.7502821063481213, "learning_rate": 9.980297125364939e-06, "loss": 0.5182, "step": 3330 }, { "epoch": 0.1258809784042513, "grad_norm": 1.6833110345844349, "learning_rate": 9.979709410215483e-06, "loss": 0.5732, "step": 3340 }, { "epoch": 0.12625786756115026, "grad_norm": 2.1392787884736824, "learning_rate": 9.979113075777636e-06, "loss": 0.5421, "step": 3350 }, { "epoch": 0.12663475671804922, "grad_norm": 1.7605201424192343, "learning_rate": 9.978508123083579e-06, "loss": 0.5693, "step": 3360 }, { "epoch": 0.12701164587494818, "grad_norm": 1.814620805028767, "learning_rate": 9.977894553180414e-06, "loss": 0.5044, "step": 3370 }, { "epoch": 0.12738853503184713, "grad_norm": 1.7333387641365299, "learning_rate": 9.977272367130161e-06, "loss": 0.5642, "step": 3380 }, { "epoch": 0.1277654241887461, "grad_norm": 6.5511623986160705, "learning_rate": 9.97664156600975e-06, "loss": 0.539, "step": 3390 }, { "epoch": 0.12814231334564505, "grad_norm": 1.8697849048435933, "learning_rate": 9.97600215091102e-06, "loss": 0.5723, "step": 3400 }, { "epoch": 0.128519202502544, "grad_norm": 1.840910802370967, "learning_rate": 9.975354122940725e-06, "loss": 0.563, "step": 3410 }, { "epoch": 0.12889609165944296, "grad_norm": 1.6208553393789018, "learning_rate": 9.974697483220526e-06, "loss": 0.569, "step": 3420 }, { "epoch": 0.12927298081634192, "grad_norm": 1.7217470801750827, "learning_rate": 9.974032232886988e-06, "loss": 0.561, "step": 3430 }, { "epoch": 0.12964986997324088, "grad_norm": 1.8150748187971744, "learning_rate": 9.973358373091578e-06, "loss": 0.5663, "step": 3440 }, { "epoch": 0.13002675913013984, "grad_norm": 1.8613735972835608, "learning_rate": 9.972675905000672e-06, "loss": 0.5412, "step": 3450 }, { "epoch": 0.1304036482870388, "grad_norm": 1.9863460603182592, "learning_rate": 9.971984829795539e-06, "loss": 0.5483, "step": 3460 }, { "epoch": 0.13078053744393775, "grad_norm": 1.4584507302056484, "learning_rate": 9.971285148672347e-06, "loss": 0.5359, "step": 3470 }, { "epoch": 0.1311574266008367, "grad_norm": 1.6661289692524046, "learning_rate": 9.970576862842165e-06, "loss": 0.5518, "step": 3480 }, { "epoch": 0.13153431575773564, "grad_norm": 1.8117322128837796, "learning_rate": 9.96985997353095e-06, "loss": 0.5294, "step": 3490 }, { "epoch": 0.1319112049146346, "grad_norm": 1.7990258695528125, "learning_rate": 9.969134481979554e-06, "loss": 0.5487, "step": 3500 }, { "epoch": 0.13228809407153355, "grad_norm": 1.6917648391788271, "learning_rate": 9.968400389443715e-06, "loss": 0.5488, "step": 3510 }, { "epoch": 0.1326649832284325, "grad_norm": 1.7815906576006262, "learning_rate": 9.967657697194062e-06, "loss": 0.5435, "step": 3520 }, { "epoch": 0.13304187238533147, "grad_norm": 1.8019408287680707, "learning_rate": 9.966906406516106e-06, "loss": 0.5317, "step": 3530 }, { "epoch": 0.13341876154223042, "grad_norm": 1.5650851056879715, "learning_rate": 9.966146518710244e-06, "loss": 0.5443, "step": 3540 }, { "epoch": 0.13379565069912938, "grad_norm": 1.6846258897654576, "learning_rate": 9.965378035091753e-06, "loss": 0.5384, "step": 3550 }, { "epoch": 0.13417253985602834, "grad_norm": 1.6977199699358678, "learning_rate": 9.964600956990785e-06, "loss": 0.5454, "step": 3560 }, { "epoch": 0.1345494290129273, "grad_norm": 5.101071946029098, "learning_rate": 9.963815285752369e-06, "loss": 0.5174, "step": 3570 }, { "epoch": 0.13492631816982625, "grad_norm": 1.5806913588105131, "learning_rate": 9.963021022736413e-06, "loss": 0.5455, "step": 3580 }, { "epoch": 0.1353032073267252, "grad_norm": 1.540462795524182, "learning_rate": 9.962218169317688e-06, "loss": 0.5385, "step": 3590 }, { "epoch": 0.13568009648362417, "grad_norm": 1.8117995998543306, "learning_rate": 9.961406726885844e-06, "loss": 0.547, "step": 3600 }, { "epoch": 0.13605698564052313, "grad_norm": 1.702664311368578, "learning_rate": 9.960586696845387e-06, "loss": 0.5425, "step": 3610 }, { "epoch": 0.13643387479742208, "grad_norm": 1.6558884575569928, "learning_rate": 9.959758080615694e-06, "loss": 0.5636, "step": 3620 }, { "epoch": 0.13681076395432104, "grad_norm": 1.5110746291185808, "learning_rate": 9.958920879631002e-06, "loss": 0.5568, "step": 3630 }, { "epoch": 0.13718765311122, "grad_norm": 2.139574069411458, "learning_rate": 9.958075095340408e-06, "loss": 0.505, "step": 3640 }, { "epoch": 0.13756454226811896, "grad_norm": 1.9376102058687341, "learning_rate": 9.957220729207862e-06, "loss": 0.5498, "step": 3650 }, { "epoch": 0.1379414314250179, "grad_norm": 2.2672765573662264, "learning_rate": 9.95635778271217e-06, "loss": 0.5435, "step": 3660 }, { "epoch": 0.13831832058191687, "grad_norm": 1.5668715176963142, "learning_rate": 9.955486257346995e-06, "loss": 0.5589, "step": 3670 }, { "epoch": 0.13869520973881583, "grad_norm": 1.7293817862316867, "learning_rate": 9.954606154620841e-06, "loss": 0.5536, "step": 3680 }, { "epoch": 0.13907209889571476, "grad_norm": 1.5203042242279239, "learning_rate": 9.953717476057062e-06, "loss": 0.5157, "step": 3690 }, { "epoch": 0.13944898805261371, "grad_norm": 1.6809131976448295, "learning_rate": 9.952820223193856e-06, "loss": 0.5581, "step": 3700 }, { "epoch": 0.13982587720951267, "grad_norm": 1.6390460530372517, "learning_rate": 9.951914397584262e-06, "loss": 0.518, "step": 3710 }, { "epoch": 0.14020276636641163, "grad_norm": 1.756920987537259, "learning_rate": 9.951000000796159e-06, "loss": 0.5561, "step": 3720 }, { "epoch": 0.1405796555233106, "grad_norm": 1.6270655698945786, "learning_rate": 9.950077034412254e-06, "loss": 0.5417, "step": 3730 }, { "epoch": 0.14095654468020954, "grad_norm": 1.7443369340079147, "learning_rate": 9.949145500030099e-06, "loss": 0.5546, "step": 3740 }, { "epoch": 0.1413334338371085, "grad_norm": 1.9623236145537395, "learning_rate": 9.948205399262066e-06, "loss": 0.5614, "step": 3750 }, { "epoch": 0.14171032299400746, "grad_norm": 1.8286963445588205, "learning_rate": 9.94725673373536e-06, "loss": 0.5194, "step": 3760 }, { "epoch": 0.14208721215090642, "grad_norm": 1.6490782226850094, "learning_rate": 9.946299505092008e-06, "loss": 0.5133, "step": 3770 }, { "epoch": 0.14246410130780537, "grad_norm": 1.459857790438441, "learning_rate": 9.945333714988859e-06, "loss": 0.5274, "step": 3780 }, { "epoch": 0.14284099046470433, "grad_norm": 2.766851100980767, "learning_rate": 9.944359365097584e-06, "loss": 0.5637, "step": 3790 }, { "epoch": 0.1432178796216033, "grad_norm": 1.773166799745728, "learning_rate": 9.943376457104665e-06, "loss": 0.5431, "step": 3800 }, { "epoch": 0.14359476877850225, "grad_norm": 1.8707486173547303, "learning_rate": 9.942384992711402e-06, "loss": 0.5413, "step": 3810 }, { "epoch": 0.1439716579354012, "grad_norm": 1.789830817687169, "learning_rate": 9.9413849736339e-06, "loss": 0.5488, "step": 3820 }, { "epoch": 0.14434854709230016, "grad_norm": 1.7488430625940725, "learning_rate": 9.940376401603077e-06, "loss": 0.5345, "step": 3830 }, { "epoch": 0.14472543624919912, "grad_norm": 1.5053305184156898, "learning_rate": 9.939359278364648e-06, "loss": 0.5323, "step": 3840 }, { "epoch": 0.14510232540609808, "grad_norm": 1.7325706869085533, "learning_rate": 9.938333605679137e-06, "loss": 0.549, "step": 3850 }, { "epoch": 0.14547921456299703, "grad_norm": 2.298650093349881, "learning_rate": 9.937299385321858e-06, "loss": 0.5629, "step": 3860 }, { "epoch": 0.145856103719896, "grad_norm": 2.010120739235166, "learning_rate": 9.936256619082928e-06, "loss": 0.5546, "step": 3870 }, { "epoch": 0.14623299287679495, "grad_norm": 1.6730215675721205, "learning_rate": 9.935205308767251e-06, "loss": 0.5592, "step": 3880 }, { "epoch": 0.14660988203369388, "grad_norm": 1.623819818612049, "learning_rate": 9.934145456194522e-06, "loss": 0.5522, "step": 3890 }, { "epoch": 0.14698677119059284, "grad_norm": 1.8737714445343006, "learning_rate": 9.933077063199217e-06, "loss": 0.5395, "step": 3900 }, { "epoch": 0.1473636603474918, "grad_norm": 1.8896976945133552, "learning_rate": 9.9320001316306e-06, "loss": 0.5155, "step": 3910 }, { "epoch": 0.14774054950439075, "grad_norm": 1.7999273816656425, "learning_rate": 9.930914663352713e-06, "loss": 0.552, "step": 3920 }, { "epoch": 0.1481174386612897, "grad_norm": 1.5451191295487272, "learning_rate": 9.929820660244372e-06, "loss": 0.5308, "step": 3930 }, { "epoch": 0.14849432781818867, "grad_norm": 1.6325314113099194, "learning_rate": 9.928718124199168e-06, "loss": 0.527, "step": 3940 }, { "epoch": 0.14887121697508762, "grad_norm": 2.044279798697509, "learning_rate": 9.927607057125461e-06, "loss": 0.5257, "step": 3950 }, { "epoch": 0.14924810613198658, "grad_norm": 1.6860219589179097, "learning_rate": 9.926487460946375e-06, "loss": 0.5179, "step": 3960 }, { "epoch": 0.14962499528888554, "grad_norm": 1.8333348047186415, "learning_rate": 9.9253593375998e-06, "loss": 0.5467, "step": 3970 }, { "epoch": 0.1500018844457845, "grad_norm": 1.8252914858844747, "learning_rate": 9.924222689038384e-06, "loss": 0.5698, "step": 3980 }, { "epoch": 0.15037877360268345, "grad_norm": 1.636078330532333, "learning_rate": 9.923077517229531e-06, "loss": 0.5817, "step": 3990 }, { "epoch": 0.1507556627595824, "grad_norm": 1.6310153659702036, "learning_rate": 9.921923824155399e-06, "loss": 0.5632, "step": 4000 }, { "epoch": 0.15113255191648137, "grad_norm": 1.703545293572265, "learning_rate": 9.920761611812892e-06, "loss": 0.5362, "step": 4010 }, { "epoch": 0.15150944107338032, "grad_norm": 1.5247266902845538, "learning_rate": 9.919590882213666e-06, "loss": 0.5847, "step": 4020 }, { "epoch": 0.15188633023027928, "grad_norm": 1.8205620209169613, "learning_rate": 9.91841163738411e-06, "loss": 0.5758, "step": 4030 }, { "epoch": 0.15226321938717824, "grad_norm": 1.6632230310524354, "learning_rate": 9.917223879365366e-06, "loss": 0.5373, "step": 4040 }, { "epoch": 0.1526401085440772, "grad_norm": 1.6718013294159164, "learning_rate": 9.916027610213293e-06, "loss": 0.5557, "step": 4050 }, { "epoch": 0.15301699770097615, "grad_norm": 1.5988376223664982, "learning_rate": 9.914822831998498e-06, "loss": 0.5453, "step": 4060 }, { "epoch": 0.1533938868578751, "grad_norm": 1.659616299752992, "learning_rate": 9.913609546806306e-06, "loss": 0.5504, "step": 4070 }, { "epoch": 0.15377077601477407, "grad_norm": 1.4037121435179074, "learning_rate": 9.912387756736773e-06, "loss": 0.5198, "step": 4080 }, { "epoch": 0.154147665171673, "grad_norm": 1.5183583233165177, "learning_rate": 9.911157463904673e-06, "loss": 0.5566, "step": 4090 }, { "epoch": 0.15452455432857196, "grad_norm": 1.680791972841532, "learning_rate": 9.909918670439494e-06, "loss": 0.5403, "step": 4100 }, { "epoch": 0.1549014434854709, "grad_norm": 1.7482553689079507, "learning_rate": 9.908671378485445e-06, "loss": 0.5445, "step": 4110 }, { "epoch": 0.15527833264236987, "grad_norm": 1.3584719888404202, "learning_rate": 9.907415590201442e-06, "loss": 0.5459, "step": 4120 }, { "epoch": 0.15565522179926883, "grad_norm": 1.6501271620909048, "learning_rate": 9.906151307761101e-06, "loss": 0.5549, "step": 4130 }, { "epoch": 0.15603211095616779, "grad_norm": 2.656483215686597, "learning_rate": 9.90487853335275e-06, "loss": 0.5504, "step": 4140 }, { "epoch": 0.15640900011306674, "grad_norm": 1.9444522460365512, "learning_rate": 9.903597269179406e-06, "loss": 0.5713, "step": 4150 }, { "epoch": 0.1567858892699657, "grad_norm": 1.8488494105763427, "learning_rate": 9.902307517458791e-06, "loss": 0.5422, "step": 4160 }, { "epoch": 0.15716277842686466, "grad_norm": 1.8661983415681136, "learning_rate": 9.90100928042331e-06, "loss": 0.5326, "step": 4170 }, { "epoch": 0.15753966758376362, "grad_norm": 1.5515586490627968, "learning_rate": 9.89970256032006e-06, "loss": 0.5282, "step": 4180 }, { "epoch": 0.15791655674066257, "grad_norm": 1.7714176130932602, "learning_rate": 9.898387359410817e-06, "loss": 0.576, "step": 4190 }, { "epoch": 0.15829344589756153, "grad_norm": 1.76322105124278, "learning_rate": 9.89706367997204e-06, "loss": 0.578, "step": 4200 }, { "epoch": 0.1586703350544605, "grad_norm": 1.8101667561564285, "learning_rate": 9.89573152429486e-06, "loss": 0.5534, "step": 4210 }, { "epoch": 0.15904722421135944, "grad_norm": 1.4490662091941684, "learning_rate": 9.894390894685082e-06, "loss": 0.5771, "step": 4220 }, { "epoch": 0.1594241133682584, "grad_norm": 1.540551931413724, "learning_rate": 9.893041793463176e-06, "loss": 0.5581, "step": 4230 }, { "epoch": 0.15980100252515736, "grad_norm": 1.7528415875090508, "learning_rate": 9.89168422296428e-06, "loss": 0.5631, "step": 4240 }, { "epoch": 0.16017789168205632, "grad_norm": 1.8668229751083296, "learning_rate": 9.890318185538183e-06, "loss": 0.5586, "step": 4250 }, { "epoch": 0.16055478083895527, "grad_norm": 1.490437975787403, "learning_rate": 9.88894368354934e-06, "loss": 0.5478, "step": 4260 }, { "epoch": 0.16093166999585423, "grad_norm": 1.8869524648789562, "learning_rate": 9.887560719376848e-06, "loss": 0.5419, "step": 4270 }, { "epoch": 0.16130855915275316, "grad_norm": 1.7852914761656666, "learning_rate": 9.886169295414454e-06, "loss": 0.5486, "step": 4280 }, { "epoch": 0.16168544830965212, "grad_norm": 1.770938471455256, "learning_rate": 9.884769414070551e-06, "loss": 0.5546, "step": 4290 }, { "epoch": 0.16206233746655108, "grad_norm": 2.005194363321807, "learning_rate": 9.883361077768166e-06, "loss": 0.5842, "step": 4300 }, { "epoch": 0.16243922662345003, "grad_norm": 1.6800086790787019, "learning_rate": 9.881944288944964e-06, "loss": 0.5463, "step": 4310 }, { "epoch": 0.162816115780349, "grad_norm": 2.1202763345448115, "learning_rate": 9.880519050053239e-06, "loss": 0.5609, "step": 4320 }, { "epoch": 0.16319300493724795, "grad_norm": 1.7584386869370296, "learning_rate": 9.879085363559911e-06, "loss": 0.5503, "step": 4330 }, { "epoch": 0.1635698940941469, "grad_norm": 1.6620875457762982, "learning_rate": 9.87764323194652e-06, "loss": 0.5751, "step": 4340 }, { "epoch": 0.16394678325104586, "grad_norm": 1.9926548707738148, "learning_rate": 9.876192657709227e-06, "loss": 0.5921, "step": 4350 }, { "epoch": 0.16432367240794482, "grad_norm": 1.820283969890227, "learning_rate": 9.874733643358806e-06, "loss": 0.5522, "step": 4360 }, { "epoch": 0.16470056156484378, "grad_norm": 1.3361373532537333, "learning_rate": 9.873266191420635e-06, "loss": 0.5341, "step": 4370 }, { "epoch": 0.16507745072174274, "grad_norm": 1.5975055201627169, "learning_rate": 9.8717903044347e-06, "loss": 0.5282, "step": 4380 }, { "epoch": 0.1654543398786417, "grad_norm": 1.607578610847404, "learning_rate": 9.870305984955591e-06, "loss": 0.545, "step": 4390 }, { "epoch": 0.16583122903554065, "grad_norm": 1.458228796237765, "learning_rate": 9.868813235552485e-06, "loss": 0.4811, "step": 4400 }, { "epoch": 0.1662081181924396, "grad_norm": 1.6414986429735758, "learning_rate": 9.86731205880916e-06, "loss": 0.5616, "step": 4410 }, { "epoch": 0.16658500734933857, "grad_norm": 1.8317176023383224, "learning_rate": 9.86580245732397e-06, "loss": 0.559, "step": 4420 }, { "epoch": 0.16696189650623752, "grad_norm": 1.7709755238978837, "learning_rate": 9.864284433709859e-06, "loss": 0.549, "step": 4430 }, { "epoch": 0.16733878566313648, "grad_norm": 1.982880798943511, "learning_rate": 9.862757990594348e-06, "loss": 0.5296, "step": 4440 }, { "epoch": 0.16771567482003544, "grad_norm": 1.7614598486822042, "learning_rate": 9.861223130619525e-06, "loss": 0.575, "step": 4450 }, { "epoch": 0.1680925639769344, "grad_norm": 1.3611867438499565, "learning_rate": 9.859679856442058e-06, "loss": 0.5224, "step": 4460 }, { "epoch": 0.16846945313383335, "grad_norm": 4.687044934475357, "learning_rate": 9.858128170733166e-06, "loss": 0.5552, "step": 4470 }, { "epoch": 0.16884634229073228, "grad_norm": 1.59571418195565, "learning_rate": 9.856568076178637e-06, "loss": 0.5229, "step": 4480 }, { "epoch": 0.16922323144763124, "grad_norm": 2.3788819118048736, "learning_rate": 9.85499957547881e-06, "loss": 0.565, "step": 4490 }, { "epoch": 0.1696001206045302, "grad_norm": 1.8329399955214953, "learning_rate": 9.853422671348573e-06, "loss": 0.5343, "step": 4500 }, { "epoch": 0.16997700976142915, "grad_norm": 1.7468489996738379, "learning_rate": 9.85183736651736e-06, "loss": 0.5201, "step": 4510 }, { "epoch": 0.1703538989183281, "grad_norm": 1.6120690658450187, "learning_rate": 9.850243663729151e-06, "loss": 0.5325, "step": 4520 }, { "epoch": 0.17073078807522707, "grad_norm": 1.676405391419548, "learning_rate": 9.848641565742451e-06, "loss": 0.5358, "step": 4530 }, { "epoch": 0.17110767723212603, "grad_norm": 1.5692375310803846, "learning_rate": 9.847031075330305e-06, "loss": 0.5469, "step": 4540 }, { "epoch": 0.17148456638902498, "grad_norm": 1.7013387896565297, "learning_rate": 9.845412195280283e-06, "loss": 0.5415, "step": 4550 }, { "epoch": 0.17186145554592394, "grad_norm": 1.6239671776400033, "learning_rate": 9.843784928394473e-06, "loss": 0.5337, "step": 4560 }, { "epoch": 0.1722383447028229, "grad_norm": 1.7752808165423049, "learning_rate": 9.842149277489482e-06, "loss": 0.5581, "step": 4570 }, { "epoch": 0.17261523385972186, "grad_norm": 1.6381139718605806, "learning_rate": 9.840505245396428e-06, "loss": 0.5512, "step": 4580 }, { "epoch": 0.1729921230166208, "grad_norm": 1.3980621024689306, "learning_rate": 9.838852834960937e-06, "loss": 0.5475, "step": 4590 }, { "epoch": 0.17336901217351977, "grad_norm": 1.8099458012944554, "learning_rate": 9.837192049043138e-06, "loss": 0.5288, "step": 4600 }, { "epoch": 0.17374590133041873, "grad_norm": 1.7008772187496575, "learning_rate": 9.83552289051765e-06, "loss": 0.5559, "step": 4610 }, { "epoch": 0.17412279048731769, "grad_norm": 7.078798933085531, "learning_rate": 9.833845362273595e-06, "loss": 0.538, "step": 4620 }, { "epoch": 0.17449967964421664, "grad_norm": 1.8058836761237198, "learning_rate": 9.832159467214571e-06, "loss": 0.5366, "step": 4630 }, { "epoch": 0.1748765688011156, "grad_norm": 1.6224697567837014, "learning_rate": 9.830465208258667e-06, "loss": 0.5315, "step": 4640 }, { "epoch": 0.17525345795801456, "grad_norm": 1.9433130297018304, "learning_rate": 9.828762588338442e-06, "loss": 0.5412, "step": 4650 }, { "epoch": 0.17563034711491352, "grad_norm": 1.6991522803860752, "learning_rate": 9.827051610400933e-06, "loss": 0.5403, "step": 4660 }, { "epoch": 0.17600723627181247, "grad_norm": 1.705218216007172, "learning_rate": 9.825332277407637e-06, "loss": 0.5556, "step": 4670 }, { "epoch": 0.1763841254287114, "grad_norm": 1.688592333353397, "learning_rate": 9.823604592334519e-06, "loss": 0.551, "step": 4680 }, { "epoch": 0.17676101458561036, "grad_norm": 1.41612023446685, "learning_rate": 9.821868558171996e-06, "loss": 0.5173, "step": 4690 }, { "epoch": 0.17713790374250932, "grad_norm": 1.6816541249040635, "learning_rate": 9.820124177924939e-06, "loss": 0.5428, "step": 4700 }, { "epoch": 0.17751479289940827, "grad_norm": 1.3749122464689933, "learning_rate": 9.818371454612664e-06, "loss": 0.5341, "step": 4710 }, { "epoch": 0.17789168205630723, "grad_norm": 1.426068903641529, "learning_rate": 9.816610391268927e-06, "loss": 0.4913, "step": 4720 }, { "epoch": 0.1782685712132062, "grad_norm": 1.5924856232245717, "learning_rate": 9.814840990941921e-06, "loss": 0.541, "step": 4730 }, { "epoch": 0.17864546037010515, "grad_norm": 1.4530573814344718, "learning_rate": 9.813063256694268e-06, "loss": 0.5088, "step": 4740 }, { "epoch": 0.1790223495270041, "grad_norm": 1.8559077656986256, "learning_rate": 9.811277191603018e-06, "loss": 0.5432, "step": 4750 }, { "epoch": 0.17939923868390306, "grad_norm": 1.668223036589219, "learning_rate": 9.809482798759636e-06, "loss": 0.5257, "step": 4760 }, { "epoch": 0.17977612784080202, "grad_norm": 1.6197394995482917, "learning_rate": 9.807680081270005e-06, "loss": 0.5453, "step": 4770 }, { "epoch": 0.18015301699770098, "grad_norm": 1.7676016632355371, "learning_rate": 9.805869042254419e-06, "loss": 0.5454, "step": 4780 }, { "epoch": 0.18052990615459993, "grad_norm": 1.630694868729753, "learning_rate": 9.804049684847566e-06, "loss": 0.5327, "step": 4790 }, { "epoch": 0.1809067953114989, "grad_norm": 1.7202050190497062, "learning_rate": 9.802222012198543e-06, "loss": 0.5609, "step": 4800 }, { "epoch": 0.18128368446839785, "grad_norm": 1.6203695895840649, "learning_rate": 9.800386027470836e-06, "loss": 0.5196, "step": 4810 }, { "epoch": 0.1816605736252968, "grad_norm": 1.7217972266536141, "learning_rate": 9.798541733842315e-06, "loss": 0.5357, "step": 4820 }, { "epoch": 0.18203746278219576, "grad_norm": 1.653697053819612, "learning_rate": 9.796689134505234e-06, "loss": 0.5222, "step": 4830 }, { "epoch": 0.18241435193909472, "grad_norm": 1.5596182552523383, "learning_rate": 9.794828232666227e-06, "loss": 0.5552, "step": 4840 }, { "epoch": 0.18279124109599368, "grad_norm": 1.867472446965036, "learning_rate": 9.79295903154629e-06, "loss": 0.5339, "step": 4850 }, { "epoch": 0.18316813025289264, "grad_norm": 1.6702383439481707, "learning_rate": 9.791081534380796e-06, "loss": 0.5151, "step": 4860 }, { "epoch": 0.1835450194097916, "grad_norm": 1.3291743718971485, "learning_rate": 9.789195744419463e-06, "loss": 0.5607, "step": 4870 }, { "epoch": 0.18392190856669052, "grad_norm": 1.5921907230443966, "learning_rate": 9.787301664926376e-06, "loss": 0.5339, "step": 4880 }, { "epoch": 0.18429879772358948, "grad_norm": 1.8736268796358457, "learning_rate": 9.78539929917996e-06, "loss": 0.5302, "step": 4890 }, { "epoch": 0.18467568688048844, "grad_norm": 1.8805034291822773, "learning_rate": 9.783488650472988e-06, "loss": 0.5445, "step": 4900 }, { "epoch": 0.1850525760373874, "grad_norm": 1.7419614721480812, "learning_rate": 9.781569722112564e-06, "loss": 0.5428, "step": 4910 }, { "epoch": 0.18542946519428635, "grad_norm": 1.9542910752412086, "learning_rate": 9.779642517420129e-06, "loss": 0.538, "step": 4920 }, { "epoch": 0.1858063543511853, "grad_norm": 1.5668762657315158, "learning_rate": 9.777707039731443e-06, "loss": 0.5516, "step": 4930 }, { "epoch": 0.18618324350808427, "grad_norm": 1.603854576231463, "learning_rate": 9.775763292396591e-06, "loss": 0.5535, "step": 4940 }, { "epoch": 0.18656013266498322, "grad_norm": 1.675798754667643, "learning_rate": 9.773811278779972e-06, "loss": 0.524, "step": 4950 }, { "epoch": 0.18693702182188218, "grad_norm": 2.348035421509291, "learning_rate": 9.771851002260288e-06, "loss": 0.5478, "step": 4960 }, { "epoch": 0.18731391097878114, "grad_norm": 1.6072598485464797, "learning_rate": 9.769882466230546e-06, "loss": 0.5287, "step": 4970 }, { "epoch": 0.1876908001356801, "grad_norm": 1.746528796476459, "learning_rate": 9.767905674098051e-06, "loss": 0.5475, "step": 4980 }, { "epoch": 0.18806768929257905, "grad_norm": 1.386585738427929, "learning_rate": 9.765920629284396e-06, "loss": 0.5098, "step": 4990 }, { "epoch": 0.188444578449478, "grad_norm": 1.8437041912808763, "learning_rate": 9.763927335225458e-06, "loss": 0.5286, "step": 5000 }, { "epoch": 0.18882146760637697, "grad_norm": 1.9482474264160459, "learning_rate": 9.761925795371394e-06, "loss": 0.5442, "step": 5010 }, { "epoch": 0.18919835676327593, "grad_norm": 1.7719952789842508, "learning_rate": 9.759916013186633e-06, "loss": 0.5461, "step": 5020 }, { "epoch": 0.18957524592017488, "grad_norm": 1.6946989567888198, "learning_rate": 9.757897992149868e-06, "loss": 0.5798, "step": 5030 }, { "epoch": 0.18995213507707384, "grad_norm": 1.4374415950854045, "learning_rate": 9.755871735754058e-06, "loss": 0.5406, "step": 5040 }, { "epoch": 0.1903290242339728, "grad_norm": 1.6824526905803994, "learning_rate": 9.753837247506415e-06, "loss": 0.574, "step": 5050 }, { "epoch": 0.19070591339087176, "grad_norm": 1.394126270724554, "learning_rate": 9.751794530928394e-06, "loss": 0.5327, "step": 5060 }, { "epoch": 0.1910828025477707, "grad_norm": 1.8155055295556857, "learning_rate": 9.749743589555696e-06, "loss": 0.5551, "step": 5070 }, { "epoch": 0.19145969170466964, "grad_norm": 1.6444154255861156, "learning_rate": 9.747684426938259e-06, "loss": 0.5324, "step": 5080 }, { "epoch": 0.1918365808615686, "grad_norm": 1.693478657126227, "learning_rate": 9.74561704664025e-06, "loss": 0.5293, "step": 5090 }, { "epoch": 0.19221347001846756, "grad_norm": 1.7991179686677223, "learning_rate": 9.743541452240062e-06, "loss": 0.5306, "step": 5100 }, { "epoch": 0.19259035917536652, "grad_norm": 1.6938448201059806, "learning_rate": 9.7414576473303e-06, "loss": 0.5557, "step": 5110 }, { "epoch": 0.19296724833226547, "grad_norm": 1.708193540793155, "learning_rate": 9.739365635517786e-06, "loss": 0.5527, "step": 5120 }, { "epoch": 0.19334413748916443, "grad_norm": 1.623469387205484, "learning_rate": 9.737265420423545e-06, "loss": 0.5677, "step": 5130 }, { "epoch": 0.1937210266460634, "grad_norm": 1.5336259453452732, "learning_rate": 9.735157005682802e-06, "loss": 0.5304, "step": 5140 }, { "epoch": 0.19409791580296235, "grad_norm": 1.6029263301801304, "learning_rate": 9.733040394944972e-06, "loss": 0.5036, "step": 5150 }, { "epoch": 0.1944748049598613, "grad_norm": 1.4027867586519995, "learning_rate": 9.73091559187366e-06, "loss": 0.5571, "step": 5160 }, { "epoch": 0.19485169411676026, "grad_norm": 1.422018727809762, "learning_rate": 9.728782600146646e-06, "loss": 0.512, "step": 5170 }, { "epoch": 0.19522858327365922, "grad_norm": 1.7086928414235798, "learning_rate": 9.726641423455889e-06, "loss": 0.5479, "step": 5180 }, { "epoch": 0.19560547243055817, "grad_norm": 1.4865928775148851, "learning_rate": 9.724492065507512e-06, "loss": 0.5303, "step": 5190 }, { "epoch": 0.19598236158745713, "grad_norm": 1.7652350174862295, "learning_rate": 9.722334530021798e-06, "loss": 0.568, "step": 5200 }, { "epoch": 0.1963592507443561, "grad_norm": 1.6654751000696695, "learning_rate": 9.720168820733189e-06, "loss": 0.5425, "step": 5210 }, { "epoch": 0.19673613990125505, "grad_norm": 1.6237026180449996, "learning_rate": 9.717994941390269e-06, "loss": 0.5256, "step": 5220 }, { "epoch": 0.197113029058154, "grad_norm": 1.4809115306845473, "learning_rate": 9.71581289575577e-06, "loss": 0.5559, "step": 5230 }, { "epoch": 0.19748991821505296, "grad_norm": 1.6736674947367551, "learning_rate": 9.71362268760655e-06, "loss": 0.5379, "step": 5240 }, { "epoch": 0.19786680737195192, "grad_norm": 1.6771556923236743, "learning_rate": 9.711424320733605e-06, "loss": 0.5539, "step": 5250 }, { "epoch": 0.19824369652885088, "grad_norm": 1.7283773691144386, "learning_rate": 9.709217798942045e-06, "loss": 0.5289, "step": 5260 }, { "epoch": 0.19862058568574983, "grad_norm": 1.4438252244390533, "learning_rate": 9.7070031260511e-06, "loss": 0.5562, "step": 5270 }, { "epoch": 0.19899747484264876, "grad_norm": 1.612252201951247, "learning_rate": 9.704780305894107e-06, "loss": 0.5508, "step": 5280 }, { "epoch": 0.19937436399954772, "grad_norm": 1.520592481703519, "learning_rate": 9.702549342318503e-06, "loss": 0.522, "step": 5290 }, { "epoch": 0.19975125315644668, "grad_norm": 1.6900037644390822, "learning_rate": 9.700310239185824e-06, "loss": 0.5495, "step": 5300 }, { "epoch": 0.20012814231334564, "grad_norm": 1.709571109702106, "learning_rate": 9.698063000371693e-06, "loss": 0.5348, "step": 5310 }, { "epoch": 0.2005050314702446, "grad_norm": 1.730178730299619, "learning_rate": 9.695807629765815e-06, "loss": 0.5141, "step": 5320 }, { "epoch": 0.20088192062714355, "grad_norm": 1.6296835391691085, "learning_rate": 9.69354413127197e-06, "loss": 0.5361, "step": 5330 }, { "epoch": 0.2012588097840425, "grad_norm": 1.6987810718130525, "learning_rate": 9.691272508808006e-06, "loss": 0.5452, "step": 5340 }, { "epoch": 0.20163569894094147, "grad_norm": 1.650234117971004, "learning_rate": 9.68899276630583e-06, "loss": 0.5197, "step": 5350 }, { "epoch": 0.20201258809784042, "grad_norm": 1.7189731868307763, "learning_rate": 9.68670490771141e-06, "loss": 0.5582, "step": 5360 }, { "epoch": 0.20238947725473938, "grad_norm": 1.629642772885024, "learning_rate": 9.68440893698476e-06, "loss": 0.5467, "step": 5370 }, { "epoch": 0.20276636641163834, "grad_norm": 1.714044057568933, "learning_rate": 9.682104858099932e-06, "loss": 0.5586, "step": 5380 }, { "epoch": 0.2031432555685373, "grad_norm": 1.7776904536246745, "learning_rate": 9.679792675045015e-06, "loss": 0.5527, "step": 5390 }, { "epoch": 0.20352014472543625, "grad_norm": 1.8482409209459776, "learning_rate": 9.677472391822122e-06, "loss": 0.5444, "step": 5400 }, { "epoch": 0.2038970338823352, "grad_norm": 2.039065180414514, "learning_rate": 9.675144012447393e-06, "loss": 0.5495, "step": 5410 }, { "epoch": 0.20427392303923417, "grad_norm": 1.6471101788933047, "learning_rate": 9.672807540950976e-06, "loss": 0.5304, "step": 5420 }, { "epoch": 0.20465081219613313, "grad_norm": 1.7782838901225566, "learning_rate": 9.670462981377024e-06, "loss": 0.539, "step": 5430 }, { "epoch": 0.20502770135303208, "grad_norm": 1.5599236676909736, "learning_rate": 9.668110337783696e-06, "loss": 0.5278, "step": 5440 }, { "epoch": 0.20540459050993104, "grad_norm": 1.7551032509348412, "learning_rate": 9.665749614243139e-06, "loss": 0.5173, "step": 5450 }, { "epoch": 0.20578147966683, "grad_norm": 1.5194286251590126, "learning_rate": 9.663380814841487e-06, "loss": 0.5209, "step": 5460 }, { "epoch": 0.20615836882372895, "grad_norm": 1.9229370370271754, "learning_rate": 9.66100394367885e-06, "loss": 0.5366, "step": 5470 }, { "epoch": 0.20653525798062788, "grad_norm": 1.6741605984890093, "learning_rate": 9.658619004869314e-06, "loss": 0.5479, "step": 5480 }, { "epoch": 0.20691214713752684, "grad_norm": 1.5030012618627315, "learning_rate": 9.656226002540923e-06, "loss": 0.5151, "step": 5490 }, { "epoch": 0.2072890362944258, "grad_norm": 1.8829102820108654, "learning_rate": 9.653824940835683e-06, "loss": 0.5398, "step": 5500 }, { "epoch": 0.20766592545132476, "grad_norm": 1.7502902417744142, "learning_rate": 9.651415823909547e-06, "loss": 0.5356, "step": 5510 }, { "epoch": 0.20804281460822371, "grad_norm": 1.8452401375252057, "learning_rate": 9.648998655932414e-06, "loss": 0.5392, "step": 5520 }, { "epoch": 0.20841970376512267, "grad_norm": 1.6582272984116684, "learning_rate": 9.646573441088112e-06, "loss": 0.5128, "step": 5530 }, { "epoch": 0.20879659292202163, "grad_norm": 1.5467497129190437, "learning_rate": 9.644140183574407e-06, "loss": 0.5551, "step": 5540 }, { "epoch": 0.2091734820789206, "grad_norm": 1.9957256803945376, "learning_rate": 9.641698887602973e-06, "loss": 0.5658, "step": 5550 }, { "epoch": 0.20955037123581954, "grad_norm": 1.6565250884502132, "learning_rate": 9.63924955739941e-06, "loss": 0.5103, "step": 5560 }, { "epoch": 0.2099272603927185, "grad_norm": 1.730711290180862, "learning_rate": 9.636792197203218e-06, "loss": 0.5311, "step": 5570 }, { "epoch": 0.21030414954961746, "grad_norm": 1.6469079302286813, "learning_rate": 9.634326811267796e-06, "loss": 0.558, "step": 5580 }, { "epoch": 0.21068103870651642, "grad_norm": 2.1727142231298737, "learning_rate": 9.631853403860437e-06, "loss": 0.5467, "step": 5590 }, { "epoch": 0.21105792786341537, "grad_norm": 1.7971977696668728, "learning_rate": 9.629371979262314e-06, "loss": 0.5043, "step": 5600 }, { "epoch": 0.21143481702031433, "grad_norm": 10.80688874918979, "learning_rate": 9.626882541768484e-06, "loss": 0.5577, "step": 5610 }, { "epoch": 0.2118117061772133, "grad_norm": 2.323690498795392, "learning_rate": 9.624385095687865e-06, "loss": 0.571, "step": 5620 }, { "epoch": 0.21218859533411225, "grad_norm": 1.719967739675491, "learning_rate": 9.621879645343245e-06, "loss": 0.5411, "step": 5630 }, { "epoch": 0.2125654844910112, "grad_norm": 1.532181810090769, "learning_rate": 9.619366195071258e-06, "loss": 0.5415, "step": 5640 }, { "epoch": 0.21294237364791016, "grad_norm": 1.7442472684705252, "learning_rate": 9.616844749222391e-06, "loss": 0.555, "step": 5650 }, { "epoch": 0.21331926280480912, "grad_norm": 1.8977510610017796, "learning_rate": 9.614315312160972e-06, "loss": 0.5259, "step": 5660 }, { "epoch": 0.21369615196170808, "grad_norm": 1.5012868113061877, "learning_rate": 9.611777888265153e-06, "loss": 0.5602, "step": 5670 }, { "epoch": 0.214073041118607, "grad_norm": 1.7027261474156072, "learning_rate": 9.609232481926917e-06, "loss": 0.4989, "step": 5680 }, { "epoch": 0.21444993027550596, "grad_norm": 1.6913027886629168, "learning_rate": 9.606679097552061e-06, "loss": 0.5414, "step": 5690 }, { "epoch": 0.21482681943240492, "grad_norm": 1.7822692679518921, "learning_rate": 9.604117739560192e-06, "loss": 0.5307, "step": 5700 }, { "epoch": 0.21520370858930388, "grad_norm": 1.7521449233376833, "learning_rate": 9.601548412384718e-06, "loss": 0.5366, "step": 5710 }, { "epoch": 0.21558059774620283, "grad_norm": 1.4927823655216998, "learning_rate": 9.59897112047284e-06, "loss": 0.5457, "step": 5720 }, { "epoch": 0.2159574869031018, "grad_norm": 1.5719965609870012, "learning_rate": 9.596385868285547e-06, "loss": 0.5744, "step": 5730 }, { "epoch": 0.21633437606000075, "grad_norm": 1.6228135876923027, "learning_rate": 9.593792660297603e-06, "loss": 0.5361, "step": 5740 }, { "epoch": 0.2167112652168997, "grad_norm": 1.869501122929772, "learning_rate": 9.591191500997545e-06, "loss": 0.5824, "step": 5750 }, { "epoch": 0.21708815437379866, "grad_norm": 1.6116057455831496, "learning_rate": 9.588582394887674e-06, "loss": 0.5208, "step": 5760 }, { "epoch": 0.21746504353069762, "grad_norm": 1.7598646289067172, "learning_rate": 9.58596534648404e-06, "loss": 0.5172, "step": 5770 }, { "epoch": 0.21784193268759658, "grad_norm": 1.459229389644524, "learning_rate": 9.583340360316452e-06, "loss": 0.5247, "step": 5780 }, { "epoch": 0.21821882184449554, "grad_norm": 1.3090949493088406, "learning_rate": 9.580707440928442e-06, "loss": 0.5795, "step": 5790 }, { "epoch": 0.2185957110013945, "grad_norm": 1.6277194603630043, "learning_rate": 9.578066592877289e-06, "loss": 0.5192, "step": 5800 }, { "epoch": 0.21897260015829345, "grad_norm": 1.3790098723023494, "learning_rate": 9.575417820733985e-06, "loss": 0.5348, "step": 5810 }, { "epoch": 0.2193494893151924, "grad_norm": 1.3968735415508085, "learning_rate": 9.572761129083245e-06, "loss": 0.512, "step": 5820 }, { "epoch": 0.21972637847209137, "grad_norm": 1.7812282571098073, "learning_rate": 9.570096522523484e-06, "loss": 0.543, "step": 5830 }, { "epoch": 0.22010326762899032, "grad_norm": 1.834347893716607, "learning_rate": 9.567424005666825e-06, "loss": 0.5329, "step": 5840 }, { "epoch": 0.22048015678588928, "grad_norm": 1.8129108228485702, "learning_rate": 9.564743583139076e-06, "loss": 0.5384, "step": 5850 }, { "epoch": 0.22085704594278824, "grad_norm": 1.6223887647007407, "learning_rate": 9.562055259579731e-06, "loss": 0.5268, "step": 5860 }, { "epoch": 0.2212339350996872, "grad_norm": 1.606954920038498, "learning_rate": 9.559359039641962e-06, "loss": 0.5414, "step": 5870 }, { "epoch": 0.22161082425658613, "grad_norm": 1.629843365300857, "learning_rate": 9.556654927992609e-06, "loss": 0.5242, "step": 5880 }, { "epoch": 0.22198771341348508, "grad_norm": 1.7527089646543905, "learning_rate": 9.553942929312166e-06, "loss": 0.5272, "step": 5890 }, { "epoch": 0.22236460257038404, "grad_norm": 1.6581171602617495, "learning_rate": 9.551223048294785e-06, "loss": 0.5116, "step": 5900 }, { "epoch": 0.222741491727283, "grad_norm": 2.235545487251775, "learning_rate": 9.548495289648254e-06, "loss": 0.5385, "step": 5910 }, { "epoch": 0.22311838088418195, "grad_norm": 2.0034168319325243, "learning_rate": 9.545759658094007e-06, "loss": 0.5557, "step": 5920 }, { "epoch": 0.2234952700410809, "grad_norm": 1.4966777192122558, "learning_rate": 9.543016158367093e-06, "loss": 0.5405, "step": 5930 }, { "epoch": 0.22387215919797987, "grad_norm": 1.7766848501696213, "learning_rate": 9.540264795216192e-06, "loss": 0.5104, "step": 5940 }, { "epoch": 0.22424904835487883, "grad_norm": 1.6670384095637465, "learning_rate": 9.537505573403582e-06, "loss": 0.5436, "step": 5950 }, { "epoch": 0.22462593751177778, "grad_norm": 1.5467554161425514, "learning_rate": 9.534738497705153e-06, "loss": 0.557, "step": 5960 }, { "epoch": 0.22500282666867674, "grad_norm": 1.7813974332712823, "learning_rate": 9.531963572910388e-06, "loss": 0.559, "step": 5970 }, { "epoch": 0.2253797158255757, "grad_norm": 1.7053744486978277, "learning_rate": 9.529180803822351e-06, "loss": 0.5489, "step": 5980 }, { "epoch": 0.22575660498247466, "grad_norm": 1.243354982475666, "learning_rate": 9.526390195257686e-06, "loss": 0.4922, "step": 5990 }, { "epoch": 0.22613349413937361, "grad_norm": 1.8099541389369207, "learning_rate": 9.523591752046608e-06, "loss": 0.5443, "step": 6000 }, { "epoch": 0.22651038329627257, "grad_norm": 1.6063302471417757, "learning_rate": 9.52078547903289e-06, "loss": 0.519, "step": 6010 }, { "epoch": 0.22688727245317153, "grad_norm": 1.480600706868146, "learning_rate": 9.51797138107386e-06, "loss": 0.516, "step": 6020 }, { "epoch": 0.2272641616100705, "grad_norm": 1.4421735699209184, "learning_rate": 9.51514946304039e-06, "loss": 0.5507, "step": 6030 }, { "epoch": 0.22764105076696944, "grad_norm": 1.6185750934964662, "learning_rate": 9.512319729816886e-06, "loss": 0.5155, "step": 6040 }, { "epoch": 0.2280179399238684, "grad_norm": 1.6957212178717491, "learning_rate": 9.50948218630128e-06, "loss": 0.5272, "step": 6050 }, { "epoch": 0.22839482908076736, "grad_norm": 1.7169594191615623, "learning_rate": 9.506636837405025e-06, "loss": 0.5314, "step": 6060 }, { "epoch": 0.22877171823766632, "grad_norm": 1.5477125080399536, "learning_rate": 9.503783688053085e-06, "loss": 0.5431, "step": 6070 }, { "epoch": 0.22914860739456525, "grad_norm": 1.6785046483352886, "learning_rate": 9.500922743183922e-06, "loss": 0.5126, "step": 6080 }, { "epoch": 0.2295254965514642, "grad_norm": 1.890254359102995, "learning_rate": 9.498054007749498e-06, "loss": 0.5337, "step": 6090 }, { "epoch": 0.22990238570836316, "grad_norm": 1.643088361181434, "learning_rate": 9.495177486715247e-06, "loss": 0.5793, "step": 6100 }, { "epoch": 0.23027927486526212, "grad_norm": 1.735778264496379, "learning_rate": 9.492293185060095e-06, "loss": 0.5551, "step": 6110 }, { "epoch": 0.23065616402216108, "grad_norm": 1.4390586582782128, "learning_rate": 9.489401107776425e-06, "loss": 0.5262, "step": 6120 }, { "epoch": 0.23103305317906003, "grad_norm": 1.8374375522238977, "learning_rate": 9.486501259870078e-06, "loss": 0.5526, "step": 6130 }, { "epoch": 0.231409942335959, "grad_norm": 1.279671922211445, "learning_rate": 9.48359364636035e-06, "loss": 0.5341, "step": 6140 }, { "epoch": 0.23178683149285795, "grad_norm": 1.628189415008674, "learning_rate": 9.480678272279976e-06, "loss": 0.5088, "step": 6150 }, { "epoch": 0.2321637206497569, "grad_norm": 1.6997013770455123, "learning_rate": 9.477755142675125e-06, "loss": 0.5629, "step": 6160 }, { "epoch": 0.23254060980665586, "grad_norm": 3.5623010997302944, "learning_rate": 9.474824262605386e-06, "loss": 0.5488, "step": 6170 }, { "epoch": 0.23291749896355482, "grad_norm": 1.7466850826897597, "learning_rate": 9.47188563714377e-06, "loss": 0.534, "step": 6180 }, { "epoch": 0.23329438812045378, "grad_norm": 1.6695888230856442, "learning_rate": 9.468939271376688e-06, "loss": 0.5062, "step": 6190 }, { "epoch": 0.23367127727735273, "grad_norm": 1.835339155823516, "learning_rate": 9.46598517040395e-06, "loss": 0.5274, "step": 6200 }, { "epoch": 0.2340481664342517, "grad_norm": 1.6711258470843053, "learning_rate": 9.463023339338758e-06, "loss": 0.5395, "step": 6210 }, { "epoch": 0.23442505559115065, "grad_norm": 1.7881856636235314, "learning_rate": 9.46005378330769e-06, "loss": 0.546, "step": 6220 }, { "epoch": 0.2348019447480496, "grad_norm": 1.6790671962686543, "learning_rate": 9.457076507450697e-06, "loss": 0.5061, "step": 6230 }, { "epoch": 0.23517883390494856, "grad_norm": 1.3732357055205477, "learning_rate": 9.45409151692109e-06, "loss": 0.521, "step": 6240 }, { "epoch": 0.23555572306184752, "grad_norm": 1.7725150216025622, "learning_rate": 9.451098816885538e-06, "loss": 0.5429, "step": 6250 }, { "epoch": 0.23593261221874648, "grad_norm": 1.714678055331704, "learning_rate": 9.448098412524047e-06, "loss": 0.5266, "step": 6260 }, { "epoch": 0.2363095013756454, "grad_norm": 1.8750459001067499, "learning_rate": 9.445090309029965e-06, "loss": 0.5624, "step": 6270 }, { "epoch": 0.23668639053254437, "grad_norm": 1.7380553727353028, "learning_rate": 9.442074511609965e-06, "loss": 0.5223, "step": 6280 }, { "epoch": 0.23706327968944332, "grad_norm": 1.6267385236512102, "learning_rate": 9.439051025484032e-06, "loss": 0.5341, "step": 6290 }, { "epoch": 0.23744016884634228, "grad_norm": 1.831598701186007, "learning_rate": 9.436019855885466e-06, "loss": 0.5406, "step": 6300 }, { "epoch": 0.23781705800324124, "grad_norm": 2.6302827853025486, "learning_rate": 9.432981008060861e-06, "loss": 0.5331, "step": 6310 }, { "epoch": 0.2381939471601402, "grad_norm": 1.2440710954030016, "learning_rate": 9.429934487270105e-06, "loss": 0.5129, "step": 6320 }, { "epoch": 0.23857083631703915, "grad_norm": 1.637294830817524, "learning_rate": 9.426880298786366e-06, "loss": 0.539, "step": 6330 }, { "epoch": 0.2389477254739381, "grad_norm": 1.446625909816762, "learning_rate": 9.423818447896081e-06, "loss": 0.505, "step": 6340 }, { "epoch": 0.23932461463083707, "grad_norm": 2.0064803572085936, "learning_rate": 9.420748939898955e-06, "loss": 0.5741, "step": 6350 }, { "epoch": 0.23970150378773603, "grad_norm": 1.9422133263523575, "learning_rate": 9.417671780107941e-06, "loss": 0.5398, "step": 6360 }, { "epoch": 0.24007839294463498, "grad_norm": 1.7232411604328788, "learning_rate": 9.414586973849241e-06, "loss": 0.5293, "step": 6370 }, { "epoch": 0.24045528210153394, "grad_norm": 1.6201469087371054, "learning_rate": 9.411494526462286e-06, "loss": 0.5269, "step": 6380 }, { "epoch": 0.2408321712584329, "grad_norm": 1.70158816865489, "learning_rate": 9.408394443299743e-06, "loss": 0.5716, "step": 6390 }, { "epoch": 0.24120906041533186, "grad_norm": 1.837108473006738, "learning_rate": 9.405286729727483e-06, "loss": 0.5208, "step": 6400 }, { "epoch": 0.2415859495722308, "grad_norm": 1.7073086270634739, "learning_rate": 9.402171391124597e-06, "loss": 0.5428, "step": 6410 }, { "epoch": 0.24196283872912977, "grad_norm": 3.051981097028862, "learning_rate": 9.399048432883363e-06, "loss": 0.5158, "step": 6420 }, { "epoch": 0.24233972788602873, "grad_norm": 1.7864500392491454, "learning_rate": 9.395917860409255e-06, "loss": 0.5241, "step": 6430 }, { "epoch": 0.24271661704292768, "grad_norm": 1.5968823208409386, "learning_rate": 9.392779679120924e-06, "loss": 0.5214, "step": 6440 }, { "epoch": 0.24309350619982664, "grad_norm": 1.5815922668690165, "learning_rate": 9.38963389445019e-06, "loss": 0.5503, "step": 6450 }, { "epoch": 0.2434703953567256, "grad_norm": 1.6823529848515881, "learning_rate": 9.386480511842035e-06, "loss": 0.5109, "step": 6460 }, { "epoch": 0.24384728451362453, "grad_norm": 1.7524818808137763, "learning_rate": 9.38331953675459e-06, "loss": 0.5373, "step": 6470 }, { "epoch": 0.2442241736705235, "grad_norm": 1.5806519372358918, "learning_rate": 9.380150974659132e-06, "loss": 0.5259, "step": 6480 }, { "epoch": 0.24460106282742244, "grad_norm": 1.7916689002740533, "learning_rate": 9.376974831040066e-06, "loss": 0.5658, "step": 6490 }, { "epoch": 0.2449779519843214, "grad_norm": 1.7760513276744359, "learning_rate": 9.373791111394921e-06, "loss": 0.5571, "step": 6500 }, { "epoch": 0.24535484114122036, "grad_norm": 1.531656356531642, "learning_rate": 9.37059982123434e-06, "loss": 0.5324, "step": 6510 }, { "epoch": 0.24573173029811932, "grad_norm": 1.7434004811232835, "learning_rate": 9.367400966082067e-06, "loss": 0.5402, "step": 6520 }, { "epoch": 0.24610861945501827, "grad_norm": 1.9091316224501476, "learning_rate": 9.364194551474947e-06, "loss": 0.5314, "step": 6530 }, { "epoch": 0.24648550861191723, "grad_norm": 1.4637280981673166, "learning_rate": 9.3609805829629e-06, "loss": 0.5214, "step": 6540 }, { "epoch": 0.2468623977688162, "grad_norm": 1.6182959856878305, "learning_rate": 9.357759066108928e-06, "loss": 0.5547, "step": 6550 }, { "epoch": 0.24723928692571515, "grad_norm": 1.9292790559109196, "learning_rate": 9.354530006489093e-06, "loss": 0.5423, "step": 6560 }, { "epoch": 0.2476161760826141, "grad_norm": 2.1285426080176415, "learning_rate": 9.351293409692519e-06, "loss": 0.5643, "step": 6570 }, { "epoch": 0.24799306523951306, "grad_norm": 1.8555701825811315, "learning_rate": 9.34804928132137e-06, "loss": 0.484, "step": 6580 }, { "epoch": 0.24836995439641202, "grad_norm": 1.933502698125089, "learning_rate": 9.344797626990851e-06, "loss": 0.532, "step": 6590 }, { "epoch": 0.24874684355331098, "grad_norm": 1.719405207009378, "learning_rate": 9.341538452329191e-06, "loss": 0.5411, "step": 6600 }, { "epoch": 0.24912373271020993, "grad_norm": 1.6862541493581185, "learning_rate": 9.338271762977633e-06, "loss": 0.5173, "step": 6610 }, { "epoch": 0.2495006218671089, "grad_norm": 1.5377963927275895, "learning_rate": 9.334997564590434e-06, "loss": 0.5156, "step": 6620 }, { "epoch": 0.24987751102400785, "grad_norm": 1.677774421001133, "learning_rate": 9.331715862834842e-06, "loss": 0.5083, "step": 6630 }, { "epoch": 0.2502544001809068, "grad_norm": 1.5490491612940884, "learning_rate": 9.328426663391096e-06, "loss": 0.5145, "step": 6640 }, { "epoch": 0.25063128933780576, "grad_norm": 1.5642521485987577, "learning_rate": 9.325129971952412e-06, "loss": 0.5261, "step": 6650 }, { "epoch": 0.2510081784947047, "grad_norm": 1.6706919162023979, "learning_rate": 9.32182579422497e-06, "loss": 0.529, "step": 6660 }, { "epoch": 0.2513850676516037, "grad_norm": 1.5818205221721195, "learning_rate": 9.318514135927916e-06, "loss": 0.5369, "step": 6670 }, { "epoch": 0.2517619568085026, "grad_norm": 2.0562318160924495, "learning_rate": 9.315195002793335e-06, "loss": 0.5234, "step": 6680 }, { "epoch": 0.2521388459654016, "grad_norm": 1.67745815477172, "learning_rate": 9.311868400566255e-06, "loss": 0.5211, "step": 6690 }, { "epoch": 0.2525157351223005, "grad_norm": 1.6591773189380454, "learning_rate": 9.308534335004633e-06, "loss": 0.5539, "step": 6700 }, { "epoch": 0.2528926242791995, "grad_norm": 1.4697051603169902, "learning_rate": 9.305192811879342e-06, "loss": 0.5181, "step": 6710 }, { "epoch": 0.25326951343609844, "grad_norm": 1.6681508210359333, "learning_rate": 9.301843836974162e-06, "loss": 0.5602, "step": 6720 }, { "epoch": 0.2536464025929974, "grad_norm": 2.3108255647817724, "learning_rate": 9.298487416085774e-06, "loss": 0.5067, "step": 6730 }, { "epoch": 0.25402329174989635, "grad_norm": 1.7726965314764336, "learning_rate": 9.295123555023746e-06, "loss": 0.5397, "step": 6740 }, { "epoch": 0.25440018090679534, "grad_norm": 1.6529476204478943, "learning_rate": 9.291752259610521e-06, "loss": 0.5437, "step": 6750 }, { "epoch": 0.25477707006369427, "grad_norm": 1.623414078675652, "learning_rate": 9.288373535681417e-06, "loss": 0.5206, "step": 6760 }, { "epoch": 0.2551539592205932, "grad_norm": 1.4719649974173148, "learning_rate": 9.284987389084602e-06, "loss": 0.569, "step": 6770 }, { "epoch": 0.2555308483774922, "grad_norm": 1.625628891396421, "learning_rate": 9.281593825681102e-06, "loss": 0.5398, "step": 6780 }, { "epoch": 0.2559077375343911, "grad_norm": 1.792916883420293, "learning_rate": 9.278192851344765e-06, "loss": 0.5551, "step": 6790 }, { "epoch": 0.2562846266912901, "grad_norm": 2.156619467272627, "learning_rate": 9.274784471962283e-06, "loss": 0.4974, "step": 6800 }, { "epoch": 0.256661515848189, "grad_norm": 1.6687247274318509, "learning_rate": 9.271368693433153e-06, "loss": 0.5422, "step": 6810 }, { "epoch": 0.257038405005088, "grad_norm": 1.7150335026062424, "learning_rate": 9.267945521669687e-06, "loss": 0.5179, "step": 6820 }, { "epoch": 0.25741529416198694, "grad_norm": 1.5067882971197635, "learning_rate": 9.264514962596989e-06, "loss": 0.5207, "step": 6830 }, { "epoch": 0.2577921833188859, "grad_norm": 2.1289911856188657, "learning_rate": 9.261077022152953e-06, "loss": 0.5437, "step": 6840 }, { "epoch": 0.25816907247578486, "grad_norm": 1.9539761400644868, "learning_rate": 9.257631706288246e-06, "loss": 0.5517, "step": 6850 }, { "epoch": 0.25854596163268384, "grad_norm": 1.9497689876502742, "learning_rate": 9.254179020966303e-06, "loss": 0.5288, "step": 6860 }, { "epoch": 0.25892285078958277, "grad_norm": 1.5902800114741822, "learning_rate": 9.250718972163312e-06, "loss": 0.5244, "step": 6870 }, { "epoch": 0.25929973994648176, "grad_norm": 1.7856883546213593, "learning_rate": 9.247251565868214e-06, "loss": 0.5054, "step": 6880 }, { "epoch": 0.2596766291033807, "grad_norm": 1.5741551964446796, "learning_rate": 9.243776808082675e-06, "loss": 0.5371, "step": 6890 }, { "epoch": 0.26005351826027967, "grad_norm": 1.648666621240082, "learning_rate": 9.240294704821091e-06, "loss": 0.5355, "step": 6900 }, { "epoch": 0.2604304074171786, "grad_norm": 1.9842160004462486, "learning_rate": 9.236805262110571e-06, "loss": 0.531, "step": 6910 }, { "epoch": 0.2608072965740776, "grad_norm": 1.6690746816867539, "learning_rate": 9.233308485990929e-06, "loss": 0.5199, "step": 6920 }, { "epoch": 0.2611841857309765, "grad_norm": 2.000871754392917, "learning_rate": 9.229804382514668e-06, "loss": 0.5287, "step": 6930 }, { "epoch": 0.2615610748878755, "grad_norm": 1.6694534259040406, "learning_rate": 9.226292957746982e-06, "loss": 0.5063, "step": 6940 }, { "epoch": 0.26193796404477443, "grad_norm": 1.7676982056441022, "learning_rate": 9.222774217765728e-06, "loss": 0.542, "step": 6950 }, { "epoch": 0.2623148532016734, "grad_norm": 1.4980337618111739, "learning_rate": 9.21924816866143e-06, "loss": 0.5138, "step": 6960 }, { "epoch": 0.26269174235857234, "grad_norm": 1.5116056583689286, "learning_rate": 9.215714816537265e-06, "loss": 0.5265, "step": 6970 }, { "epoch": 0.2630686315154713, "grad_norm": 1.4865966539266282, "learning_rate": 9.212174167509044e-06, "loss": 0.5403, "step": 6980 }, { "epoch": 0.26344552067237026, "grad_norm": 1.6837642502881358, "learning_rate": 9.208626227705212e-06, "loss": 0.5434, "step": 6990 }, { "epoch": 0.2638224098292692, "grad_norm": 1.7206211694739486, "learning_rate": 9.205071003266838e-06, "loss": 0.5293, "step": 7000 }, { "epoch": 0.2641992989861682, "grad_norm": 1.5568568822181916, "learning_rate": 9.201508500347592e-06, "loss": 0.539, "step": 7010 }, { "epoch": 0.2645761881430671, "grad_norm": 2.515135490034705, "learning_rate": 9.197938725113745e-06, "loss": 0.5273, "step": 7020 }, { "epoch": 0.2649530772999661, "grad_norm": 1.7780698740100387, "learning_rate": 9.194361683744156e-06, "loss": 0.5249, "step": 7030 }, { "epoch": 0.265329966456865, "grad_norm": 1.9741428022356151, "learning_rate": 9.190777382430262e-06, "loss": 0.5067, "step": 7040 }, { "epoch": 0.265706855613764, "grad_norm": 1.5854337997969816, "learning_rate": 9.187185827376065e-06, "loss": 0.5162, "step": 7050 }, { "epoch": 0.26608374477066293, "grad_norm": 1.8032750599953304, "learning_rate": 9.183587024798122e-06, "loss": 0.5337, "step": 7060 }, { "epoch": 0.2664606339275619, "grad_norm": 1.6853602175583824, "learning_rate": 9.179980980925533e-06, "loss": 0.5619, "step": 7070 }, { "epoch": 0.26683752308446085, "grad_norm": 1.668776536624248, "learning_rate": 9.176367701999936e-06, "loss": 0.5306, "step": 7080 }, { "epoch": 0.26721441224135983, "grad_norm": 1.8011152218189725, "learning_rate": 9.172747194275492e-06, "loss": 0.5421, "step": 7090 }, { "epoch": 0.26759130139825876, "grad_norm": 1.8708330614213387, "learning_rate": 9.169119464018865e-06, "loss": 0.5326, "step": 7100 }, { "epoch": 0.26796819055515775, "grad_norm": 1.6632424343194352, "learning_rate": 9.165484517509231e-06, "loss": 0.5538, "step": 7110 }, { "epoch": 0.2683450797120567, "grad_norm": 1.467226646436778, "learning_rate": 9.161842361038255e-06, "loss": 0.5068, "step": 7120 }, { "epoch": 0.26872196886895566, "grad_norm": 1.6200121333407522, "learning_rate": 9.158193000910078e-06, "loss": 0.5388, "step": 7130 }, { "epoch": 0.2690988580258546, "grad_norm": 1.7014337626263312, "learning_rate": 9.15453644344131e-06, "loss": 0.5293, "step": 7140 }, { "epoch": 0.2694757471827536, "grad_norm": 1.7595131636999561, "learning_rate": 9.15087269496102e-06, "loss": 0.5448, "step": 7150 }, { "epoch": 0.2698526363396525, "grad_norm": 1.8761488565801718, "learning_rate": 9.147201761810722e-06, "loss": 0.53, "step": 7160 }, { "epoch": 0.27022952549655144, "grad_norm": 1.5345557406874837, "learning_rate": 9.143523650344373e-06, "loss": 0.5328, "step": 7170 }, { "epoch": 0.2706064146534504, "grad_norm": 1.6268714379432996, "learning_rate": 9.139838366928341e-06, "loss": 0.5676, "step": 7180 }, { "epoch": 0.27098330381034935, "grad_norm": 1.5406506948214973, "learning_rate": 9.136145917941423e-06, "loss": 0.5199, "step": 7190 }, { "epoch": 0.27136019296724834, "grad_norm": 1.6991584106421929, "learning_rate": 9.13244630977481e-06, "loss": 0.5249, "step": 7200 }, { "epoch": 0.27173708212414727, "grad_norm": 1.778739550927463, "learning_rate": 9.128739548832084e-06, "loss": 0.5493, "step": 7210 }, { "epoch": 0.27211397128104625, "grad_norm": 1.5672069369770456, "learning_rate": 9.125025641529212e-06, "loss": 0.548, "step": 7220 }, { "epoch": 0.2724908604379452, "grad_norm": 1.849271034046483, "learning_rate": 9.121304594294526e-06, "loss": 0.5069, "step": 7230 }, { "epoch": 0.27286774959484417, "grad_norm": 1.6198818202816212, "learning_rate": 9.117576413568726e-06, "loss": 0.5491, "step": 7240 }, { "epoch": 0.2732446387517431, "grad_norm": 1.7505219289954272, "learning_rate": 9.113841105804843e-06, "loss": 0.5331, "step": 7250 }, { "epoch": 0.2736215279086421, "grad_norm": 1.5456684152965627, "learning_rate": 9.110098677468258e-06, "loss": 0.5304, "step": 7260 }, { "epoch": 0.273998417065541, "grad_norm": 1.4875862331167544, "learning_rate": 9.106349135036673e-06, "loss": 0.5268, "step": 7270 }, { "epoch": 0.27437530622244, "grad_norm": 1.7648342778101611, "learning_rate": 9.102592485000101e-06, "loss": 0.541, "step": 7280 }, { "epoch": 0.2747521953793389, "grad_norm": 1.585590877192049, "learning_rate": 9.09882873386086e-06, "loss": 0.5159, "step": 7290 }, { "epoch": 0.2751290845362379, "grad_norm": 1.8789850010967337, "learning_rate": 9.095057888133557e-06, "loss": 0.526, "step": 7300 }, { "epoch": 0.27550597369313684, "grad_norm": 1.6350234581153913, "learning_rate": 9.09127995434508e-06, "loss": 0.5094, "step": 7310 }, { "epoch": 0.2758828628500358, "grad_norm": 1.7747303027226409, "learning_rate": 9.087494939034589e-06, "loss": 0.5223, "step": 7320 }, { "epoch": 0.27625975200693476, "grad_norm": 1.6705228626037893, "learning_rate": 9.083702848753496e-06, "loss": 0.5115, "step": 7330 }, { "epoch": 0.27663664116383374, "grad_norm": 1.7334155324512346, "learning_rate": 9.079903690065461e-06, "loss": 0.536, "step": 7340 }, { "epoch": 0.27701353032073267, "grad_norm": 1.7152387686021637, "learning_rate": 9.076097469546378e-06, "loss": 0.5174, "step": 7350 }, { "epoch": 0.27739041947763166, "grad_norm": 2.7121339144898267, "learning_rate": 9.072284193784366e-06, "loss": 0.5278, "step": 7360 }, { "epoch": 0.2777673086345306, "grad_norm": 1.4228173967742475, "learning_rate": 9.068463869379755e-06, "loss": 0.5315, "step": 7370 }, { "epoch": 0.2781441977914295, "grad_norm": 1.5850322389947202, "learning_rate": 9.064636502945074e-06, "loss": 0.5437, "step": 7380 }, { "epoch": 0.2785210869483285, "grad_norm": 1.462469278604766, "learning_rate": 9.060802101105041e-06, "loss": 0.5043, "step": 7390 }, { "epoch": 0.27889797610522743, "grad_norm": 1.991811474599226, "learning_rate": 9.056960670496555e-06, "loss": 0.5347, "step": 7400 }, { "epoch": 0.2792748652621264, "grad_norm": 1.5481573287316113, "learning_rate": 9.053112217768675e-06, "loss": 0.5317, "step": 7410 }, { "epoch": 0.27965175441902534, "grad_norm": 1.9124383091203907, "learning_rate": 9.049256749582621e-06, "loss": 0.5363, "step": 7420 }, { "epoch": 0.28002864357592433, "grad_norm": 1.8423235171267427, "learning_rate": 9.045394272611752e-06, "loss": 0.573, "step": 7430 }, { "epoch": 0.28040553273282326, "grad_norm": 1.8356986741564718, "learning_rate": 9.041524793541557e-06, "loss": 0.527, "step": 7440 }, { "epoch": 0.28078242188972224, "grad_norm": 1.709517929026637, "learning_rate": 9.037648319069648e-06, "loss": 0.5325, "step": 7450 }, { "epoch": 0.2811593110466212, "grad_norm": 1.6511934273562192, "learning_rate": 9.033764855905746e-06, "loss": 0.5107, "step": 7460 }, { "epoch": 0.28153620020352016, "grad_norm": 1.7382038263988389, "learning_rate": 9.029874410771664e-06, "loss": 0.5501, "step": 7470 }, { "epoch": 0.2819130893604191, "grad_norm": 1.5833394804264111, "learning_rate": 9.025976990401304e-06, "loss": 0.5124, "step": 7480 }, { "epoch": 0.2822899785173181, "grad_norm": 1.355478094878015, "learning_rate": 9.022072601540642e-06, "loss": 0.5425, "step": 7490 }, { "epoch": 0.282666867674217, "grad_norm": 1.6618586288433348, "learning_rate": 9.018161250947708e-06, "loss": 0.5291, "step": 7500 }, { "epoch": 0.283043756831116, "grad_norm": 1.273053488731193, "learning_rate": 9.014242945392592e-06, "loss": 0.4874, "step": 7510 }, { "epoch": 0.2834206459880149, "grad_norm": 2.0289155247042383, "learning_rate": 9.010317691657417e-06, "loss": 0.4941, "step": 7520 }, { "epoch": 0.2837975351449139, "grad_norm": 1.4934609822867562, "learning_rate": 9.006385496536334e-06, "loss": 0.5214, "step": 7530 }, { "epoch": 0.28417442430181283, "grad_norm": 1.6263199918676976, "learning_rate": 9.002446366835507e-06, "loss": 0.5493, "step": 7540 }, { "epoch": 0.2845513134587118, "grad_norm": 1.5893668854653922, "learning_rate": 8.998500309373104e-06, "loss": 0.4892, "step": 7550 }, { "epoch": 0.28492820261561075, "grad_norm": 1.41806671358887, "learning_rate": 8.994547330979281e-06, "loss": 0.524, "step": 7560 }, { "epoch": 0.2853050917725097, "grad_norm": 1.5553788303244762, "learning_rate": 8.990587438496183e-06, "loss": 0.5221, "step": 7570 }, { "epoch": 0.28568198092940866, "grad_norm": 1.418345991264022, "learning_rate": 8.986620638777911e-06, "loss": 0.4997, "step": 7580 }, { "epoch": 0.2860588700863076, "grad_norm": 1.508355836892063, "learning_rate": 8.982646938690527e-06, "loss": 0.5395, "step": 7590 }, { "epoch": 0.2864357592432066, "grad_norm": 1.6201465695031843, "learning_rate": 8.978666345112037e-06, "loss": 0.5507, "step": 7600 }, { "epoch": 0.2868126484001055, "grad_norm": 1.57811554793437, "learning_rate": 8.974678864932379e-06, "loss": 0.5004, "step": 7610 }, { "epoch": 0.2871895375570045, "grad_norm": 1.5741977951369208, "learning_rate": 8.970684505053407e-06, "loss": 0.5116, "step": 7620 }, { "epoch": 0.2875664267139034, "grad_norm": 1.676860663266555, "learning_rate": 8.96668327238889e-06, "loss": 0.5461, "step": 7630 }, { "epoch": 0.2879433158708024, "grad_norm": 1.4656181144812006, "learning_rate": 8.962675173864483e-06, "loss": 0.513, "step": 7640 }, { "epoch": 0.28832020502770134, "grad_norm": 1.7396852241340373, "learning_rate": 8.958660216417735e-06, "loss": 0.5341, "step": 7650 }, { "epoch": 0.2886970941846003, "grad_norm": 1.4829830776509865, "learning_rate": 8.954638406998062e-06, "loss": 0.5364, "step": 7660 }, { "epoch": 0.28907398334149925, "grad_norm": 2.003917803432353, "learning_rate": 8.95060975256674e-06, "loss": 0.5107, "step": 7670 }, { "epoch": 0.28945087249839824, "grad_norm": 1.5880349728736471, "learning_rate": 8.946574260096897e-06, "loss": 0.5199, "step": 7680 }, { "epoch": 0.28982776165529717, "grad_norm": 6.4085223096319295, "learning_rate": 8.942531936573487e-06, "loss": 0.5336, "step": 7690 }, { "epoch": 0.29020465081219615, "grad_norm": 1.700617891186804, "learning_rate": 8.9384827889933e-06, "loss": 0.524, "step": 7700 }, { "epoch": 0.2905815399690951, "grad_norm": 1.9072803846307507, "learning_rate": 8.934426824364931e-06, "loss": 0.4835, "step": 7710 }, { "epoch": 0.29095842912599407, "grad_norm": 1.455274287063036, "learning_rate": 8.93036404970877e-06, "loss": 0.5229, "step": 7720 }, { "epoch": 0.291335318282893, "grad_norm": 1.8160201738390154, "learning_rate": 8.926294472057006e-06, "loss": 0.5123, "step": 7730 }, { "epoch": 0.291712207439792, "grad_norm": 1.865480030478559, "learning_rate": 8.922218098453596e-06, "loss": 0.5363, "step": 7740 }, { "epoch": 0.2920890965966909, "grad_norm": 1.5712881260410958, "learning_rate": 8.91813493595426e-06, "loss": 0.5267, "step": 7750 }, { "epoch": 0.2924659857535899, "grad_norm": 1.6468565924570966, "learning_rate": 8.914044991626467e-06, "loss": 0.5313, "step": 7760 }, { "epoch": 0.2928428749104888, "grad_norm": 1.82716894869601, "learning_rate": 8.90994827254943e-06, "loss": 0.5278, "step": 7770 }, { "epoch": 0.29321976406738776, "grad_norm": 1.5036064580532265, "learning_rate": 8.905844785814086e-06, "loss": 0.5086, "step": 7780 }, { "epoch": 0.29359665322428674, "grad_norm": 1.5144138972292562, "learning_rate": 8.901734538523083e-06, "loss": 0.5081, "step": 7790 }, { "epoch": 0.29397354238118567, "grad_norm": 1.7463912063578895, "learning_rate": 8.897617537790775e-06, "loss": 0.5414, "step": 7800 }, { "epoch": 0.29435043153808466, "grad_norm": 1.4528500852259172, "learning_rate": 8.893493790743205e-06, "loss": 0.5077, "step": 7810 }, { "epoch": 0.2947273206949836, "grad_norm": 1.6942113196018895, "learning_rate": 8.889363304518088e-06, "loss": 0.5399, "step": 7820 }, { "epoch": 0.29510420985188257, "grad_norm": 1.5104690108892953, "learning_rate": 8.88522608626481e-06, "loss": 0.5459, "step": 7830 }, { "epoch": 0.2954810990087815, "grad_norm": 1.8645243846646347, "learning_rate": 8.881082143144405e-06, "loss": 0.5134, "step": 7840 }, { "epoch": 0.2958579881656805, "grad_norm": 1.6728891933665688, "learning_rate": 8.876931482329554e-06, "loss": 0.5177, "step": 7850 }, { "epoch": 0.2962348773225794, "grad_norm": 1.2067983902923343, "learning_rate": 8.872774111004553e-06, "loss": 0.5047, "step": 7860 }, { "epoch": 0.2966117664794784, "grad_norm": 1.5099148600629388, "learning_rate": 8.868610036365324e-06, "loss": 0.4986, "step": 7870 }, { "epoch": 0.29698865563637733, "grad_norm": 2.34928629080023, "learning_rate": 8.86443926561939e-06, "loss": 0.5366, "step": 7880 }, { "epoch": 0.2973655447932763, "grad_norm": 1.661924846466901, "learning_rate": 8.860261805985857e-06, "loss": 0.5272, "step": 7890 }, { "epoch": 0.29774243395017524, "grad_norm": 1.6500446518789744, "learning_rate": 8.856077664695418e-06, "loss": 0.5247, "step": 7900 }, { "epoch": 0.29811932310707423, "grad_norm": 1.6550912001046552, "learning_rate": 8.851886848990326e-06, "loss": 0.5073, "step": 7910 }, { "epoch": 0.29849621226397316, "grad_norm": 2.4385332990829394, "learning_rate": 8.847689366124387e-06, "loss": 0.5224, "step": 7920 }, { "epoch": 0.29887310142087214, "grad_norm": 1.7454181558614066, "learning_rate": 8.843485223362947e-06, "loss": 0.5404, "step": 7930 }, { "epoch": 0.2992499905777711, "grad_norm": 1.6197152816080147, "learning_rate": 8.839274427982883e-06, "loss": 0.5249, "step": 7940 }, { "epoch": 0.29962687973467006, "grad_norm": 1.6733433336543917, "learning_rate": 8.835056987272581e-06, "loss": 0.5369, "step": 7950 }, { "epoch": 0.300003768891569, "grad_norm": 1.6407791190885486, "learning_rate": 8.830832908531935e-06, "loss": 0.5036, "step": 7960 }, { "epoch": 0.3003806580484679, "grad_norm": 1.6837408609676576, "learning_rate": 8.826602199072323e-06, "loss": 0.5304, "step": 7970 }, { "epoch": 0.3007575472053669, "grad_norm": 1.5824176969906407, "learning_rate": 8.822364866216606e-06, "loss": 0.4872, "step": 7980 }, { "epoch": 0.30113443636226583, "grad_norm": 1.4977384439375079, "learning_rate": 8.818120917299105e-06, "loss": 0.4893, "step": 7990 }, { "epoch": 0.3015113255191648, "grad_norm": 1.9173818601725474, "learning_rate": 8.813870359665594e-06, "loss": 0.515, "step": 8000 }, { "epoch": 0.30188821467606375, "grad_norm": 1.224532027864391, "learning_rate": 8.809613200673284e-06, "loss": 0.4968, "step": 8010 }, { "epoch": 0.30226510383296273, "grad_norm": 1.507185992220378, "learning_rate": 8.805349447690819e-06, "loss": 0.5283, "step": 8020 }, { "epoch": 0.30264199298986166, "grad_norm": 1.5501471489535523, "learning_rate": 8.801079108098247e-06, "loss": 0.5297, "step": 8030 }, { "epoch": 0.30301888214676065, "grad_norm": 1.6682468431261555, "learning_rate": 8.796802189287021e-06, "loss": 0.5169, "step": 8040 }, { "epoch": 0.3033957713036596, "grad_norm": 1.5793954538427635, "learning_rate": 8.792518698659985e-06, "loss": 0.514, "step": 8050 }, { "epoch": 0.30377266046055856, "grad_norm": 1.5869189857437231, "learning_rate": 8.788228643631353e-06, "loss": 0.5699, "step": 8060 }, { "epoch": 0.3041495496174575, "grad_norm": 1.5017790526890054, "learning_rate": 8.783932031626702e-06, "loss": 0.5333, "step": 8070 }, { "epoch": 0.3045264387743565, "grad_norm": 1.7418008438314914, "learning_rate": 8.779628870082963e-06, "loss": 0.5681, "step": 8080 }, { "epoch": 0.3049033279312554, "grad_norm": 1.5384555545934482, "learning_rate": 8.775319166448397e-06, "loss": 0.4892, "step": 8090 }, { "epoch": 0.3052802170881544, "grad_norm": 1.7242800734457064, "learning_rate": 8.771002928182593e-06, "loss": 0.5323, "step": 8100 }, { "epoch": 0.3056571062450533, "grad_norm": 1.5545232251003573, "learning_rate": 8.76668016275645e-06, "loss": 0.5347, "step": 8110 }, { "epoch": 0.3060339954019523, "grad_norm": 1.5864191351139438, "learning_rate": 8.762350877652161e-06, "loss": 0.5133, "step": 8120 }, { "epoch": 0.30641088455885124, "grad_norm": 1.536374345634808, "learning_rate": 8.758015080363209e-06, "loss": 0.5429, "step": 8130 }, { "epoch": 0.3067877737157502, "grad_norm": 1.3739374982043573, "learning_rate": 8.753672778394348e-06, "loss": 0.5071, "step": 8140 }, { "epoch": 0.30716466287264915, "grad_norm": 1.9356831887412023, "learning_rate": 8.749323979261586e-06, "loss": 0.5598, "step": 8150 }, { "epoch": 0.30754155202954814, "grad_norm": 1.8841720246001954, "learning_rate": 8.744968690492183e-06, "loss": 0.5251, "step": 8160 }, { "epoch": 0.30791844118644707, "grad_norm": 1.4483138834088356, "learning_rate": 8.740606919624628e-06, "loss": 0.542, "step": 8170 }, { "epoch": 0.308295330343346, "grad_norm": 1.739676604765352, "learning_rate": 8.73623867420863e-06, "loss": 0.5161, "step": 8180 }, { "epoch": 0.308672219500245, "grad_norm": 1.4496329719941419, "learning_rate": 8.731863961805108e-06, "loss": 0.5142, "step": 8190 }, { "epoch": 0.3090491086571439, "grad_norm": 1.586205834775067, "learning_rate": 8.727482789986167e-06, "loss": 0.5105, "step": 8200 }, { "epoch": 0.3094259978140429, "grad_norm": 1.5822661719201925, "learning_rate": 8.723095166335105e-06, "loss": 0.4927, "step": 8210 }, { "epoch": 0.3098028869709418, "grad_norm": 1.4744825299579403, "learning_rate": 8.718701098446373e-06, "loss": 0.5154, "step": 8220 }, { "epoch": 0.3101797761278408, "grad_norm": 1.6760657484578674, "learning_rate": 8.714300593925588e-06, "loss": 0.5443, "step": 8230 }, { "epoch": 0.31055666528473974, "grad_norm": 1.6507496475712244, "learning_rate": 8.709893660389502e-06, "loss": 0.5103, "step": 8240 }, { "epoch": 0.3109335544416387, "grad_norm": 1.6267514125886644, "learning_rate": 8.705480305465993e-06, "loss": 0.5204, "step": 8250 }, { "epoch": 0.31131044359853766, "grad_norm": 1.7139044318938548, "learning_rate": 8.701060536794062e-06, "loss": 0.5082, "step": 8260 }, { "epoch": 0.31168733275543664, "grad_norm": 1.5364667287673175, "learning_rate": 8.6966343620238e-06, "loss": 0.5124, "step": 8270 }, { "epoch": 0.31206422191233557, "grad_norm": 1.5973367114004897, "learning_rate": 8.692201788816397e-06, "loss": 0.5401, "step": 8280 }, { "epoch": 0.31244111106923456, "grad_norm": 1.6171012597396603, "learning_rate": 8.687762824844112e-06, "loss": 0.5523, "step": 8290 }, { "epoch": 0.3128180002261335, "grad_norm": 1.6798865965319494, "learning_rate": 8.683317477790267e-06, "loss": 0.5566, "step": 8300 }, { "epoch": 0.31319488938303247, "grad_norm": 1.3965855622429122, "learning_rate": 8.678865755349232e-06, "loss": 0.5215, "step": 8310 }, { "epoch": 0.3135717785399314, "grad_norm": 1.478475036458125, "learning_rate": 8.674407665226412e-06, "loss": 0.548, "step": 8320 }, { "epoch": 0.3139486676968304, "grad_norm": 1.6537961782542907, "learning_rate": 8.669943215138236e-06, "loss": 0.5155, "step": 8330 }, { "epoch": 0.3143255568537293, "grad_norm": 1.5083041356086178, "learning_rate": 8.665472412812137e-06, "loss": 0.5218, "step": 8340 }, { "epoch": 0.3147024460106283, "grad_norm": 1.7094855096711425, "learning_rate": 8.660995265986547e-06, "loss": 0.4969, "step": 8350 }, { "epoch": 0.31507933516752723, "grad_norm": 1.5448154600516473, "learning_rate": 8.656511782410877e-06, "loss": 0.5228, "step": 8360 }, { "epoch": 0.31545622432442616, "grad_norm": 1.8392297958376145, "learning_rate": 8.652021969845508e-06, "loss": 0.5122, "step": 8370 }, { "epoch": 0.31583311348132515, "grad_norm": 1.5057677282206323, "learning_rate": 8.647525836061773e-06, "loss": 0.5232, "step": 8380 }, { "epoch": 0.3162100026382241, "grad_norm": 1.3387979618187744, "learning_rate": 8.643023388841951e-06, "loss": 0.4788, "step": 8390 }, { "epoch": 0.31658689179512306, "grad_norm": 1.7299810103354585, "learning_rate": 8.638514635979242e-06, "loss": 0.5271, "step": 8400 }, { "epoch": 0.316963780952022, "grad_norm": 1.5877959379659248, "learning_rate": 8.633999585277769e-06, "loss": 0.5492, "step": 8410 }, { "epoch": 0.317340670108921, "grad_norm": 1.5926803639365381, "learning_rate": 8.629478244552548e-06, "loss": 0.4928, "step": 8420 }, { "epoch": 0.3177175592658199, "grad_norm": 1.4153581295672915, "learning_rate": 8.624950621629487e-06, "loss": 0.5204, "step": 8430 }, { "epoch": 0.3180944484227189, "grad_norm": 1.515638801797706, "learning_rate": 8.620416724345365e-06, "loss": 0.5266, "step": 8440 }, { "epoch": 0.3184713375796178, "grad_norm": 1.7525263151023882, "learning_rate": 8.615876560547822e-06, "loss": 0.524, "step": 8450 }, { "epoch": 0.3188482267365168, "grad_norm": 1.5277030246398249, "learning_rate": 8.611330138095344e-06, "loss": 0.5071, "step": 8460 }, { "epoch": 0.31922511589341573, "grad_norm": 1.692436940543452, "learning_rate": 8.606777464857254e-06, "loss": 0.518, "step": 8470 }, { "epoch": 0.3196020050503147, "grad_norm": 1.9842826236775062, "learning_rate": 8.60221854871369e-06, "loss": 0.5193, "step": 8480 }, { "epoch": 0.31997889420721365, "grad_norm": 1.7689057994372488, "learning_rate": 8.597653397555597e-06, "loss": 0.5288, "step": 8490 }, { "epoch": 0.32035578336411263, "grad_norm": 1.6259926395499629, "learning_rate": 8.59308201928471e-06, "loss": 0.5614, "step": 8500 }, { "epoch": 0.32073267252101156, "grad_norm": 1.4818216820726773, "learning_rate": 8.588504421813548e-06, "loss": 0.519, "step": 8510 }, { "epoch": 0.32110956167791055, "grad_norm": 1.4755097387688436, "learning_rate": 8.583920613065389e-06, "loss": 0.5363, "step": 8520 }, { "epoch": 0.3214864508348095, "grad_norm": 1.4930239554129483, "learning_rate": 8.579330600974263e-06, "loss": 0.5245, "step": 8530 }, { "epoch": 0.32186333999170846, "grad_norm": 1.5984210950118547, "learning_rate": 8.57473439348494e-06, "loss": 0.5069, "step": 8540 }, { "epoch": 0.3222402291486074, "grad_norm": 1.667635983292806, "learning_rate": 8.570131998552912e-06, "loss": 0.5303, "step": 8550 }, { "epoch": 0.3226171183055063, "grad_norm": 1.616690957226872, "learning_rate": 8.56552342414438e-06, "loss": 0.5231, "step": 8560 }, { "epoch": 0.3229940074624053, "grad_norm": 1.8034906955411334, "learning_rate": 8.560908678236243e-06, "loss": 0.5207, "step": 8570 }, { "epoch": 0.32337089661930424, "grad_norm": 1.629302217435465, "learning_rate": 8.55628776881608e-06, "loss": 0.5111, "step": 8580 }, { "epoch": 0.3237477857762032, "grad_norm": 1.5524410794967876, "learning_rate": 8.551660703882137e-06, "loss": 0.5291, "step": 8590 }, { "epoch": 0.32412467493310215, "grad_norm": 1.682436170015779, "learning_rate": 8.547027491443319e-06, "loss": 0.5203, "step": 8600 }, { "epoch": 0.32450156409000114, "grad_norm": 1.4406822786435207, "learning_rate": 8.542388139519166e-06, "loss": 0.5147, "step": 8610 }, { "epoch": 0.32487845324690007, "grad_norm": 1.7053903855885608, "learning_rate": 8.537742656139854e-06, "loss": 0.5221, "step": 8620 }, { "epoch": 0.32525534240379905, "grad_norm": 1.6508021549322034, "learning_rate": 8.533091049346158e-06, "loss": 0.5087, "step": 8630 }, { "epoch": 0.325632231560698, "grad_norm": 1.7227108959852215, "learning_rate": 8.528433327189464e-06, "loss": 0.5144, "step": 8640 }, { "epoch": 0.32600912071759697, "grad_norm": 1.7003660286486464, "learning_rate": 8.52376949773174e-06, "loss": 0.5178, "step": 8650 }, { "epoch": 0.3263860098744959, "grad_norm": 1.3401486704634595, "learning_rate": 8.51909956904552e-06, "loss": 0.5392, "step": 8660 }, { "epoch": 0.3267628990313949, "grad_norm": 1.5872896977384134, "learning_rate": 8.514423549213899e-06, "loss": 0.5173, "step": 8670 }, { "epoch": 0.3271397881882938, "grad_norm": 1.5745916384042906, "learning_rate": 8.509741446330516e-06, "loss": 0.5014, "step": 8680 }, { "epoch": 0.3275166773451928, "grad_norm": 1.5616168597563413, "learning_rate": 8.505053268499536e-06, "loss": 0.5448, "step": 8690 }, { "epoch": 0.3278935665020917, "grad_norm": 1.6773461052799608, "learning_rate": 8.500359023835643e-06, "loss": 0.5367, "step": 8700 }, { "epoch": 0.3282704556589907, "grad_norm": 1.744777642171872, "learning_rate": 8.49565872046402e-06, "loss": 0.5294, "step": 8710 }, { "epoch": 0.32864734481588964, "grad_norm": 1.4370066131355128, "learning_rate": 8.490952366520332e-06, "loss": 0.5104, "step": 8720 }, { "epoch": 0.3290242339727886, "grad_norm": 1.5686171487455622, "learning_rate": 8.486239970150726e-06, "loss": 0.5082, "step": 8730 }, { "epoch": 0.32940112312968756, "grad_norm": 1.729514880196974, "learning_rate": 8.481521539511802e-06, "loss": 0.5424, "step": 8740 }, { "epoch": 0.32977801228658654, "grad_norm": 1.7026908866007286, "learning_rate": 8.476797082770604e-06, "loss": 0.5319, "step": 8750 }, { "epoch": 0.33015490144348547, "grad_norm": 1.6815280081880184, "learning_rate": 8.472066608104613e-06, "loss": 0.497, "step": 8760 }, { "epoch": 0.3305317906003844, "grad_norm": 1.3354916475471044, "learning_rate": 8.467330123701713e-06, "loss": 0.5199, "step": 8770 }, { "epoch": 0.3309086797572834, "grad_norm": 1.6596093120910274, "learning_rate": 8.462587637760207e-06, "loss": 0.49, "step": 8780 }, { "epoch": 0.3312855689141823, "grad_norm": 1.4490041698465326, "learning_rate": 8.457839158488772e-06, "loss": 0.5177, "step": 8790 }, { "epoch": 0.3316624580710813, "grad_norm": 1.743520901799635, "learning_rate": 8.453084694106468e-06, "loss": 0.5554, "step": 8800 }, { "epoch": 0.33203934722798023, "grad_norm": 1.3601593837886905, "learning_rate": 8.448324252842708e-06, "loss": 0.4939, "step": 8810 }, { "epoch": 0.3324162363848792, "grad_norm": 1.0538420503197117, "learning_rate": 8.443557842937257e-06, "loss": 0.5129, "step": 8820 }, { "epoch": 0.33279312554177815, "grad_norm": 1.580901795857022, "learning_rate": 8.438785472640202e-06, "loss": 0.523, "step": 8830 }, { "epoch": 0.33317001469867713, "grad_norm": 1.6642518860804858, "learning_rate": 8.434007150211957e-06, "loss": 0.5148, "step": 8840 }, { "epoch": 0.33354690385557606, "grad_norm": 1.5678025009658634, "learning_rate": 8.42922288392323e-06, "loss": 0.5189, "step": 8850 }, { "epoch": 0.33392379301247505, "grad_norm": 1.5916235440951905, "learning_rate": 8.424432682055022e-06, "loss": 0.5072, "step": 8860 }, { "epoch": 0.334300682169374, "grad_norm": 1.4657720753841874, "learning_rate": 8.419636552898605e-06, "loss": 0.5275, "step": 8870 }, { "epoch": 0.33467757132627296, "grad_norm": 1.5520680065937231, "learning_rate": 8.414834504755513e-06, "loss": 0.5329, "step": 8880 }, { "epoch": 0.3350544604831719, "grad_norm": 1.4122325295424663, "learning_rate": 8.410026545937522e-06, "loss": 0.5215, "step": 8890 }, { "epoch": 0.3354313496400709, "grad_norm": 1.7208431430034201, "learning_rate": 8.405212684766642e-06, "loss": 0.5266, "step": 8900 }, { "epoch": 0.3358082387969698, "grad_norm": 1.522595875256597, "learning_rate": 8.400392929575098e-06, "loss": 0.521, "step": 8910 }, { "epoch": 0.3361851279538688, "grad_norm": 1.193362753628422, "learning_rate": 8.395567288705315e-06, "loss": 0.473, "step": 8920 }, { "epoch": 0.3365620171107677, "grad_norm": 1.617816412749885, "learning_rate": 8.390735770509909e-06, "loss": 0.5367, "step": 8930 }, { "epoch": 0.3369389062676667, "grad_norm": 3.7585583359479173, "learning_rate": 8.385898383351662e-06, "loss": 0.5273, "step": 8940 }, { "epoch": 0.33731579542456563, "grad_norm": 1.570651388025663, "learning_rate": 8.381055135603526e-06, "loss": 0.5431, "step": 8950 }, { "epoch": 0.33769268458146456, "grad_norm": 1.7069789377376001, "learning_rate": 8.376206035648587e-06, "loss": 0.5534, "step": 8960 }, { "epoch": 0.33806957373836355, "grad_norm": 1.718778224322803, "learning_rate": 8.371351091880064e-06, "loss": 0.5132, "step": 8970 }, { "epoch": 0.3384464628952625, "grad_norm": 1.5304391761250087, "learning_rate": 8.366490312701292e-06, "loss": 0.5227, "step": 8980 }, { "epoch": 0.33882335205216146, "grad_norm": 1.5179430433625931, "learning_rate": 8.361623706525703e-06, "loss": 0.5131, "step": 8990 }, { "epoch": 0.3392002412090604, "grad_norm": 1.7236639439764125, "learning_rate": 8.356751281776818e-06, "loss": 0.5392, "step": 9000 }, { "epoch": 0.3395771303659594, "grad_norm": 1.5735386800178819, "learning_rate": 8.35187304688823e-06, "loss": 0.5347, "step": 9010 }, { "epoch": 0.3399540195228583, "grad_norm": 1.8833870834225461, "learning_rate": 8.346989010303586e-06, "loss": 0.5209, "step": 9020 }, { "epoch": 0.3403309086797573, "grad_norm": 1.4643995674675825, "learning_rate": 8.342099180476575e-06, "loss": 0.5049, "step": 9030 }, { "epoch": 0.3407077978366562, "grad_norm": 1.8572382732988715, "learning_rate": 8.337203565870915e-06, "loss": 0.5256, "step": 9040 }, { "epoch": 0.3410846869935552, "grad_norm": 1.7660917988302332, "learning_rate": 8.332302174960336e-06, "loss": 0.5416, "step": 9050 }, { "epoch": 0.34146157615045414, "grad_norm": 1.8036514572139464, "learning_rate": 8.327395016228567e-06, "loss": 0.5649, "step": 9060 }, { "epoch": 0.3418384653073531, "grad_norm": 1.3940242330097408, "learning_rate": 8.32248209816932e-06, "loss": 0.5114, "step": 9070 }, { "epoch": 0.34221535446425205, "grad_norm": 1.6019993171028344, "learning_rate": 8.317563429286274e-06, "loss": 0.4947, "step": 9080 }, { "epoch": 0.34259224362115104, "grad_norm": 2.247467137710252, "learning_rate": 8.312639018093067e-06, "loss": 0.5569, "step": 9090 }, { "epoch": 0.34296913277804997, "grad_norm": 1.5951663374446334, "learning_rate": 8.307708873113267e-06, "loss": 0.4897, "step": 9100 }, { "epoch": 0.34334602193494895, "grad_norm": 1.738333108559983, "learning_rate": 8.302773002880377e-06, "loss": 0.4987, "step": 9110 }, { "epoch": 0.3437229110918479, "grad_norm": 1.5337169050623085, "learning_rate": 8.297831415937802e-06, "loss": 0.505, "step": 9120 }, { "epoch": 0.34409980024874687, "grad_norm": 1.4472548665213318, "learning_rate": 8.29288412083885e-06, "loss": 0.512, "step": 9130 }, { "epoch": 0.3444766894056458, "grad_norm": 1.4492396478807243, "learning_rate": 8.287931126146696e-06, "loss": 0.4886, "step": 9140 }, { "epoch": 0.3448535785625448, "grad_norm": 1.7630091584083531, "learning_rate": 8.282972440434393e-06, "loss": 0.534, "step": 9150 }, { "epoch": 0.3452304677194437, "grad_norm": 1.3895390793947004, "learning_rate": 8.278008072284841e-06, "loss": 0.4952, "step": 9160 }, { "epoch": 0.34560735687634264, "grad_norm": 1.3777597188749056, "learning_rate": 8.273038030290772e-06, "loss": 0.5084, "step": 9170 }, { "epoch": 0.3459842460332416, "grad_norm": 1.7939386012497827, "learning_rate": 8.268062323054742e-06, "loss": 0.5433, "step": 9180 }, { "epoch": 0.34636113519014056, "grad_norm": 1.4962140700288487, "learning_rate": 8.263080959189114e-06, "loss": 0.5415, "step": 9190 }, { "epoch": 0.34673802434703954, "grad_norm": 1.6671458345630203, "learning_rate": 8.258093947316036e-06, "loss": 0.5137, "step": 9200 }, { "epoch": 0.34711491350393847, "grad_norm": 1.5818238301461247, "learning_rate": 8.253101296067441e-06, "loss": 0.5005, "step": 9210 }, { "epoch": 0.34749180266083746, "grad_norm": 1.6485091022559941, "learning_rate": 8.248103014085014e-06, "loss": 0.526, "step": 9220 }, { "epoch": 0.3478686918177364, "grad_norm": 1.6061788923973597, "learning_rate": 8.243099110020191e-06, "loss": 0.5299, "step": 9230 }, { "epoch": 0.34824558097463537, "grad_norm": 1.5857303046251194, "learning_rate": 8.238089592534143e-06, "loss": 0.5272, "step": 9240 }, { "epoch": 0.3486224701315343, "grad_norm": 1.5733908134969434, "learning_rate": 8.233074470297746e-06, "loss": 0.5027, "step": 9250 }, { "epoch": 0.3489993592884333, "grad_norm": 1.2391810742375864, "learning_rate": 8.228053751991586e-06, "loss": 0.5147, "step": 9260 }, { "epoch": 0.3493762484453322, "grad_norm": 1.6623967687120003, "learning_rate": 8.223027446305939e-06, "loss": 0.562, "step": 9270 }, { "epoch": 0.3497531376022312, "grad_norm": 1.6535672620155857, "learning_rate": 8.217995561940735e-06, "loss": 0.5135, "step": 9280 }, { "epoch": 0.35013002675913013, "grad_norm": 1.628954367480316, "learning_rate": 8.21295810760558e-06, "loss": 0.5491, "step": 9290 }, { "epoch": 0.3505069159160291, "grad_norm": 1.6259980778856178, "learning_rate": 8.207915092019709e-06, "loss": 0.5277, "step": 9300 }, { "epoch": 0.35088380507292805, "grad_norm": 1.7960271399406822, "learning_rate": 8.202866523911985e-06, "loss": 0.5026, "step": 9310 }, { "epoch": 0.35126069422982703, "grad_norm": 1.5027810941773745, "learning_rate": 8.197812412020882e-06, "loss": 0.5219, "step": 9320 }, { "epoch": 0.35163758338672596, "grad_norm": 1.4147518487518373, "learning_rate": 8.192752765094474e-06, "loss": 0.4946, "step": 9330 }, { "epoch": 0.35201447254362495, "grad_norm": 1.7588868083859255, "learning_rate": 8.18768759189041e-06, "loss": 0.5214, "step": 9340 }, { "epoch": 0.3523913617005239, "grad_norm": 1.9046948041639848, "learning_rate": 8.182616901175904e-06, "loss": 0.5327, "step": 9350 }, { "epoch": 0.3527682508574228, "grad_norm": 1.8541351405292703, "learning_rate": 8.177540701727725e-06, "loss": 0.5332, "step": 9360 }, { "epoch": 0.3531451400143218, "grad_norm": 1.4841893890474134, "learning_rate": 8.172459002332174e-06, "loss": 0.5198, "step": 9370 }, { "epoch": 0.3535220291712207, "grad_norm": 1.6262991665753983, "learning_rate": 8.16737181178507e-06, "loss": 0.5204, "step": 9380 }, { "epoch": 0.3538989183281197, "grad_norm": 1.9711118756578987, "learning_rate": 8.16227913889174e-06, "loss": 0.5227, "step": 9390 }, { "epoch": 0.35427580748501863, "grad_norm": 1.6101287024416397, "learning_rate": 8.157180992466999e-06, "loss": 0.5213, "step": 9400 }, { "epoch": 0.3546526966419176, "grad_norm": 1.4345161151869006, "learning_rate": 8.152077381335136e-06, "loss": 0.5156, "step": 9410 }, { "epoch": 0.35502958579881655, "grad_norm": 1.3670300786401688, "learning_rate": 8.146968314329897e-06, "loss": 0.5415, "step": 9420 }, { "epoch": 0.35540647495571553, "grad_norm": 2.0488645347215755, "learning_rate": 8.141853800294474e-06, "loss": 0.495, "step": 9430 }, { "epoch": 0.35578336411261446, "grad_norm": 1.6748625828080144, "learning_rate": 8.136733848081489e-06, "loss": 0.5253, "step": 9440 }, { "epoch": 0.35616025326951345, "grad_norm": 1.749008596784993, "learning_rate": 8.131608466552968e-06, "loss": 0.5306, "step": 9450 }, { "epoch": 0.3565371424264124, "grad_norm": 1.4862728647070822, "learning_rate": 8.126477664580347e-06, "loss": 0.5124, "step": 9460 }, { "epoch": 0.35691403158331136, "grad_norm": 1.6537416257961348, "learning_rate": 8.121341451044433e-06, "loss": 0.53, "step": 9470 }, { "epoch": 0.3572909207402103, "grad_norm": 1.6263866458416927, "learning_rate": 8.116199834835408e-06, "loss": 0.4975, "step": 9480 }, { "epoch": 0.3576678098971093, "grad_norm": 1.5395362965786443, "learning_rate": 8.1110528248528e-06, "loss": 0.5234, "step": 9490 }, { "epoch": 0.3580446990540082, "grad_norm": 1.6712903285736607, "learning_rate": 8.105900430005476e-06, "loss": 0.5306, "step": 9500 }, { "epoch": 0.3584215882109072, "grad_norm": 1.8324911808117614, "learning_rate": 8.10074265921162e-06, "loss": 0.5242, "step": 9510 }, { "epoch": 0.3587984773678061, "grad_norm": 2.351195839028606, "learning_rate": 8.095579521398727e-06, "loss": 0.5219, "step": 9520 }, { "epoch": 0.3591753665247051, "grad_norm": 1.7110044303010266, "learning_rate": 8.090411025503576e-06, "loss": 0.4978, "step": 9530 }, { "epoch": 0.35955225568160404, "grad_norm": 1.6106891478057317, "learning_rate": 8.085237180472222e-06, "loss": 0.4827, "step": 9540 }, { "epoch": 0.359929144838503, "grad_norm": 1.7506503522662198, "learning_rate": 8.080057995259983e-06, "loss": 0.5101, "step": 9550 }, { "epoch": 0.36030603399540195, "grad_norm": 1.4401619042910059, "learning_rate": 8.074873478831412e-06, "loss": 0.4944, "step": 9560 }, { "epoch": 0.3606829231523009, "grad_norm": 1.5447630644836872, "learning_rate": 8.069683640160297e-06, "loss": 0.5043, "step": 9570 }, { "epoch": 0.36105981230919987, "grad_norm": 1.6594696899732704, "learning_rate": 8.064488488229634e-06, "loss": 0.5308, "step": 9580 }, { "epoch": 0.3614367014660988, "grad_norm": 1.50896874784597, "learning_rate": 8.059288032031616e-06, "loss": 0.511, "step": 9590 }, { "epoch": 0.3618135906229978, "grad_norm": 1.463806572060008, "learning_rate": 8.05408228056762e-06, "loss": 0.5112, "step": 9600 }, { "epoch": 0.3621904797798967, "grad_norm": 2.3161399801404206, "learning_rate": 8.048871242848186e-06, "loss": 0.5094, "step": 9610 }, { "epoch": 0.3625673689367957, "grad_norm": 1.6934623824302724, "learning_rate": 8.043654927893003e-06, "loss": 0.5302, "step": 9620 }, { "epoch": 0.3629442580936946, "grad_norm": 1.669003654657236, "learning_rate": 8.038433344730896e-06, "loss": 0.5149, "step": 9630 }, { "epoch": 0.3633211472505936, "grad_norm": 1.627539817842675, "learning_rate": 8.033206502399811e-06, "loss": 0.508, "step": 9640 }, { "epoch": 0.36369803640749254, "grad_norm": 1.606477921617676, "learning_rate": 8.027974409946791e-06, "loss": 0.5388, "step": 9650 }, { "epoch": 0.3640749255643915, "grad_norm": 1.5765425415250627, "learning_rate": 8.02273707642797e-06, "loss": 0.4904, "step": 9660 }, { "epoch": 0.36445181472129046, "grad_norm": 1.377919727134035, "learning_rate": 8.017494510908557e-06, "loss": 0.4736, "step": 9670 }, { "epoch": 0.36482870387818944, "grad_norm": 1.5390007963630454, "learning_rate": 8.012246722462807e-06, "loss": 0.5366, "step": 9680 }, { "epoch": 0.36520559303508837, "grad_norm": 1.6016562038880642, "learning_rate": 8.006993720174026e-06, "loss": 0.5278, "step": 9690 }, { "epoch": 0.36558248219198736, "grad_norm": 1.6926301666587609, "learning_rate": 8.001735513134539e-06, "loss": 0.524, "step": 9700 }, { "epoch": 0.3659593713488863, "grad_norm": 1.4095598406907603, "learning_rate": 7.996472110445682e-06, "loss": 0.5248, "step": 9710 }, { "epoch": 0.36633626050578527, "grad_norm": 1.931772269279146, "learning_rate": 7.99120352121778e-06, "loss": 0.5468, "step": 9720 }, { "epoch": 0.3667131496626842, "grad_norm": 1.7501030367149104, "learning_rate": 7.985929754570138e-06, "loss": 0.522, "step": 9730 }, { "epoch": 0.3670900388195832, "grad_norm": 1.526100066204629, "learning_rate": 7.980650819631028e-06, "loss": 0.5101, "step": 9740 }, { "epoch": 0.3674669279764821, "grad_norm": 1.7385487243528495, "learning_rate": 7.975366725537657e-06, "loss": 0.5216, "step": 9750 }, { "epoch": 0.36784381713338105, "grad_norm": 1.5749812078525176, "learning_rate": 7.970077481436169e-06, "loss": 0.5003, "step": 9760 }, { "epoch": 0.36822070629028003, "grad_norm": 1.9001739721765867, "learning_rate": 7.964783096481624e-06, "loss": 0.5299, "step": 9770 }, { "epoch": 0.36859759544717896, "grad_norm": 1.7061699416032134, "learning_rate": 7.95948357983797e-06, "loss": 0.5293, "step": 9780 }, { "epoch": 0.36897448460407795, "grad_norm": 1.628434299258182, "learning_rate": 7.954178940678048e-06, "loss": 0.5365, "step": 9790 }, { "epoch": 0.3693513737609769, "grad_norm": 1.5914358544888132, "learning_rate": 7.94886918818356e-06, "loss": 0.486, "step": 9800 }, { "epoch": 0.36972826291787586, "grad_norm": 1.8260310549139975, "learning_rate": 7.94355433154506e-06, "loss": 0.5051, "step": 9810 }, { "epoch": 0.3701051520747748, "grad_norm": 1.5743149246238135, "learning_rate": 7.93823437996194e-06, "loss": 0.4909, "step": 9820 }, { "epoch": 0.3704820412316738, "grad_norm": 1.5885416947805906, "learning_rate": 7.932909342642403e-06, "loss": 0.5111, "step": 9830 }, { "epoch": 0.3708589303885727, "grad_norm": 1.806111484800823, "learning_rate": 7.92757922880346e-06, "loss": 0.5213, "step": 9840 }, { "epoch": 0.3712358195454717, "grad_norm": 1.622739814163259, "learning_rate": 7.922244047670908e-06, "loss": 0.5223, "step": 9850 }, { "epoch": 0.3716127087023706, "grad_norm": 1.422381514282876, "learning_rate": 7.916903808479316e-06, "loss": 0.4954, "step": 9860 }, { "epoch": 0.3719895978592696, "grad_norm": 1.4819588262177361, "learning_rate": 7.911558520472007e-06, "loss": 0.5041, "step": 9870 }, { "epoch": 0.37236648701616853, "grad_norm": 1.6630721593622193, "learning_rate": 7.906208192901043e-06, "loss": 0.5031, "step": 9880 }, { "epoch": 0.3727433761730675, "grad_norm": 1.5994053992133728, "learning_rate": 7.900852835027207e-06, "loss": 0.5212, "step": 9890 }, { "epoch": 0.37312026532996645, "grad_norm": 1.6742248574206915, "learning_rate": 7.89549245611999e-06, "loss": 0.509, "step": 9900 }, { "epoch": 0.37349715448686543, "grad_norm": 1.3401945002776963, "learning_rate": 7.890127065457578e-06, "loss": 0.4989, "step": 9910 }, { "epoch": 0.37387404364376436, "grad_norm": 1.7744024916171353, "learning_rate": 7.884756672326824e-06, "loss": 0.5221, "step": 9920 }, { "epoch": 0.37425093280066335, "grad_norm": 1.5151769409007294, "learning_rate": 7.879381286023247e-06, "loss": 0.4874, "step": 9930 }, { "epoch": 0.3746278219575623, "grad_norm": 1.521150560489862, "learning_rate": 7.874000915851e-06, "loss": 0.5243, "step": 9940 }, { "epoch": 0.37500471111446126, "grad_norm": 1.8681950791510447, "learning_rate": 7.868615571122877e-06, "loss": 0.5333, "step": 9950 }, { "epoch": 0.3753816002713602, "grad_norm": 1.7696887020829462, "learning_rate": 7.863225261160264e-06, "loss": 0.5095, "step": 9960 }, { "epoch": 0.3757584894282591, "grad_norm": 1.5476742902685527, "learning_rate": 7.857829995293156e-06, "loss": 0.5138, "step": 9970 }, { "epoch": 0.3761353785851581, "grad_norm": 1.8113224519075866, "learning_rate": 7.852429782860116e-06, "loss": 0.5204, "step": 9980 }, { "epoch": 0.37651226774205704, "grad_norm": 1.5771605683013104, "learning_rate": 7.847024633208277e-06, "loss": 0.5251, "step": 9990 }, { "epoch": 0.376889156898956, "grad_norm": 1.7363856844555856, "learning_rate": 7.841614555693315e-06, "loss": 0.5374, "step": 10000 }, { "epoch": 0.37726604605585495, "grad_norm": 1.4364933283426136, "learning_rate": 7.83619955967943e-06, "loss": 0.5058, "step": 10010 }, { "epoch": 0.37764293521275394, "grad_norm": 1.7783682918591002, "learning_rate": 7.830779654539347e-06, "loss": 0.5219, "step": 10020 }, { "epoch": 0.37801982436965287, "grad_norm": 1.502023352144145, "learning_rate": 7.825354849654276e-06, "loss": 0.5063, "step": 10030 }, { "epoch": 0.37839671352655185, "grad_norm": 1.4662226669968323, "learning_rate": 7.819925154413913e-06, "loss": 0.519, "step": 10040 }, { "epoch": 0.3787736026834508, "grad_norm": 1.7426260620841227, "learning_rate": 7.814490578216418e-06, "loss": 0.5139, "step": 10050 }, { "epoch": 0.37915049184034977, "grad_norm": 1.392450084963678, "learning_rate": 7.809051130468406e-06, "loss": 0.5117, "step": 10060 }, { "epoch": 0.3795273809972487, "grad_norm": 1.3645116377228637, "learning_rate": 7.80360682058491e-06, "loss": 0.4957, "step": 10070 }, { "epoch": 0.3799042701541477, "grad_norm": 1.585986026276787, "learning_rate": 7.798157657989393e-06, "loss": 0.4932, "step": 10080 }, { "epoch": 0.3802811593110466, "grad_norm": 1.732526223305691, "learning_rate": 7.792703652113711e-06, "loss": 0.489, "step": 10090 }, { "epoch": 0.3806580484679456, "grad_norm": 1.5456835026999505, "learning_rate": 7.7872448123981e-06, "loss": 0.5113, "step": 10100 }, { "epoch": 0.3810349376248445, "grad_norm": 1.691666553542104, "learning_rate": 7.781781148291168e-06, "loss": 0.5062, "step": 10110 }, { "epoch": 0.3814118267817435, "grad_norm": 1.4393454733074624, "learning_rate": 7.776312669249871e-06, "loss": 0.5259, "step": 10120 }, { "epoch": 0.38178871593864244, "grad_norm": 1.5606823602350173, "learning_rate": 7.770839384739502e-06, "loss": 0.481, "step": 10130 }, { "epoch": 0.3821656050955414, "grad_norm": 1.5406731978486246, "learning_rate": 7.765361304233669e-06, "loss": 0.5022, "step": 10140 }, { "epoch": 0.38254249425244036, "grad_norm": 1.5789632721608042, "learning_rate": 7.759878437214279e-06, "loss": 0.5205, "step": 10150 }, { "epoch": 0.3829193834093393, "grad_norm": 1.4404684890173407, "learning_rate": 7.75439079317153e-06, "loss": 0.5194, "step": 10160 }, { "epoch": 0.38329627256623827, "grad_norm": 1.5872838579011181, "learning_rate": 7.748898381603885e-06, "loss": 0.5209, "step": 10170 }, { "epoch": 0.3836731617231372, "grad_norm": 1.7373659799447403, "learning_rate": 7.743401212018058e-06, "loss": 0.5338, "step": 10180 }, { "epoch": 0.3840500508800362, "grad_norm": 1.5163208593264923, "learning_rate": 7.737899293929e-06, "loss": 0.5171, "step": 10190 }, { "epoch": 0.3844269400369351, "grad_norm": 1.7459426641170264, "learning_rate": 7.73239263685988e-06, "loss": 0.514, "step": 10200 }, { "epoch": 0.3848038291938341, "grad_norm": 1.4346771101184586, "learning_rate": 7.726881250342072e-06, "loss": 0.4932, "step": 10210 }, { "epoch": 0.38518071835073303, "grad_norm": 1.5111843012047024, "learning_rate": 7.721365143915134e-06, "loss": 0.5125, "step": 10220 }, { "epoch": 0.385557607507632, "grad_norm": 1.9962267341536073, "learning_rate": 7.715844327126796e-06, "loss": 0.5167, "step": 10230 }, { "epoch": 0.38593449666453095, "grad_norm": 1.8161411611054894, "learning_rate": 7.710318809532936e-06, "loss": 0.5158, "step": 10240 }, { "epoch": 0.38631138582142993, "grad_norm": 2.7639886954074973, "learning_rate": 7.704788600697572e-06, "loss": 0.5188, "step": 10250 }, { "epoch": 0.38668827497832886, "grad_norm": 1.4820078773895478, "learning_rate": 7.699253710192846e-06, "loss": 0.5393, "step": 10260 }, { "epoch": 0.38706516413522785, "grad_norm": 1.5410641109210454, "learning_rate": 7.693714147598997e-06, "loss": 0.5153, "step": 10270 }, { "epoch": 0.3874420532921268, "grad_norm": 1.5522185520833145, "learning_rate": 7.68816992250435e-06, "loss": 0.501, "step": 10280 }, { "epoch": 0.38781894244902576, "grad_norm": 1.5919453235813696, "learning_rate": 7.682621044505307e-06, "loss": 0.5129, "step": 10290 }, { "epoch": 0.3881958316059247, "grad_norm": 1.8187432897104705, "learning_rate": 7.67706752320632e-06, "loss": 0.5119, "step": 10300 }, { "epoch": 0.3885727207628237, "grad_norm": 1.7249561612487072, "learning_rate": 7.671509368219876e-06, "loss": 0.4994, "step": 10310 }, { "epoch": 0.3889496099197226, "grad_norm": 1.6848328239504586, "learning_rate": 7.665946589166487e-06, "loss": 0.4925, "step": 10320 }, { "epoch": 0.3893264990766216, "grad_norm": 1.5828091110560405, "learning_rate": 7.660379195674661e-06, "loss": 0.5038, "step": 10330 }, { "epoch": 0.3897033882335205, "grad_norm": 1.5813911515185386, "learning_rate": 7.654807197380905e-06, "loss": 0.5195, "step": 10340 }, { "epoch": 0.3900802773904195, "grad_norm": 1.6125111488328268, "learning_rate": 7.649230603929682e-06, "loss": 0.5015, "step": 10350 }, { "epoch": 0.39045716654731843, "grad_norm": 1.3381134076751229, "learning_rate": 7.643649424973423e-06, "loss": 0.5088, "step": 10360 }, { "epoch": 0.39083405570421736, "grad_norm": 1.7305681398298995, "learning_rate": 7.638063670172484e-06, "loss": 0.5185, "step": 10370 }, { "epoch": 0.39121094486111635, "grad_norm": 1.4448524895747423, "learning_rate": 7.632473349195148e-06, "loss": 0.4951, "step": 10380 }, { "epoch": 0.3915878340180153, "grad_norm": 1.4981197503697667, "learning_rate": 7.626878471717601e-06, "loss": 0.4968, "step": 10390 }, { "epoch": 0.39196472317491426, "grad_norm": 1.7418645645645234, "learning_rate": 7.621279047423913e-06, "loss": 0.5008, "step": 10400 }, { "epoch": 0.3923416123318132, "grad_norm": 1.5178888998524598, "learning_rate": 7.615675086006027e-06, "loss": 0.5289, "step": 10410 }, { "epoch": 0.3927185014887122, "grad_norm": 1.830273252291855, "learning_rate": 7.610066597163737e-06, "loss": 0.5122, "step": 10420 }, { "epoch": 0.3930953906456111, "grad_norm": 1.7128529156361387, "learning_rate": 7.604453590604675e-06, "loss": 0.5195, "step": 10430 }, { "epoch": 0.3934722798025101, "grad_norm": 1.391617720005151, "learning_rate": 7.5988360760442905e-06, "loss": 0.5076, "step": 10440 }, { "epoch": 0.393849168959409, "grad_norm": 1.6877860440671137, "learning_rate": 7.5932140632058395e-06, "loss": 0.4974, "step": 10450 }, { "epoch": 0.394226058116308, "grad_norm": 1.8880275691787758, "learning_rate": 7.587587561820357e-06, "loss": 0.5285, "step": 10460 }, { "epoch": 0.39460294727320694, "grad_norm": 2.1952687274271323, "learning_rate": 7.581956581626659e-06, "loss": 0.4788, "step": 10470 }, { "epoch": 0.3949798364301059, "grad_norm": 1.4679354015620223, "learning_rate": 7.5763211323713e-06, "loss": 0.5148, "step": 10480 }, { "epoch": 0.39535672558700485, "grad_norm": 5.552246755488179, "learning_rate": 7.570681223808581e-06, "loss": 0.5088, "step": 10490 }, { "epoch": 0.39573361474390384, "grad_norm": 1.7064464108546398, "learning_rate": 7.565036865700515e-06, "loss": 0.5036, "step": 10500 }, { "epoch": 0.39611050390080277, "grad_norm": 2.5811736419688125, "learning_rate": 7.559388067816818e-06, "loss": 0.4968, "step": 10510 }, { "epoch": 0.39648739305770175, "grad_norm": 1.5411522330658454, "learning_rate": 7.553734839934892e-06, "loss": 0.5321, "step": 10520 }, { "epoch": 0.3968642822146007, "grad_norm": 7.8958392331494265, "learning_rate": 7.54807719183981e-06, "loss": 0.5144, "step": 10530 }, { "epoch": 0.39724117137149967, "grad_norm": 1.398958685780613, "learning_rate": 7.5424151333242854e-06, "loss": 0.5038, "step": 10540 }, { "epoch": 0.3976180605283986, "grad_norm": 1.8158525212197818, "learning_rate": 7.536748674188679e-06, "loss": 0.5498, "step": 10550 }, { "epoch": 0.3979949496852975, "grad_norm": 1.5804568739533704, "learning_rate": 7.531077824240955e-06, "loss": 0.4943, "step": 10560 }, { "epoch": 0.3983718388421965, "grad_norm": 1.5226993293921613, "learning_rate": 7.5254025932966915e-06, "loss": 0.5197, "step": 10570 }, { "epoch": 0.39874872799909544, "grad_norm": 1.4875198630376472, "learning_rate": 7.519722991179037e-06, "loss": 0.4911, "step": 10580 }, { "epoch": 0.3991256171559944, "grad_norm": 1.4559827608231475, "learning_rate": 7.514039027718714e-06, "loss": 0.5369, "step": 10590 }, { "epoch": 0.39950250631289336, "grad_norm": 1.6204937117910332, "learning_rate": 7.50835071275399e-06, "loss": 0.5126, "step": 10600 }, { "epoch": 0.39987939546979234, "grad_norm": 1.3328838051236007, "learning_rate": 7.502658056130667e-06, "loss": 0.4924, "step": 10610 }, { "epoch": 0.40025628462669127, "grad_norm": 1.6977987639267709, "learning_rate": 7.496961067702061e-06, "loss": 0.5299, "step": 10620 }, { "epoch": 0.40063317378359026, "grad_norm": 1.7016949489371669, "learning_rate": 7.491259757328986e-06, "loss": 0.4688, "step": 10630 }, { "epoch": 0.4010100629404892, "grad_norm": 1.7144533476268096, "learning_rate": 7.4855541348797325e-06, "loss": 0.496, "step": 10640 }, { "epoch": 0.40138695209738817, "grad_norm": 1.6798605375904403, "learning_rate": 7.479844210230063e-06, "loss": 0.4848, "step": 10650 }, { "epoch": 0.4017638412542871, "grad_norm": 1.6026398018607957, "learning_rate": 7.474129993263181e-06, "loss": 0.5436, "step": 10660 }, { "epoch": 0.4021407304111861, "grad_norm": 2.1790501189406974, "learning_rate": 7.468411493869719e-06, "loss": 0.506, "step": 10670 }, { "epoch": 0.402517619568085, "grad_norm": 1.5645837080947917, "learning_rate": 7.462688721947724e-06, "loss": 0.5218, "step": 10680 }, { "epoch": 0.402894508724984, "grad_norm": 1.4088765793630955, "learning_rate": 7.456961687402639e-06, "loss": 0.5237, "step": 10690 }, { "epoch": 0.40327139788188293, "grad_norm": 1.7292916232116071, "learning_rate": 7.451230400147285e-06, "loss": 0.5469, "step": 10700 }, { "epoch": 0.4036482870387819, "grad_norm": 1.5186451953406783, "learning_rate": 7.44549487010184e-06, "loss": 0.4883, "step": 10710 }, { "epoch": 0.40402517619568085, "grad_norm": 1.5087236868153622, "learning_rate": 7.43975510719383e-06, "loss": 0.5093, "step": 10720 }, { "epoch": 0.40440206535257983, "grad_norm": 1.4677033417891505, "learning_rate": 7.434011121358106e-06, "loss": 0.5286, "step": 10730 }, { "epoch": 0.40477895450947876, "grad_norm": 1.4802091202672563, "learning_rate": 7.428262922536829e-06, "loss": 0.5089, "step": 10740 }, { "epoch": 0.4051558436663777, "grad_norm": 1.7355158540184825, "learning_rate": 7.422510520679451e-06, "loss": 0.4905, "step": 10750 }, { "epoch": 0.4055327328232767, "grad_norm": 1.6236103109691675, "learning_rate": 7.416753925742699e-06, "loss": 0.5378, "step": 10760 }, { "epoch": 0.4059096219801756, "grad_norm": 1.6609723115712154, "learning_rate": 7.410993147690559e-06, "loss": 0.5389, "step": 10770 }, { "epoch": 0.4062865111370746, "grad_norm": 1.723187447121083, "learning_rate": 7.405228196494258e-06, "loss": 0.506, "step": 10780 }, { "epoch": 0.4066634002939735, "grad_norm": 1.77371397243615, "learning_rate": 7.399459082132245e-06, "loss": 0.5513, "step": 10790 }, { "epoch": 0.4070402894508725, "grad_norm": 1.61901847701063, "learning_rate": 7.393685814590173e-06, "loss": 0.5186, "step": 10800 }, { "epoch": 0.40741717860777144, "grad_norm": 1.59625784282029, "learning_rate": 7.387908403860888e-06, "loss": 0.5185, "step": 10810 }, { "epoch": 0.4077940677646704, "grad_norm": 1.6692144430702707, "learning_rate": 7.382126859944404e-06, "loss": 0.512, "step": 10820 }, { "epoch": 0.40817095692156935, "grad_norm": 1.4908534978634642, "learning_rate": 7.3763411928478905e-06, "loss": 0.4861, "step": 10830 }, { "epoch": 0.40854784607846834, "grad_norm": 1.8840748788280792, "learning_rate": 7.370551412585653e-06, "loss": 0.5155, "step": 10840 }, { "epoch": 0.40892473523536726, "grad_norm": 1.6633231923886227, "learning_rate": 7.364757529179116e-06, "loss": 0.5222, "step": 10850 }, { "epoch": 0.40930162439226625, "grad_norm": 1.484920688542688, "learning_rate": 7.3589595526568105e-06, "loss": 0.5058, "step": 10860 }, { "epoch": 0.4096785135491652, "grad_norm": 1.9228281526896014, "learning_rate": 7.353157493054342e-06, "loss": 0.5304, "step": 10870 }, { "epoch": 0.41005540270606416, "grad_norm": 1.8251544219160065, "learning_rate": 7.347351360414396e-06, "loss": 0.5211, "step": 10880 }, { "epoch": 0.4104322918629631, "grad_norm": 1.6738905480970352, "learning_rate": 7.341541164786701e-06, "loss": 0.472, "step": 10890 }, { "epoch": 0.4108091810198621, "grad_norm": 1.7440198392451363, "learning_rate": 7.335726916228014e-06, "loss": 0.5083, "step": 10900 }, { "epoch": 0.411186070176761, "grad_norm": 1.6268369569369572, "learning_rate": 7.329908624802118e-06, "loss": 0.5208, "step": 10910 }, { "epoch": 0.41156295933366, "grad_norm": 1.254862433004947, "learning_rate": 7.3240863005797845e-06, "loss": 0.4823, "step": 10920 }, { "epoch": 0.4119398484905589, "grad_norm": 1.396064069324666, "learning_rate": 7.3182599536387685e-06, "loss": 0.5345, "step": 10930 }, { "epoch": 0.4123167376474579, "grad_norm": 1.5132124149170976, "learning_rate": 7.31242959406379e-06, "loss": 0.5, "step": 10940 }, { "epoch": 0.41269362680435684, "grad_norm": 1.6199593366062408, "learning_rate": 7.306595231946509e-06, "loss": 0.52, "step": 10950 }, { "epoch": 0.41307051596125577, "grad_norm": 1.4096476517099632, "learning_rate": 7.300756877385522e-06, "loss": 0.5073, "step": 10960 }, { "epoch": 0.41344740511815475, "grad_norm": 2.2637085400824266, "learning_rate": 7.294914540486324e-06, "loss": 0.5514, "step": 10970 }, { "epoch": 0.4138242942750537, "grad_norm": 1.7076238981325191, "learning_rate": 7.2890682313613145e-06, "loss": 0.5193, "step": 10980 }, { "epoch": 0.41420118343195267, "grad_norm": 1.404089053698958, "learning_rate": 7.283217960129761e-06, "loss": 0.4978, "step": 10990 }, { "epoch": 0.4145780725888516, "grad_norm": 1.765699200284767, "learning_rate": 7.277363736917793e-06, "loss": 0.5045, "step": 11000 }, { "epoch": 0.4149549617457506, "grad_norm": 1.2515597552306361, "learning_rate": 7.271505571858378e-06, "loss": 0.5161, "step": 11010 }, { "epoch": 0.4153318509026495, "grad_norm": 1.5637990689131112, "learning_rate": 7.265643475091308e-06, "loss": 0.5056, "step": 11020 }, { "epoch": 0.4157087400595485, "grad_norm": 1.6950097710271756, "learning_rate": 7.25977745676318e-06, "loss": 0.5305, "step": 11030 }, { "epoch": 0.41608562921644743, "grad_norm": 1.3673559058916098, "learning_rate": 7.253907527027377e-06, "loss": 0.5156, "step": 11040 }, { "epoch": 0.4164625183733464, "grad_norm": 1.7616801139174183, "learning_rate": 7.2480336960440535e-06, "loss": 0.5105, "step": 11050 }, { "epoch": 0.41683940753024534, "grad_norm": 1.5601831166364712, "learning_rate": 7.242155973980118e-06, "loss": 0.4974, "step": 11060 }, { "epoch": 0.41721629668714433, "grad_norm": 1.5959842163354248, "learning_rate": 7.236274371009213e-06, "loss": 0.5112, "step": 11070 }, { "epoch": 0.41759318584404326, "grad_norm": 1.5739266782444512, "learning_rate": 7.2303888973116955e-06, "loss": 0.5146, "step": 11080 }, { "epoch": 0.41797007500094224, "grad_norm": 1.4353518599364972, "learning_rate": 7.224499563074627e-06, "loss": 0.4896, "step": 11090 }, { "epoch": 0.4183469641578412, "grad_norm": 1.6746462092510674, "learning_rate": 7.218606378491748e-06, "loss": 0.5083, "step": 11100 }, { "epoch": 0.41872385331474016, "grad_norm": 1.4785054484483375, "learning_rate": 7.2127093537634655e-06, "loss": 0.504, "step": 11110 }, { "epoch": 0.4191007424716391, "grad_norm": 2.5097265005422287, "learning_rate": 7.20680849909683e-06, "loss": 0.5089, "step": 11120 }, { "epoch": 0.4194776316285381, "grad_norm": 1.936326911057867, "learning_rate": 7.200903824705525e-06, "loss": 0.4966, "step": 11130 }, { "epoch": 0.419854520785437, "grad_norm": 1.3661039009657177, "learning_rate": 7.194995340809845e-06, "loss": 0.4992, "step": 11140 }, { "epoch": 0.42023140994233593, "grad_norm": 1.7262176315985598, "learning_rate": 7.189083057636677e-06, "loss": 0.5468, "step": 11150 }, { "epoch": 0.4206082990992349, "grad_norm": 1.4653322424571276, "learning_rate": 7.183166985419482e-06, "loss": 0.5141, "step": 11160 }, { "epoch": 0.42098518825613385, "grad_norm": 1.401053993164947, "learning_rate": 7.177247134398286e-06, "loss": 0.4912, "step": 11170 }, { "epoch": 0.42136207741303283, "grad_norm": 1.4861938807474284, "learning_rate": 7.171323514819645e-06, "loss": 0.5232, "step": 11180 }, { "epoch": 0.42173896656993176, "grad_norm": 1.4194599929006113, "learning_rate": 7.1653961369366495e-06, "loss": 0.4814, "step": 11190 }, { "epoch": 0.42211585572683075, "grad_norm": 1.76956116217899, "learning_rate": 7.159465011008888e-06, "loss": 0.4932, "step": 11200 }, { "epoch": 0.4224927448837297, "grad_norm": 1.9109196300178812, "learning_rate": 7.15353014730244e-06, "loss": 0.5128, "step": 11210 }, { "epoch": 0.42286963404062866, "grad_norm": 1.6509926306486402, "learning_rate": 7.147591556089851e-06, "loss": 0.505, "step": 11220 }, { "epoch": 0.4232465231975276, "grad_norm": 1.6713792587181258, "learning_rate": 7.141649247650122e-06, "loss": 0.4887, "step": 11230 }, { "epoch": 0.4236234123544266, "grad_norm": 1.668002839224313, "learning_rate": 7.135703232268686e-06, "loss": 0.4888, "step": 11240 }, { "epoch": 0.4240003015113255, "grad_norm": 1.6242132187489502, "learning_rate": 7.1297535202373935e-06, "loss": 0.4965, "step": 11250 }, { "epoch": 0.4243771906682245, "grad_norm": 1.4189185720393587, "learning_rate": 7.1238001218544904e-06, "loss": 0.4619, "step": 11260 }, { "epoch": 0.4247540798251234, "grad_norm": 1.3574884914862415, "learning_rate": 7.117843047424608e-06, "loss": 0.5141, "step": 11270 }, { "epoch": 0.4251309689820224, "grad_norm": 1.5298779197157868, "learning_rate": 7.111882307258737e-06, "loss": 0.4846, "step": 11280 }, { "epoch": 0.42550785813892134, "grad_norm": 1.6463532463625445, "learning_rate": 7.105917911674216e-06, "loss": 0.52, "step": 11290 }, { "epoch": 0.4258847472958203, "grad_norm": 1.8876680862122976, "learning_rate": 7.099949870994706e-06, "loss": 0.5022, "step": 11300 }, { "epoch": 0.42626163645271925, "grad_norm": 1.6209020479441716, "learning_rate": 7.093978195550181e-06, "loss": 0.4856, "step": 11310 }, { "epoch": 0.42663852560961824, "grad_norm": 1.4825939311414624, "learning_rate": 7.088002895676905e-06, "loss": 0.51, "step": 11320 }, { "epoch": 0.42701541476651717, "grad_norm": 1.6454549725646832, "learning_rate": 7.082023981717417e-06, "loss": 0.5298, "step": 11330 }, { "epoch": 0.42739230392341615, "grad_norm": 1.8515246881502785, "learning_rate": 7.07604146402051e-06, "loss": 0.5353, "step": 11340 }, { "epoch": 0.4277691930803151, "grad_norm": 1.664094685534996, "learning_rate": 7.0700553529412155e-06, "loss": 0.5199, "step": 11350 }, { "epoch": 0.428146082237214, "grad_norm": 2.1244311792060975, "learning_rate": 7.064065658840782e-06, "loss": 0.5145, "step": 11360 }, { "epoch": 0.428522971394113, "grad_norm": 1.520255254828738, "learning_rate": 7.058072392086663e-06, "loss": 0.5159, "step": 11370 }, { "epoch": 0.4288998605510119, "grad_norm": 1.6576637569684334, "learning_rate": 7.052075563052496e-06, "loss": 0.4984, "step": 11380 }, { "epoch": 0.4292767497079109, "grad_norm": 1.5125466189919647, "learning_rate": 7.0460751821180825e-06, "loss": 0.5264, "step": 11390 }, { "epoch": 0.42965363886480984, "grad_norm": 1.6676033896976015, "learning_rate": 7.0400712596693735e-06, "loss": 0.5229, "step": 11400 }, { "epoch": 0.4300305280217088, "grad_norm": 1.7189105587329556, "learning_rate": 7.034063806098447e-06, "loss": 0.5207, "step": 11410 }, { "epoch": 0.43040741717860775, "grad_norm": 1.7180985315889115, "learning_rate": 7.0280528318034965e-06, "loss": 0.5022, "step": 11420 }, { "epoch": 0.43078430633550674, "grad_norm": 1.5059034039755612, "learning_rate": 7.022038347188809e-06, "loss": 0.5184, "step": 11430 }, { "epoch": 0.43116119549240567, "grad_norm": 1.6528886008084185, "learning_rate": 7.016020362664744e-06, "loss": 0.5168, "step": 11440 }, { "epoch": 0.43153808464930465, "grad_norm": 1.8617180030972997, "learning_rate": 7.009998888647724e-06, "loss": 0.502, "step": 11450 }, { "epoch": 0.4319149738062036, "grad_norm": 1.3525198323198173, "learning_rate": 7.003973935560206e-06, "loss": 0.4869, "step": 11460 }, { "epoch": 0.43229186296310257, "grad_norm": 1.677808325515248, "learning_rate": 6.997945513830674e-06, "loss": 0.5101, "step": 11470 }, { "epoch": 0.4326687521200015, "grad_norm": 1.5676318820179806, "learning_rate": 6.991913633893612e-06, "loss": 0.5009, "step": 11480 }, { "epoch": 0.4330456412769005, "grad_norm": 1.5538674193725228, "learning_rate": 6.985878306189491e-06, "loss": 0.4667, "step": 11490 }, { "epoch": 0.4334225304337994, "grad_norm": 1.783690739137192, "learning_rate": 6.979839541164754e-06, "loss": 0.5322, "step": 11500 }, { "epoch": 0.4337994195906984, "grad_norm": 1.6669015455087597, "learning_rate": 6.973797349271783e-06, "loss": 0.5029, "step": 11510 }, { "epoch": 0.43417630874759733, "grad_norm": 1.4853940822920297, "learning_rate": 6.967751740968902e-06, "loss": 0.5118, "step": 11520 }, { "epoch": 0.4345531979044963, "grad_norm": 1.6489294125281642, "learning_rate": 6.9617027267203445e-06, "loss": 0.5031, "step": 11530 }, { "epoch": 0.43493008706139524, "grad_norm": 1.449669861501994, "learning_rate": 6.955650316996236e-06, "loss": 0.5022, "step": 11540 }, { "epoch": 0.4353069762182942, "grad_norm": 1.8165282391809885, "learning_rate": 6.949594522272587e-06, "loss": 0.4935, "step": 11550 }, { "epoch": 0.43568386537519316, "grad_norm": 1.5552590851069892, "learning_rate": 6.943535353031258e-06, "loss": 0.5157, "step": 11560 }, { "epoch": 0.4360607545320921, "grad_norm": 1.6072285953371521, "learning_rate": 6.937472819759959e-06, "loss": 0.5416, "step": 11570 }, { "epoch": 0.4364376436889911, "grad_norm": 1.7154718263219901, "learning_rate": 6.931406932952216e-06, "loss": 0.5112, "step": 11580 }, { "epoch": 0.43681453284589, "grad_norm": 1.69530017912524, "learning_rate": 6.92533770310736e-06, "loss": 0.5259, "step": 11590 }, { "epoch": 0.437191422002789, "grad_norm": 1.860148174972511, "learning_rate": 6.919265140730514e-06, "loss": 0.5106, "step": 11600 }, { "epoch": 0.4375683111596879, "grad_norm": 1.5811409620351222, "learning_rate": 6.913189256332566e-06, "loss": 0.5198, "step": 11610 }, { "epoch": 0.4379452003165869, "grad_norm": 1.4378401199137765, "learning_rate": 6.9071100604301496e-06, "loss": 0.5018, "step": 11620 }, { "epoch": 0.43832208947348583, "grad_norm": 1.6434275760246106, "learning_rate": 6.901027563545639e-06, "loss": 0.5117, "step": 11630 }, { "epoch": 0.4386989786303848, "grad_norm": 1.6446268399097024, "learning_rate": 6.894941776207114e-06, "loss": 0.5143, "step": 11640 }, { "epoch": 0.43907586778728375, "grad_norm": 1.5857907742925326, "learning_rate": 6.888852708948354e-06, "loss": 0.5174, "step": 11650 }, { "epoch": 0.43945275694418273, "grad_norm": 1.4020005504081066, "learning_rate": 6.882760372308819e-06, "loss": 0.5229, "step": 11660 }, { "epoch": 0.43982964610108166, "grad_norm": 1.61069613644627, "learning_rate": 6.876664776833616e-06, "loss": 0.4927, "step": 11670 }, { "epoch": 0.44020653525798065, "grad_norm": 1.7140951891727507, "learning_rate": 6.870565933073505e-06, "loss": 0.4936, "step": 11680 }, { "epoch": 0.4405834244148796, "grad_norm": 1.4507770402959719, "learning_rate": 6.864463851584863e-06, "loss": 0.5296, "step": 11690 }, { "epoch": 0.44096031357177856, "grad_norm": 1.460824925207524, "learning_rate": 6.858358542929672e-06, "loss": 0.5255, "step": 11700 }, { "epoch": 0.4413372027286775, "grad_norm": 1.6825791692741976, "learning_rate": 6.852250017675499e-06, "loss": 0.5079, "step": 11710 }, { "epoch": 0.4417140918855765, "grad_norm": 1.7652951082406643, "learning_rate": 6.8461382863954786e-06, "loss": 0.5281, "step": 11720 }, { "epoch": 0.4420909810424754, "grad_norm": 1.6445005156801455, "learning_rate": 6.840023359668297e-06, "loss": 0.5007, "step": 11730 }, { "epoch": 0.4424678701993744, "grad_norm": 3.0505367160131107, "learning_rate": 6.833905248078168e-06, "loss": 0.5027, "step": 11740 }, { "epoch": 0.4428447593562733, "grad_norm": 1.7728481701439127, "learning_rate": 6.82778396221482e-06, "loss": 0.5067, "step": 11750 }, { "epoch": 0.44322164851317225, "grad_norm": 1.4266982652231972, "learning_rate": 6.8216595126734775e-06, "loss": 0.4973, "step": 11760 }, { "epoch": 0.44359853767007124, "grad_norm": 1.4465402471000843, "learning_rate": 6.815531910054834e-06, "loss": 0.5315, "step": 11770 }, { "epoch": 0.44397542682697017, "grad_norm": 1.5371566334005535, "learning_rate": 6.809401164965051e-06, "loss": 0.4947, "step": 11780 }, { "epoch": 0.44435231598386915, "grad_norm": 1.664248932365906, "learning_rate": 6.803267288015718e-06, "loss": 0.5267, "step": 11790 }, { "epoch": 0.4447292051407681, "grad_norm": 1.6494483320654694, "learning_rate": 6.7971302898238545e-06, "loss": 0.4986, "step": 11800 }, { "epoch": 0.44510609429766707, "grad_norm": 1.3037692393364804, "learning_rate": 6.7909901810118785e-06, "loss": 0.4724, "step": 11810 }, { "epoch": 0.445482983454566, "grad_norm": 2.6200187135591713, "learning_rate": 6.784846972207593e-06, "loss": 0.5032, "step": 11820 }, { "epoch": 0.445859872611465, "grad_norm": 1.6525404819183418, "learning_rate": 6.778700674044164e-06, "loss": 0.5413, "step": 11830 }, { "epoch": 0.4462367617683639, "grad_norm": 1.480259604215167, "learning_rate": 6.77255129716011e-06, "loss": 0.5062, "step": 11840 }, { "epoch": 0.4466136509252629, "grad_norm": 1.7299508446770775, "learning_rate": 6.7663988521992744e-06, "loss": 0.5062, "step": 11850 }, { "epoch": 0.4469905400821618, "grad_norm": 1.389929475780395, "learning_rate": 6.760243349810811e-06, "loss": 0.5041, "step": 11860 }, { "epoch": 0.4473674292390608, "grad_norm": 1.795064940876833, "learning_rate": 6.754084800649169e-06, "loss": 0.4923, "step": 11870 }, { "epoch": 0.44774431839595974, "grad_norm": 1.5502942925119751, "learning_rate": 6.747923215374068e-06, "loss": 0.4851, "step": 11880 }, { "epoch": 0.4481212075528587, "grad_norm": 1.635874220277167, "learning_rate": 6.741758604650485e-06, "loss": 0.5009, "step": 11890 }, { "epoch": 0.44849809670975765, "grad_norm": 1.706031522666674, "learning_rate": 6.735590979148629e-06, "loss": 0.487, "step": 11900 }, { "epoch": 0.44887498586665664, "grad_norm": 1.690364938457251, "learning_rate": 6.729420349543934e-06, "loss": 0.4947, "step": 11910 }, { "epoch": 0.44925187502355557, "grad_norm": 1.7433823515896854, "learning_rate": 6.7232467265170295e-06, "loss": 0.5077, "step": 11920 }, { "epoch": 0.44962876418045455, "grad_norm": 1.6126498482759561, "learning_rate": 6.7170701207537285e-06, "loss": 0.5043, "step": 11930 }, { "epoch": 0.4500056533373535, "grad_norm": 1.5461921740338886, "learning_rate": 6.7108905429450035e-06, "loss": 0.5211, "step": 11940 }, { "epoch": 0.4503825424942524, "grad_norm": 1.654741487368281, "learning_rate": 6.704708003786974e-06, "loss": 0.5162, "step": 11950 }, { "epoch": 0.4507594316511514, "grad_norm": 1.5875885105401868, "learning_rate": 6.698522513980884e-06, "loss": 0.5076, "step": 11960 }, { "epoch": 0.45113632080805033, "grad_norm": 1.501080589567821, "learning_rate": 6.692334084233087e-06, "loss": 0.5019, "step": 11970 }, { "epoch": 0.4515132099649493, "grad_norm": 1.6323939585322438, "learning_rate": 6.686142725255021e-06, "loss": 0.5215, "step": 11980 }, { "epoch": 0.45189009912184824, "grad_norm": 1.3624830176460956, "learning_rate": 6.679948447763201e-06, "loss": 0.5026, "step": 11990 }, { "epoch": 0.45226698827874723, "grad_norm": 1.6944099137300341, "learning_rate": 6.673751262479183e-06, "loss": 0.5073, "step": 12000 }, { "epoch": 0.45264387743564616, "grad_norm": 1.5860681363669702, "learning_rate": 6.667551180129565e-06, "loss": 0.5085, "step": 12010 }, { "epoch": 0.45302076659254514, "grad_norm": 1.6331152902584547, "learning_rate": 6.661348211445959e-06, "loss": 0.5195, "step": 12020 }, { "epoch": 0.4533976557494441, "grad_norm": 4.415183248235426, "learning_rate": 6.655142367164967e-06, "loss": 0.5005, "step": 12030 }, { "epoch": 0.45377454490634306, "grad_norm": 1.5798349602106418, "learning_rate": 6.648933658028174e-06, "loss": 0.5045, "step": 12040 }, { "epoch": 0.454151434063242, "grad_norm": 1.435313930078849, "learning_rate": 6.642722094782121e-06, "loss": 0.4641, "step": 12050 }, { "epoch": 0.454528323220141, "grad_norm": 1.7147543461190684, "learning_rate": 6.636507688178291e-06, "loss": 0.5031, "step": 12060 }, { "epoch": 0.4549052123770399, "grad_norm": 1.5410650369487786, "learning_rate": 6.630290448973087e-06, "loss": 0.5173, "step": 12070 }, { "epoch": 0.4552821015339389, "grad_norm": 1.61040530239707, "learning_rate": 6.624070387927811e-06, "loss": 0.5205, "step": 12080 }, { "epoch": 0.4556589906908378, "grad_norm": 1.4352463589168436, "learning_rate": 6.61784751580866e-06, "loss": 0.4747, "step": 12090 }, { "epoch": 0.4560358798477368, "grad_norm": 1.8210569410817206, "learning_rate": 6.611621843386684e-06, "loss": 0.4858, "step": 12100 }, { "epoch": 0.45641276900463573, "grad_norm": 1.9985993173284664, "learning_rate": 6.605393381437792e-06, "loss": 0.496, "step": 12110 }, { "epoch": 0.4567896581615347, "grad_norm": 1.2939840914047982, "learning_rate": 6.599162140742712e-06, "loss": 0.4852, "step": 12120 }, { "epoch": 0.45716654731843365, "grad_norm": 1.6530787032881842, "learning_rate": 6.592928132086984e-06, "loss": 0.5003, "step": 12130 }, { "epoch": 0.45754343647533263, "grad_norm": 1.2909832653537678, "learning_rate": 6.586691366260943e-06, "loss": 0.5254, "step": 12140 }, { "epoch": 0.45792032563223156, "grad_norm": 1.775707858274227, "learning_rate": 6.580451854059693e-06, "loss": 0.5033, "step": 12150 }, { "epoch": 0.4582972147891305, "grad_norm": 1.5901192603045122, "learning_rate": 6.574209606283089e-06, "loss": 0.4797, "step": 12160 }, { "epoch": 0.4586741039460295, "grad_norm": 1.5718931288158366, "learning_rate": 6.56796463373573e-06, "loss": 0.5122, "step": 12170 }, { "epoch": 0.4590509931029284, "grad_norm": 1.5316527090504148, "learning_rate": 6.561716947226918e-06, "loss": 0.5096, "step": 12180 }, { "epoch": 0.4594278822598274, "grad_norm": 1.6069192593187367, "learning_rate": 6.555466557570666e-06, "loss": 0.4749, "step": 12190 }, { "epoch": 0.4598047714167263, "grad_norm": 1.6443701058606435, "learning_rate": 6.549213475585657e-06, "loss": 0.5315, "step": 12200 }, { "epoch": 0.4601816605736253, "grad_norm": 1.6467073011805697, "learning_rate": 6.542957712095236e-06, "loss": 0.4864, "step": 12210 }, { "epoch": 0.46055854973052424, "grad_norm": 1.6443944395717125, "learning_rate": 6.536699277927393e-06, "loss": 0.5069, "step": 12220 }, { "epoch": 0.4609354388874232, "grad_norm": 1.6659779073083896, "learning_rate": 6.530438183914735e-06, "loss": 0.4887, "step": 12230 }, { "epoch": 0.46131232804432215, "grad_norm": 1.802633741824681, "learning_rate": 6.5241744408944776e-06, "loss": 0.4967, "step": 12240 }, { "epoch": 0.46168921720122114, "grad_norm": 1.6619516541296946, "learning_rate": 6.517908059708417e-06, "loss": 0.5076, "step": 12250 }, { "epoch": 0.46206610635812007, "grad_norm": 1.9424205639529566, "learning_rate": 6.511639051202922e-06, "loss": 0.5228, "step": 12260 }, { "epoch": 0.46244299551501905, "grad_norm": 1.864221195142991, "learning_rate": 6.505367426228902e-06, "loss": 0.5092, "step": 12270 }, { "epoch": 0.462819884671918, "grad_norm": 1.5859581361435124, "learning_rate": 6.499093195641801e-06, "loss": 0.4919, "step": 12280 }, { "epoch": 0.46319677382881697, "grad_norm": 1.5769900351453978, "learning_rate": 6.49281637030157e-06, "loss": 0.5003, "step": 12290 }, { "epoch": 0.4635736629857159, "grad_norm": 1.962586555293073, "learning_rate": 6.486536961072651e-06, "loss": 0.5144, "step": 12300 }, { "epoch": 0.4639505521426149, "grad_norm": 1.5960370769135634, "learning_rate": 6.4802549788239585e-06, "loss": 0.4968, "step": 12310 }, { "epoch": 0.4643274412995138, "grad_norm": 1.4471637413402056, "learning_rate": 6.473970434428865e-06, "loss": 0.5133, "step": 12320 }, { "epoch": 0.4647043304564128, "grad_norm": 1.5033062625403697, "learning_rate": 6.467683338765169e-06, "loss": 0.4983, "step": 12330 }, { "epoch": 0.4650812196133117, "grad_norm": 1.8045255687380959, "learning_rate": 6.461393702715093e-06, "loss": 0.5212, "step": 12340 }, { "epoch": 0.46545810877021065, "grad_norm": 1.5049755950938395, "learning_rate": 6.455101537165251e-06, "loss": 0.4964, "step": 12350 }, { "epoch": 0.46583499792710964, "grad_norm": 1.307607429446019, "learning_rate": 6.448806853006642e-06, "loss": 0.4847, "step": 12360 }, { "epoch": 0.46621188708400857, "grad_norm": 1.5148579248421497, "learning_rate": 6.442509661134617e-06, "loss": 0.4749, "step": 12370 }, { "epoch": 0.46658877624090755, "grad_norm": 1.6520404081654247, "learning_rate": 6.436209972448872e-06, "loss": 0.5118, "step": 12380 }, { "epoch": 0.4669656653978065, "grad_norm": 1.4969166628742265, "learning_rate": 6.4299077978534215e-06, "loss": 0.4866, "step": 12390 }, { "epoch": 0.46734255455470547, "grad_norm": 1.688329647541456, "learning_rate": 6.423603148256589e-06, "loss": 0.5215, "step": 12400 }, { "epoch": 0.4677194437116044, "grad_norm": 1.4741523126628993, "learning_rate": 6.417296034570972e-06, "loss": 0.4984, "step": 12410 }, { "epoch": 0.4680963328685034, "grad_norm": 1.4962239650802354, "learning_rate": 6.410986467713446e-06, "loss": 0.5275, "step": 12420 }, { "epoch": 0.4684732220254023, "grad_norm": 1.7317402848543209, "learning_rate": 6.404674458605119e-06, "loss": 0.5235, "step": 12430 }, { "epoch": 0.4688501111823013, "grad_norm": 1.507370223846744, "learning_rate": 6.398360018171335e-06, "loss": 0.5065, "step": 12440 }, { "epoch": 0.46922700033920023, "grad_norm": 1.6389685200877848, "learning_rate": 6.392043157341645e-06, "loss": 0.5139, "step": 12450 }, { "epoch": 0.4696038894960992, "grad_norm": 4.587056243685229, "learning_rate": 6.385723887049788e-06, "loss": 0.5037, "step": 12460 }, { "epoch": 0.46998077865299814, "grad_norm": 1.4469191726379287, "learning_rate": 6.379402218233673e-06, "loss": 0.5139, "step": 12470 }, { "epoch": 0.47035766780989713, "grad_norm": 1.4631499108231787, "learning_rate": 6.373078161835364e-06, "loss": 0.5132, "step": 12480 }, { "epoch": 0.47073455696679606, "grad_norm": 1.494757392645272, "learning_rate": 6.366751728801051e-06, "loss": 0.4952, "step": 12490 }, { "epoch": 0.47111144612369504, "grad_norm": 1.6614354722718154, "learning_rate": 6.360422930081045e-06, "loss": 0.5236, "step": 12500 }, { "epoch": 0.471488335280594, "grad_norm": 1.589063266340144, "learning_rate": 6.3540917766297475e-06, "loss": 0.4754, "step": 12510 }, { "epoch": 0.47186522443749296, "grad_norm": 1.645641474309447, "learning_rate": 6.347758279405636e-06, "loss": 0.5105, "step": 12520 }, { "epoch": 0.4722421135943919, "grad_norm": 1.842229947630362, "learning_rate": 6.341422449371247e-06, "loss": 0.5058, "step": 12530 }, { "epoch": 0.4726190027512908, "grad_norm": 1.6305355750205157, "learning_rate": 6.3350842974931526e-06, "loss": 0.5387, "step": 12540 }, { "epoch": 0.4729958919081898, "grad_norm": 1.618466891577579, "learning_rate": 6.328743834741945e-06, "loss": 0.4999, "step": 12550 }, { "epoch": 0.47337278106508873, "grad_norm": 1.326825185706353, "learning_rate": 6.322401072092216e-06, "loss": 0.5027, "step": 12560 }, { "epoch": 0.4737496702219877, "grad_norm": 1.5969546323969477, "learning_rate": 6.316056020522538e-06, "loss": 0.5222, "step": 12570 }, { "epoch": 0.47412655937888665, "grad_norm": 1.5972771926737852, "learning_rate": 6.309708691015443e-06, "loss": 0.5015, "step": 12580 }, { "epoch": 0.47450344853578563, "grad_norm": 1.5891933279912756, "learning_rate": 6.303359094557411e-06, "loss": 0.4977, "step": 12590 }, { "epoch": 0.47488033769268456, "grad_norm": 1.7999033521409773, "learning_rate": 6.297007242138842e-06, "loss": 0.5161, "step": 12600 }, { "epoch": 0.47525722684958355, "grad_norm": 1.5315463861951017, "learning_rate": 6.290653144754043e-06, "loss": 0.5215, "step": 12610 }, { "epoch": 0.4756341160064825, "grad_norm": 1.6153094914746262, "learning_rate": 6.2842968134012026e-06, "loss": 0.4953, "step": 12620 }, { "epoch": 0.47601100516338146, "grad_norm": 1.665498157343032, "learning_rate": 6.277938259082382e-06, "loss": 0.4995, "step": 12630 }, { "epoch": 0.4763878943202804, "grad_norm": 1.3905032960020645, "learning_rate": 6.271577492803486e-06, "loss": 0.4796, "step": 12640 }, { "epoch": 0.4767647834771794, "grad_norm": 1.6507279959491987, "learning_rate": 6.265214525574248e-06, "loss": 0.5157, "step": 12650 }, { "epoch": 0.4771416726340783, "grad_norm": 1.2843410140916522, "learning_rate": 6.258849368408213e-06, "loss": 0.4858, "step": 12660 }, { "epoch": 0.4775185617909773, "grad_norm": 1.8350039947211565, "learning_rate": 6.252482032322716e-06, "loss": 0.5029, "step": 12670 }, { "epoch": 0.4778954509478762, "grad_norm": 1.5252536968641863, "learning_rate": 6.246112528338864e-06, "loss": 0.4954, "step": 12680 }, { "epoch": 0.4782723401047752, "grad_norm": 15.626822494749767, "learning_rate": 6.239740867481514e-06, "loss": 0.4798, "step": 12690 }, { "epoch": 0.47864922926167414, "grad_norm": 1.4368053909649985, "learning_rate": 6.233367060779258e-06, "loss": 0.4988, "step": 12700 }, { "epoch": 0.4790261184185731, "grad_norm": 1.6672155662880033, "learning_rate": 6.226991119264405e-06, "loss": 0.501, "step": 12710 }, { "epoch": 0.47940300757547205, "grad_norm": 1.7236312698986331, "learning_rate": 6.22061305397295e-06, "loss": 0.49, "step": 12720 }, { "epoch": 0.47977989673237104, "grad_norm": 1.5777802364774114, "learning_rate": 6.214232875944577e-06, "loss": 0.4982, "step": 12730 }, { "epoch": 0.48015678588926997, "grad_norm": 1.6768420216779125, "learning_rate": 6.207850596222616e-06, "loss": 0.4923, "step": 12740 }, { "epoch": 0.4805336750461689, "grad_norm": 1.7758220533639664, "learning_rate": 6.201466225854038e-06, "loss": 0.5189, "step": 12750 }, { "epoch": 0.4809105642030679, "grad_norm": 1.4580963940658098, "learning_rate": 6.195079775889436e-06, "loss": 0.4777, "step": 12760 }, { "epoch": 0.4812874533599668, "grad_norm": 1.5845783588645914, "learning_rate": 6.188691257382998e-06, "loss": 0.5021, "step": 12770 }, { "epoch": 0.4816643425168658, "grad_norm": 1.6276734887673494, "learning_rate": 6.182300681392497e-06, "loss": 0.5087, "step": 12780 }, { "epoch": 0.4820412316737647, "grad_norm": 1.6758298613310316, "learning_rate": 6.175908058979264e-06, "loss": 0.4919, "step": 12790 }, { "epoch": 0.4824181208306637, "grad_norm": 1.733797532127255, "learning_rate": 6.169513401208169e-06, "loss": 0.4979, "step": 12800 }, { "epoch": 0.48279500998756264, "grad_norm": 1.497057452497915, "learning_rate": 6.163116719147615e-06, "loss": 0.5047, "step": 12810 }, { "epoch": 0.4831718991444616, "grad_norm": 1.8210153084730252, "learning_rate": 6.156718023869497e-06, "loss": 0.4924, "step": 12820 }, { "epoch": 0.48354878830136055, "grad_norm": 1.5847792515896284, "learning_rate": 6.150317326449204e-06, "loss": 0.4779, "step": 12830 }, { "epoch": 0.48392567745825954, "grad_norm": 1.7086888789433332, "learning_rate": 6.143914637965585e-06, "loss": 0.5339, "step": 12840 }, { "epoch": 0.48430256661515847, "grad_norm": 1.825053431015552, "learning_rate": 6.137509969500936e-06, "loss": 0.4735, "step": 12850 }, { "epoch": 0.48467945577205745, "grad_norm": 1.4362907044279158, "learning_rate": 6.131103332140983e-06, "loss": 0.487, "step": 12860 }, { "epoch": 0.4850563449289564, "grad_norm": 1.6070484026990965, "learning_rate": 6.124694736974857e-06, "loss": 0.5195, "step": 12870 }, { "epoch": 0.48543323408585537, "grad_norm": 1.4921206455295302, "learning_rate": 6.11828419509508e-06, "loss": 0.4883, "step": 12880 }, { "epoch": 0.4858101232427543, "grad_norm": 1.596099544918074, "learning_rate": 6.111871717597542e-06, "loss": 0.5001, "step": 12890 }, { "epoch": 0.4861870123996533, "grad_norm": 1.6612200565392714, "learning_rate": 6.10545731558148e-06, "loss": 0.4955, "step": 12900 }, { "epoch": 0.4865639015565522, "grad_norm": 1.5500638018436168, "learning_rate": 6.09904100014947e-06, "loss": 0.5034, "step": 12910 }, { "epoch": 0.4869407907134512, "grad_norm": 1.4977064904230966, "learning_rate": 6.092622782407395e-06, "loss": 0.4963, "step": 12920 }, { "epoch": 0.48731767987035013, "grad_norm": 1.556241953096911, "learning_rate": 6.086202673464428e-06, "loss": 0.4838, "step": 12930 }, { "epoch": 0.48769456902724906, "grad_norm": 1.3784052149036907, "learning_rate": 6.079780684433024e-06, "loss": 0.5104, "step": 12940 }, { "epoch": 0.48807145818414804, "grad_norm": 1.862515188918877, "learning_rate": 6.0733568264288825e-06, "loss": 0.4936, "step": 12950 }, { "epoch": 0.488448347341047, "grad_norm": 1.4553084049722098, "learning_rate": 6.066931110570946e-06, "loss": 0.5048, "step": 12960 }, { "epoch": 0.48882523649794596, "grad_norm": 1.8977016581962198, "learning_rate": 6.0605035479813665e-06, "loss": 0.51, "step": 12970 }, { "epoch": 0.4892021256548449, "grad_norm": 1.4849897548302309, "learning_rate": 6.054074149785495e-06, "loss": 0.5085, "step": 12980 }, { "epoch": 0.4895790148117439, "grad_norm": 1.5091872951995677, "learning_rate": 6.047642927111861e-06, "loss": 0.5341, "step": 12990 }, { "epoch": 0.4899559039686428, "grad_norm": 1.6207478659154608, "learning_rate": 6.04120989109215e-06, "loss": 0.4925, "step": 13000 }, { "epoch": 0.4903327931255418, "grad_norm": 1.7085152211604375, "learning_rate": 6.0347750528611885e-06, "loss": 0.5045, "step": 13010 }, { "epoch": 0.4907096822824407, "grad_norm": 1.566132388020068, "learning_rate": 6.028338423556921e-06, "loss": 0.4953, "step": 13020 }, { "epoch": 0.4910865714393397, "grad_norm": 5.227941826091356, "learning_rate": 6.021900014320388e-06, "loss": 0.4873, "step": 13030 }, { "epoch": 0.49146346059623863, "grad_norm": 1.399863983250925, "learning_rate": 6.015459836295719e-06, "loss": 0.5148, "step": 13040 }, { "epoch": 0.4918403497531376, "grad_norm": 1.7472942859455838, "learning_rate": 6.0090179006301e-06, "loss": 0.4913, "step": 13050 }, { "epoch": 0.49221723891003655, "grad_norm": 1.6716498146417993, "learning_rate": 6.002574218473759e-06, "loss": 0.5056, "step": 13060 }, { "epoch": 0.49259412806693553, "grad_norm": 1.4884093526859357, "learning_rate": 5.996128800979949e-06, "loss": 0.491, "step": 13070 }, { "epoch": 0.49297101722383446, "grad_norm": 1.2765637587957481, "learning_rate": 5.989681659304927e-06, "loss": 0.5089, "step": 13080 }, { "epoch": 0.49334790638073345, "grad_norm": 1.515026969598235, "learning_rate": 5.9832328046079305e-06, "loss": 0.5015, "step": 13090 }, { "epoch": 0.4937247955376324, "grad_norm": 1.7949411377094697, "learning_rate": 5.9767822480511685e-06, "loss": 0.5161, "step": 13100 }, { "epoch": 0.49410168469453136, "grad_norm": 1.4675750652458723, "learning_rate": 5.970330000799787e-06, "loss": 0.5104, "step": 13110 }, { "epoch": 0.4944785738514303, "grad_norm": 2.0046846020153444, "learning_rate": 5.963876074021868e-06, "loss": 0.5292, "step": 13120 }, { "epoch": 0.4948554630083293, "grad_norm": 1.5369389040150114, "learning_rate": 5.957420478888393e-06, "loss": 0.4792, "step": 13130 }, { "epoch": 0.4952323521652282, "grad_norm": 1.4186555111290844, "learning_rate": 5.950963226573237e-06, "loss": 0.4947, "step": 13140 }, { "epoch": 0.49560924132212714, "grad_norm": 1.8188924112261224, "learning_rate": 5.944504328253137e-06, "loss": 0.5111, "step": 13150 }, { "epoch": 0.4959861304790261, "grad_norm": 1.5665015635774655, "learning_rate": 5.9380437951076845e-06, "loss": 0.487, "step": 13160 }, { "epoch": 0.49636301963592505, "grad_norm": 1.408767806697384, "learning_rate": 5.931581638319298e-06, "loss": 0.5116, "step": 13170 }, { "epoch": 0.49673990879282404, "grad_norm": 1.58728988540725, "learning_rate": 5.925117869073208e-06, "loss": 0.5059, "step": 13180 }, { "epoch": 0.49711679794972297, "grad_norm": 1.6119437087711728, "learning_rate": 5.918652498557434e-06, "loss": 0.5047, "step": 13190 }, { "epoch": 0.49749368710662195, "grad_norm": 1.6279721324019796, "learning_rate": 5.91218553796277e-06, "loss": 0.4935, "step": 13200 }, { "epoch": 0.4978705762635209, "grad_norm": 1.459557111527884, "learning_rate": 5.905716998482758e-06, "loss": 0.4596, "step": 13210 }, { "epoch": 0.49824746542041987, "grad_norm": 1.939844630405015, "learning_rate": 5.899246891313678e-06, "loss": 0.5423, "step": 13220 }, { "epoch": 0.4986243545773188, "grad_norm": 2.0937651337717273, "learning_rate": 5.892775227654518e-06, "loss": 0.5223, "step": 13230 }, { "epoch": 0.4990012437342178, "grad_norm": 1.6236113152239642, "learning_rate": 5.886302018706964e-06, "loss": 0.5015, "step": 13240 }, { "epoch": 0.4993781328911167, "grad_norm": 1.701791434893844, "learning_rate": 5.879827275675375e-06, "loss": 0.5264, "step": 13250 }, { "epoch": 0.4997550220480157, "grad_norm": 1.452378340458775, "learning_rate": 5.8733510097667664e-06, "loss": 0.4962, "step": 13260 }, { "epoch": 0.5001319112049146, "grad_norm": 1.7488927537673415, "learning_rate": 5.866873232190791e-06, "loss": 0.5223, "step": 13270 }, { "epoch": 0.5005088003618136, "grad_norm": 1.4944535300590263, "learning_rate": 5.860393954159712e-06, "loss": 0.4943, "step": 13280 }, { "epoch": 0.5008856895187126, "grad_norm": 1.3543774753320355, "learning_rate": 5.853913186888397e-06, "loss": 0.5162, "step": 13290 }, { "epoch": 0.5012625786756115, "grad_norm": 1.505410524383986, "learning_rate": 5.847430941594287e-06, "loss": 0.4926, "step": 13300 }, { "epoch": 0.5016394678325105, "grad_norm": 1.8407394343276793, "learning_rate": 5.840947229497382e-06, "loss": 0.4958, "step": 13310 }, { "epoch": 0.5020163569894094, "grad_norm": 1.6072319888354176, "learning_rate": 5.834462061820223e-06, "loss": 0.5221, "step": 13320 }, { "epoch": 0.5023932461463084, "grad_norm": 1.4621312186704514, "learning_rate": 5.827975449787868e-06, "loss": 0.5361, "step": 13330 }, { "epoch": 0.5027701353032074, "grad_norm": 1.478486209469283, "learning_rate": 5.821487404627872e-06, "loss": 0.4898, "step": 13340 }, { "epoch": 0.5031470244601063, "grad_norm": 1.5900748132206215, "learning_rate": 5.814997937570282e-06, "loss": 0.4911, "step": 13350 }, { "epoch": 0.5035239136170052, "grad_norm": 1.6106460671873601, "learning_rate": 5.808507059847591e-06, "loss": 0.5078, "step": 13360 }, { "epoch": 0.5039008027739041, "grad_norm": 1.8201545845564078, "learning_rate": 5.802014782694745e-06, "loss": 0.5254, "step": 13370 }, { "epoch": 0.5042776919308032, "grad_norm": 1.5604700154370892, "learning_rate": 5.795521117349106e-06, "loss": 0.5183, "step": 13380 }, { "epoch": 0.5046545810877021, "grad_norm": 1.476477451573828, "learning_rate": 5.789026075050445e-06, "loss": 0.5158, "step": 13390 }, { "epoch": 0.505031470244601, "grad_norm": 1.8253062366468507, "learning_rate": 5.782529667040908e-06, "loss": 0.5044, "step": 13400 }, { "epoch": 0.5054083594015, "grad_norm": 1.4718401708375746, "learning_rate": 5.7760319045650124e-06, "loss": 0.4713, "step": 13410 }, { "epoch": 0.505785248558399, "grad_norm": 2.0225600662739187, "learning_rate": 5.769532798869617e-06, "loss": 0.5435, "step": 13420 }, { "epoch": 0.5061621377152979, "grad_norm": 1.7805238633538563, "learning_rate": 5.763032361203904e-06, "loss": 0.4927, "step": 13430 }, { "epoch": 0.5065390268721969, "grad_norm": 1.6277608026896737, "learning_rate": 5.756530602819363e-06, "loss": 0.5066, "step": 13440 }, { "epoch": 0.5069159160290958, "grad_norm": 1.4945525295342976, "learning_rate": 5.750027534969771e-06, "loss": 0.4921, "step": 13450 }, { "epoch": 0.5072928051859948, "grad_norm": 1.4820174242789568, "learning_rate": 5.743523168911167e-06, "loss": 0.5075, "step": 13460 }, { "epoch": 0.5076696943428938, "grad_norm": 1.3635089466658887, "learning_rate": 5.7370175159018415e-06, "loss": 0.5046, "step": 13470 }, { "epoch": 0.5080465834997927, "grad_norm": 1.8437545665124715, "learning_rate": 5.730510587202311e-06, "loss": 0.515, "step": 13480 }, { "epoch": 0.5084234726566916, "grad_norm": 1.786451934916694, "learning_rate": 5.7240023940752984e-06, "loss": 0.5066, "step": 13490 }, { "epoch": 0.5088003618135907, "grad_norm": 1.7504343352240117, "learning_rate": 5.71749294778572e-06, "loss": 0.4949, "step": 13500 }, { "epoch": 0.5091772509704896, "grad_norm": 1.4788751754375462, "learning_rate": 5.710982259600656e-06, "loss": 0.4816, "step": 13510 }, { "epoch": 0.5095541401273885, "grad_norm": 1.5291626156746818, "learning_rate": 5.704470340789335e-06, "loss": 0.4725, "step": 13520 }, { "epoch": 0.5099310292842875, "grad_norm": 1.6830268113490698, "learning_rate": 5.697957202623126e-06, "loss": 0.4755, "step": 13530 }, { "epoch": 0.5103079184411864, "grad_norm": 1.495645523541403, "learning_rate": 5.691442856375493e-06, "loss": 0.4848, "step": 13540 }, { "epoch": 0.5106848075980854, "grad_norm": 1.7364770312162106, "learning_rate": 5.684927313322006e-06, "loss": 0.4986, "step": 13550 }, { "epoch": 0.5110616967549844, "grad_norm": 1.646648774368243, "learning_rate": 5.678410584740296e-06, "loss": 0.5062, "step": 13560 }, { "epoch": 0.5114385859118833, "grad_norm": 1.4355497252644054, "learning_rate": 5.671892681910052e-06, "loss": 0.5043, "step": 13570 }, { "epoch": 0.5118154750687822, "grad_norm": 1.5515113579431314, "learning_rate": 5.6653736161129925e-06, "loss": 0.5216, "step": 13580 }, { "epoch": 0.5121923642256813, "grad_norm": 1.668318401565443, "learning_rate": 5.658853398632849e-06, "loss": 0.5192, "step": 13590 }, { "epoch": 0.5125692533825802, "grad_norm": 1.4997427229219318, "learning_rate": 5.6523320407553495e-06, "loss": 0.4884, "step": 13600 }, { "epoch": 0.5129461425394791, "grad_norm": 1.6101341859359553, "learning_rate": 5.6458095537681924e-06, "loss": 0.5099, "step": 13610 }, { "epoch": 0.513323031696378, "grad_norm": 1.606887828085733, "learning_rate": 5.63928594896103e-06, "loss": 0.5299, "step": 13620 }, { "epoch": 0.5136999208532771, "grad_norm": 1.416359664160664, "learning_rate": 5.632761237625455e-06, "loss": 0.4947, "step": 13630 }, { "epoch": 0.514076810010176, "grad_norm": 1.4981521351869527, "learning_rate": 5.626235431054968e-06, "loss": 0.5176, "step": 13640 }, { "epoch": 0.514453699167075, "grad_norm": 1.9146596793411172, "learning_rate": 5.619708540544971e-06, "loss": 0.4981, "step": 13650 }, { "epoch": 0.5148305883239739, "grad_norm": 1.6248776563981115, "learning_rate": 5.61318057739274e-06, "loss": 0.5343, "step": 13660 }, { "epoch": 0.5152074774808729, "grad_norm": 1.8392698085104202, "learning_rate": 5.606651552897404e-06, "loss": 0.4745, "step": 13670 }, { "epoch": 0.5155843666377719, "grad_norm": 1.51974705908948, "learning_rate": 5.6001214783599375e-06, "loss": 0.484, "step": 13680 }, { "epoch": 0.5159612557946708, "grad_norm": 1.5164194761342433, "learning_rate": 5.593590365083126e-06, "loss": 0.4973, "step": 13690 }, { "epoch": 0.5163381449515697, "grad_norm": 1.571042115479235, "learning_rate": 5.587058224371553e-06, "loss": 0.4708, "step": 13700 }, { "epoch": 0.5167150341084688, "grad_norm": 2.1629006116268683, "learning_rate": 5.580525067531585e-06, "loss": 0.4937, "step": 13710 }, { "epoch": 0.5170919232653677, "grad_norm": 1.842138362917266, "learning_rate": 5.57399090587134e-06, "loss": 0.499, "step": 13720 }, { "epoch": 0.5174688124222666, "grad_norm": 1.4590850155680444, "learning_rate": 5.5674557507006846e-06, "loss": 0.4989, "step": 13730 }, { "epoch": 0.5178457015791655, "grad_norm": 1.5433721159408116, "learning_rate": 5.560919613331197e-06, "loss": 0.5099, "step": 13740 }, { "epoch": 0.5182225907360645, "grad_norm": 1.8050635225380678, "learning_rate": 5.554382505076157e-06, "loss": 0.4918, "step": 13750 }, { "epoch": 0.5185994798929635, "grad_norm": 1.3538997043392071, "learning_rate": 5.54784443725053e-06, "loss": 0.4787, "step": 13760 }, { "epoch": 0.5189763690498624, "grad_norm": 1.7172296005933367, "learning_rate": 5.541305421170936e-06, "loss": 0.4926, "step": 13770 }, { "epoch": 0.5193532582067614, "grad_norm": 1.682395972487519, "learning_rate": 5.534765468155641e-06, "loss": 0.4837, "step": 13780 }, { "epoch": 0.5197301473636603, "grad_norm": 1.5176445024464842, "learning_rate": 5.528224589524527e-06, "loss": 0.4976, "step": 13790 }, { "epoch": 0.5201070365205593, "grad_norm": 1.5257635716345566, "learning_rate": 5.521682796599086e-06, "loss": 0.496, "step": 13800 }, { "epoch": 0.5204839256774583, "grad_norm": 1.5260587749295311, "learning_rate": 5.515140100702385e-06, "loss": 0.4884, "step": 13810 }, { "epoch": 0.5208608148343572, "grad_norm": 1.6982387061332391, "learning_rate": 5.508596513159059e-06, "loss": 0.5005, "step": 13820 }, { "epoch": 0.5212377039912561, "grad_norm": 1.4479971134584864, "learning_rate": 5.502052045295286e-06, "loss": 0.4982, "step": 13830 }, { "epoch": 0.5216145931481552, "grad_norm": 1.857022364604521, "learning_rate": 5.495506708438763e-06, "loss": 0.5174, "step": 13840 }, { "epoch": 0.5219914823050541, "grad_norm": 1.6885724207882848, "learning_rate": 5.488960513918695e-06, "loss": 0.522, "step": 13850 }, { "epoch": 0.522368371461953, "grad_norm": 1.4128662344535574, "learning_rate": 5.482413473065775e-06, "loss": 0.5039, "step": 13860 }, { "epoch": 0.522745260618852, "grad_norm": 1.9106751275521132, "learning_rate": 5.475865597212152e-06, "loss": 0.462, "step": 13870 }, { "epoch": 0.523122149775751, "grad_norm": 1.5304732560419125, "learning_rate": 5.469316897691428e-06, "loss": 0.4906, "step": 13880 }, { "epoch": 0.5234990389326499, "grad_norm": 1.6509200854352029, "learning_rate": 5.4627673858386255e-06, "loss": 0.4937, "step": 13890 }, { "epoch": 0.5238759280895489, "grad_norm": 1.692631204971466, "learning_rate": 5.456217072990178e-06, "loss": 0.4782, "step": 13900 }, { "epoch": 0.5242528172464478, "grad_norm": 1.5218720962530914, "learning_rate": 5.4496659704839e-06, "loss": 0.4917, "step": 13910 }, { "epoch": 0.5246297064033468, "grad_norm": 1.50914557920104, "learning_rate": 5.44311408965898e-06, "loss": 0.5226, "step": 13920 }, { "epoch": 0.5250065955602458, "grad_norm": 1.6708689941350754, "learning_rate": 5.436561441855942e-06, "loss": 0.4741, "step": 13930 }, { "epoch": 0.5253834847171447, "grad_norm": 2.0239362250295336, "learning_rate": 5.430008038416653e-06, "loss": 0.4797, "step": 13940 }, { "epoch": 0.5257603738740436, "grad_norm": 1.6524565432727731, "learning_rate": 5.423453890684274e-06, "loss": 0.5416, "step": 13950 }, { "epoch": 0.5261372630309425, "grad_norm": 1.4470951746948137, "learning_rate": 5.416899010003264e-06, "loss": 0.4571, "step": 13960 }, { "epoch": 0.5265141521878416, "grad_norm": 1.713191989373032, "learning_rate": 5.410343407719343e-06, "loss": 0.5086, "step": 13970 }, { "epoch": 0.5268910413447405, "grad_norm": 1.511732776736868, "learning_rate": 5.4037870951794856e-06, "loss": 0.4623, "step": 13980 }, { "epoch": 0.5272679305016394, "grad_norm": 1.6985510028227295, "learning_rate": 5.397230083731894e-06, "loss": 0.4956, "step": 13990 }, { "epoch": 0.5276448196585384, "grad_norm": 1.8917478296516785, "learning_rate": 5.390672384725979e-06, "loss": 0.5007, "step": 14000 }, { "epoch": 0.5280217088154374, "grad_norm": 1.45118679961553, "learning_rate": 5.384114009512343e-06, "loss": 0.4753, "step": 14010 }, { "epoch": 0.5283985979723363, "grad_norm": 1.5887735261780631, "learning_rate": 5.37755496944276e-06, "loss": 0.5109, "step": 14020 }, { "epoch": 0.5287754871292353, "grad_norm": 1.6042273339896562, "learning_rate": 5.37099527587015e-06, "loss": 0.5093, "step": 14030 }, { "epoch": 0.5291523762861342, "grad_norm": 1.5185256947676165, "learning_rate": 5.3644349401485695e-06, "loss": 0.512, "step": 14040 }, { "epoch": 0.5295292654430332, "grad_norm": 1.5903571571587558, "learning_rate": 5.3578739736331846e-06, "loss": 0.5145, "step": 14050 }, { "epoch": 0.5299061545999322, "grad_norm": 1.6617724641502587, "learning_rate": 5.351312387680249e-06, "loss": 0.4734, "step": 14060 }, { "epoch": 0.5302830437568311, "grad_norm": 1.5942708990624488, "learning_rate": 5.344750193647097e-06, "loss": 0.4822, "step": 14070 }, { "epoch": 0.53065993291373, "grad_norm": 1.6417425014333669, "learning_rate": 5.338187402892108e-06, "loss": 0.5148, "step": 14080 }, { "epoch": 0.5310368220706291, "grad_norm": 1.6747154379450402, "learning_rate": 5.331624026774698e-06, "loss": 0.4797, "step": 14090 }, { "epoch": 0.531413711227528, "grad_norm": 1.7559506025087541, "learning_rate": 5.325060076655295e-06, "loss": 0.4998, "step": 14100 }, { "epoch": 0.5317906003844269, "grad_norm": 1.5737701449749422, "learning_rate": 5.3184955638953215e-06, "loss": 0.4771, "step": 14110 }, { "epoch": 0.5321674895413259, "grad_norm": 1.4261947265219483, "learning_rate": 5.311930499857173e-06, "loss": 0.483, "step": 14120 }, { "epoch": 0.5325443786982249, "grad_norm": 1.5998345020742453, "learning_rate": 5.3053648959041995e-06, "loss": 0.4958, "step": 14130 }, { "epoch": 0.5329212678551238, "grad_norm": 1.4631565618451698, "learning_rate": 5.2987987634006845e-06, "loss": 0.4755, "step": 14140 }, { "epoch": 0.5332981570120228, "grad_norm": 1.5593631613031096, "learning_rate": 5.2922321137118285e-06, "loss": 0.4837, "step": 14150 }, { "epoch": 0.5336750461689217, "grad_norm": 1.373456275581913, "learning_rate": 5.285664958203723e-06, "loss": 0.488, "step": 14160 }, { "epoch": 0.5340519353258206, "grad_norm": 1.4401671838017023, "learning_rate": 5.2790973082433415e-06, "loss": 0.4953, "step": 14170 }, { "epoch": 0.5344288244827197, "grad_norm": 1.7640373010765296, "learning_rate": 5.2725291751985085e-06, "loss": 0.464, "step": 14180 }, { "epoch": 0.5348057136396186, "grad_norm": 3.309243521738535, "learning_rate": 5.2659605704378855e-06, "loss": 0.4997, "step": 14190 }, { "epoch": 0.5351826027965175, "grad_norm": 1.7411014099972144, "learning_rate": 5.259391505330952e-06, "loss": 0.487, "step": 14200 }, { "epoch": 0.5355594919534165, "grad_norm": 1.736020357499271, "learning_rate": 5.252821991247983e-06, "loss": 0.5079, "step": 14210 }, { "epoch": 0.5359363811103155, "grad_norm": 1.608976151378783, "learning_rate": 5.246252039560029e-06, "loss": 0.4948, "step": 14220 }, { "epoch": 0.5363132702672144, "grad_norm": 1.4342748366323024, "learning_rate": 5.239681661638902e-06, "loss": 0.4819, "step": 14230 }, { "epoch": 0.5366901594241134, "grad_norm": 1.5495611294979035, "learning_rate": 5.233110868857148e-06, "loss": 0.4804, "step": 14240 }, { "epoch": 0.5370670485810123, "grad_norm": 1.8546486328260592, "learning_rate": 5.2265396725880354e-06, "loss": 0.5026, "step": 14250 }, { "epoch": 0.5374439377379113, "grad_norm": 1.4351730050068885, "learning_rate": 5.219968084205525e-06, "loss": 0.4984, "step": 14260 }, { "epoch": 0.5378208268948103, "grad_norm": 1.5059171293734337, "learning_rate": 5.213396115084261e-06, "loss": 0.486, "step": 14270 }, { "epoch": 0.5381977160517092, "grad_norm": 1.5964828714981016, "learning_rate": 5.206823776599544e-06, "loss": 0.4951, "step": 14280 }, { "epoch": 0.5385746052086081, "grad_norm": 1.4533398425449158, "learning_rate": 5.200251080127318e-06, "loss": 0.5093, "step": 14290 }, { "epoch": 0.5389514943655072, "grad_norm": 1.6285262584640723, "learning_rate": 5.1936780370441395e-06, "loss": 0.4896, "step": 14300 }, { "epoch": 0.5393283835224061, "grad_norm": 1.6246946608117687, "learning_rate": 5.187104658727173e-06, "loss": 0.4775, "step": 14310 }, { "epoch": 0.539705272679305, "grad_norm": 1.4908047445233454, "learning_rate": 5.180530956554158e-06, "loss": 0.4975, "step": 14320 }, { "epoch": 0.5400821618362039, "grad_norm": 1.5621540920543053, "learning_rate": 5.173956941903395e-06, "loss": 0.4687, "step": 14330 }, { "epoch": 0.5404590509931029, "grad_norm": 1.7060966317168311, "learning_rate": 5.167382626153727e-06, "loss": 0.5019, "step": 14340 }, { "epoch": 0.5408359401500019, "grad_norm": 1.7020605336609178, "learning_rate": 5.160808020684519e-06, "loss": 0.4993, "step": 14350 }, { "epoch": 0.5412128293069008, "grad_norm": 1.7730954600238917, "learning_rate": 5.154233136875633e-06, "loss": 0.485, "step": 14360 }, { "epoch": 0.5415897184637998, "grad_norm": 1.9393987985330001, "learning_rate": 5.147657986107417e-06, "loss": 0.5375, "step": 14370 }, { "epoch": 0.5419666076206987, "grad_norm": 1.6471427503287805, "learning_rate": 5.1410825797606816e-06, "loss": 0.4997, "step": 14380 }, { "epoch": 0.5423434967775977, "grad_norm": 1.4369947353354569, "learning_rate": 5.134506929216674e-06, "loss": 0.4739, "step": 14390 }, { "epoch": 0.5427203859344967, "grad_norm": 1.7681719081478857, "learning_rate": 5.127931045857073e-06, "loss": 0.4991, "step": 14400 }, { "epoch": 0.5430972750913956, "grad_norm": 1.6691866607079882, "learning_rate": 5.1213549410639515e-06, "loss": 0.4623, "step": 14410 }, { "epoch": 0.5434741642482945, "grad_norm": 1.4149702534749613, "learning_rate": 5.114778626219772e-06, "loss": 0.4828, "step": 14420 }, { "epoch": 0.5438510534051936, "grad_norm": 1.6627823221828433, "learning_rate": 5.108202112707357e-06, "loss": 0.5027, "step": 14430 }, { "epoch": 0.5442279425620925, "grad_norm": 1.250526612258918, "learning_rate": 5.101625411909874e-06, "loss": 0.4779, "step": 14440 }, { "epoch": 0.5446048317189914, "grad_norm": 1.5283584890240594, "learning_rate": 5.0950485352108145e-06, "loss": 0.494, "step": 14450 }, { "epoch": 0.5449817208758904, "grad_norm": 1.337913464919517, "learning_rate": 5.088471493993977e-06, "loss": 0.459, "step": 14460 }, { "epoch": 0.5453586100327894, "grad_norm": 1.481063864916166, "learning_rate": 5.081894299643439e-06, "loss": 0.5148, "step": 14470 }, { "epoch": 0.5457354991896883, "grad_norm": 1.5902256444957483, "learning_rate": 5.07531696354355e-06, "loss": 0.4776, "step": 14480 }, { "epoch": 0.5461123883465873, "grad_norm": 1.443883382589487, "learning_rate": 5.068739497078898e-06, "loss": 0.5122, "step": 14490 }, { "epoch": 0.5464892775034862, "grad_norm": 1.6573811027305527, "learning_rate": 5.0621619116343e-06, "loss": 0.5163, "step": 14500 }, { "epoch": 0.5468661666603852, "grad_norm": 1.609253979586725, "learning_rate": 5.055584218594782e-06, "loss": 0.4862, "step": 14510 }, { "epoch": 0.5472430558172842, "grad_norm": 1.7791228748733698, "learning_rate": 5.049006429345552e-06, "loss": 0.5079, "step": 14520 }, { "epoch": 0.5476199449741831, "grad_norm": 1.5440262873205584, "learning_rate": 5.0424285552719845e-06, "loss": 0.4881, "step": 14530 }, { "epoch": 0.547996834131082, "grad_norm": 1.67027687894764, "learning_rate": 5.0358506077596035e-06, "loss": 0.4948, "step": 14540 }, { "epoch": 0.548373723287981, "grad_norm": 1.7084316467056804, "learning_rate": 5.029272598194057e-06, "loss": 0.4836, "step": 14550 }, { "epoch": 0.54875061244488, "grad_norm": 1.6824534462487943, "learning_rate": 5.022694537961105e-06, "loss": 0.4904, "step": 14560 }, { "epoch": 0.5491275016017789, "grad_norm": 1.684452437139124, "learning_rate": 5.016116438446588e-06, "loss": 0.5244, "step": 14570 }, { "epoch": 0.5495043907586779, "grad_norm": 1.7169193584179046, "learning_rate": 5.009538311036422e-06, "loss": 0.5139, "step": 14580 }, { "epoch": 0.5498812799155768, "grad_norm": 1.6319964128441222, "learning_rate": 5.002960167116567e-06, "loss": 0.4995, "step": 14590 }, { "epoch": 0.5502581690724758, "grad_norm": 1.7828166102258067, "learning_rate": 4.9963820180730125e-06, "loss": 0.4847, "step": 14600 }, { "epoch": 0.5506350582293748, "grad_norm": 1.4715536669658034, "learning_rate": 4.989803875291759e-06, "loss": 0.5032, "step": 14610 }, { "epoch": 0.5510119473862737, "grad_norm": 1.6193456161052595, "learning_rate": 4.983225750158789e-06, "loss": 0.4738, "step": 14620 }, { "epoch": 0.5513888365431726, "grad_norm": 1.6523635284605276, "learning_rate": 4.976647654060064e-06, "loss": 0.4697, "step": 14630 }, { "epoch": 0.5517657257000717, "grad_norm": 1.5353363387196937, "learning_rate": 4.970069598381489e-06, "loss": 0.4766, "step": 14640 }, { "epoch": 0.5521426148569706, "grad_norm": 1.599767794638566, "learning_rate": 4.963491594508904e-06, "loss": 0.486, "step": 14650 }, { "epoch": 0.5525195040138695, "grad_norm": 1.5072462327621345, "learning_rate": 4.956913653828051e-06, "loss": 0.5029, "step": 14660 }, { "epoch": 0.5528963931707684, "grad_norm": 1.576083520001881, "learning_rate": 4.950335787724571e-06, "loss": 0.5008, "step": 14670 }, { "epoch": 0.5532732823276675, "grad_norm": 1.7206709649650131, "learning_rate": 4.943758007583972e-06, "loss": 0.4899, "step": 14680 }, { "epoch": 0.5536501714845664, "grad_norm": 2.125129340848355, "learning_rate": 4.937180324791616e-06, "loss": 0.4858, "step": 14690 }, { "epoch": 0.5540270606414653, "grad_norm": 1.776386341218708, "learning_rate": 4.930602750732691e-06, "loss": 0.5114, "step": 14700 }, { "epoch": 0.5544039497983643, "grad_norm": 1.8272872066498904, "learning_rate": 4.924025296792202e-06, "loss": 0.4938, "step": 14710 }, { "epoch": 0.5547808389552633, "grad_norm": 1.5539132471817085, "learning_rate": 4.917447974354944e-06, "loss": 0.5012, "step": 14720 }, { "epoch": 0.5551577281121622, "grad_norm": 1.5058070102826557, "learning_rate": 4.910870794805484e-06, "loss": 0.4982, "step": 14730 }, { "epoch": 0.5555346172690612, "grad_norm": 1.710828327508459, "learning_rate": 4.904293769528146e-06, "loss": 0.4756, "step": 14740 }, { "epoch": 0.5559115064259601, "grad_norm": 1.8818911198800967, "learning_rate": 4.8977169099069774e-06, "loss": 0.4877, "step": 14750 }, { "epoch": 0.556288395582859, "grad_norm": 1.5555517297318513, "learning_rate": 4.891140227325749e-06, "loss": 0.5072, "step": 14760 }, { "epoch": 0.5566652847397581, "grad_norm": 1.3837969704058737, "learning_rate": 4.884563733167921e-06, "loss": 0.4493, "step": 14770 }, { "epoch": 0.557042173896657, "grad_norm": 1.605935176191625, "learning_rate": 4.877987438816626e-06, "loss": 0.5043, "step": 14780 }, { "epoch": 0.5574190630535559, "grad_norm": 1.4152291686855751, "learning_rate": 4.8714113556546526e-06, "loss": 0.4896, "step": 14790 }, { "epoch": 0.5577959522104549, "grad_norm": 1.555671007575408, "learning_rate": 4.864835495064422e-06, "loss": 0.4948, "step": 14800 }, { "epoch": 0.5581728413673539, "grad_norm": 1.7501287152105198, "learning_rate": 4.858259868427975e-06, "loss": 0.528, "step": 14810 }, { "epoch": 0.5585497305242528, "grad_norm": 1.4977704984866138, "learning_rate": 4.851684487126942e-06, "loss": 0.4995, "step": 14820 }, { "epoch": 0.5589266196811518, "grad_norm": 1.5881770020676744, "learning_rate": 4.845109362542531e-06, "loss": 0.5033, "step": 14830 }, { "epoch": 0.5593035088380507, "grad_norm": 1.6759640529299542, "learning_rate": 4.838534506055505e-06, "loss": 0.5153, "step": 14840 }, { "epoch": 0.5596803979949497, "grad_norm": 1.444873322408345, "learning_rate": 4.8319599290461644e-06, "loss": 0.4907, "step": 14850 }, { "epoch": 0.5600572871518487, "grad_norm": 1.8264686526034841, "learning_rate": 4.825385642894325e-06, "loss": 0.5306, "step": 14860 }, { "epoch": 0.5604341763087476, "grad_norm": 1.5497290772020473, "learning_rate": 4.818811658979298e-06, "loss": 0.5152, "step": 14870 }, { "epoch": 0.5608110654656465, "grad_norm": 1.37844981662825, "learning_rate": 4.8122379886798714e-06, "loss": 0.4983, "step": 14880 }, { "epoch": 0.5611879546225456, "grad_norm": 1.3444131954230294, "learning_rate": 4.805664643374295e-06, "loss": 0.4925, "step": 14890 }, { "epoch": 0.5615648437794445, "grad_norm": 1.5532769744703083, "learning_rate": 4.799091634440251e-06, "loss": 0.5188, "step": 14900 }, { "epoch": 0.5619417329363434, "grad_norm": 1.555196739372832, "learning_rate": 4.7925189732548396e-06, "loss": 0.4683, "step": 14910 }, { "epoch": 0.5623186220932423, "grad_norm": 1.4684972837534254, "learning_rate": 4.7859466711945616e-06, "loss": 0.479, "step": 14920 }, { "epoch": 0.5626955112501413, "grad_norm": 1.6202095473956792, "learning_rate": 4.7793747396352945e-06, "loss": 0.494, "step": 14930 }, { "epoch": 0.5630724004070403, "grad_norm": 1.7528710855944911, "learning_rate": 4.7728031899522775e-06, "loss": 0.4671, "step": 14940 }, { "epoch": 0.5634492895639392, "grad_norm": 1.8853181007805313, "learning_rate": 4.7662320335200815e-06, "loss": 0.4952, "step": 14950 }, { "epoch": 0.5638261787208382, "grad_norm": 1.5382530586672782, "learning_rate": 4.759661281712605e-06, "loss": 0.4707, "step": 14960 }, { "epoch": 0.5642030678777371, "grad_norm": 1.525234396856901, "learning_rate": 4.753090945903043e-06, "loss": 0.479, "step": 14970 }, { "epoch": 0.5645799570346361, "grad_norm": 1.4131804151431138, "learning_rate": 4.74652103746387e-06, "loss": 0.5002, "step": 14980 }, { "epoch": 0.5649568461915351, "grad_norm": 1.3413955318599295, "learning_rate": 4.739951567766819e-06, "loss": 0.5014, "step": 14990 }, { "epoch": 0.565333735348434, "grad_norm": 1.707719627926162, "learning_rate": 4.733382548182867e-06, "loss": 0.4803, "step": 15000 }, { "epoch": 0.5657106245053329, "grad_norm": 1.783248272136694, "learning_rate": 4.726813990082208e-06, "loss": 0.521, "step": 15010 }, { "epoch": 0.566087513662232, "grad_norm": 1.6966076793346403, "learning_rate": 4.720245904834247e-06, "loss": 0.491, "step": 15020 }, { "epoch": 0.5664644028191309, "grad_norm": 1.718872464504697, "learning_rate": 4.713678303807554e-06, "loss": 0.508, "step": 15030 }, { "epoch": 0.5668412919760298, "grad_norm": 1.3577910290768516, "learning_rate": 4.707111198369875e-06, "loss": 0.4792, "step": 15040 }, { "epoch": 0.5672181811329288, "grad_norm": 2.011765897335492, "learning_rate": 4.700544599888092e-06, "loss": 0.4792, "step": 15050 }, { "epoch": 0.5675950702898278, "grad_norm": 1.5685090926240022, "learning_rate": 4.693978519728214e-06, "loss": 0.487, "step": 15060 }, { "epoch": 0.5679719594467267, "grad_norm": 1.9465868567085707, "learning_rate": 4.687412969255344e-06, "loss": 0.5383, "step": 15070 }, { "epoch": 0.5683488486036257, "grad_norm": 1.9601527980802476, "learning_rate": 4.680847959833678e-06, "loss": 0.5143, "step": 15080 }, { "epoch": 0.5687257377605246, "grad_norm": 1.3360212285500883, "learning_rate": 4.674283502826469e-06, "loss": 0.5006, "step": 15090 }, { "epoch": 0.5691026269174236, "grad_norm": 1.8144442698478411, "learning_rate": 4.667719609596017e-06, "loss": 0.4999, "step": 15100 }, { "epoch": 0.5694795160743226, "grad_norm": 1.8698769948828864, "learning_rate": 4.661156291503648e-06, "loss": 0.4859, "step": 15110 }, { "epoch": 0.5698564052312215, "grad_norm": 1.49460133455783, "learning_rate": 4.654593559909686e-06, "loss": 0.4845, "step": 15120 }, { "epoch": 0.5702332943881204, "grad_norm": 1.4660243676447777, "learning_rate": 4.648031426173445e-06, "loss": 0.5125, "step": 15130 }, { "epoch": 0.5706101835450194, "grad_norm": 1.3870825072557174, "learning_rate": 4.641469901653202e-06, "loss": 0.5016, "step": 15140 }, { "epoch": 0.5709870727019184, "grad_norm": 1.6431723296673835, "learning_rate": 4.634908997706185e-06, "loss": 0.4987, "step": 15150 }, { "epoch": 0.5713639618588173, "grad_norm": 1.7105138212254234, "learning_rate": 4.628348725688535e-06, "loss": 0.4854, "step": 15160 }, { "epoch": 0.5717408510157163, "grad_norm": 1.9455745122203643, "learning_rate": 4.621789096955314e-06, "loss": 0.4913, "step": 15170 }, { "epoch": 0.5721177401726152, "grad_norm": 1.6110602983128681, "learning_rate": 4.615230122860463e-06, "loss": 0.478, "step": 15180 }, { "epoch": 0.5724946293295142, "grad_norm": 1.570236871676212, "learning_rate": 4.608671814756789e-06, "loss": 0.4966, "step": 15190 }, { "epoch": 0.5728715184864132, "grad_norm": 1.6101020345767108, "learning_rate": 4.60211418399595e-06, "loss": 0.4921, "step": 15200 }, { "epoch": 0.5732484076433121, "grad_norm": 1.3277725999257566, "learning_rate": 4.595557241928428e-06, "loss": 0.4492, "step": 15210 }, { "epoch": 0.573625296800211, "grad_norm": 1.4944481859425578, "learning_rate": 4.589000999903514e-06, "loss": 0.508, "step": 15220 }, { "epoch": 0.5740021859571101, "grad_norm": 1.949554647184248, "learning_rate": 4.582445469269293e-06, "loss": 0.4783, "step": 15230 }, { "epoch": 0.574379075114009, "grad_norm": 1.6382300534433607, "learning_rate": 4.575890661372608e-06, "loss": 0.5262, "step": 15240 }, { "epoch": 0.5747559642709079, "grad_norm": 1.7414910131507269, "learning_rate": 4.569336587559058e-06, "loss": 0.5097, "step": 15250 }, { "epoch": 0.5751328534278068, "grad_norm": 1.5308534197031307, "learning_rate": 4.562783259172972e-06, "loss": 0.4743, "step": 15260 }, { "epoch": 0.5755097425847059, "grad_norm": 1.544445790287268, "learning_rate": 4.556230687557387e-06, "loss": 0.4855, "step": 15270 }, { "epoch": 0.5758866317416048, "grad_norm": 1.5649621936574907, "learning_rate": 4.549678884054028e-06, "loss": 0.4827, "step": 15280 }, { "epoch": 0.5762635208985037, "grad_norm": 1.5699103732057733, "learning_rate": 4.543127860003291e-06, "loss": 0.5126, "step": 15290 }, { "epoch": 0.5766404100554027, "grad_norm": 1.6479924327503963, "learning_rate": 4.536577626744229e-06, "loss": 0.5087, "step": 15300 }, { "epoch": 0.5770172992123017, "grad_norm": 1.754860584177109, "learning_rate": 4.53002819561452e-06, "loss": 0.4868, "step": 15310 }, { "epoch": 0.5773941883692006, "grad_norm": 1.538257113940453, "learning_rate": 4.523479577950452e-06, "loss": 0.4828, "step": 15320 }, { "epoch": 0.5777710775260996, "grad_norm": 1.6723499369767234, "learning_rate": 4.516931785086911e-06, "loss": 0.4702, "step": 15330 }, { "epoch": 0.5781479666829985, "grad_norm": 1.441994971838687, "learning_rate": 4.510384828357352e-06, "loss": 0.5074, "step": 15340 }, { "epoch": 0.5785248558398974, "grad_norm": 1.676436195429064, "learning_rate": 4.503838719093785e-06, "loss": 0.498, "step": 15350 }, { "epoch": 0.5789017449967965, "grad_norm": 1.5193324381147915, "learning_rate": 4.4972934686267465e-06, "loss": 0.488, "step": 15360 }, { "epoch": 0.5792786341536954, "grad_norm": 1.644408204909188, "learning_rate": 4.4907490882852945e-06, "loss": 0.4658, "step": 15370 }, { "epoch": 0.5796555233105943, "grad_norm": 1.3830347527818876, "learning_rate": 4.484205589396979e-06, "loss": 0.5044, "step": 15380 }, { "epoch": 0.5800324124674933, "grad_norm": 1.5656512441657606, "learning_rate": 4.477662983287823e-06, "loss": 0.4713, "step": 15390 }, { "epoch": 0.5804093016243923, "grad_norm": 1.8212831299721022, "learning_rate": 4.4711212812823015e-06, "loss": 0.5006, "step": 15400 }, { "epoch": 0.5807861907812912, "grad_norm": 5.814852464154508, "learning_rate": 4.46458049470333e-06, "loss": 0.5273, "step": 15410 }, { "epoch": 0.5811630799381902, "grad_norm": 1.5922280470024501, "learning_rate": 4.458040634872234e-06, "loss": 0.512, "step": 15420 }, { "epoch": 0.5815399690950891, "grad_norm": 1.383810258386157, "learning_rate": 4.451501713108744e-06, "loss": 0.4864, "step": 15430 }, { "epoch": 0.5819168582519881, "grad_norm": 1.5168290662535795, "learning_rate": 4.444963740730953e-06, "loss": 0.5101, "step": 15440 }, { "epoch": 0.5822937474088871, "grad_norm": 1.628078473199888, "learning_rate": 4.438426729055324e-06, "loss": 0.4838, "step": 15450 }, { "epoch": 0.582670636565786, "grad_norm": 1.3493978928610881, "learning_rate": 4.431890689396649e-06, "loss": 0.4719, "step": 15460 }, { "epoch": 0.5830475257226849, "grad_norm": 1.740559583658373, "learning_rate": 4.425355633068041e-06, "loss": 0.5067, "step": 15470 }, { "epoch": 0.583424414879584, "grad_norm": 1.7981946745212039, "learning_rate": 4.418821571380911e-06, "loss": 0.4932, "step": 15480 }, { "epoch": 0.5838013040364829, "grad_norm": 1.5473002005079304, "learning_rate": 4.4122885156449445e-06, "loss": 0.4904, "step": 15490 }, { "epoch": 0.5841781931933818, "grad_norm": 1.8784776516281418, "learning_rate": 4.40575647716809e-06, "loss": 0.5048, "step": 15500 }, { "epoch": 0.5845550823502808, "grad_norm": 1.4719513927457284, "learning_rate": 4.399225467256535e-06, "loss": 0.4481, "step": 15510 }, { "epoch": 0.5849319715071798, "grad_norm": 1.825241349848936, "learning_rate": 4.392695497214688e-06, "loss": 0.4983, "step": 15520 }, { "epoch": 0.5853088606640787, "grad_norm": 1.5528305044356574, "learning_rate": 4.38616657834515e-06, "loss": 0.5041, "step": 15530 }, { "epoch": 0.5856857498209777, "grad_norm": 1.6870571870401214, "learning_rate": 4.3796387219487105e-06, "loss": 0.4874, "step": 15540 }, { "epoch": 0.5860626389778766, "grad_norm": 1.7920265982059933, "learning_rate": 4.373111939324317e-06, "loss": 0.4999, "step": 15550 }, { "epoch": 0.5864395281347755, "grad_norm": 1.341117566474037, "learning_rate": 4.366586241769061e-06, "loss": 0.4557, "step": 15560 }, { "epoch": 0.5868164172916746, "grad_norm": 1.3355923074651992, "learning_rate": 4.36006164057815e-06, "loss": 0.475, "step": 15570 }, { "epoch": 0.5871933064485735, "grad_norm": 1.5410068065396823, "learning_rate": 4.353538147044899e-06, "loss": 0.4794, "step": 15580 }, { "epoch": 0.5875701956054724, "grad_norm": 1.4096817479193515, "learning_rate": 4.347015772460705e-06, "loss": 0.5178, "step": 15590 }, { "epoch": 0.5879470847623713, "grad_norm": 1.7430158797469388, "learning_rate": 4.340494528115028e-06, "loss": 0.4908, "step": 15600 }, { "epoch": 0.5883239739192704, "grad_norm": 1.6723561324160858, "learning_rate": 4.333974425295368e-06, "loss": 0.4748, "step": 15610 }, { "epoch": 0.5887008630761693, "grad_norm": 1.5864053334700638, "learning_rate": 4.327455475287255e-06, "loss": 0.4994, "step": 15620 }, { "epoch": 0.5890777522330682, "grad_norm": 1.524780156258879, "learning_rate": 4.3209376893742185e-06, "loss": 0.4881, "step": 15630 }, { "epoch": 0.5894546413899672, "grad_norm": 1.7296029856373045, "learning_rate": 4.314421078837782e-06, "loss": 0.5231, "step": 15640 }, { "epoch": 0.5898315305468662, "grad_norm": 1.6109569171154967, "learning_rate": 4.3079056549574185e-06, "loss": 0.5001, "step": 15650 }, { "epoch": 0.5902084197037651, "grad_norm": 1.3624010397586623, "learning_rate": 4.301391429010563e-06, "loss": 0.5082, "step": 15660 }, { "epoch": 0.5905853088606641, "grad_norm": 1.7683758241128607, "learning_rate": 4.2948784122725695e-06, "loss": 0.4992, "step": 15670 }, { "epoch": 0.590962198017563, "grad_norm": 1.8030632058982388, "learning_rate": 4.2883666160167004e-06, "loss": 0.4562, "step": 15680 }, { "epoch": 0.591339087174462, "grad_norm": 1.4736588153729777, "learning_rate": 4.281856051514104e-06, "loss": 0.4598, "step": 15690 }, { "epoch": 0.591715976331361, "grad_norm": 1.9040076493277438, "learning_rate": 4.275346730033797e-06, "loss": 0.5181, "step": 15700 }, { "epoch": 0.5920928654882599, "grad_norm": 1.6217096093879626, "learning_rate": 4.268838662842648e-06, "loss": 0.493, "step": 15710 }, { "epoch": 0.5924697546451588, "grad_norm": 1.4714622199259768, "learning_rate": 4.262331861205353e-06, "loss": 0.4911, "step": 15720 }, { "epoch": 0.5928466438020578, "grad_norm": 1.6675985050238766, "learning_rate": 4.255826336384413e-06, "loss": 0.4839, "step": 15730 }, { "epoch": 0.5932235329589568, "grad_norm": 1.730305207931142, "learning_rate": 4.249322099640124e-06, "loss": 0.4613, "step": 15740 }, { "epoch": 0.5936004221158557, "grad_norm": 1.5875923671826002, "learning_rate": 4.2428191622305515e-06, "loss": 0.4882, "step": 15750 }, { "epoch": 0.5939773112727547, "grad_norm": 1.7817428188939046, "learning_rate": 4.2363175354115125e-06, "loss": 0.4735, "step": 15760 }, { "epoch": 0.5943542004296536, "grad_norm": 1.4446917245054236, "learning_rate": 4.229817230436551e-06, "loss": 0.4965, "step": 15770 }, { "epoch": 0.5947310895865526, "grad_norm": 1.5351237013104135, "learning_rate": 4.223318258556929e-06, "loss": 0.4889, "step": 15780 }, { "epoch": 0.5951079787434516, "grad_norm": 1.4127276518276473, "learning_rate": 4.2168206310216e-06, "loss": 0.4777, "step": 15790 }, { "epoch": 0.5954848679003505, "grad_norm": 1.632038239139784, "learning_rate": 4.210324359077188e-06, "loss": 0.4863, "step": 15800 }, { "epoch": 0.5958617570572494, "grad_norm": 1.58550770046582, "learning_rate": 4.20382945396797e-06, "loss": 0.4629, "step": 15810 }, { "epoch": 0.5962386462141485, "grad_norm": 1.5604563182410427, "learning_rate": 4.197335926935862e-06, "loss": 0.4702, "step": 15820 }, { "epoch": 0.5966155353710474, "grad_norm": 1.6694863244839393, "learning_rate": 4.190843789220388e-06, "loss": 0.4841, "step": 15830 }, { "epoch": 0.5969924245279463, "grad_norm": 1.3812318270249444, "learning_rate": 4.184353052058675e-06, "loss": 0.4803, "step": 15840 }, { "epoch": 0.5973693136848452, "grad_norm": 1.542874778740591, "learning_rate": 4.177863726685422e-06, "loss": 0.4774, "step": 15850 }, { "epoch": 0.5977462028417443, "grad_norm": 1.5992161372449223, "learning_rate": 4.1713758243328805e-06, "loss": 0.4642, "step": 15860 }, { "epoch": 0.5981230919986432, "grad_norm": 1.695522458255685, "learning_rate": 4.164889356230845e-06, "loss": 0.4984, "step": 15870 }, { "epoch": 0.5984999811555421, "grad_norm": 1.5064263824602755, "learning_rate": 4.158404333606624e-06, "loss": 0.484, "step": 15880 }, { "epoch": 0.5988768703124411, "grad_norm": 1.5539780173830111, "learning_rate": 4.151920767685028e-06, "loss": 0.5053, "step": 15890 }, { "epoch": 0.5992537594693401, "grad_norm": 1.8156834275952407, "learning_rate": 4.145438669688339e-06, "loss": 0.4891, "step": 15900 }, { "epoch": 0.599630648626239, "grad_norm": 1.6867172865153917, "learning_rate": 4.138958050836305e-06, "loss": 0.4933, "step": 15910 }, { "epoch": 0.600007537783138, "grad_norm": 1.5069489029221794, "learning_rate": 4.132478922346111e-06, "loss": 0.4981, "step": 15920 }, { "epoch": 0.6003844269400369, "grad_norm": 1.645715388170334, "learning_rate": 4.126001295432362e-06, "loss": 0.5092, "step": 15930 }, { "epoch": 0.6007613160969358, "grad_norm": 1.5081305054375247, "learning_rate": 4.119525181307065e-06, "loss": 0.5007, "step": 15940 }, { "epoch": 0.6011382052538349, "grad_norm": 1.7513431875588092, "learning_rate": 4.113050591179608e-06, "loss": 0.4818, "step": 15950 }, { "epoch": 0.6015150944107338, "grad_norm": 1.4446843423733429, "learning_rate": 4.10657753625674e-06, "loss": 0.4891, "step": 15960 }, { "epoch": 0.6018919835676327, "grad_norm": 1.5962355483326147, "learning_rate": 4.100106027742559e-06, "loss": 0.5487, "step": 15970 }, { "epoch": 0.6022688727245317, "grad_norm": 1.5956138956675063, "learning_rate": 4.093636076838474e-06, "loss": 0.4953, "step": 15980 }, { "epoch": 0.6026457618814307, "grad_norm": 1.6376372777266162, "learning_rate": 4.087167694743209e-06, "loss": 0.474, "step": 15990 }, { "epoch": 0.6030226510383296, "grad_norm": 1.7797231365552175, "learning_rate": 4.080700892652769e-06, "loss": 0.5058, "step": 16000 }, { "epoch": 0.6033995401952286, "grad_norm": 1.5483118782326815, "learning_rate": 4.074235681760425e-06, "loss": 0.4455, "step": 16010 }, { "epoch": 0.6037764293521275, "grad_norm": 1.6389537384394115, "learning_rate": 4.067772073256691e-06, "loss": 0.4974, "step": 16020 }, { "epoch": 0.6041533185090265, "grad_norm": 1.5711684676833029, "learning_rate": 4.0613100783293085e-06, "loss": 0.4745, "step": 16030 }, { "epoch": 0.6045302076659255, "grad_norm": 1.583414482094785, "learning_rate": 4.0548497081632275e-06, "loss": 0.4832, "step": 16040 }, { "epoch": 0.6049070968228244, "grad_norm": 1.9771396052065735, "learning_rate": 4.04839097394059e-06, "loss": 0.4997, "step": 16050 }, { "epoch": 0.6052839859797233, "grad_norm": 1.4625015091341529, "learning_rate": 4.0419338868406934e-06, "loss": 0.4469, "step": 16060 }, { "epoch": 0.6056608751366224, "grad_norm": 1.497067817501958, "learning_rate": 4.035478458039998e-06, "loss": 0.4967, "step": 16070 }, { "epoch": 0.6060377642935213, "grad_norm": 1.5120714577785972, "learning_rate": 4.029024698712085e-06, "loss": 0.4829, "step": 16080 }, { "epoch": 0.6064146534504202, "grad_norm": 1.37555735450956, "learning_rate": 4.022572620027653e-06, "loss": 0.4758, "step": 16090 }, { "epoch": 0.6067915426073192, "grad_norm": 1.956108364099901, "learning_rate": 4.016122233154483e-06, "loss": 0.5154, "step": 16100 }, { "epoch": 0.6071684317642182, "grad_norm": 1.426684210517623, "learning_rate": 4.009673549257432e-06, "loss": 0.5068, "step": 16110 }, { "epoch": 0.6075453209211171, "grad_norm": 1.675741874097294, "learning_rate": 4.0032265794984145e-06, "loss": 0.4616, "step": 16120 }, { "epoch": 0.6079222100780161, "grad_norm": 1.666536388372485, "learning_rate": 3.99678133503637e-06, "loss": 0.4775, "step": 16130 }, { "epoch": 0.608299099234915, "grad_norm": 1.4010007944856846, "learning_rate": 3.990337827027256e-06, "loss": 0.5018, "step": 16140 }, { "epoch": 0.6086759883918139, "grad_norm": 1.811100516226975, "learning_rate": 3.983896066624021e-06, "loss": 0.4643, "step": 16150 }, { "epoch": 0.609052877548713, "grad_norm": 1.4452896868562695, "learning_rate": 3.977456064976592e-06, "loss": 0.5072, "step": 16160 }, { "epoch": 0.6094297667056119, "grad_norm": 1.2863215835461435, "learning_rate": 3.97101783323185e-06, "loss": 0.4604, "step": 16170 }, { "epoch": 0.6098066558625108, "grad_norm": 1.6495177612337615, "learning_rate": 3.964581382533618e-06, "loss": 0.4874, "step": 16180 }, { "epoch": 0.6101835450194097, "grad_norm": 1.4527097080833093, "learning_rate": 3.958146724022623e-06, "loss": 0.4744, "step": 16190 }, { "epoch": 0.6105604341763088, "grad_norm": 1.6146711317833942, "learning_rate": 3.951713868836506e-06, "loss": 0.4962, "step": 16200 }, { "epoch": 0.6109373233332077, "grad_norm": 1.8235449585876042, "learning_rate": 3.945282828109774e-06, "loss": 0.4968, "step": 16210 }, { "epoch": 0.6113142124901066, "grad_norm": 1.592201298917257, "learning_rate": 3.938853612973801e-06, "loss": 0.508, "step": 16220 }, { "epoch": 0.6116911016470056, "grad_norm": 1.9031681135985004, "learning_rate": 3.932426234556798e-06, "loss": 0.4793, "step": 16230 }, { "epoch": 0.6120679908039046, "grad_norm": 1.860838297311073, "learning_rate": 3.926000703983795e-06, "loss": 0.4691, "step": 16240 }, { "epoch": 0.6124448799608035, "grad_norm": 1.7168068035412793, "learning_rate": 3.919577032376628e-06, "loss": 0.4927, "step": 16250 }, { "epoch": 0.6128217691177025, "grad_norm": 1.856102414725057, "learning_rate": 3.913155230853915e-06, "loss": 0.4918, "step": 16260 }, { "epoch": 0.6131986582746014, "grad_norm": 1.712306650139487, "learning_rate": 3.906735310531033e-06, "loss": 0.4969, "step": 16270 }, { "epoch": 0.6135755474315004, "grad_norm": 1.6309354412368642, "learning_rate": 3.900317282520104e-06, "loss": 0.4693, "step": 16280 }, { "epoch": 0.6139524365883994, "grad_norm": 1.4778531939363033, "learning_rate": 3.893901157929979e-06, "loss": 0.4735, "step": 16290 }, { "epoch": 0.6143293257452983, "grad_norm": 1.8592722441894747, "learning_rate": 3.8874869478662104e-06, "loss": 0.4552, "step": 16300 }, { "epoch": 0.6147062149021972, "grad_norm": 1.9102009395175383, "learning_rate": 3.881074663431037e-06, "loss": 0.4856, "step": 16310 }, { "epoch": 0.6150831040590963, "grad_norm": 2.834088988880712, "learning_rate": 3.874664315723363e-06, "loss": 0.4862, "step": 16320 }, { "epoch": 0.6154599932159952, "grad_norm": 1.5759267871394191, "learning_rate": 3.8682559158387474e-06, "loss": 0.4749, "step": 16330 }, { "epoch": 0.6158368823728941, "grad_norm": 1.6563437776338659, "learning_rate": 3.861849474869371e-06, "loss": 0.4675, "step": 16340 }, { "epoch": 0.6162137715297931, "grad_norm": 1.7936481213990962, "learning_rate": 3.855445003904024e-06, "loss": 0.4901, "step": 16350 }, { "epoch": 0.616590660686692, "grad_norm": 1.6261629750224562, "learning_rate": 3.849042514028091e-06, "loss": 0.4817, "step": 16360 }, { "epoch": 0.616967549843591, "grad_norm": 1.6014458715286652, "learning_rate": 3.842642016323522e-06, "loss": 0.4757, "step": 16370 }, { "epoch": 0.61734443900049, "grad_norm": 1.6565625346460429, "learning_rate": 3.836243521868828e-06, "loss": 0.4849, "step": 16380 }, { "epoch": 0.6177213281573889, "grad_norm": 1.6412335965766696, "learning_rate": 3.82984704173904e-06, "loss": 0.486, "step": 16390 }, { "epoch": 0.6180982173142878, "grad_norm": 1.8138163964366028, "learning_rate": 3.823452587005712e-06, "loss": 0.5104, "step": 16400 }, { "epoch": 0.6184751064711869, "grad_norm": 1.7953902099489745, "learning_rate": 3.8170601687368905e-06, "loss": 0.479, "step": 16410 }, { "epoch": 0.6188519956280858, "grad_norm": 1.5270507793675938, "learning_rate": 3.8106697979970952e-06, "loss": 0.528, "step": 16420 }, { "epoch": 0.6192288847849847, "grad_norm": 1.5641867404888987, "learning_rate": 3.804281485847301e-06, "loss": 0.4849, "step": 16430 }, { "epoch": 0.6196057739418837, "grad_norm": 1.7507477091577552, "learning_rate": 3.7978952433449223e-06, "loss": 0.4832, "step": 16440 }, { "epoch": 0.6199826630987827, "grad_norm": 1.6054491509073952, "learning_rate": 3.7915110815437883e-06, "loss": 0.4919, "step": 16450 }, { "epoch": 0.6203595522556816, "grad_norm": 1.3576882864362545, "learning_rate": 3.7851290114941335e-06, "loss": 0.4842, "step": 16460 }, { "epoch": 0.6207364414125806, "grad_norm": 1.5934327565220905, "learning_rate": 3.77874904424256e-06, "loss": 0.4787, "step": 16470 }, { "epoch": 0.6211133305694795, "grad_norm": 1.4581770227833064, "learning_rate": 3.7723711908320417e-06, "loss": 0.4991, "step": 16480 }, { "epoch": 0.6214902197263785, "grad_norm": 1.7598577411897167, "learning_rate": 3.7659954623018875e-06, "loss": 0.4655, "step": 16490 }, { "epoch": 0.6218671088832775, "grad_norm": 1.5466125977952652, "learning_rate": 3.759621869687731e-06, "loss": 0.4763, "step": 16500 }, { "epoch": 0.6222439980401764, "grad_norm": 1.5162276323975878, "learning_rate": 3.753250424021506e-06, "loss": 0.4791, "step": 16510 }, { "epoch": 0.6226208871970753, "grad_norm": 1.6745379682639492, "learning_rate": 3.746881136331431e-06, "loss": 0.5151, "step": 16520 }, { "epoch": 0.6229977763539742, "grad_norm": 1.532874094800718, "learning_rate": 3.740514017641993e-06, "loss": 0.4911, "step": 16530 }, { "epoch": 0.6233746655108733, "grad_norm": 1.6604506659265745, "learning_rate": 3.7341490789739205e-06, "loss": 0.4906, "step": 16540 }, { "epoch": 0.6237515546677722, "grad_norm": 1.6456103279322771, "learning_rate": 3.727786331344171e-06, "loss": 0.4923, "step": 16550 }, { "epoch": 0.6241284438246711, "grad_norm": 1.4879389920554817, "learning_rate": 3.7214257857659066e-06, "loss": 0.489, "step": 16560 }, { "epoch": 0.6245053329815701, "grad_norm": 2.2740003134139237, "learning_rate": 3.715067453248481e-06, "loss": 0.5154, "step": 16570 }, { "epoch": 0.6248822221384691, "grad_norm": 1.7286954403484827, "learning_rate": 3.7087113447974153e-06, "loss": 0.4746, "step": 16580 }, { "epoch": 0.625259111295368, "grad_norm": 1.291986369209173, "learning_rate": 3.7023574714143858e-06, "loss": 0.4625, "step": 16590 }, { "epoch": 0.625636000452267, "grad_norm": 1.664446699881965, "learning_rate": 3.69600584409719e-06, "loss": 0.4541, "step": 16600 }, { "epoch": 0.6260128896091659, "grad_norm": 1.5018038978083545, "learning_rate": 3.6896564738397484e-06, "loss": 0.4657, "step": 16610 }, { "epoch": 0.6263897787660649, "grad_norm": 1.5823124760264218, "learning_rate": 3.6833093716320693e-06, "loss": 0.4768, "step": 16620 }, { "epoch": 0.6267666679229639, "grad_norm": 1.9860629829549734, "learning_rate": 3.6769645484602377e-06, "loss": 0.4811, "step": 16630 }, { "epoch": 0.6271435570798628, "grad_norm": 1.839944752571797, "learning_rate": 3.6706220153063904e-06, "loss": 0.48, "step": 16640 }, { "epoch": 0.6275204462367617, "grad_norm": 1.7309100874405818, "learning_rate": 3.664281783148702e-06, "loss": 0.4722, "step": 16650 }, { "epoch": 0.6278973353936608, "grad_norm": 1.581593496476517, "learning_rate": 3.6579438629613682e-06, "loss": 0.475, "step": 16660 }, { "epoch": 0.6282742245505597, "grad_norm": 1.6548205842663573, "learning_rate": 3.651608265714579e-06, "loss": 0.4907, "step": 16670 }, { "epoch": 0.6286511137074586, "grad_norm": 1.5516453686391054, "learning_rate": 3.645275002374502e-06, "loss": 0.4909, "step": 16680 }, { "epoch": 0.6290280028643576, "grad_norm": 1.4834746500429061, "learning_rate": 3.6389440839032687e-06, "loss": 0.5048, "step": 16690 }, { "epoch": 0.6294048920212566, "grad_norm": 1.7024990526554837, "learning_rate": 3.6326155212589507e-06, "loss": 0.4714, "step": 16700 }, { "epoch": 0.6297817811781555, "grad_norm": 1.709920706597902, "learning_rate": 3.6262893253955433e-06, "loss": 0.4828, "step": 16710 }, { "epoch": 0.6301586703350545, "grad_norm": 1.8336853543986507, "learning_rate": 3.6199655072629415e-06, "loss": 0.4847, "step": 16720 }, { "epoch": 0.6305355594919534, "grad_norm": 1.4929651219976985, "learning_rate": 3.613644077806927e-06, "loss": 0.4786, "step": 16730 }, { "epoch": 0.6309124486488523, "grad_norm": 1.7036171691777842, "learning_rate": 3.607325047969149e-06, "loss": 0.4957, "step": 16740 }, { "epoch": 0.6312893378057514, "grad_norm": 1.73108529336121, "learning_rate": 3.6010084286871017e-06, "loss": 0.5082, "step": 16750 }, { "epoch": 0.6316662269626503, "grad_norm": 1.6591602232622107, "learning_rate": 3.5946942308941035e-06, "loss": 0.4894, "step": 16760 }, { "epoch": 0.6320431161195492, "grad_norm": 2.047186622437741, "learning_rate": 3.5883824655192855e-06, "loss": 0.4853, "step": 16770 }, { "epoch": 0.6324200052764481, "grad_norm": 1.6832333116348397, "learning_rate": 3.582073143487568e-06, "loss": 0.5132, "step": 16780 }, { "epoch": 0.6327968944333472, "grad_norm": 1.5899139760429426, "learning_rate": 3.575766275719644e-06, "loss": 0.5026, "step": 16790 }, { "epoch": 0.6331737835902461, "grad_norm": 1.6765166894945724, "learning_rate": 3.5694618731319507e-06, "loss": 0.493, "step": 16800 }, { "epoch": 0.633550672747145, "grad_norm": 1.5669282699472866, "learning_rate": 3.5631599466366683e-06, "loss": 0.4989, "step": 16810 }, { "epoch": 0.633927561904044, "grad_norm": 1.6353842292836174, "learning_rate": 3.556860507141685e-06, "loss": 0.4842, "step": 16820 }, { "epoch": 0.634304451060943, "grad_norm": 1.5863452326347784, "learning_rate": 3.5505635655505877e-06, "loss": 0.4808, "step": 16830 }, { "epoch": 0.634681340217842, "grad_norm": 1.4750527987414617, "learning_rate": 3.5442691327626354e-06, "loss": 0.4833, "step": 16840 }, { "epoch": 0.6350582293747409, "grad_norm": 1.5244616927874408, "learning_rate": 3.5379772196727486e-06, "loss": 0.4848, "step": 16850 }, { "epoch": 0.6354351185316398, "grad_norm": 1.4503143224413209, "learning_rate": 3.5316878371714838e-06, "loss": 0.4737, "step": 16860 }, { "epoch": 0.6358120076885388, "grad_norm": 1.631831674936825, "learning_rate": 3.525400996145023e-06, "loss": 0.4847, "step": 16870 }, { "epoch": 0.6361888968454378, "grad_norm": 1.863352381042274, "learning_rate": 3.5191167074751385e-06, "loss": 0.461, "step": 16880 }, { "epoch": 0.6365657860023367, "grad_norm": 1.6699698482794394, "learning_rate": 3.512834982039196e-06, "loss": 0.4629, "step": 16890 }, { "epoch": 0.6369426751592356, "grad_norm": 1.487044517490085, "learning_rate": 3.506555830710118e-06, "loss": 0.5105, "step": 16900 }, { "epoch": 0.6373195643161347, "grad_norm": 1.4479081048385236, "learning_rate": 3.500279264356374e-06, "loss": 0.5242, "step": 16910 }, { "epoch": 0.6376964534730336, "grad_norm": 1.447171417720663, "learning_rate": 3.4940052938419583e-06, "loss": 0.508, "step": 16920 }, { "epoch": 0.6380733426299325, "grad_norm": 1.6796141341080977, "learning_rate": 3.4877339300263712e-06, "loss": 0.5005, "step": 16930 }, { "epoch": 0.6384502317868315, "grad_norm": 1.6662153763761518, "learning_rate": 3.481465183764602e-06, "loss": 0.4834, "step": 16940 }, { "epoch": 0.6388271209437304, "grad_norm": 1.5754157381604186, "learning_rate": 3.475199065907111e-06, "loss": 0.4874, "step": 16950 }, { "epoch": 0.6392040101006294, "grad_norm": 1.6041426691346072, "learning_rate": 3.4689355872998085e-06, "loss": 0.4865, "step": 16960 }, { "epoch": 0.6395808992575284, "grad_norm": 1.6020448439122437, "learning_rate": 3.4626747587840336e-06, "loss": 0.4804, "step": 16970 }, { "epoch": 0.6399577884144273, "grad_norm": 1.5691356747235539, "learning_rate": 3.4564165911965407e-06, "loss": 0.4491, "step": 16980 }, { "epoch": 0.6403346775713262, "grad_norm": 1.6238593763575764, "learning_rate": 3.4501610953694775e-06, "loss": 0.4721, "step": 16990 }, { "epoch": 0.6407115667282253, "grad_norm": 1.8524954109583753, "learning_rate": 3.4439082821303723e-06, "loss": 0.5065, "step": 17000 }, { "epoch": 0.6410884558851242, "grad_norm": 1.5903092899398827, "learning_rate": 3.4376581623020987e-06, "loss": 0.4751, "step": 17010 }, { "epoch": 0.6414653450420231, "grad_norm": 1.6223821940369376, "learning_rate": 3.43141074670288e-06, "loss": 0.482, "step": 17020 }, { "epoch": 0.6418422341989221, "grad_norm": 1.7154442225202828, "learning_rate": 3.425166046146254e-06, "loss": 0.5018, "step": 17030 }, { "epoch": 0.6422191233558211, "grad_norm": 1.4885853201247903, "learning_rate": 3.4189240714410587e-06, "loss": 0.4554, "step": 17040 }, { "epoch": 0.64259601251272, "grad_norm": 1.7004544435150633, "learning_rate": 3.412684833391413e-06, "loss": 0.4831, "step": 17050 }, { "epoch": 0.642972901669619, "grad_norm": 1.5122082088204818, "learning_rate": 3.406448342796702e-06, "loss": 0.4601, "step": 17060 }, { "epoch": 0.6433497908265179, "grad_norm": 1.5886153641711012, "learning_rate": 3.400214610451553e-06, "loss": 0.5, "step": 17070 }, { "epoch": 0.6437266799834169, "grad_norm": 1.4497829362440928, "learning_rate": 3.393983647145823e-06, "loss": 0.4886, "step": 17080 }, { "epoch": 0.6441035691403159, "grad_norm": 1.8243610592206863, "learning_rate": 3.387755463664567e-06, "loss": 0.4844, "step": 17090 }, { "epoch": 0.6444804582972148, "grad_norm": 1.9217986771041224, "learning_rate": 3.3815300707880394e-06, "loss": 0.4921, "step": 17100 }, { "epoch": 0.6448573474541137, "grad_norm": 1.5791418592871644, "learning_rate": 3.3753074792916574e-06, "loss": 0.482, "step": 17110 }, { "epoch": 0.6452342366110126, "grad_norm": 1.4431656921824283, "learning_rate": 3.369087699945993e-06, "loss": 0.4711, "step": 17120 }, { "epoch": 0.6456111257679117, "grad_norm": 1.662138312216896, "learning_rate": 3.3628707435167467e-06, "loss": 0.4701, "step": 17130 }, { "epoch": 0.6459880149248106, "grad_norm": 1.6549524822340256, "learning_rate": 3.3566566207647354e-06, "loss": 0.4761, "step": 17140 }, { "epoch": 0.6463649040817095, "grad_norm": 1.7699738162116803, "learning_rate": 3.350445342445874e-06, "loss": 0.4945, "step": 17150 }, { "epoch": 0.6467417932386085, "grad_norm": 2.1127375357724083, "learning_rate": 3.344236919311149e-06, "loss": 0.4765, "step": 17160 }, { "epoch": 0.6471186823955075, "grad_norm": 1.5444316703665961, "learning_rate": 3.338031362106607e-06, "loss": 0.4936, "step": 17170 }, { "epoch": 0.6474955715524064, "grad_norm": 1.6616572448362332, "learning_rate": 3.3318286815733335e-06, "loss": 0.4746, "step": 17180 }, { "epoch": 0.6478724607093054, "grad_norm": 1.9913624341143157, "learning_rate": 3.325628888447437e-06, "loss": 0.5077, "step": 17190 }, { "epoch": 0.6482493498662043, "grad_norm": 1.7915552215298538, "learning_rate": 3.319431993460026e-06, "loss": 0.504, "step": 17200 }, { "epoch": 0.6486262390231033, "grad_norm": 1.732835360991205, "learning_rate": 3.3132380073371926e-06, "loss": 0.4925, "step": 17210 }, { "epoch": 0.6490031281800023, "grad_norm": 1.3938136019682936, "learning_rate": 3.3070469407999937e-06, "loss": 0.4801, "step": 17220 }, { "epoch": 0.6493800173369012, "grad_norm": 1.3212224075939478, "learning_rate": 3.3008588045644357e-06, "loss": 0.4609, "step": 17230 }, { "epoch": 0.6497569064938001, "grad_norm": 1.7945950130121113, "learning_rate": 3.2946736093414524e-06, "loss": 0.467, "step": 17240 }, { "epoch": 0.6501337956506992, "grad_norm": 1.7547142990915863, "learning_rate": 3.288491365836881e-06, "loss": 0.5079, "step": 17250 }, { "epoch": 0.6505106848075981, "grad_norm": 1.6591091274236383, "learning_rate": 3.2823120847514577e-06, "loss": 0.4642, "step": 17260 }, { "epoch": 0.650887573964497, "grad_norm": 1.5655640287361217, "learning_rate": 3.2761357767807857e-06, "loss": 0.4641, "step": 17270 }, { "epoch": 0.651264463121396, "grad_norm": 1.7936591153526042, "learning_rate": 3.269962452615326e-06, "loss": 0.471, "step": 17280 }, { "epoch": 0.651641352278295, "grad_norm": 1.5915717374592357, "learning_rate": 3.2637921229403734e-06, "loss": 0.4689, "step": 17290 }, { "epoch": 0.6520182414351939, "grad_norm": 1.775887565559236, "learning_rate": 3.2576247984360372e-06, "loss": 0.4985, "step": 17300 }, { "epoch": 0.6523951305920929, "grad_norm": 1.8241609832963868, "learning_rate": 3.251460489777228e-06, "loss": 0.4983, "step": 17310 }, { "epoch": 0.6527720197489918, "grad_norm": 1.7201193138133724, "learning_rate": 3.2452992076336356e-06, "loss": 0.486, "step": 17320 }, { "epoch": 0.6531489089058907, "grad_norm": 1.6087203461140804, "learning_rate": 3.239140962669711e-06, "loss": 0.4933, "step": 17330 }, { "epoch": 0.6535257980627898, "grad_norm": 1.7218137998329464, "learning_rate": 3.2329857655446483e-06, "loss": 0.4932, "step": 17340 }, { "epoch": 0.6539026872196887, "grad_norm": 1.5206186268024502, "learning_rate": 3.2268336269123646e-06, "loss": 0.4552, "step": 17350 }, { "epoch": 0.6542795763765876, "grad_norm": 1.4491517030153769, "learning_rate": 3.220684557421488e-06, "loss": 0.4773, "step": 17360 }, { "epoch": 0.6546564655334866, "grad_norm": 1.5069597772562586, "learning_rate": 3.21453856771533e-06, "loss": 0.4768, "step": 17370 }, { "epoch": 0.6550333546903856, "grad_norm": 1.5693052306029478, "learning_rate": 3.2083956684318708e-06, "loss": 0.4817, "step": 17380 }, { "epoch": 0.6554102438472845, "grad_norm": 1.6315582314566435, "learning_rate": 3.2022558702037432e-06, "loss": 0.4875, "step": 17390 }, { "epoch": 0.6557871330041835, "grad_norm": 1.8830189051247035, "learning_rate": 3.196119183658213e-06, "loss": 0.5081, "step": 17400 }, { "epoch": 0.6561640221610824, "grad_norm": 1.4994699080361438, "learning_rate": 3.1899856194171607e-06, "loss": 0.4637, "step": 17410 }, { "epoch": 0.6565409113179814, "grad_norm": 1.5681733019464799, "learning_rate": 3.183855188097057e-06, "loss": 0.4754, "step": 17420 }, { "epoch": 0.6569178004748804, "grad_norm": 1.7957738354956814, "learning_rate": 3.177727900308958e-06, "loss": 0.4834, "step": 17430 }, { "epoch": 0.6572946896317793, "grad_norm": 1.4836118332583659, "learning_rate": 3.171603766658472e-06, "loss": 0.487, "step": 17440 }, { "epoch": 0.6576715787886782, "grad_norm": 1.8415155422864224, "learning_rate": 3.1654827977457526e-06, "loss": 0.4965, "step": 17450 }, { "epoch": 0.6580484679455773, "grad_norm": 1.48362840539746, "learning_rate": 3.1593650041654716e-06, "loss": 0.446, "step": 17460 }, { "epoch": 0.6584253571024762, "grad_norm": 1.6394333815884048, "learning_rate": 3.1532503965068073e-06, "loss": 0.4687, "step": 17470 }, { "epoch": 0.6588022462593751, "grad_norm": 1.483403229969005, "learning_rate": 3.1471389853534217e-06, "loss": 0.4596, "step": 17480 }, { "epoch": 0.659179135416274, "grad_norm": 1.3730547797639057, "learning_rate": 3.141030781283449e-06, "loss": 0.4691, "step": 17490 }, { "epoch": 0.6595560245731731, "grad_norm": 1.692266344168683, "learning_rate": 3.134925794869463e-06, "loss": 0.4649, "step": 17500 }, { "epoch": 0.659932913730072, "grad_norm": 1.5190834038718941, "learning_rate": 3.128824036678477e-06, "loss": 0.4734, "step": 17510 }, { "epoch": 0.6603098028869709, "grad_norm": 1.4798475487891651, "learning_rate": 3.1227255172719127e-06, "loss": 0.486, "step": 17520 }, { "epoch": 0.6606866920438699, "grad_norm": 1.5780730702677566, "learning_rate": 3.1166302472055873e-06, "loss": 0.462, "step": 17530 }, { "epoch": 0.6610635812007688, "grad_norm": 1.645103230020474, "learning_rate": 3.11053823702969e-06, "loss": 0.4983, "step": 17540 }, { "epoch": 0.6614404703576678, "grad_norm": 1.6169975944237835, "learning_rate": 3.104449497288772e-06, "loss": 0.4693, "step": 17550 }, { "epoch": 0.6618173595145668, "grad_norm": 1.6644257874809296, "learning_rate": 3.0983640385217224e-06, "loss": 0.537, "step": 17560 }, { "epoch": 0.6621942486714657, "grad_norm": 1.5913845909506998, "learning_rate": 3.092281871261752e-06, "loss": 0.4874, "step": 17570 }, { "epoch": 0.6625711378283646, "grad_norm": 1.6846273341706974, "learning_rate": 3.086203006036371e-06, "loss": 0.4407, "step": 17580 }, { "epoch": 0.6629480269852637, "grad_norm": 1.4314112760740891, "learning_rate": 3.0801274533673776e-06, "loss": 0.4778, "step": 17590 }, { "epoch": 0.6633249161421626, "grad_norm": 1.528021919193015, "learning_rate": 3.0740552237708366e-06, "loss": 0.5041, "step": 17600 }, { "epoch": 0.6637018052990615, "grad_norm": 1.679731483162306, "learning_rate": 3.0679863277570566e-06, "loss": 0.4939, "step": 17610 }, { "epoch": 0.6640786944559605, "grad_norm": 1.4342878274161748, "learning_rate": 3.0619207758305848e-06, "loss": 0.4895, "step": 17620 }, { "epoch": 0.6644555836128595, "grad_norm": 1.7346799588693844, "learning_rate": 3.0558585784901675e-06, "loss": 0.486, "step": 17630 }, { "epoch": 0.6648324727697584, "grad_norm": 1.783807850705795, "learning_rate": 3.0497997462287566e-06, "loss": 0.492, "step": 17640 }, { "epoch": 0.6652093619266574, "grad_norm": 2.0262857478932634, "learning_rate": 3.0437442895334734e-06, "loss": 0.5014, "step": 17650 }, { "epoch": 0.6655862510835563, "grad_norm": 1.5453260298900549, "learning_rate": 3.037692218885599e-06, "loss": 0.4923, "step": 17660 }, { "epoch": 0.6659631402404553, "grad_norm": 2.203790859304095, "learning_rate": 3.0316435447605495e-06, "loss": 0.4846, "step": 17670 }, { "epoch": 0.6663400293973543, "grad_norm": 1.546001802031686, "learning_rate": 3.025598277627866e-06, "loss": 0.4971, "step": 17680 }, { "epoch": 0.6667169185542532, "grad_norm": 1.5128697377492308, "learning_rate": 3.0195564279511925e-06, "loss": 0.4797, "step": 17690 }, { "epoch": 0.6670938077111521, "grad_norm": 2.0573665058922135, "learning_rate": 3.0135180061882564e-06, "loss": 0.4543, "step": 17700 }, { "epoch": 0.6674706968680512, "grad_norm": 1.7400957327471658, "learning_rate": 3.0074830227908514e-06, "loss": 0.4574, "step": 17710 }, { "epoch": 0.6678475860249501, "grad_norm": 1.3380341450114601, "learning_rate": 3.0014514882048195e-06, "loss": 0.5058, "step": 17720 }, { "epoch": 0.668224475181849, "grad_norm": 2.076718786647805, "learning_rate": 2.995423412870036e-06, "loss": 0.4859, "step": 17730 }, { "epoch": 0.668601364338748, "grad_norm": 1.4195821383830665, "learning_rate": 2.9893988072203867e-06, "loss": 0.492, "step": 17740 }, { "epoch": 0.6689782534956469, "grad_norm": 1.8805532322048002, "learning_rate": 2.98337768168375e-06, "loss": 0.483, "step": 17750 }, { "epoch": 0.6693551426525459, "grad_norm": 1.6317624518967453, "learning_rate": 2.977360046681983e-06, "loss": 0.478, "step": 17760 }, { "epoch": 0.6697320318094448, "grad_norm": 2.0847886406534513, "learning_rate": 2.971345912630902e-06, "loss": 0.4872, "step": 17770 }, { "epoch": 0.6701089209663438, "grad_norm": 1.7246432982403883, "learning_rate": 2.965335289940263e-06, "loss": 0.5001, "step": 17780 }, { "epoch": 0.6704858101232427, "grad_norm": 1.6363668958491415, "learning_rate": 2.9593281890137404e-06, "loss": 0.4686, "step": 17790 }, { "epoch": 0.6708626992801417, "grad_norm": 1.7985482020730335, "learning_rate": 2.9533246202489173e-06, "loss": 0.4969, "step": 17800 }, { "epoch": 0.6712395884370407, "grad_norm": 1.6264485996061422, "learning_rate": 2.9473245940372608e-06, "loss": 0.4936, "step": 17810 }, { "epoch": 0.6716164775939396, "grad_norm": 1.6890174531918563, "learning_rate": 2.9413281207641114e-06, "loss": 0.4799, "step": 17820 }, { "epoch": 0.6719933667508385, "grad_norm": 1.5841849546055524, "learning_rate": 2.9353352108086485e-06, "loss": 0.4998, "step": 17830 }, { "epoch": 0.6723702559077376, "grad_norm": 1.6089769681179087, "learning_rate": 2.929345874543896e-06, "loss": 0.4887, "step": 17840 }, { "epoch": 0.6727471450646365, "grad_norm": 1.5674713179458315, "learning_rate": 2.923360122336686e-06, "loss": 0.4847, "step": 17850 }, { "epoch": 0.6731240342215354, "grad_norm": 1.584156919079405, "learning_rate": 2.9173779645476474e-06, "loss": 0.4628, "step": 17860 }, { "epoch": 0.6735009233784344, "grad_norm": 1.6929345771758708, "learning_rate": 2.911399411531188e-06, "loss": 0.4748, "step": 17870 }, { "epoch": 0.6738778125353334, "grad_norm": 1.787787378046894, "learning_rate": 2.9054244736354766e-06, "loss": 0.4735, "step": 17880 }, { "epoch": 0.6742547016922323, "grad_norm": 1.5201306101583867, "learning_rate": 2.899453161202425e-06, "loss": 0.4833, "step": 17890 }, { "epoch": 0.6746315908491313, "grad_norm": 1.9695556108547445, "learning_rate": 2.893485484567669e-06, "loss": 0.4861, "step": 17900 }, { "epoch": 0.6750084800060302, "grad_norm": 1.4351406646686231, "learning_rate": 2.887521454060551e-06, "loss": 0.4673, "step": 17910 }, { "epoch": 0.6753853691629291, "grad_norm": 2.0856261193107732, "learning_rate": 2.881561080004104e-06, "loss": 0.4938, "step": 17920 }, { "epoch": 0.6757622583198282, "grad_norm": 1.609969345939983, "learning_rate": 2.8756043727150295e-06, "loss": 0.4843, "step": 17930 }, { "epoch": 0.6761391474767271, "grad_norm": 1.3392614292717313, "learning_rate": 2.8696513425036874e-06, "loss": 0.4819, "step": 17940 }, { "epoch": 0.676516036633626, "grad_norm": 1.7102447232516564, "learning_rate": 2.8637019996740624e-06, "loss": 0.494, "step": 17950 }, { "epoch": 0.676892925790525, "grad_norm": 1.6998269355745326, "learning_rate": 2.8577563545237686e-06, "loss": 0.4653, "step": 17960 }, { "epoch": 0.677269814947424, "grad_norm": 1.4862506147625136, "learning_rate": 2.8518144173440153e-06, "loss": 0.4922, "step": 17970 }, { "epoch": 0.6776467041043229, "grad_norm": 1.510405588964004, "learning_rate": 2.8458761984195913e-06, "loss": 0.4763, "step": 17980 }, { "epoch": 0.6780235932612219, "grad_norm": 1.5288462993209506, "learning_rate": 2.839941708028856e-06, "loss": 0.4736, "step": 17990 }, { "epoch": 0.6784004824181208, "grad_norm": 1.649944605527973, "learning_rate": 2.8340109564437028e-06, "loss": 0.5076, "step": 18000 }, { "epoch": 0.6787773715750198, "grad_norm": 1.517180625990711, "learning_rate": 2.8280839539295685e-06, "loss": 0.4903, "step": 18010 }, { "epoch": 0.6791542607319188, "grad_norm": 1.6911443889236912, "learning_rate": 2.822160710745392e-06, "loss": 0.4963, "step": 18020 }, { "epoch": 0.6795311498888177, "grad_norm": 1.870392805720581, "learning_rate": 2.8162412371436087e-06, "loss": 0.4636, "step": 18030 }, { "epoch": 0.6799080390457166, "grad_norm": 1.7551336831390325, "learning_rate": 2.8103255433701238e-06, "loss": 0.505, "step": 18040 }, { "epoch": 0.6802849282026157, "grad_norm": 2.112747546430556, "learning_rate": 2.804413639664306e-06, "loss": 0.4923, "step": 18050 }, { "epoch": 0.6806618173595146, "grad_norm": 1.389558866431419, "learning_rate": 2.7985055362589597e-06, "loss": 0.4746, "step": 18060 }, { "epoch": 0.6810387065164135, "grad_norm": 1.8602009007028841, "learning_rate": 2.792601243380321e-06, "loss": 0.4935, "step": 18070 }, { "epoch": 0.6814155956733124, "grad_norm": 1.8232438996039204, "learning_rate": 2.7867007712480145e-06, "loss": 0.4638, "step": 18080 }, { "epoch": 0.6817924848302115, "grad_norm": 1.422637041046491, "learning_rate": 2.780804130075064e-06, "loss": 0.4859, "step": 18090 }, { "epoch": 0.6821693739871104, "grad_norm": 1.79592896537598, "learning_rate": 2.7749113300678576e-06, "loss": 0.4994, "step": 18100 }, { "epoch": 0.6825462631440093, "grad_norm": 1.7163110389066987, "learning_rate": 2.7690223814261358e-06, "loss": 0.4649, "step": 18110 }, { "epoch": 0.6829231523009083, "grad_norm": 1.4437242473003, "learning_rate": 2.7631372943429724e-06, "loss": 0.4818, "step": 18120 }, { "epoch": 0.6833000414578072, "grad_norm": 1.799293879805964, "learning_rate": 2.757256079004758e-06, "loss": 0.5199, "step": 18130 }, { "epoch": 0.6836769306147062, "grad_norm": 1.8978824221262882, "learning_rate": 2.751378745591181e-06, "loss": 0.4743, "step": 18140 }, { "epoch": 0.6840538197716052, "grad_norm": 1.7079497057612991, "learning_rate": 2.74550530427521e-06, "loss": 0.4572, "step": 18150 }, { "epoch": 0.6844307089285041, "grad_norm": 1.6167362676629933, "learning_rate": 2.739635765223079e-06, "loss": 0.4645, "step": 18160 }, { "epoch": 0.684807598085403, "grad_norm": 1.1239553035502026, "learning_rate": 2.7337701385942655e-06, "loss": 0.4916, "step": 18170 }, { "epoch": 0.6851844872423021, "grad_norm": 1.80414731066518, "learning_rate": 2.7279084345414765e-06, "loss": 0.4832, "step": 18180 }, { "epoch": 0.685561376399201, "grad_norm": 1.6220429458848973, "learning_rate": 2.7220506632106304e-06, "loss": 0.4637, "step": 18190 }, { "epoch": 0.6859382655560999, "grad_norm": 1.6869049749820662, "learning_rate": 2.7161968347408325e-06, "loss": 0.4966, "step": 18200 }, { "epoch": 0.6863151547129989, "grad_norm": 1.6657505964750594, "learning_rate": 2.710346959264369e-06, "loss": 0.4561, "step": 18210 }, { "epoch": 0.6866920438698979, "grad_norm": 1.3973709297787218, "learning_rate": 2.7045010469066864e-06, "loss": 0.4933, "step": 18220 }, { "epoch": 0.6870689330267968, "grad_norm": 1.5860018125512794, "learning_rate": 2.6986591077863677e-06, "loss": 0.4857, "step": 18230 }, { "epoch": 0.6874458221836958, "grad_norm": 1.4618918574737128, "learning_rate": 2.692821152015116e-06, "loss": 0.4591, "step": 18240 }, { "epoch": 0.6878227113405947, "grad_norm": 1.638643622605981, "learning_rate": 2.686987189697744e-06, "loss": 0.4951, "step": 18250 }, { "epoch": 0.6881996004974937, "grad_norm": 1.7231621888652404, "learning_rate": 2.6811572309321487e-06, "loss": 0.4845, "step": 18260 }, { "epoch": 0.6885764896543927, "grad_norm": 1.834141519895122, "learning_rate": 2.6753312858093056e-06, "loss": 0.4793, "step": 18270 }, { "epoch": 0.6889533788112916, "grad_norm": 1.401487676831547, "learning_rate": 2.669509364413232e-06, "loss": 0.4864, "step": 18280 }, { "epoch": 0.6893302679681905, "grad_norm": 1.6493898881695628, "learning_rate": 2.6636914768209867e-06, "loss": 0.4856, "step": 18290 }, { "epoch": 0.6897071571250896, "grad_norm": 1.751769186322749, "learning_rate": 2.6578776331026456e-06, "loss": 0.5033, "step": 18300 }, { "epoch": 0.6900840462819885, "grad_norm": 1.3871471388084073, "learning_rate": 2.6520678433212854e-06, "loss": 0.4726, "step": 18310 }, { "epoch": 0.6904609354388874, "grad_norm": 1.5531242747701188, "learning_rate": 2.646262117532966e-06, "loss": 0.4818, "step": 18320 }, { "epoch": 0.6908378245957864, "grad_norm": 1.5566017006302422, "learning_rate": 2.640460465786711e-06, "loss": 0.4743, "step": 18330 }, { "epoch": 0.6912147137526853, "grad_norm": 1.7092683735724328, "learning_rate": 2.634662898124495e-06, "loss": 0.4473, "step": 18340 }, { "epoch": 0.6915916029095843, "grad_norm": 1.7674173470502206, "learning_rate": 2.6288694245812217e-06, "loss": 0.5052, "step": 18350 }, { "epoch": 0.6919684920664833, "grad_norm": 1.7113075330809617, "learning_rate": 2.6230800551847096e-06, "loss": 0.4968, "step": 18360 }, { "epoch": 0.6923453812233822, "grad_norm": 1.666331092982349, "learning_rate": 2.6172947999556723e-06, "loss": 0.4852, "step": 18370 }, { "epoch": 0.6927222703802811, "grad_norm": 1.7496234849403591, "learning_rate": 2.6115136689077037e-06, "loss": 0.4809, "step": 18380 }, { "epoch": 0.6930991595371802, "grad_norm": 1.4830201051445864, "learning_rate": 2.605736672047257e-06, "loss": 0.4579, "step": 18390 }, { "epoch": 0.6934760486940791, "grad_norm": 1.6817677793368029, "learning_rate": 2.5999638193736337e-06, "loss": 0.4922, "step": 18400 }, { "epoch": 0.693852937850978, "grad_norm": 1.4954573467617007, "learning_rate": 2.594195120878954e-06, "loss": 0.5002, "step": 18410 }, { "epoch": 0.6942298270078769, "grad_norm": 1.5504183128022453, "learning_rate": 2.5884305865481572e-06, "loss": 0.4834, "step": 18420 }, { "epoch": 0.694606716164776, "grad_norm": 1.554112427963254, "learning_rate": 2.582670226358971e-06, "loss": 0.4486, "step": 18430 }, { "epoch": 0.6949836053216749, "grad_norm": 1.4901063381514676, "learning_rate": 2.576914050281899e-06, "loss": 0.494, "step": 18440 }, { "epoch": 0.6953604944785738, "grad_norm": 1.5107903863102292, "learning_rate": 2.5711620682801973e-06, "loss": 0.4662, "step": 18450 }, { "epoch": 0.6957373836354728, "grad_norm": 1.4705557147554638, "learning_rate": 2.56541429030987e-06, "loss": 0.4955, "step": 18460 }, { "epoch": 0.6961142727923718, "grad_norm": 1.5353961394271842, "learning_rate": 2.5596707263196386e-06, "loss": 0.4796, "step": 18470 }, { "epoch": 0.6964911619492707, "grad_norm": 1.3895564519970267, "learning_rate": 2.55393138625094e-06, "loss": 0.4634, "step": 18480 }, { "epoch": 0.6968680511061697, "grad_norm": 1.3450383420888639, "learning_rate": 2.548196280037886e-06, "loss": 0.4798, "step": 18490 }, { "epoch": 0.6972449402630686, "grad_norm": 1.8244663013187354, "learning_rate": 2.5424654176072714e-06, "loss": 0.4746, "step": 18500 }, { "epoch": 0.6976218294199676, "grad_norm": 1.5654399272762263, "learning_rate": 2.5367388088785413e-06, "loss": 0.4899, "step": 18510 }, { "epoch": 0.6979987185768666, "grad_norm": 1.4654205360338723, "learning_rate": 2.5310164637637773e-06, "loss": 0.49, "step": 18520 }, { "epoch": 0.6983756077337655, "grad_norm": 1.4467059234382345, "learning_rate": 2.525298392167683e-06, "loss": 0.4427, "step": 18530 }, { "epoch": 0.6987524968906644, "grad_norm": 1.5408140891984066, "learning_rate": 2.519584603987566e-06, "loss": 0.4758, "step": 18540 }, { "epoch": 0.6991293860475634, "grad_norm": 1.7821474974934417, "learning_rate": 2.513875109113316e-06, "loss": 0.4865, "step": 18550 }, { "epoch": 0.6995062752044624, "grad_norm": 1.9953848627201554, "learning_rate": 2.5081699174273955e-06, "loss": 0.4808, "step": 18560 }, { "epoch": 0.6998831643613613, "grad_norm": 1.741343264928784, "learning_rate": 2.5024690388048154e-06, "loss": 0.502, "step": 18570 }, { "epoch": 0.7002600535182603, "grad_norm": 1.6437287473675368, "learning_rate": 2.4967724831131244e-06, "loss": 0.4969, "step": 18580 }, { "epoch": 0.7006369426751592, "grad_norm": 1.6241010396370417, "learning_rate": 2.4910802602123865e-06, "loss": 0.5005, "step": 18590 }, { "epoch": 0.7010138318320582, "grad_norm": 1.4898387628775849, "learning_rate": 2.4853923799551677e-06, "loss": 0.484, "step": 18600 }, { "epoch": 0.7013907209889572, "grad_norm": 1.6400342378919166, "learning_rate": 2.4797088521865138e-06, "loss": 0.4722, "step": 18610 }, { "epoch": 0.7017676101458561, "grad_norm": 1.479505953040376, "learning_rate": 2.474029686743939e-06, "loss": 0.5037, "step": 18620 }, { "epoch": 0.702144499302755, "grad_norm": 1.4589932060948365, "learning_rate": 2.4683548934574115e-06, "loss": 0.4609, "step": 18630 }, { "epoch": 0.7025213884596541, "grad_norm": 1.551678370051871, "learning_rate": 2.462684482149327e-06, "loss": 0.4801, "step": 18640 }, { "epoch": 0.702898277616553, "grad_norm": 1.7192058552759832, "learning_rate": 2.4570184626344944e-06, "loss": 0.4644, "step": 18650 }, { "epoch": 0.7032751667734519, "grad_norm": 1.631280002083957, "learning_rate": 2.451356844720125e-06, "loss": 0.4907, "step": 18660 }, { "epoch": 0.7036520559303509, "grad_norm": 1.6750923039054713, "learning_rate": 2.445699638205809e-06, "loss": 0.5031, "step": 18670 }, { "epoch": 0.7040289450872499, "grad_norm": 1.9691622944065195, "learning_rate": 2.440046852883507e-06, "loss": 0.4536, "step": 18680 }, { "epoch": 0.7044058342441488, "grad_norm": 1.507322764879853, "learning_rate": 2.4343984985375167e-06, "loss": 0.4725, "step": 18690 }, { "epoch": 0.7047827234010478, "grad_norm": 1.7740189927150876, "learning_rate": 2.4287545849444747e-06, "loss": 0.4698, "step": 18700 }, { "epoch": 0.7051596125579467, "grad_norm": 1.5752161156901476, "learning_rate": 2.423115121873328e-06, "loss": 0.4571, "step": 18710 }, { "epoch": 0.7055365017148456, "grad_norm": 1.9762064673353243, "learning_rate": 2.4174801190853196e-06, "loss": 0.4921, "step": 18720 }, { "epoch": 0.7059133908717447, "grad_norm": 1.6174279577484105, "learning_rate": 2.411849586333974e-06, "loss": 0.4975, "step": 18730 }, { "epoch": 0.7062902800286436, "grad_norm": 1.7193669580907476, "learning_rate": 2.406223533365078e-06, "loss": 0.4873, "step": 18740 }, { "epoch": 0.7066671691855425, "grad_norm": 1.7235537874317444, "learning_rate": 2.4006019699166643e-06, "loss": 0.4685, "step": 18750 }, { "epoch": 0.7070440583424414, "grad_norm": 1.63971711730806, "learning_rate": 2.394984905718994e-06, "loss": 0.4818, "step": 18760 }, { "epoch": 0.7074209474993405, "grad_norm": 1.4461491095827899, "learning_rate": 2.3893723504945425e-06, "loss": 0.4914, "step": 18770 }, { "epoch": 0.7077978366562394, "grad_norm": 1.4529129185509562, "learning_rate": 2.3837643139579786e-06, "loss": 0.4628, "step": 18780 }, { "epoch": 0.7081747258131383, "grad_norm": 1.53376236798455, "learning_rate": 2.378160805816151e-06, "loss": 0.4969, "step": 18790 }, { "epoch": 0.7085516149700373, "grad_norm": 1.6666468688910057, "learning_rate": 2.3725618357680697e-06, "loss": 0.4796, "step": 18800 }, { "epoch": 0.7089285041269363, "grad_norm": 1.6506198653287087, "learning_rate": 2.366967413504892e-06, "loss": 0.4854, "step": 18810 }, { "epoch": 0.7093053932838352, "grad_norm": 1.7178732497859122, "learning_rate": 2.361377548709897e-06, "loss": 0.4727, "step": 18820 }, { "epoch": 0.7096822824407342, "grad_norm": 1.4181246211634644, "learning_rate": 2.3557922510584837e-06, "loss": 0.4836, "step": 18830 }, { "epoch": 0.7100591715976331, "grad_norm": 1.5200708457900483, "learning_rate": 2.3502115302181415e-06, "loss": 0.4736, "step": 18840 }, { "epoch": 0.7104360607545321, "grad_norm": 1.8352918119756971, "learning_rate": 2.3446353958484404e-06, "loss": 0.4932, "step": 18850 }, { "epoch": 0.7108129499114311, "grad_norm": 1.6181777322718822, "learning_rate": 2.339063857601006e-06, "loss": 0.481, "step": 18860 }, { "epoch": 0.71118983906833, "grad_norm": 1.6727732163944582, "learning_rate": 2.3334969251195137e-06, "loss": 0.4688, "step": 18870 }, { "epoch": 0.7115667282252289, "grad_norm": 1.5382920829842202, "learning_rate": 2.3279346080396652e-06, "loss": 0.4799, "step": 18880 }, { "epoch": 0.711943617382128, "grad_norm": 1.6126228289797586, "learning_rate": 2.322376915989178e-06, "loss": 0.4579, "step": 18890 }, { "epoch": 0.7123205065390269, "grad_norm": 2.051868736900158, "learning_rate": 2.3168238585877552e-06, "loss": 0.5088, "step": 18900 }, { "epoch": 0.7126973956959258, "grad_norm": 1.485292408430921, "learning_rate": 2.3112754454470847e-06, "loss": 0.4821, "step": 18910 }, { "epoch": 0.7130742848528248, "grad_norm": 1.488486027715864, "learning_rate": 2.305731686170814e-06, "loss": 0.4498, "step": 18920 }, { "epoch": 0.7134511740097237, "grad_norm": 1.5197855092868775, "learning_rate": 2.300192590354534e-06, "loss": 0.466, "step": 18930 }, { "epoch": 0.7138280631666227, "grad_norm": 1.723349595195898, "learning_rate": 2.2946581675857667e-06, "loss": 0.4954, "step": 18940 }, { "epoch": 0.7142049523235217, "grad_norm": 1.8046482293904333, "learning_rate": 2.2891284274439424e-06, "loss": 0.4798, "step": 18950 }, { "epoch": 0.7145818414804206, "grad_norm": 1.5075542664335193, "learning_rate": 2.2836033795003882e-06, "loss": 0.4625, "step": 18960 }, { "epoch": 0.7149587306373195, "grad_norm": 1.7547546524633186, "learning_rate": 2.2780830333183086e-06, "loss": 0.4916, "step": 18970 }, { "epoch": 0.7153356197942186, "grad_norm": 1.5599589166635623, "learning_rate": 2.2725673984527706e-06, "loss": 0.4878, "step": 18980 }, { "epoch": 0.7157125089511175, "grad_norm": 1.5938966644066468, "learning_rate": 2.2670564844506863e-06, "loss": 0.4914, "step": 18990 }, { "epoch": 0.7160893981080164, "grad_norm": 1.758549701257888, "learning_rate": 2.2615503008507965e-06, "loss": 0.46, "step": 19000 }, { "epoch": 0.7164662872649153, "grad_norm": 1.6218207774272693, "learning_rate": 2.256048857183656e-06, "loss": 0.4709, "step": 19010 }, { "epoch": 0.7168431764218144, "grad_norm": 2.0658511464902447, "learning_rate": 2.2505521629716095e-06, "loss": 0.4902, "step": 19020 }, { "epoch": 0.7172200655787133, "grad_norm": 1.6814039721656933, "learning_rate": 2.245060227728785e-06, "loss": 0.4711, "step": 19030 }, { "epoch": 0.7175969547356122, "grad_norm": 1.7568224370312793, "learning_rate": 2.2395730609610777e-06, "loss": 0.4949, "step": 19040 }, { "epoch": 0.7179738438925112, "grad_norm": 1.8189719042512071, "learning_rate": 2.234090672166122e-06, "loss": 0.5321, "step": 19050 }, { "epoch": 0.7183507330494102, "grad_norm": 1.568239365153995, "learning_rate": 2.2286130708332876e-06, "loss": 0.4724, "step": 19060 }, { "epoch": 0.7187276222063091, "grad_norm": 2.0389164815636325, "learning_rate": 2.22314026644365e-06, "loss": 0.4657, "step": 19070 }, { "epoch": 0.7191045113632081, "grad_norm": 1.385299223521904, "learning_rate": 2.2176722684699882e-06, "loss": 0.4835, "step": 19080 }, { "epoch": 0.719481400520107, "grad_norm": 1.4633858575844483, "learning_rate": 2.2122090863767627e-06, "loss": 0.4542, "step": 19090 }, { "epoch": 0.719858289677006, "grad_norm": 1.7748405690606124, "learning_rate": 2.206750729620097e-06, "loss": 0.4782, "step": 19100 }, { "epoch": 0.720235178833905, "grad_norm": 1.5545823449736953, "learning_rate": 2.201297207647757e-06, "loss": 0.4548, "step": 19110 }, { "epoch": 0.7206120679908039, "grad_norm": 2.292728574012736, "learning_rate": 2.195848529899147e-06, "loss": 0.4722, "step": 19120 }, { "epoch": 0.7209889571477028, "grad_norm": 1.716053797160062, "learning_rate": 2.1904047058052842e-06, "loss": 0.4828, "step": 19130 }, { "epoch": 0.7213658463046018, "grad_norm": 1.714097262044458, "learning_rate": 2.1849657447887847e-06, "loss": 0.4735, "step": 19140 }, { "epoch": 0.7217427354615008, "grad_norm": 1.7301553058622174, "learning_rate": 2.1795316562638462e-06, "loss": 0.485, "step": 19150 }, { "epoch": 0.7221196246183997, "grad_norm": 2.437219895216966, "learning_rate": 2.1741024496362344e-06, "loss": 0.455, "step": 19160 }, { "epoch": 0.7224965137752987, "grad_norm": 1.7131734298817816, "learning_rate": 2.1686781343032647e-06, "loss": 0.484, "step": 19170 }, { "epoch": 0.7228734029321976, "grad_norm": 1.4800874315130503, "learning_rate": 2.1632587196537853e-06, "loss": 0.4646, "step": 19180 }, { "epoch": 0.7232502920890966, "grad_norm": 1.6136385919031264, "learning_rate": 2.1578442150681615e-06, "loss": 0.4615, "step": 19190 }, { "epoch": 0.7236271812459956, "grad_norm": 1.678439760720894, "learning_rate": 2.1524346299182626e-06, "loss": 0.4632, "step": 19200 }, { "epoch": 0.7240040704028945, "grad_norm": 1.5309141919283373, "learning_rate": 2.14702997356744e-06, "loss": 0.4585, "step": 19210 }, { "epoch": 0.7243809595597934, "grad_norm": 1.393910668679631, "learning_rate": 2.1416302553705165e-06, "loss": 0.4844, "step": 19220 }, { "epoch": 0.7247578487166925, "grad_norm": 1.6745490665396063, "learning_rate": 2.136235484673761e-06, "loss": 0.4523, "step": 19230 }, { "epoch": 0.7251347378735914, "grad_norm": 1.6646971203310925, "learning_rate": 2.1308456708148896e-06, "loss": 0.4777, "step": 19240 }, { "epoch": 0.7255116270304903, "grad_norm": 1.7651217240185753, "learning_rate": 2.1254608231230312e-06, "loss": 0.4985, "step": 19250 }, { "epoch": 0.7258885161873893, "grad_norm": 1.458188978373751, "learning_rate": 2.120080950918722e-06, "loss": 0.4609, "step": 19260 }, { "epoch": 0.7262654053442883, "grad_norm": 1.3512368514891746, "learning_rate": 2.1147060635138817e-06, "loss": 0.4586, "step": 19270 }, { "epoch": 0.7266422945011872, "grad_norm": 1.6354165388054114, "learning_rate": 2.1093361702118065e-06, "loss": 0.4673, "step": 19280 }, { "epoch": 0.7270191836580862, "grad_norm": 1.6908458833172149, "learning_rate": 2.103971280307146e-06, "loss": 0.4652, "step": 19290 }, { "epoch": 0.7273960728149851, "grad_norm": 1.5384882126868258, "learning_rate": 2.098611403085895e-06, "loss": 0.4553, "step": 19300 }, { "epoch": 0.727772961971884, "grad_norm": 1.3879436105661611, "learning_rate": 2.0932565478253624e-06, "loss": 0.4606, "step": 19310 }, { "epoch": 0.728149851128783, "grad_norm": 1.7839927445388464, "learning_rate": 2.087906723794171e-06, "loss": 0.4913, "step": 19320 }, { "epoch": 0.728526740285682, "grad_norm": 1.5679146539305475, "learning_rate": 2.0825619402522356e-06, "loss": 0.4764, "step": 19330 }, { "epoch": 0.7289036294425809, "grad_norm": 1.6427710422238193, "learning_rate": 2.077222206450743e-06, "loss": 0.4941, "step": 19340 }, { "epoch": 0.7292805185994798, "grad_norm": 1.9163137570823914, "learning_rate": 2.0718875316321413e-06, "loss": 0.4728, "step": 19350 }, { "epoch": 0.7296574077563789, "grad_norm": 1.7523481576736066, "learning_rate": 2.066557925030123e-06, "loss": 0.4837, "step": 19360 }, { "epoch": 0.7300342969132778, "grad_norm": 1.6581516586288727, "learning_rate": 2.0612333958696068e-06, "loss": 0.4649, "step": 19370 }, { "epoch": 0.7304111860701767, "grad_norm": 1.833543727989413, "learning_rate": 2.0559139533667227e-06, "loss": 0.4888, "step": 19380 }, { "epoch": 0.7307880752270757, "grad_norm": 2.948145026805694, "learning_rate": 2.050599606728798e-06, "loss": 0.4679, "step": 19390 }, { "epoch": 0.7311649643839747, "grad_norm": 1.6265548517672246, "learning_rate": 2.045290365154338e-06, "loss": 0.4561, "step": 19400 }, { "epoch": 0.7315418535408736, "grad_norm": 1.5522637397176864, "learning_rate": 2.039986237833012e-06, "loss": 0.4481, "step": 19410 }, { "epoch": 0.7319187426977726, "grad_norm": 1.5713552061547922, "learning_rate": 2.0346872339456385e-06, "loss": 0.4555, "step": 19420 }, { "epoch": 0.7322956318546715, "grad_norm": 1.9565945231630675, "learning_rate": 2.0293933626641677e-06, "loss": 0.4705, "step": 19430 }, { "epoch": 0.7326725210115705, "grad_norm": 1.4336544390295196, "learning_rate": 2.0241046331516596e-06, "loss": 0.475, "step": 19440 }, { "epoch": 0.7330494101684695, "grad_norm": 1.9396889631970387, "learning_rate": 2.018821054562286e-06, "loss": 0.4681, "step": 19450 }, { "epoch": 0.7334262993253684, "grad_norm": 1.567151805567963, "learning_rate": 2.0135426360412945e-06, "loss": 0.4835, "step": 19460 }, { "epoch": 0.7338031884822673, "grad_norm": 1.9444973709798723, "learning_rate": 2.008269386725006e-06, "loss": 0.4698, "step": 19470 }, { "epoch": 0.7341800776391664, "grad_norm": 1.6143737110809373, "learning_rate": 2.003001315740788e-06, "loss": 0.4718, "step": 19480 }, { "epoch": 0.7345569667960653, "grad_norm": 1.4062299288988356, "learning_rate": 1.997738432207048e-06, "loss": 0.486, "step": 19490 }, { "epoch": 0.7349338559529642, "grad_norm": 1.8345777555224132, "learning_rate": 1.9924807452332203e-06, "loss": 0.4982, "step": 19500 }, { "epoch": 0.7353107451098632, "grad_norm": 1.3571830592619372, "learning_rate": 1.9872282639197384e-06, "loss": 0.4541, "step": 19510 }, { "epoch": 0.7356876342667621, "grad_norm": 1.5972192547656638, "learning_rate": 1.981980997358023e-06, "loss": 0.4876, "step": 19520 }, { "epoch": 0.7360645234236611, "grad_norm": 1.6793172446301055, "learning_rate": 1.976738954630475e-06, "loss": 0.4902, "step": 19530 }, { "epoch": 0.7364414125805601, "grad_norm": 1.7207394609259994, "learning_rate": 1.97150214481045e-06, "loss": 0.4631, "step": 19540 }, { "epoch": 0.736818301737459, "grad_norm": 1.7659650208337772, "learning_rate": 1.9662705769622473e-06, "loss": 0.475, "step": 19550 }, { "epoch": 0.7371951908943579, "grad_norm": 2.259295033352332, "learning_rate": 1.9610442601410924e-06, "loss": 0.4748, "step": 19560 }, { "epoch": 0.737572080051257, "grad_norm": 1.4373497805396753, "learning_rate": 1.955823203393122e-06, "loss": 0.4417, "step": 19570 }, { "epoch": 0.7379489692081559, "grad_norm": 1.7830963074391613, "learning_rate": 1.9506074157553674e-06, "loss": 0.4995, "step": 19580 }, { "epoch": 0.7383258583650548, "grad_norm": 1.6506977257382058, "learning_rate": 1.9453969062557413e-06, "loss": 0.4704, "step": 19590 }, { "epoch": 0.7387027475219538, "grad_norm": 1.8345545514824426, "learning_rate": 1.94019168391302e-06, "loss": 0.4513, "step": 19600 }, { "epoch": 0.7390796366788528, "grad_norm": 1.5681906345509673, "learning_rate": 1.9349917577368278e-06, "loss": 0.4633, "step": 19610 }, { "epoch": 0.7394565258357517, "grad_norm": 1.1866445291178405, "learning_rate": 1.929797136727622e-06, "loss": 0.4482, "step": 19620 }, { "epoch": 0.7398334149926507, "grad_norm": 1.7441782836688702, "learning_rate": 1.924607829876679e-06, "loss": 0.4656, "step": 19630 }, { "epoch": 0.7402103041495496, "grad_norm": 1.5560618510253559, "learning_rate": 1.9194238461660715e-06, "loss": 0.4713, "step": 19640 }, { "epoch": 0.7405871933064486, "grad_norm": 1.612264222679392, "learning_rate": 1.9142451945686675e-06, "loss": 0.4888, "step": 19650 }, { "epoch": 0.7409640824633476, "grad_norm": 1.6525004938247971, "learning_rate": 1.909071884048098e-06, "loss": 0.4831, "step": 19660 }, { "epoch": 0.7413409716202465, "grad_norm": 1.8991946449473516, "learning_rate": 1.9039039235587549e-06, "loss": 0.4793, "step": 19670 }, { "epoch": 0.7417178607771454, "grad_norm": 1.661925662248117, "learning_rate": 1.898741322045763e-06, "loss": 0.468, "step": 19680 }, { "epoch": 0.7420947499340445, "grad_norm": 1.6008784105624398, "learning_rate": 1.8935840884449774e-06, "loss": 0.4959, "step": 19690 }, { "epoch": 0.7424716390909434, "grad_norm": 1.8462741745907572, "learning_rate": 1.888432231682958e-06, "loss": 0.447, "step": 19700 }, { "epoch": 0.7428485282478423, "grad_norm": 1.6621308425745434, "learning_rate": 1.8832857606769645e-06, "loss": 0.454, "step": 19710 }, { "epoch": 0.7432254174047412, "grad_norm": 1.4464720693906488, "learning_rate": 1.8781446843349255e-06, "loss": 0.4744, "step": 19720 }, { "epoch": 0.7436023065616402, "grad_norm": 1.5289864438887075, "learning_rate": 1.8730090115554377e-06, "loss": 0.4847, "step": 19730 }, { "epoch": 0.7439791957185392, "grad_norm": 1.499672820897478, "learning_rate": 1.8678787512277441e-06, "loss": 0.4696, "step": 19740 }, { "epoch": 0.7443560848754381, "grad_norm": 1.9522488619546912, "learning_rate": 1.8627539122317184e-06, "loss": 0.51, "step": 19750 }, { "epoch": 0.7447329740323371, "grad_norm": 1.578646901148932, "learning_rate": 1.8576345034378518e-06, "loss": 0.4626, "step": 19760 }, { "epoch": 0.745109863189236, "grad_norm": 1.6897078881033194, "learning_rate": 1.8525205337072356e-06, "loss": 0.4986, "step": 19770 }, { "epoch": 0.745486752346135, "grad_norm": 1.6933365658689057, "learning_rate": 1.8474120118915468e-06, "loss": 0.4989, "step": 19780 }, { "epoch": 0.745863641503034, "grad_norm": 1.5425636755893886, "learning_rate": 1.8423089468330323e-06, "loss": 0.4755, "step": 19790 }, { "epoch": 0.7462405306599329, "grad_norm": 1.7854964281266579, "learning_rate": 1.8372113473644954e-06, "loss": 0.4677, "step": 19800 }, { "epoch": 0.7466174198168318, "grad_norm": 1.5758981880661738, "learning_rate": 1.8321192223092783e-06, "loss": 0.4596, "step": 19810 }, { "epoch": 0.7469943089737309, "grad_norm": 1.7615382948562168, "learning_rate": 1.8270325804812467e-06, "loss": 0.4959, "step": 19820 }, { "epoch": 0.7473711981306298, "grad_norm": 1.6845149695852346, "learning_rate": 1.8219514306847769e-06, "loss": 0.453, "step": 19830 }, { "epoch": 0.7477480872875287, "grad_norm": 1.5622386509669985, "learning_rate": 1.8168757817147408e-06, "loss": 0.4636, "step": 19840 }, { "epoch": 0.7481249764444277, "grad_norm": 1.659724890019801, "learning_rate": 1.8118056423564807e-06, "loss": 0.4773, "step": 19850 }, { "epoch": 0.7485018656013267, "grad_norm": 1.6175612238786405, "learning_rate": 1.8067410213858144e-06, "loss": 0.4586, "step": 19860 }, { "epoch": 0.7488787547582256, "grad_norm": 1.2204409557789897, "learning_rate": 1.8016819275690005e-06, "loss": 0.4876, "step": 19870 }, { "epoch": 0.7492556439151246, "grad_norm": 1.5479726038649555, "learning_rate": 1.7966283696627334e-06, "loss": 0.4903, "step": 19880 }, { "epoch": 0.7496325330720235, "grad_norm": 1.5825271274712456, "learning_rate": 1.791580356414122e-06, "loss": 0.48, "step": 19890 }, { "epoch": 0.7500094222289225, "grad_norm": 1.6729980724400284, "learning_rate": 1.7865378965606816e-06, "loss": 0.4971, "step": 19900 }, { "epoch": 0.7503863113858215, "grad_norm": 1.49855379749909, "learning_rate": 1.7815009988303128e-06, "loss": 0.4559, "step": 19910 }, { "epoch": 0.7507632005427204, "grad_norm": 1.867864298503486, "learning_rate": 1.7764696719412955e-06, "loss": 0.4676, "step": 19920 }, { "epoch": 0.7511400896996193, "grad_norm": 1.398138122231671, "learning_rate": 1.7714439246022563e-06, "loss": 0.4665, "step": 19930 }, { "epoch": 0.7515169788565182, "grad_norm": 1.679272548470173, "learning_rate": 1.7664237655121712e-06, "loss": 0.4845, "step": 19940 }, { "epoch": 0.7518938680134173, "grad_norm": 1.6071742957955386, "learning_rate": 1.7614092033603435e-06, "loss": 0.4762, "step": 19950 }, { "epoch": 0.7522707571703162, "grad_norm": 2.0121603774745, "learning_rate": 1.7564002468263864e-06, "loss": 0.4983, "step": 19960 }, { "epoch": 0.7526476463272151, "grad_norm": 1.6878110868707674, "learning_rate": 1.7513969045802121e-06, "loss": 0.4774, "step": 19970 }, { "epoch": 0.7530245354841141, "grad_norm": 1.5812116606356919, "learning_rate": 1.7463991852820146e-06, "loss": 0.4694, "step": 19980 }, { "epoch": 0.7534014246410131, "grad_norm": 1.6728801843104153, "learning_rate": 1.741407097582255e-06, "loss": 0.4812, "step": 19990 }, { "epoch": 0.753778313797912, "grad_norm": 1.558056793965425, "learning_rate": 1.7364206501216468e-06, "loss": 0.4771, "step": 20000 }, { "epoch": 0.754155202954811, "grad_norm": 1.572534621526344, "learning_rate": 1.7314398515311425e-06, "loss": 0.4825, "step": 20010 }, { "epoch": 0.7545320921117099, "grad_norm": 1.7319010605390628, "learning_rate": 1.7264647104319144e-06, "loss": 0.4924, "step": 20020 }, { "epoch": 0.754908981268609, "grad_norm": 1.8088713988535818, "learning_rate": 1.7214952354353442e-06, "loss": 0.5092, "step": 20030 }, { "epoch": 0.7552858704255079, "grad_norm": 1.6367538520605005, "learning_rate": 1.7165314351430073e-06, "loss": 0.4853, "step": 20040 }, { "epoch": 0.7556627595824068, "grad_norm": 1.8009115510393299, "learning_rate": 1.7115733181466521e-06, "loss": 0.4861, "step": 20050 }, { "epoch": 0.7560396487393057, "grad_norm": 1.647359105340635, "learning_rate": 1.706620893028193e-06, "loss": 0.4872, "step": 20060 }, { "epoch": 0.7564165378962048, "grad_norm": 1.7977265334517736, "learning_rate": 1.7016741683596956e-06, "loss": 0.4861, "step": 20070 }, { "epoch": 0.7567934270531037, "grad_norm": 1.4658968392116936, "learning_rate": 1.696733152703356e-06, "loss": 0.4621, "step": 20080 }, { "epoch": 0.7571703162100026, "grad_norm": 1.400405223101517, "learning_rate": 1.6917978546114844e-06, "loss": 0.4567, "step": 20090 }, { "epoch": 0.7575472053669016, "grad_norm": 1.7826408832538072, "learning_rate": 1.686868282626501e-06, "loss": 0.455, "step": 20100 }, { "epoch": 0.7579240945238005, "grad_norm": 1.369980544067672, "learning_rate": 1.6819444452809097e-06, "loss": 0.4498, "step": 20110 }, { "epoch": 0.7583009836806995, "grad_norm": 1.606503602489862, "learning_rate": 1.6770263510972967e-06, "loss": 0.4784, "step": 20120 }, { "epoch": 0.7586778728375985, "grad_norm": 1.7717643562693965, "learning_rate": 1.6721140085882958e-06, "loss": 0.4983, "step": 20130 }, { "epoch": 0.7590547619944974, "grad_norm": 1.5525199466594755, "learning_rate": 1.6672074262565935e-06, "loss": 0.4724, "step": 20140 }, { "epoch": 0.7594316511513963, "grad_norm": 1.8260818838800927, "learning_rate": 1.6623066125949039e-06, "loss": 0.4855, "step": 20150 }, { "epoch": 0.7598085403082954, "grad_norm": 1.5847972521629938, "learning_rate": 1.6574115760859565e-06, "loss": 0.4962, "step": 20160 }, { "epoch": 0.7601854294651943, "grad_norm": 1.9147538929254044, "learning_rate": 1.6525223252024803e-06, "loss": 0.4906, "step": 20170 }, { "epoch": 0.7605623186220932, "grad_norm": 1.8192442389995647, "learning_rate": 1.6476388684071904e-06, "loss": 0.4461, "step": 20180 }, { "epoch": 0.7609392077789922, "grad_norm": 1.6057172465162213, "learning_rate": 1.6427612141527737e-06, "loss": 0.4661, "step": 20190 }, { "epoch": 0.7613160969358912, "grad_norm": 1.5700052558656017, "learning_rate": 1.6378893708818737e-06, "loss": 0.4579, "step": 20200 }, { "epoch": 0.7616929860927901, "grad_norm": 2.4569585216041556, "learning_rate": 1.6330233470270745e-06, "loss": 0.4794, "step": 20210 }, { "epoch": 0.762069875249689, "grad_norm": 1.889553496160301, "learning_rate": 1.6281631510108886e-06, "loss": 0.442, "step": 20220 }, { "epoch": 0.762446764406588, "grad_norm": 1.5962321807838258, "learning_rate": 1.6233087912457412e-06, "loss": 0.4672, "step": 20230 }, { "epoch": 0.762823653563487, "grad_norm": 1.5965451145332648, "learning_rate": 1.618460276133954e-06, "loss": 0.4676, "step": 20240 }, { "epoch": 0.763200542720386, "grad_norm": 1.6567069219014103, "learning_rate": 1.6136176140677368e-06, "loss": 0.4783, "step": 20250 }, { "epoch": 0.7635774318772849, "grad_norm": 1.4557357506093793, "learning_rate": 1.6087808134291593e-06, "loss": 0.4823, "step": 20260 }, { "epoch": 0.7639543210341838, "grad_norm": 1.876565177528593, "learning_rate": 1.6039498825901568e-06, "loss": 0.4774, "step": 20270 }, { "epoch": 0.7643312101910829, "grad_norm": 1.7263871602732919, "learning_rate": 1.5991248299124978e-06, "loss": 0.4738, "step": 20280 }, { "epoch": 0.7647080993479818, "grad_norm": 1.6090188313058575, "learning_rate": 1.5943056637477804e-06, "loss": 0.4567, "step": 20290 }, { "epoch": 0.7650849885048807, "grad_norm": 1.8159775563602787, "learning_rate": 1.5894923924374077e-06, "loss": 0.4996, "step": 20300 }, { "epoch": 0.7654618776617796, "grad_norm": 1.694825125868995, "learning_rate": 1.5846850243125856e-06, "loss": 0.467, "step": 20310 }, { "epoch": 0.7658387668186786, "grad_norm": 1.4834845730262398, "learning_rate": 1.5798835676942976e-06, "loss": 0.4733, "step": 20320 }, { "epoch": 0.7662156559755776, "grad_norm": 1.6457346282769572, "learning_rate": 1.5750880308933036e-06, "loss": 0.4775, "step": 20330 }, { "epoch": 0.7665925451324765, "grad_norm": 1.6332831296059798, "learning_rate": 1.5702984222101053e-06, "loss": 0.4598, "step": 20340 }, { "epoch": 0.7669694342893755, "grad_norm": 2.0230516472784417, "learning_rate": 1.565514749934951e-06, "loss": 0.501, "step": 20350 }, { "epoch": 0.7673463234462744, "grad_norm": 1.9624821551932368, "learning_rate": 1.5607370223478118e-06, "loss": 0.4863, "step": 20360 }, { "epoch": 0.7677232126031734, "grad_norm": 1.3909923765612713, "learning_rate": 1.5559652477183702e-06, "loss": 0.4592, "step": 20370 }, { "epoch": 0.7681001017600724, "grad_norm": 1.6465814513557648, "learning_rate": 1.5511994343060033e-06, "loss": 0.4755, "step": 20380 }, { "epoch": 0.7684769909169713, "grad_norm": 1.610226051279983, "learning_rate": 1.5464395903597713e-06, "loss": 0.4469, "step": 20390 }, { "epoch": 0.7688538800738702, "grad_norm": 1.7453869621269993, "learning_rate": 1.5416857241184007e-06, "loss": 0.4547, "step": 20400 }, { "epoch": 0.7692307692307693, "grad_norm": 1.310381314751475, "learning_rate": 1.5369378438102728e-06, "loss": 0.4668, "step": 20410 }, { "epoch": 0.7696076583876682, "grad_norm": 1.6872384131950084, "learning_rate": 1.5321959576534073e-06, "loss": 0.5217, "step": 20420 }, { "epoch": 0.7699845475445671, "grad_norm": 1.6894678298188206, "learning_rate": 1.527460073855448e-06, "loss": 0.4665, "step": 20430 }, { "epoch": 0.7703614367014661, "grad_norm": 1.8097958205422997, "learning_rate": 1.52273020061365e-06, "loss": 0.4564, "step": 20440 }, { "epoch": 0.7707383258583651, "grad_norm": 1.7701718906608808, "learning_rate": 1.5180063461148675e-06, "loss": 0.4664, "step": 20450 }, { "epoch": 0.771115215015264, "grad_norm": 1.2961421273522296, "learning_rate": 1.5132885185355294e-06, "loss": 0.4527, "step": 20460 }, { "epoch": 0.771492104172163, "grad_norm": 1.8199021031458331, "learning_rate": 1.5085767260416396e-06, "loss": 0.4945, "step": 20470 }, { "epoch": 0.7718689933290619, "grad_norm": 1.5671269899021723, "learning_rate": 1.5038709767887548e-06, "loss": 0.4816, "step": 20480 }, { "epoch": 0.7722458824859609, "grad_norm": 1.818867610498672, "learning_rate": 1.4991712789219714e-06, "loss": 0.4778, "step": 20490 }, { "epoch": 0.7726227716428599, "grad_norm": 1.5948143698764703, "learning_rate": 1.4944776405759115e-06, "loss": 0.4652, "step": 20500 }, { "epoch": 0.7729996607997588, "grad_norm": 1.4821620142605674, "learning_rate": 1.4897900698747047e-06, "loss": 0.4422, "step": 20510 }, { "epoch": 0.7733765499566577, "grad_norm": 1.8116026121890576, "learning_rate": 1.4851085749319827e-06, "loss": 0.469, "step": 20520 }, { "epoch": 0.7737534391135567, "grad_norm": 1.523784502143446, "learning_rate": 1.4804331638508623e-06, "loss": 0.4773, "step": 20530 }, { "epoch": 0.7741303282704557, "grad_norm": 1.8206430937956362, "learning_rate": 1.4757638447239276e-06, "loss": 0.4724, "step": 20540 }, { "epoch": 0.7745072174273546, "grad_norm": 1.7588855241689347, "learning_rate": 1.4711006256332156e-06, "loss": 0.4382, "step": 20550 }, { "epoch": 0.7748841065842536, "grad_norm": 1.584901601395231, "learning_rate": 1.4664435146502083e-06, "loss": 0.4674, "step": 20560 }, { "epoch": 0.7752609957411525, "grad_norm": 1.7973543634379379, "learning_rate": 1.461792519835814e-06, "loss": 0.4826, "step": 20570 }, { "epoch": 0.7756378848980515, "grad_norm": 1.673141519768556, "learning_rate": 1.4571476492403563e-06, "loss": 0.4753, "step": 20580 }, { "epoch": 0.7760147740549505, "grad_norm": 1.5795162978217505, "learning_rate": 1.452508910903556e-06, "loss": 0.4965, "step": 20590 }, { "epoch": 0.7763916632118494, "grad_norm": 1.71797844150418, "learning_rate": 1.447876312854521e-06, "loss": 0.466, "step": 20600 }, { "epoch": 0.7767685523687483, "grad_norm": 1.6656630817840206, "learning_rate": 1.4432498631117314e-06, "loss": 0.43, "step": 20610 }, { "epoch": 0.7771454415256474, "grad_norm": 1.5139791631000281, "learning_rate": 1.438629569683025e-06, "loss": 0.4855, "step": 20620 }, { "epoch": 0.7775223306825463, "grad_norm": 1.6906826740177294, "learning_rate": 1.4340154405655826e-06, "loss": 0.456, "step": 20630 }, { "epoch": 0.7778992198394452, "grad_norm": 1.6698594166197502, "learning_rate": 1.4294074837459177e-06, "loss": 0.4912, "step": 20640 }, { "epoch": 0.7782761089963441, "grad_norm": 2.250059784898189, "learning_rate": 1.4248057071998578e-06, "loss": 0.4998, "step": 20650 }, { "epoch": 0.7786529981532432, "grad_norm": 1.575246221565116, "learning_rate": 1.420210118892536e-06, "loss": 0.4768, "step": 20660 }, { "epoch": 0.7790298873101421, "grad_norm": 1.5597959754124424, "learning_rate": 1.4156207267783679e-06, "loss": 0.4652, "step": 20670 }, { "epoch": 0.779406776467041, "grad_norm": 1.628610962575096, "learning_rate": 1.4110375388010538e-06, "loss": 0.4876, "step": 20680 }, { "epoch": 0.77978366562394, "grad_norm": 1.644374962717556, "learning_rate": 1.4064605628935479e-06, "loss": 0.4679, "step": 20690 }, { "epoch": 0.780160554780839, "grad_norm": 1.8253088878455022, "learning_rate": 1.4018898069780572e-06, "loss": 0.4772, "step": 20700 }, { "epoch": 0.7805374439377379, "grad_norm": 1.5207862619368617, "learning_rate": 1.3973252789660158e-06, "loss": 0.4574, "step": 20710 }, { "epoch": 0.7809143330946369, "grad_norm": 1.7050624400247496, "learning_rate": 1.3927669867580845e-06, "loss": 0.4554, "step": 20720 }, { "epoch": 0.7812912222515358, "grad_norm": 1.9273191609007996, "learning_rate": 1.3882149382441262e-06, "loss": 0.5077, "step": 20730 }, { "epoch": 0.7816681114084347, "grad_norm": 1.4282728575268597, "learning_rate": 1.3836691413032045e-06, "loss": 0.4596, "step": 20740 }, { "epoch": 0.7820450005653338, "grad_norm": 1.5766792437516781, "learning_rate": 1.37912960380355e-06, "loss": 0.475, "step": 20750 }, { "epoch": 0.7824218897222327, "grad_norm": 1.588717645461143, "learning_rate": 1.3745963336025692e-06, "loss": 0.4701, "step": 20760 }, { "epoch": 0.7827987788791316, "grad_norm": 1.628374898696195, "learning_rate": 1.3700693385468156e-06, "loss": 0.467, "step": 20770 }, { "epoch": 0.7831756680360306, "grad_norm": 1.8101040525930028, "learning_rate": 1.3655486264719832e-06, "loss": 0.4655, "step": 20780 }, { "epoch": 0.7835525571929296, "grad_norm": 1.5363813815541065, "learning_rate": 1.3610342052028897e-06, "loss": 0.4629, "step": 20790 }, { "epoch": 0.7839294463498285, "grad_norm": 1.7639661489622709, "learning_rate": 1.3565260825534653e-06, "loss": 0.4502, "step": 20800 }, { "epoch": 0.7843063355067275, "grad_norm": 1.5089833035960092, "learning_rate": 1.3520242663267375e-06, "loss": 0.4871, "step": 20810 }, { "epoch": 0.7846832246636264, "grad_norm": 1.5073945769296926, "learning_rate": 1.3475287643148178e-06, "loss": 0.4786, "step": 20820 }, { "epoch": 0.7850601138205254, "grad_norm": 1.5526068937170885, "learning_rate": 1.3430395842988886e-06, "loss": 0.4549, "step": 20830 }, { "epoch": 0.7854370029774244, "grad_norm": 1.4425597365312917, "learning_rate": 1.3385567340491901e-06, "loss": 0.453, "step": 20840 }, { "epoch": 0.7858138921343233, "grad_norm": 1.6718319416849392, "learning_rate": 1.334080221325006e-06, "loss": 0.4755, "step": 20850 }, { "epoch": 0.7861907812912222, "grad_norm": 1.4577265197030453, "learning_rate": 1.3296100538746514e-06, "loss": 0.4454, "step": 20860 }, { "epoch": 0.7865676704481213, "grad_norm": 1.3692713929909994, "learning_rate": 1.3251462394354585e-06, "loss": 0.4927, "step": 20870 }, { "epoch": 0.7869445596050202, "grad_norm": 1.6442519408672833, "learning_rate": 1.3206887857337586e-06, "loss": 0.4492, "step": 20880 }, { "epoch": 0.7873214487619191, "grad_norm": 1.6104969996274061, "learning_rate": 1.3162377004848814e-06, "loss": 0.4515, "step": 20890 }, { "epoch": 0.787698337918818, "grad_norm": 1.7066857405575866, "learning_rate": 1.3117929913931277e-06, "loss": 0.4553, "step": 20900 }, { "epoch": 0.788075227075717, "grad_norm": 1.731131533765537, "learning_rate": 1.3073546661517655e-06, "loss": 0.4822, "step": 20910 }, { "epoch": 0.788452116232616, "grad_norm": 1.696849854193515, "learning_rate": 1.3029227324430077e-06, "loss": 0.4536, "step": 20920 }, { "epoch": 0.788829005389515, "grad_norm": 1.6266173084112765, "learning_rate": 1.298497197938008e-06, "loss": 0.4464, "step": 20930 }, { "epoch": 0.7892058945464139, "grad_norm": 1.5230400863153157, "learning_rate": 1.2940780702968464e-06, "loss": 0.4838, "step": 20940 }, { "epoch": 0.7895827837033128, "grad_norm": 1.805355977994606, "learning_rate": 1.2896653571685108e-06, "loss": 0.4808, "step": 20950 }, { "epoch": 0.7899596728602118, "grad_norm": 1.6563216644658791, "learning_rate": 1.2852590661908826e-06, "loss": 0.4793, "step": 20960 }, { "epoch": 0.7903365620171108, "grad_norm": 1.7324681232443127, "learning_rate": 1.280859204990732e-06, "loss": 0.4659, "step": 20970 }, { "epoch": 0.7907134511740097, "grad_norm": 1.6887405218479397, "learning_rate": 1.2764657811836995e-06, "loss": 0.4829, "step": 20980 }, { "epoch": 0.7910903403309086, "grad_norm": 1.723570674808702, "learning_rate": 1.2720788023742819e-06, "loss": 0.495, "step": 20990 }, { "epoch": 0.7914672294878077, "grad_norm": 1.7913544321855928, "learning_rate": 1.267698276155821e-06, "loss": 0.4626, "step": 21000 }, { "epoch": 0.7918441186447066, "grad_norm": 1.7911338951212774, "learning_rate": 1.2633242101104904e-06, "loss": 0.4577, "step": 21010 }, { "epoch": 0.7922210078016055, "grad_norm": 1.8879029948128403, "learning_rate": 1.2589566118092805e-06, "loss": 0.5013, "step": 21020 }, { "epoch": 0.7925978969585045, "grad_norm": 1.602586178784185, "learning_rate": 1.2545954888119882e-06, "loss": 0.4872, "step": 21030 }, { "epoch": 0.7929747861154035, "grad_norm": 1.816604176967041, "learning_rate": 1.2502408486672018e-06, "loss": 0.4898, "step": 21040 }, { "epoch": 0.7933516752723024, "grad_norm": 1.6554081568794934, "learning_rate": 1.2458926989122894e-06, "loss": 0.4739, "step": 21050 }, { "epoch": 0.7937285644292014, "grad_norm": 1.8212477680147816, "learning_rate": 1.2415510470733832e-06, "loss": 0.4574, "step": 21060 }, { "epoch": 0.7941054535861003, "grad_norm": 1.2942280081366884, "learning_rate": 1.2372159006653711e-06, "loss": 0.4775, "step": 21070 }, { "epoch": 0.7944823427429993, "grad_norm": 1.7198494071850723, "learning_rate": 1.2328872671918752e-06, "loss": 0.5035, "step": 21080 }, { "epoch": 0.7948592318998983, "grad_norm": 1.717727877702444, "learning_rate": 1.2285651541452526e-06, "loss": 0.4602, "step": 21090 }, { "epoch": 0.7952361210567972, "grad_norm": 1.7030104890295072, "learning_rate": 1.2242495690065687e-06, "loss": 0.4563, "step": 21100 }, { "epoch": 0.7956130102136961, "grad_norm": 1.550857418455602, "learning_rate": 1.219940519245592e-06, "loss": 0.4906, "step": 21110 }, { "epoch": 0.795989899370595, "grad_norm": 1.1895997553169184, "learning_rate": 1.2156380123207761e-06, "loss": 0.476, "step": 21120 }, { "epoch": 0.7963667885274941, "grad_norm": 1.6911416575004463, "learning_rate": 1.2113420556792539e-06, "loss": 0.4591, "step": 21130 }, { "epoch": 0.796743677684393, "grad_norm": 1.439704510030527, "learning_rate": 1.2070526567568164e-06, "loss": 0.4489, "step": 21140 }, { "epoch": 0.797120566841292, "grad_norm": 1.6272324810392764, "learning_rate": 1.20276982297791e-06, "loss": 0.4763, "step": 21150 }, { "epoch": 0.7974974559981909, "grad_norm": 1.661794252836065, "learning_rate": 1.1984935617556104e-06, "loss": 0.4652, "step": 21160 }, { "epoch": 0.7978743451550899, "grad_norm": 1.6161485954397128, "learning_rate": 1.1942238804916213e-06, "loss": 0.478, "step": 21170 }, { "epoch": 0.7982512343119889, "grad_norm": 1.5127230860148504, "learning_rate": 1.1899607865762563e-06, "loss": 0.4879, "step": 21180 }, { "epoch": 0.7986281234688878, "grad_norm": 1.6559970652967313, "learning_rate": 1.1857042873884272e-06, "loss": 0.4557, "step": 21190 }, { "epoch": 0.7990050126257867, "grad_norm": 1.6399527198328363, "learning_rate": 1.181454390295631e-06, "loss": 0.4691, "step": 21200 }, { "epoch": 0.7993819017826858, "grad_norm": 1.5670234998624126, "learning_rate": 1.1772111026539374e-06, "loss": 0.454, "step": 21210 }, { "epoch": 0.7997587909395847, "grad_norm": 1.7735935156287603, "learning_rate": 1.172974431807975e-06, "loss": 0.4637, "step": 21220 }, { "epoch": 0.8001356800964836, "grad_norm": 1.6014800132057636, "learning_rate": 1.1687443850909208e-06, "loss": 0.4736, "step": 21230 }, { "epoch": 0.8005125692533825, "grad_norm": 1.63595133372094, "learning_rate": 1.1645209698244857e-06, "loss": 0.4642, "step": 21240 }, { "epoch": 0.8008894584102816, "grad_norm": 1.7792664084505718, "learning_rate": 1.1603041933189024e-06, "loss": 0.5004, "step": 21250 }, { "epoch": 0.8012663475671805, "grad_norm": 1.7417841561819936, "learning_rate": 1.1560940628729129e-06, "loss": 0.4851, "step": 21260 }, { "epoch": 0.8016432367240794, "grad_norm": 1.7474085529041983, "learning_rate": 1.1518905857737544e-06, "loss": 0.506, "step": 21270 }, { "epoch": 0.8020201258809784, "grad_norm": 1.4496625205949163, "learning_rate": 1.1476937692971508e-06, "loss": 0.4375, "step": 21280 }, { "epoch": 0.8023970150378774, "grad_norm": 1.6179305557255557, "learning_rate": 1.1435036207072913e-06, "loss": 0.448, "step": 21290 }, { "epoch": 0.8027739041947763, "grad_norm": 1.6365956374761494, "learning_rate": 1.1393201472568322e-06, "loss": 0.4974, "step": 21300 }, { "epoch": 0.8031507933516753, "grad_norm": 1.3450521505509172, "learning_rate": 1.1351433561868697e-06, "loss": 0.4508, "step": 21310 }, { "epoch": 0.8035276825085742, "grad_norm": 1.4538600265225352, "learning_rate": 1.130973254726937e-06, "loss": 0.4614, "step": 21320 }, { "epoch": 0.8039045716654731, "grad_norm": 1.587273756529849, "learning_rate": 1.1268098500949843e-06, "loss": 0.4577, "step": 21330 }, { "epoch": 0.8042814608223722, "grad_norm": 1.6193327198648404, "learning_rate": 1.122653149497373e-06, "loss": 0.4548, "step": 21340 }, { "epoch": 0.8046583499792711, "grad_norm": 1.7408766583273092, "learning_rate": 1.1185031601288627e-06, "loss": 0.4792, "step": 21350 }, { "epoch": 0.80503523913617, "grad_norm": 1.745321600962186, "learning_rate": 1.1143598891725948e-06, "loss": 0.4503, "step": 21360 }, { "epoch": 0.805412128293069, "grad_norm": 1.6321545726426894, "learning_rate": 1.1102233438000786e-06, "loss": 0.4755, "step": 21370 }, { "epoch": 0.805789017449968, "grad_norm": 1.6372516624414968, "learning_rate": 1.1060935311711873e-06, "loss": 0.4491, "step": 21380 }, { "epoch": 0.8061659066068669, "grad_norm": 1.8295447316903106, "learning_rate": 1.1019704584341374e-06, "loss": 0.4645, "step": 21390 }, { "epoch": 0.8065427957637659, "grad_norm": 1.6650412672495543, "learning_rate": 1.097854132725481e-06, "loss": 0.4669, "step": 21400 }, { "epoch": 0.8069196849206648, "grad_norm": 1.5629750029935003, "learning_rate": 1.093744561170092e-06, "loss": 0.469, "step": 21410 }, { "epoch": 0.8072965740775638, "grad_norm": 1.421621390086491, "learning_rate": 1.0896417508811518e-06, "loss": 0.4753, "step": 21420 }, { "epoch": 0.8076734632344628, "grad_norm": 1.6379330117232693, "learning_rate": 1.0855457089601407e-06, "loss": 0.4587, "step": 21430 }, { "epoch": 0.8080503523913617, "grad_norm": 1.7407275341596244, "learning_rate": 1.0814564424968226e-06, "loss": 0.4966, "step": 21440 }, { "epoch": 0.8084272415482606, "grad_norm": 1.846067209557093, "learning_rate": 1.0773739585692356e-06, "loss": 0.5118, "step": 21450 }, { "epoch": 0.8088041307051597, "grad_norm": 1.456547037165189, "learning_rate": 1.0732982642436757e-06, "loss": 0.4466, "step": 21460 }, { "epoch": 0.8091810198620586, "grad_norm": 1.7012035006791435, "learning_rate": 1.0692293665746884e-06, "loss": 0.4926, "step": 21470 }, { "epoch": 0.8095579090189575, "grad_norm": 1.5902195620043145, "learning_rate": 1.065167272605056e-06, "loss": 0.4765, "step": 21480 }, { "epoch": 0.8099347981758565, "grad_norm": 1.527190606227678, "learning_rate": 1.061111989365779e-06, "loss": 0.4694, "step": 21490 }, { "epoch": 0.8103116873327554, "grad_norm": 1.6866265884761789, "learning_rate": 1.0570635238760774e-06, "loss": 0.4623, "step": 21500 }, { "epoch": 0.8106885764896544, "grad_norm": 1.575765143637899, "learning_rate": 1.0530218831433652e-06, "loss": 0.4482, "step": 21510 }, { "epoch": 0.8110654656465534, "grad_norm": 2.4146188153476995, "learning_rate": 1.0489870741632456e-06, "loss": 0.4633, "step": 21520 }, { "epoch": 0.8114423548034523, "grad_norm": 1.6718557277711972, "learning_rate": 1.044959103919494e-06, "loss": 0.4811, "step": 21530 }, { "epoch": 0.8118192439603512, "grad_norm": 1.7873992974852744, "learning_rate": 1.0409379793840518e-06, "loss": 0.4546, "step": 21540 }, { "epoch": 0.8121961331172503, "grad_norm": 1.8256231709433406, "learning_rate": 1.0369237075170091e-06, "loss": 0.4743, "step": 21550 }, { "epoch": 0.8125730222741492, "grad_norm": 1.6028991502176622, "learning_rate": 1.0329162952666e-06, "loss": 0.441, "step": 21560 }, { "epoch": 0.8129499114310481, "grad_norm": 1.7523235315102614, "learning_rate": 1.028915749569177e-06, "loss": 0.4874, "step": 21570 }, { "epoch": 0.813326800587947, "grad_norm": 1.574362879271632, "learning_rate": 1.0249220773492142e-06, "loss": 0.47, "step": 21580 }, { "epoch": 0.8137036897448461, "grad_norm": 1.4217729969659767, "learning_rate": 1.020935285519285e-06, "loss": 0.4655, "step": 21590 }, { "epoch": 0.814080578901745, "grad_norm": 1.7611978152515075, "learning_rate": 1.0169553809800543e-06, "loss": 0.4992, "step": 21600 }, { "epoch": 0.8144574680586439, "grad_norm": 1.4580463296463873, "learning_rate": 1.0129823706202696e-06, "loss": 0.4516, "step": 21610 }, { "epoch": 0.8148343572155429, "grad_norm": 1.5284529560123907, "learning_rate": 1.0090162613167393e-06, "loss": 0.4668, "step": 21620 }, { "epoch": 0.8152112463724419, "grad_norm": 1.6642481846748813, "learning_rate": 1.0050570599343302e-06, "loss": 0.4533, "step": 21630 }, { "epoch": 0.8155881355293408, "grad_norm": 1.3683583910657175, "learning_rate": 1.0011047733259521e-06, "loss": 0.4597, "step": 21640 }, { "epoch": 0.8159650246862398, "grad_norm": 1.8875658078070612, "learning_rate": 9.97159408332547e-07, "loss": 0.4782, "step": 21650 }, { "epoch": 0.8163419138431387, "grad_norm": 1.692047018964554, "learning_rate": 9.932209717830744e-07, "loss": 0.462, "step": 21660 }, { "epoch": 0.8167188030000377, "grad_norm": 1.5378872143686608, "learning_rate": 9.892894704945022e-07, "loss": 0.4656, "step": 21670 }, { "epoch": 0.8170956921569367, "grad_norm": 1.5613665579925253, "learning_rate": 9.85364911271795e-07, "loss": 0.476, "step": 21680 }, { "epoch": 0.8174725813138356, "grad_norm": 1.6137324346165267, "learning_rate": 9.814473009079017e-07, "loss": 0.5071, "step": 21690 }, { "epoch": 0.8178494704707345, "grad_norm": 1.614130393792285, "learning_rate": 9.7753664618374e-07, "loss": 0.4664, "step": 21700 }, { "epoch": 0.8182263596276335, "grad_norm": 1.6240132524422568, "learning_rate": 9.736329538681932e-07, "loss": 0.4683, "step": 21710 }, { "epoch": 0.8186032487845325, "grad_norm": 1.5296419317199343, "learning_rate": 9.697362307180918e-07, "loss": 0.4939, "step": 21720 }, { "epoch": 0.8189801379414314, "grad_norm": 1.5321812462004734, "learning_rate": 9.658464834782033e-07, "loss": 0.4778, "step": 21730 }, { "epoch": 0.8193570270983304, "grad_norm": 1.5185831568058594, "learning_rate": 9.619637188812175e-07, "loss": 0.46, "step": 21740 }, { "epoch": 0.8197339162552293, "grad_norm": 1.6821769094096455, "learning_rate": 9.58087943647743e-07, "loss": 0.4879, "step": 21750 }, { "epoch": 0.8201108054121283, "grad_norm": 1.467098506919944, "learning_rate": 9.542191644862869e-07, "loss": 0.4338, "step": 21760 }, { "epoch": 0.8204876945690273, "grad_norm": 1.5832840990797274, "learning_rate": 9.503573880932527e-07, "loss": 0.4617, "step": 21770 }, { "epoch": 0.8208645837259262, "grad_norm": 1.4725123019965634, "learning_rate": 9.465026211529149e-07, "loss": 0.4591, "step": 21780 }, { "epoch": 0.8212414728828251, "grad_norm": 1.4443596976324486, "learning_rate": 9.42654870337421e-07, "loss": 0.4723, "step": 21790 }, { "epoch": 0.8216183620397242, "grad_norm": 1.7715020744201238, "learning_rate": 9.38814142306772e-07, "loss": 0.4414, "step": 21800 }, { "epoch": 0.8219952511966231, "grad_norm": 1.6263211713573036, "learning_rate": 9.349804437088155e-07, "loss": 0.4657, "step": 21810 }, { "epoch": 0.822372140353522, "grad_norm": 2.675858370147406, "learning_rate": 9.311537811792299e-07, "loss": 0.4547, "step": 21820 }, { "epoch": 0.822749029510421, "grad_norm": 1.6989825689942237, "learning_rate": 9.273341613415155e-07, "loss": 0.4486, "step": 21830 }, { "epoch": 0.82312591866732, "grad_norm": 1.758883283595281, "learning_rate": 9.235215908069828e-07, "loss": 0.4752, "step": 21840 }, { "epoch": 0.8235028078242189, "grad_norm": 1.6500955266665545, "learning_rate": 9.197160761747415e-07, "loss": 0.4578, "step": 21850 }, { "epoch": 0.8238796969811178, "grad_norm": 1.4200589057349262, "learning_rate": 9.159176240316869e-07, "loss": 0.4292, "step": 21860 }, { "epoch": 0.8242565861380168, "grad_norm": 1.7138258286525525, "learning_rate": 9.121262409524906e-07, "loss": 0.479, "step": 21870 }, { "epoch": 0.8246334752949158, "grad_norm": 1.6629136940635971, "learning_rate": 9.08341933499589e-07, "loss": 0.4676, "step": 21880 }, { "epoch": 0.8250103644518147, "grad_norm": 1.7116482891389613, "learning_rate": 9.045647082231729e-07, "loss": 0.4717, "step": 21890 }, { "epoch": 0.8253872536087137, "grad_norm": 1.695232561294525, "learning_rate": 9.007945716611688e-07, "loss": 0.4766, "step": 21900 }, { "epoch": 0.8257641427656126, "grad_norm": 1.7002498190688085, "learning_rate": 8.970315303392379e-07, "loss": 0.4901, "step": 21910 }, { "epoch": 0.8261410319225115, "grad_norm": 1.492022987501469, "learning_rate": 8.93275590770763e-07, "loss": 0.4444, "step": 21920 }, { "epoch": 0.8265179210794106, "grad_norm": 1.7137434822176278, "learning_rate": 8.895267594568302e-07, "loss": 0.4595, "step": 21930 }, { "epoch": 0.8268948102363095, "grad_norm": 1.5218510147047475, "learning_rate": 8.857850428862241e-07, "loss": 0.4631, "step": 21940 }, { "epoch": 0.8272716993932084, "grad_norm": 1.8826119986337635, "learning_rate": 8.820504475354119e-07, "loss": 0.5034, "step": 21950 }, { "epoch": 0.8276485885501074, "grad_norm": 1.5920446251275133, "learning_rate": 8.783229798685361e-07, "loss": 0.467, "step": 21960 }, { "epoch": 0.8280254777070064, "grad_norm": 1.7063227060767632, "learning_rate": 8.746026463374058e-07, "loss": 0.4541, "step": 21970 }, { "epoch": 0.8284023668639053, "grad_norm": 1.618446816633231, "learning_rate": 8.708894533814788e-07, "loss": 0.4466, "step": 21980 }, { "epoch": 0.8287792560208043, "grad_norm": 1.7677621425455639, "learning_rate": 8.671834074278496e-07, "loss": 0.4488, "step": 21990 }, { "epoch": 0.8291561451777032, "grad_norm": 1.8004192808155495, "learning_rate": 8.63484514891248e-07, "loss": 0.4781, "step": 22000 }, { "epoch": 0.8295330343346022, "grad_norm": 1.638331409201194, "learning_rate": 8.597927821740188e-07, "loss": 0.4847, "step": 22010 }, { "epoch": 0.8299099234915012, "grad_norm": 1.547508824093798, "learning_rate": 8.56108215666116e-07, "loss": 0.467, "step": 22020 }, { "epoch": 0.8302868126484001, "grad_norm": 1.6865585478017953, "learning_rate": 8.524308217450883e-07, "loss": 0.4976, "step": 22030 }, { "epoch": 0.830663701805299, "grad_norm": 1.5407631720098116, "learning_rate": 8.487606067760695e-07, "loss": 0.4503, "step": 22040 }, { "epoch": 0.8310405909621981, "grad_norm": 1.7278683855838521, "learning_rate": 8.450975771117686e-07, "loss": 0.4766, "step": 22050 }, { "epoch": 0.831417480119097, "grad_norm": 1.6881225655115248, "learning_rate": 8.414417390924567e-07, "loss": 0.4529, "step": 22060 }, { "epoch": 0.8317943692759959, "grad_norm": 1.5343097557556282, "learning_rate": 8.37793099045957e-07, "loss": 0.4654, "step": 22070 }, { "epoch": 0.8321712584328949, "grad_norm": 1.5443651240466079, "learning_rate": 8.341516632876345e-07, "loss": 0.4725, "step": 22080 }, { "epoch": 0.8325481475897939, "grad_norm": 1.709840635782194, "learning_rate": 8.30517438120384e-07, "loss": 0.4648, "step": 22090 }, { "epoch": 0.8329250367466928, "grad_norm": 1.734867377598363, "learning_rate": 8.268904298346215e-07, "loss": 0.4862, "step": 22100 }, { "epoch": 0.8333019259035918, "grad_norm": 1.5853579920685308, "learning_rate": 8.232706447082644e-07, "loss": 0.4554, "step": 22110 }, { "epoch": 0.8336788150604907, "grad_norm": 1.8218704126887049, "learning_rate": 8.196580890067379e-07, "loss": 0.4712, "step": 22120 }, { "epoch": 0.8340557042173896, "grad_norm": 1.7644284565478414, "learning_rate": 8.160527689829473e-07, "loss": 0.5028, "step": 22130 }, { "epoch": 0.8344325933742887, "grad_norm": 1.6857508469766067, "learning_rate": 8.124546908772768e-07, "loss": 0.4622, "step": 22140 }, { "epoch": 0.8348094825311876, "grad_norm": 1.6855492767191518, "learning_rate": 8.088638609175719e-07, "loss": 0.4948, "step": 22150 }, { "epoch": 0.8351863716880865, "grad_norm": 1.3978162670227476, "learning_rate": 8.052802853191355e-07, "loss": 0.4743, "step": 22160 }, { "epoch": 0.8355632608449854, "grad_norm": 1.6951740065317433, "learning_rate": 8.01703970284713e-07, "loss": 0.4754, "step": 22170 }, { "epoch": 0.8359401500018845, "grad_norm": 1.7573161295973543, "learning_rate": 7.98134922004486e-07, "loss": 0.4871, "step": 22180 }, { "epoch": 0.8363170391587834, "grad_norm": 1.4918859167709155, "learning_rate": 7.945731466560519e-07, "loss": 0.4794, "step": 22190 }, { "epoch": 0.8366939283156823, "grad_norm": 1.7049198529331042, "learning_rate": 7.910186504044237e-07, "loss": 0.4611, "step": 22200 }, { "epoch": 0.8370708174725813, "grad_norm": 1.4292962627440216, "learning_rate": 7.874714394020145e-07, "loss": 0.4175, "step": 22210 }, { "epoch": 0.8374477066294803, "grad_norm": 2.0016660592227145, "learning_rate": 7.839315197886277e-07, "loss": 0.468, "step": 22220 }, { "epoch": 0.8378245957863792, "grad_norm": 1.8041549181742595, "learning_rate": 7.803988976914451e-07, "loss": 0.4661, "step": 22230 }, { "epoch": 0.8382014849432782, "grad_norm": 1.793146229523631, "learning_rate": 7.768735792250176e-07, "loss": 0.4874, "step": 22240 }, { "epoch": 0.8385783741001771, "grad_norm": 1.8151183016027572, "learning_rate": 7.73355570491256e-07, "loss": 0.4688, "step": 22250 }, { "epoch": 0.8389552632570761, "grad_norm": 1.6753537342016527, "learning_rate": 7.698448775794171e-07, "loss": 0.4918, "step": 22260 }, { "epoch": 0.8393321524139751, "grad_norm": 1.5400170574876808, "learning_rate": 7.663415065660951e-07, "loss": 0.4734, "step": 22270 }, { "epoch": 0.839709041570874, "grad_norm": 1.4967554695098977, "learning_rate": 7.628454635152111e-07, "loss": 0.4725, "step": 22280 }, { "epoch": 0.8400859307277729, "grad_norm": 1.8581940585235912, "learning_rate": 7.593567544780028e-07, "loss": 0.4813, "step": 22290 }, { "epoch": 0.8404628198846719, "grad_norm": 1.7617654233248232, "learning_rate": 7.558753854930129e-07, "loss": 0.4798, "step": 22300 }, { "epoch": 0.8408397090415709, "grad_norm": 1.8228739549825173, "learning_rate": 7.52401362586081e-07, "loss": 0.4661, "step": 22310 }, { "epoch": 0.8412165981984698, "grad_norm": 1.5249651201885777, "learning_rate": 7.489346917703261e-07, "loss": 0.4881, "step": 22320 }, { "epoch": 0.8415934873553688, "grad_norm": 1.6186808625257767, "learning_rate": 7.4547537904615e-07, "loss": 0.4733, "step": 22330 }, { "epoch": 0.8419703765122677, "grad_norm": 2.203606849350494, "learning_rate": 7.420234304012119e-07, "loss": 0.4488, "step": 22340 }, { "epoch": 0.8423472656691667, "grad_norm": 1.791129399689499, "learning_rate": 7.385788518104287e-07, "loss": 0.4419, "step": 22350 }, { "epoch": 0.8427241548260657, "grad_norm": 1.586850556241108, "learning_rate": 7.351416492359564e-07, "loss": 0.4632, "step": 22360 }, { "epoch": 0.8431010439829646, "grad_norm": 1.4777261662520846, "learning_rate": 7.317118286271869e-07, "loss": 0.4428, "step": 22370 }, { "epoch": 0.8434779331398635, "grad_norm": 1.6300982151377412, "learning_rate": 7.282893959207354e-07, "loss": 0.4844, "step": 22380 }, { "epoch": 0.8438548222967626, "grad_norm": 1.7666022912394777, "learning_rate": 7.248743570404293e-07, "loss": 0.4825, "step": 22390 }, { "epoch": 0.8442317114536615, "grad_norm": 1.5994223812698616, "learning_rate": 7.214667178972951e-07, "loss": 0.4678, "step": 22400 }, { "epoch": 0.8446086006105604, "grad_norm": 1.5930748274830833, "learning_rate": 7.180664843895536e-07, "loss": 0.4825, "step": 22410 }, { "epoch": 0.8449854897674594, "grad_norm": 1.8952726942929914, "learning_rate": 7.146736624026073e-07, "loss": 0.4619, "step": 22420 }, { "epoch": 0.8453623789243584, "grad_norm": 1.6535903612300786, "learning_rate": 7.112882578090308e-07, "loss": 0.442, "step": 22430 }, { "epoch": 0.8457392680812573, "grad_norm": 1.8113816961588787, "learning_rate": 7.079102764685592e-07, "loss": 0.4727, "step": 22440 }, { "epoch": 0.8461161572381563, "grad_norm": 1.6381559303129163, "learning_rate": 7.045397242280782e-07, "loss": 0.4739, "step": 22450 }, { "epoch": 0.8464930463950552, "grad_norm": 1.6420241463187408, "learning_rate": 7.011766069216153e-07, "loss": 0.4728, "step": 22460 }, { "epoch": 0.8468699355519542, "grad_norm": 1.8579503334291299, "learning_rate": 6.978209303703298e-07, "loss": 0.467, "step": 22470 }, { "epoch": 0.8472468247088532, "grad_norm": 1.8303598632267788, "learning_rate": 6.944727003825014e-07, "loss": 0.459, "step": 22480 }, { "epoch": 0.8476237138657521, "grad_norm": 1.6128953601572278, "learning_rate": 6.91131922753519e-07, "loss": 0.4592, "step": 22490 }, { "epoch": 0.848000603022651, "grad_norm": 1.7095865691859484, "learning_rate": 6.877986032658751e-07, "loss": 0.48, "step": 22500 }, { "epoch": 0.8483774921795499, "grad_norm": 1.4938798824764543, "learning_rate": 6.844727476891521e-07, "loss": 0.4781, "step": 22510 }, { "epoch": 0.848754381336449, "grad_norm": 1.6485957619451577, "learning_rate": 6.811543617800104e-07, "loss": 0.5045, "step": 22520 }, { "epoch": 0.8491312704933479, "grad_norm": 1.837620635777043, "learning_rate": 6.778434512821863e-07, "loss": 0.4748, "step": 22530 }, { "epoch": 0.8495081596502468, "grad_norm": 1.4879964061685897, "learning_rate": 6.745400219264736e-07, "loss": 0.4304, "step": 22540 }, { "epoch": 0.8498850488071458, "grad_norm": 1.7156819983043117, "learning_rate": 6.712440794307191e-07, "loss": 0.466, "step": 22550 }, { "epoch": 0.8502619379640448, "grad_norm": 1.3621122589006487, "learning_rate": 6.67955629499808e-07, "loss": 0.4704, "step": 22560 }, { "epoch": 0.8506388271209437, "grad_norm": 2.0534498348755834, "learning_rate": 6.646746778256591e-07, "loss": 0.4854, "step": 22570 }, { "epoch": 0.8510157162778427, "grad_norm": 1.6005946712317962, "learning_rate": 6.614012300872108e-07, "loss": 0.4695, "step": 22580 }, { "epoch": 0.8513926054347416, "grad_norm": 1.5414012999559674, "learning_rate": 6.581352919504175e-07, "loss": 0.4707, "step": 22590 }, { "epoch": 0.8517694945916406, "grad_norm": 1.8269075507153945, "learning_rate": 6.548768690682295e-07, "loss": 0.4661, "step": 22600 }, { "epoch": 0.8521463837485396, "grad_norm": 1.8104476310135393, "learning_rate": 6.516259670805914e-07, "loss": 0.4715, "step": 22610 }, { "epoch": 0.8525232729054385, "grad_norm": 1.5462248619686105, "learning_rate": 6.483825916144315e-07, "loss": 0.4634, "step": 22620 }, { "epoch": 0.8529001620623374, "grad_norm": 1.7572378321726936, "learning_rate": 6.451467482836493e-07, "loss": 0.47, "step": 22630 }, { "epoch": 0.8532770512192365, "grad_norm": 1.719401134457696, "learning_rate": 6.419184426891062e-07, "loss": 0.4417, "step": 22640 }, { "epoch": 0.8536539403761354, "grad_norm": 1.8799617216498543, "learning_rate": 6.386976804186185e-07, "loss": 0.4543, "step": 22650 }, { "epoch": 0.8540308295330343, "grad_norm": 1.7409813880394684, "learning_rate": 6.354844670469446e-07, "loss": 0.4555, "step": 22660 }, { "epoch": 0.8544077186899333, "grad_norm": 1.7233766286902579, "learning_rate": 6.322788081357767e-07, "loss": 0.4894, "step": 22670 }, { "epoch": 0.8547846078468323, "grad_norm": 1.822979772670787, "learning_rate": 6.290807092337325e-07, "loss": 0.456, "step": 22680 }, { "epoch": 0.8551614970037312, "grad_norm": 1.7840609931799256, "learning_rate": 6.258901758763425e-07, "loss": 0.4467, "step": 22690 }, { "epoch": 0.8555383861606302, "grad_norm": 1.7330476182654266, "learning_rate": 6.227072135860424e-07, "loss": 0.4887, "step": 22700 }, { "epoch": 0.8559152753175291, "grad_norm": 1.634410872367023, "learning_rate": 6.195318278721646e-07, "loss": 0.4589, "step": 22710 }, { "epoch": 0.856292164474428, "grad_norm": 1.8487043942748613, "learning_rate": 6.163640242309271e-07, "loss": 0.4514, "step": 22720 }, { "epoch": 0.8566690536313271, "grad_norm": 1.6693062741828186, "learning_rate": 6.132038081454206e-07, "loss": 0.4654, "step": 22730 }, { "epoch": 0.857045942788226, "grad_norm": 1.6243325415551528, "learning_rate": 6.100511850856083e-07, "loss": 0.4877, "step": 22740 }, { "epoch": 0.8574228319451249, "grad_norm": 1.76930759153765, "learning_rate": 6.069061605083076e-07, "loss": 0.4678, "step": 22750 }, { "epoch": 0.8577997211020238, "grad_norm": 1.6289028881940613, "learning_rate": 6.037687398571846e-07, "loss": 0.4247, "step": 22760 }, { "epoch": 0.8581766102589229, "grad_norm": 1.6696350756219092, "learning_rate": 6.006389285627423e-07, "loss": 0.4883, "step": 22770 }, { "epoch": 0.8585534994158218, "grad_norm": 1.513884778231805, "learning_rate": 5.975167320423137e-07, "loss": 0.4606, "step": 22780 }, { "epoch": 0.8589303885727207, "grad_norm": 1.6184179790577693, "learning_rate": 5.94402155700054e-07, "loss": 0.4408, "step": 22790 }, { "epoch": 0.8593072777296197, "grad_norm": 1.8267950570439317, "learning_rate": 5.912952049269271e-07, "loss": 0.4638, "step": 22800 }, { "epoch": 0.8596841668865187, "grad_norm": 1.411486088307513, "learning_rate": 5.881958851006952e-07, "loss": 0.4581, "step": 22810 }, { "epoch": 0.8600610560434176, "grad_norm": 1.4674642682660017, "learning_rate": 5.851042015859154e-07, "loss": 0.4565, "step": 22820 }, { "epoch": 0.8604379452003166, "grad_norm": 1.3939142639115625, "learning_rate": 5.82020159733927e-07, "loss": 0.4704, "step": 22830 }, { "epoch": 0.8608148343572155, "grad_norm": 1.5415763619326572, "learning_rate": 5.789437648828411e-07, "loss": 0.433, "step": 22840 }, { "epoch": 0.8611917235141145, "grad_norm": 1.6303887934059382, "learning_rate": 5.758750223575344e-07, "loss": 0.4747, "step": 22850 }, { "epoch": 0.8615686126710135, "grad_norm": 1.6198622243082812, "learning_rate": 5.728139374696368e-07, "loss": 0.4741, "step": 22860 }, { "epoch": 0.8619455018279124, "grad_norm": 1.6527922202005432, "learning_rate": 5.697605155175246e-07, "loss": 0.441, "step": 22870 }, { "epoch": 0.8623223909848113, "grad_norm": 1.6598779751043562, "learning_rate": 5.667147617863106e-07, "loss": 0.465, "step": 22880 }, { "epoch": 0.8626992801417104, "grad_norm": 1.601592197543877, "learning_rate": 5.636766815478346e-07, "loss": 0.4664, "step": 22890 }, { "epoch": 0.8630761692986093, "grad_norm": 1.8518020978894572, "learning_rate": 5.606462800606538e-07, "loss": 0.4765, "step": 22900 }, { "epoch": 0.8634530584555082, "grad_norm": 1.8569990976395905, "learning_rate": 5.576235625700355e-07, "loss": 0.4947, "step": 22910 }, { "epoch": 0.8638299476124072, "grad_norm": 1.78143629478858, "learning_rate": 5.546085343079472e-07, "loss": 0.4733, "step": 22920 }, { "epoch": 0.8642068367693061, "grad_norm": 1.7977943755656154, "learning_rate": 5.516012004930432e-07, "loss": 0.4677, "step": 22930 }, { "epoch": 0.8645837259262051, "grad_norm": 1.3894902301565615, "learning_rate": 5.486015663306665e-07, "loss": 0.4603, "step": 22940 }, { "epoch": 0.8649606150831041, "grad_norm": 1.8030057987995889, "learning_rate": 5.456096370128277e-07, "loss": 0.438, "step": 22950 }, { "epoch": 0.865337504240003, "grad_norm": 1.7403239254852352, "learning_rate": 5.426254177182039e-07, "loss": 0.4651, "step": 22960 }, { "epoch": 0.8657143933969019, "grad_norm": 1.5972874364344118, "learning_rate": 5.396489136121241e-07, "loss": 0.4621, "step": 22970 }, { "epoch": 0.866091282553801, "grad_norm": 1.9303613177891243, "learning_rate": 5.366801298465662e-07, "loss": 0.4713, "step": 22980 }, { "epoch": 0.8664681717106999, "grad_norm": 1.6872934044203467, "learning_rate": 5.337190715601426e-07, "loss": 0.4905, "step": 22990 }, { "epoch": 0.8668450608675988, "grad_norm": 1.7126679035862156, "learning_rate": 5.307657438780988e-07, "loss": 0.4803, "step": 23000 }, { "epoch": 0.8672219500244978, "grad_norm": 1.7350738496917792, "learning_rate": 5.278201519122922e-07, "loss": 0.4845, "step": 23010 }, { "epoch": 0.8675988391813968, "grad_norm": 1.787434922425744, "learning_rate": 5.248823007611964e-07, "loss": 0.4893, "step": 23020 }, { "epoch": 0.8679757283382957, "grad_norm": 1.872251424045816, "learning_rate": 5.219521955098833e-07, "loss": 0.4786, "step": 23030 }, { "epoch": 0.8683526174951947, "grad_norm": 1.7715299011721564, "learning_rate": 5.190298412300182e-07, "loss": 0.4791, "step": 23040 }, { "epoch": 0.8687295066520936, "grad_norm": 1.8357529427154649, "learning_rate": 5.161152429798538e-07, "loss": 0.4772, "step": 23050 }, { "epoch": 0.8691063958089926, "grad_norm": 1.6077824935411622, "learning_rate": 5.13208405804213e-07, "loss": 0.4591, "step": 23060 }, { "epoch": 0.8694832849658916, "grad_norm": 2.0708649306446567, "learning_rate": 5.103093347344872e-07, "loss": 0.4645, "step": 23070 }, { "epoch": 0.8698601741227905, "grad_norm": 1.4566190018720826, "learning_rate": 5.074180347886265e-07, "loss": 0.4395, "step": 23080 }, { "epoch": 0.8702370632796894, "grad_norm": 1.8556474690031441, "learning_rate": 5.045345109711291e-07, "loss": 0.4897, "step": 23090 }, { "epoch": 0.8706139524365883, "grad_norm": 1.8191111683824606, "learning_rate": 5.016587682730339e-07, "loss": 0.4656, "step": 23100 }, { "epoch": 0.8709908415934874, "grad_norm": 1.8020154384020783, "learning_rate": 4.98790811671912e-07, "loss": 0.4805, "step": 23110 }, { "epoch": 0.8713677307503863, "grad_norm": 1.6434790782350468, "learning_rate": 4.959306461318563e-07, "loss": 0.4614, "step": 23120 }, { "epoch": 0.8717446199072852, "grad_norm": 1.820609481455034, "learning_rate": 4.930782766034775e-07, "loss": 0.461, "step": 23130 }, { "epoch": 0.8721215090641842, "grad_norm": 1.5856064167725252, "learning_rate": 4.902337080238862e-07, "loss": 0.4856, "step": 23140 }, { "epoch": 0.8724983982210832, "grad_norm": 1.7306241882494389, "learning_rate": 4.873969453166982e-07, "loss": 0.4874, "step": 23150 }, { "epoch": 0.8728752873779821, "grad_norm": 1.5611552008896656, "learning_rate": 4.845679933920122e-07, "loss": 0.4528, "step": 23160 }, { "epoch": 0.8732521765348811, "grad_norm": 1.4214411570399696, "learning_rate": 4.817468571464118e-07, "loss": 0.4712, "step": 23170 }, { "epoch": 0.87362906569178, "grad_norm": 1.4281643083118118, "learning_rate": 4.789335414629481e-07, "loss": 0.4506, "step": 23180 }, { "epoch": 0.874005954848679, "grad_norm": 1.737558818061735, "learning_rate": 4.761280512111377e-07, "loss": 0.4896, "step": 23190 }, { "epoch": 0.874382844005578, "grad_norm": 1.9801433378045732, "learning_rate": 4.733303912469545e-07, "loss": 0.4325, "step": 23200 }, { "epoch": 0.8747597331624769, "grad_norm": 1.5293237221434222, "learning_rate": 4.7054056641281767e-07, "loss": 0.4658, "step": 23210 }, { "epoch": 0.8751366223193758, "grad_norm": 1.5764509216468041, "learning_rate": 4.6775858153758237e-07, "loss": 0.4849, "step": 23220 }, { "epoch": 0.8755135114762749, "grad_norm": 1.4828757798168408, "learning_rate": 4.649844414365357e-07, "loss": 0.4399, "step": 23230 }, { "epoch": 0.8758904006331738, "grad_norm": 1.6081292880236426, "learning_rate": 4.622181509113871e-07, "loss": 0.4631, "step": 23240 }, { "epoch": 0.8762672897900727, "grad_norm": 1.6761532852837226, "learning_rate": 4.594597147502583e-07, "loss": 0.4814, "step": 23250 }, { "epoch": 0.8766441789469717, "grad_norm": 1.7471109969458456, "learning_rate": 4.5670913772767665e-07, "loss": 0.4681, "step": 23260 }, { "epoch": 0.8770210681038707, "grad_norm": 1.670160553063871, "learning_rate": 4.53966424604565e-07, "loss": 0.4832, "step": 23270 }, { "epoch": 0.8773979572607696, "grad_norm": 2.012186736194834, "learning_rate": 4.5123158012823577e-07, "loss": 0.4324, "step": 23280 }, { "epoch": 0.8777748464176686, "grad_norm": 1.6439554732480879, "learning_rate": 4.4850460903238193e-07, "loss": 0.4924, "step": 23290 }, { "epoch": 0.8781517355745675, "grad_norm": 1.5734128031410948, "learning_rate": 4.4578551603706834e-07, "loss": 0.479, "step": 23300 }, { "epoch": 0.8785286247314664, "grad_norm": 1.5385928479066775, "learning_rate": 4.4307430584872247e-07, "loss": 0.4493, "step": 23310 }, { "epoch": 0.8789055138883655, "grad_norm": 1.495012993416791, "learning_rate": 4.403709831601299e-07, "loss": 0.4394, "step": 23320 }, { "epoch": 0.8792824030452644, "grad_norm": 1.7809672218234174, "learning_rate": 4.3767555265042283e-07, "loss": 0.4492, "step": 23330 }, { "epoch": 0.8796592922021633, "grad_norm": 1.7530873048425202, "learning_rate": 4.3498801898507027e-07, "loss": 0.4556, "step": 23340 }, { "epoch": 0.8800361813590623, "grad_norm": 1.6880143734094333, "learning_rate": 4.323083868158784e-07, "loss": 0.4908, "step": 23350 }, { "epoch": 0.8804130705159613, "grad_norm": 1.7763398568413087, "learning_rate": 4.2963666078097267e-07, "loss": 0.464, "step": 23360 }, { "epoch": 0.8807899596728602, "grad_norm": 1.534011001228688, "learning_rate": 4.26972845504795e-07, "loss": 0.466, "step": 23370 }, { "epoch": 0.8811668488297592, "grad_norm": 1.6994612004248872, "learning_rate": 4.2431694559809554e-07, "loss": 0.5048, "step": 23380 }, { "epoch": 0.8815437379866581, "grad_norm": 1.7973858705958985, "learning_rate": 4.216689656579215e-07, "loss": 0.4866, "step": 23390 }, { "epoch": 0.8819206271435571, "grad_norm": 1.7439539082800413, "learning_rate": 4.1902891026761316e-07, "loss": 0.4917, "step": 23400 }, { "epoch": 0.882297516300456, "grad_norm": 1.8633764405786568, "learning_rate": 4.1639678399679586e-07, "loss": 0.474, "step": 23410 }, { "epoch": 0.882674405457355, "grad_norm": 3.0062093045952927, "learning_rate": 4.137725914013696e-07, "loss": 0.4931, "step": 23420 }, { "epoch": 0.8830512946142539, "grad_norm": 1.6834986673559424, "learning_rate": 4.1115633702349943e-07, "loss": 0.4785, "step": 23430 }, { "epoch": 0.883428183771153, "grad_norm": 1.528153535189198, "learning_rate": 4.0854802539161353e-07, "loss": 0.4653, "step": 23440 }, { "epoch": 0.8838050729280519, "grad_norm": 1.9539698847148672, "learning_rate": 4.0594766102039e-07, "loss": 0.4624, "step": 23450 }, { "epoch": 0.8841819620849508, "grad_norm": 1.8627808627003384, "learning_rate": 4.03355248410755e-07, "loss": 0.4597, "step": 23460 }, { "epoch": 0.8845588512418497, "grad_norm": 1.8122593075205515, "learning_rate": 4.007707920498649e-07, "loss": 0.456, "step": 23470 }, { "epoch": 0.8849357403987488, "grad_norm": 1.6711500055529624, "learning_rate": 3.9819429641111074e-07, "loss": 0.4732, "step": 23480 }, { "epoch": 0.8853126295556477, "grad_norm": 1.5192643064244584, "learning_rate": 3.956257659541002e-07, "loss": 0.468, "step": 23490 }, { "epoch": 0.8856895187125466, "grad_norm": 1.813662608056834, "learning_rate": 3.93065205124657e-07, "loss": 0.4752, "step": 23500 }, { "epoch": 0.8860664078694456, "grad_norm": 1.7914587106316096, "learning_rate": 3.905126183548086e-07, "loss": 0.45, "step": 23510 }, { "epoch": 0.8864432970263445, "grad_norm": 1.634445231205134, "learning_rate": 3.879680100627814e-07, "loss": 0.4823, "step": 23520 }, { "epoch": 0.8868201861832435, "grad_norm": 1.5155533043912437, "learning_rate": 3.854313846529917e-07, "loss": 0.4738, "step": 23530 }, { "epoch": 0.8871970753401425, "grad_norm": 1.3069686366746593, "learning_rate": 3.8290274651603844e-07, "loss": 0.4295, "step": 23540 }, { "epoch": 0.8875739644970414, "grad_norm": 1.6900219347834147, "learning_rate": 3.803821000286939e-07, "loss": 0.4713, "step": 23550 }, { "epoch": 0.8879508536539403, "grad_norm": 1.7207203874610701, "learning_rate": 3.7786944955390094e-07, "loss": 0.4833, "step": 23560 }, { "epoch": 0.8883277428108394, "grad_norm": 1.6381226224767984, "learning_rate": 3.7536479944075946e-07, "loss": 0.4789, "step": 23570 }, { "epoch": 0.8887046319677383, "grad_norm": 1.468117460132038, "learning_rate": 3.7286815402452436e-07, "loss": 0.4666, "step": 23580 }, { "epoch": 0.8890815211246372, "grad_norm": 1.5096914213423873, "learning_rate": 3.703795176265912e-07, "loss": 0.4633, "step": 23590 }, { "epoch": 0.8894584102815362, "grad_norm": 1.8984329064476526, "learning_rate": 3.678988945544976e-07, "loss": 0.4806, "step": 23600 }, { "epoch": 0.8898352994384352, "grad_norm": 1.941908868591099, "learning_rate": 3.654262891019067e-07, "loss": 0.4769, "step": 23610 }, { "epoch": 0.8902121885953341, "grad_norm": 1.8348751421543537, "learning_rate": 3.6296170554860954e-07, "loss": 0.4828, "step": 23620 }, { "epoch": 0.8905890777522331, "grad_norm": 1.5016426776021727, "learning_rate": 3.60505148160506e-07, "loss": 0.4548, "step": 23630 }, { "epoch": 0.890965966909132, "grad_norm": 1.7145005520740269, "learning_rate": 3.5805662118960747e-07, "loss": 0.5038, "step": 23640 }, { "epoch": 0.891342856066031, "grad_norm": 1.705443534960991, "learning_rate": 3.5561612887402565e-07, "loss": 0.4417, "step": 23650 }, { "epoch": 0.89171974522293, "grad_norm": 1.4890162691299267, "learning_rate": 3.531836754379625e-07, "loss": 0.462, "step": 23660 }, { "epoch": 0.8920966343798289, "grad_norm": 3.142113368721328, "learning_rate": 3.507592650917091e-07, "loss": 0.5031, "step": 23670 }, { "epoch": 0.8924735235367278, "grad_norm": 1.6644517980326772, "learning_rate": 3.48342902031632e-07, "loss": 0.4791, "step": 23680 }, { "epoch": 0.8928504126936267, "grad_norm": 1.7717700337165432, "learning_rate": 3.459345904401712e-07, "loss": 0.4588, "step": 23690 }, { "epoch": 0.8932273018505258, "grad_norm": 1.7866048077368766, "learning_rate": 3.435343344858283e-07, "loss": 0.4523, "step": 23700 }, { "epoch": 0.8936041910074247, "grad_norm": 1.6579103062766696, "learning_rate": 3.411421383231628e-07, "loss": 0.4613, "step": 23710 }, { "epoch": 0.8939810801643236, "grad_norm": 1.2395669709919346, "learning_rate": 3.387580060927842e-07, "loss": 0.4635, "step": 23720 }, { "epoch": 0.8943579693212226, "grad_norm": 1.637561199613229, "learning_rate": 3.363819419213432e-07, "loss": 0.4709, "step": 23730 }, { "epoch": 0.8947348584781216, "grad_norm": 1.3235058842010576, "learning_rate": 3.3401394992152615e-07, "loss": 0.4477, "step": 23740 }, { "epoch": 0.8951117476350205, "grad_norm": 1.9666315503654384, "learning_rate": 3.316540341920477e-07, "loss": 0.5028, "step": 23750 }, { "epoch": 0.8954886367919195, "grad_norm": 1.5304868239681946, "learning_rate": 3.293021988176409e-07, "loss": 0.494, "step": 23760 }, { "epoch": 0.8958655259488184, "grad_norm": 1.7286366317517883, "learning_rate": 3.269584478690574e-07, "loss": 0.4889, "step": 23770 }, { "epoch": 0.8962424151057174, "grad_norm": 1.3521612831885137, "learning_rate": 3.2462278540305205e-07, "loss": 0.4572, "step": 23780 }, { "epoch": 0.8966193042626164, "grad_norm": 1.5149236125626533, "learning_rate": 3.2229521546238097e-07, "loss": 0.4456, "step": 23790 }, { "epoch": 0.8969961934195153, "grad_norm": 1.673486048314399, "learning_rate": 3.199757420757915e-07, "loss": 0.4624, "step": 23800 }, { "epoch": 0.8973730825764142, "grad_norm": 1.6567774362547631, "learning_rate": 3.176643692580184e-07, "loss": 0.4809, "step": 23810 }, { "epoch": 0.8977499717333133, "grad_norm": 1.8358040993755154, "learning_rate": 3.1536110100977514e-07, "loss": 0.4801, "step": 23820 }, { "epoch": 0.8981268608902122, "grad_norm": 1.6459275011239465, "learning_rate": 3.130659413177478e-07, "loss": 0.4345, "step": 23830 }, { "epoch": 0.8985037500471111, "grad_norm": 1.8349612600952543, "learning_rate": 3.107788941545842e-07, "loss": 0.4614, "step": 23840 }, { "epoch": 0.8988806392040101, "grad_norm": 1.7411885119540658, "learning_rate": 3.0849996347889434e-07, "loss": 0.4839, "step": 23850 }, { "epoch": 0.8992575283609091, "grad_norm": 1.9747342875954943, "learning_rate": 3.0622915323523683e-07, "loss": 0.472, "step": 23860 }, { "epoch": 0.899634417517808, "grad_norm": 1.5726613697693645, "learning_rate": 3.039664673541165e-07, "loss": 0.4805, "step": 23870 }, { "epoch": 0.900011306674707, "grad_norm": 1.7695993345825292, "learning_rate": 3.0171190975197553e-07, "loss": 0.4646, "step": 23880 }, { "epoch": 0.9003881958316059, "grad_norm": 1.6476558150869725, "learning_rate": 2.994654843311856e-07, "loss": 0.4738, "step": 23890 }, { "epoch": 0.9007650849885048, "grad_norm": 1.839852845526703, "learning_rate": 2.972271949800443e-07, "loss": 0.4871, "step": 23900 }, { "epoch": 0.9011419741454039, "grad_norm": 1.6296225754693838, "learning_rate": 2.949970455727652e-07, "loss": 0.4948, "step": 23910 }, { "epoch": 0.9015188633023028, "grad_norm": 1.446451999177883, "learning_rate": 2.9277503996947453e-07, "loss": 0.4458, "step": 23920 }, { "epoch": 0.9018957524592017, "grad_norm": 1.5091359658108487, "learning_rate": 2.905611820162002e-07, "loss": 0.4676, "step": 23930 }, { "epoch": 0.9022726416161007, "grad_norm": 1.4935637256342174, "learning_rate": 2.883554755448692e-07, "loss": 0.4402, "step": 23940 }, { "epoch": 0.9026495307729997, "grad_norm": 1.7255741199144814, "learning_rate": 2.861579243732993e-07, "loss": 0.4814, "step": 23950 }, { "epoch": 0.9030264199298986, "grad_norm": 1.7337660198803386, "learning_rate": 2.8396853230518993e-07, "loss": 0.4631, "step": 23960 }, { "epoch": 0.9034033090867976, "grad_norm": 1.8762174059045174, "learning_rate": 2.8178730313012215e-07, "loss": 0.4773, "step": 23970 }, { "epoch": 0.9037801982436965, "grad_norm": 1.7366313753924991, "learning_rate": 2.796142406235447e-07, "loss": 0.4423, "step": 23980 }, { "epoch": 0.9041570874005955, "grad_norm": 1.6724834106832989, "learning_rate": 2.7744934854677274e-07, "loss": 0.4486, "step": 23990 }, { "epoch": 0.9045339765574945, "grad_norm": 1.935280413678451, "learning_rate": 2.752926306469772e-07, "loss": 0.4678, "step": 24000 }, { "epoch": 0.9049108657143934, "grad_norm": 1.714097876276615, "learning_rate": 2.731440906571825e-07, "loss": 0.4606, "step": 24010 }, { "epoch": 0.9052877548712923, "grad_norm": 1.4926141470560026, "learning_rate": 2.710037322962572e-07, "loss": 0.4681, "step": 24020 }, { "epoch": 0.9056646440281914, "grad_norm": 1.722145408901395, "learning_rate": 2.688715592689101e-07, "loss": 0.4426, "step": 24030 }, { "epoch": 0.9060415331850903, "grad_norm": 1.9291301430266703, "learning_rate": 2.6674757526567895e-07, "loss": 0.4764, "step": 24040 }, { "epoch": 0.9064184223419892, "grad_norm": 1.7351194729472024, "learning_rate": 2.646317839629292e-07, "loss": 0.4556, "step": 24050 }, { "epoch": 0.9067953114988881, "grad_norm": 1.3921208085094439, "learning_rate": 2.625241890228464e-07, "loss": 0.4437, "step": 24060 }, { "epoch": 0.9071722006557872, "grad_norm": 5.211286582046159, "learning_rate": 2.6042479409342734e-07, "loss": 0.4792, "step": 24070 }, { "epoch": 0.9075490898126861, "grad_norm": 1.6508702793082282, "learning_rate": 2.5833360280847707e-07, "loss": 0.4854, "step": 24080 }, { "epoch": 0.907925978969585, "grad_norm": 1.5904848357115617, "learning_rate": 2.562506187876007e-07, "loss": 0.4795, "step": 24090 }, { "epoch": 0.908302868126484, "grad_norm": 1.6275384635774426, "learning_rate": 2.5417584563619647e-07, "loss": 0.4781, "step": 24100 }, { "epoch": 0.9086797572833829, "grad_norm": 1.5228159132695211, "learning_rate": 2.521092869454528e-07, "loss": 0.47, "step": 24110 }, { "epoch": 0.909056646440282, "grad_norm": 1.5131657204163729, "learning_rate": 2.5005094629233726e-07, "loss": 0.4746, "step": 24120 }, { "epoch": 0.9094335355971809, "grad_norm": 1.803558175951124, "learning_rate": 2.4800082723959505e-07, "loss": 0.4672, "step": 24130 }, { "epoch": 0.9098104247540798, "grad_norm": 1.5532182659903644, "learning_rate": 2.459589333357393e-07, "loss": 0.465, "step": 24140 }, { "epoch": 0.9101873139109787, "grad_norm": 1.8682246003822802, "learning_rate": 2.439252681150472e-07, "loss": 0.4661, "step": 24150 }, { "epoch": 0.9105642030678778, "grad_norm": 1.6186148451233608, "learning_rate": 2.418998350975543e-07, "loss": 0.4562, "step": 24160 }, { "epoch": 0.9109410922247767, "grad_norm": 1.9076729059759836, "learning_rate": 2.398826377890423e-07, "loss": 0.4791, "step": 24170 }, { "epoch": 0.9113179813816756, "grad_norm": 1.5131142108015385, "learning_rate": 2.378736796810449e-07, "loss": 0.4574, "step": 24180 }, { "epoch": 0.9116948705385746, "grad_norm": 1.6267367473954135, "learning_rate": 2.3587296425082894e-07, "loss": 0.4434, "step": 24190 }, { "epoch": 0.9120717596954736, "grad_norm": 1.756928999491046, "learning_rate": 2.338804949613982e-07, "loss": 0.4753, "step": 24200 }, { "epoch": 0.9124486488523725, "grad_norm": 1.678600318807189, "learning_rate": 2.3189627526148007e-07, "loss": 0.4892, "step": 24210 }, { "epoch": 0.9128255380092715, "grad_norm": 1.6170139181507344, "learning_rate": 2.299203085855234e-07, "loss": 0.4578, "step": 24220 }, { "epoch": 0.9132024271661704, "grad_norm": 1.5717138547339975, "learning_rate": 2.279525983536951e-07, "loss": 0.4751, "step": 24230 }, { "epoch": 0.9135793163230694, "grad_norm": 1.7504585367583836, "learning_rate": 2.2599314797186857e-07, "loss": 0.4996, "step": 24240 }, { "epoch": 0.9139562054799684, "grad_norm": 1.8589504523667113, "learning_rate": 2.2404196083161968e-07, "loss": 0.464, "step": 24250 }, { "epoch": 0.9143330946368673, "grad_norm": 1.5549339824814492, "learning_rate": 2.2209904031022356e-07, "loss": 0.474, "step": 24260 }, { "epoch": 0.9147099837937662, "grad_norm": 1.5431790462672454, "learning_rate": 2.2016438977064624e-07, "loss": 0.4688, "step": 24270 }, { "epoch": 0.9150868729506653, "grad_norm": 1.5231675790646082, "learning_rate": 2.18238012561538e-07, "loss": 0.4459, "step": 24280 }, { "epoch": 0.9154637621075642, "grad_norm": 1.213036543371949, "learning_rate": 2.1631991201723102e-07, "loss": 0.4214, "step": 24290 }, { "epoch": 0.9158406512644631, "grad_norm": 1.6789791239635883, "learning_rate": 2.1441009145773074e-07, "loss": 0.4608, "step": 24300 }, { "epoch": 0.916217540421362, "grad_norm": 1.646949624163508, "learning_rate": 2.1250855418871008e-07, "loss": 0.4597, "step": 24310 }, { "epoch": 0.916594429578261, "grad_norm": 1.8335341035244128, "learning_rate": 2.106153035015057e-07, "loss": 0.4875, "step": 24320 }, { "epoch": 0.91697131873516, "grad_norm": 1.5713652425889508, "learning_rate": 2.0873034267311131e-07, "loss": 0.4644, "step": 24330 }, { "epoch": 0.917348207892059, "grad_norm": 1.7316298479198824, "learning_rate": 2.0685367496617037e-07, "loss": 0.4834, "step": 24340 }, { "epoch": 0.9177250970489579, "grad_norm": 1.7723989836411413, "learning_rate": 2.0498530362897283e-07, "loss": 0.4794, "step": 24350 }, { "epoch": 0.9181019862058568, "grad_norm": 1.5053617798457812, "learning_rate": 2.031252318954502e-07, "loss": 0.468, "step": 24360 }, { "epoch": 0.9184788753627559, "grad_norm": 1.6353066163088765, "learning_rate": 2.0127346298516426e-07, "loss": 0.5002, "step": 24370 }, { "epoch": 0.9188557645196548, "grad_norm": 1.6543832590665188, "learning_rate": 1.9943000010331005e-07, "loss": 0.4606, "step": 24380 }, { "epoch": 0.9192326536765537, "grad_norm": 1.4329476172975082, "learning_rate": 1.9759484644070347e-07, "loss": 0.4732, "step": 24390 }, { "epoch": 0.9196095428334526, "grad_norm": 1.803480078827421, "learning_rate": 1.9576800517377924e-07, "loss": 0.4525, "step": 24400 }, { "epoch": 0.9199864319903517, "grad_norm": 1.5271475875148783, "learning_rate": 1.9394947946458243e-07, "loss": 0.4276, "step": 24410 }, { "epoch": 0.9203633211472506, "grad_norm": 1.7436847616380555, "learning_rate": 1.9213927246076623e-07, "loss": 0.4601, "step": 24420 }, { "epoch": 0.9207402103041495, "grad_norm": 1.7878744772222794, "learning_rate": 1.9033738729558437e-07, "loss": 0.4736, "step": 24430 }, { "epoch": 0.9211170994610485, "grad_norm": 1.6372286383071935, "learning_rate": 1.8854382708788976e-07, "loss": 0.4829, "step": 24440 }, { "epoch": 0.9214939886179475, "grad_norm": 1.8458554633785587, "learning_rate": 1.8675859494212078e-07, "loss": 0.4831, "step": 24450 }, { "epoch": 0.9218708777748464, "grad_norm": 1.7303090763015152, "learning_rate": 1.84981693948304e-07, "loss": 0.4581, "step": 24460 }, { "epoch": 0.9222477669317454, "grad_norm": 1.6324828539105698, "learning_rate": 1.8321312718204477e-07, "loss": 0.4652, "step": 24470 }, { "epoch": 0.9226246560886443, "grad_norm": 1.6706150940449451, "learning_rate": 1.8145289770452323e-07, "loss": 0.4862, "step": 24480 }, { "epoch": 0.9230015452455432, "grad_norm": 1.6307388216734264, "learning_rate": 1.7970100856248896e-07, "loss": 0.4872, "step": 24490 }, { "epoch": 0.9233784344024423, "grad_norm": 1.584467065772082, "learning_rate": 1.7795746278825465e-07, "loss": 0.476, "step": 24500 }, { "epoch": 0.9237553235593412, "grad_norm": 1.5002376627559966, "learning_rate": 1.7622226339969185e-07, "loss": 0.4608, "step": 24510 }, { "epoch": 0.9241322127162401, "grad_norm": 1.3947526048448347, "learning_rate": 1.7449541340022526e-07, "loss": 0.4641, "step": 24520 }, { "epoch": 0.9245091018731391, "grad_norm": 1.6798334967661213, "learning_rate": 1.7277691577882892e-07, "loss": 0.4469, "step": 24530 }, { "epoch": 0.9248859910300381, "grad_norm": 1.589068317695499, "learning_rate": 1.7106677351001855e-07, "loss": 0.458, "step": 24540 }, { "epoch": 0.925262880186937, "grad_norm": 1.8869275914521926, "learning_rate": 1.693649895538485e-07, "loss": 0.505, "step": 24550 }, { "epoch": 0.925639769343836, "grad_norm": 1.9569489786208991, "learning_rate": 1.6767156685590536e-07, "loss": 0.451, "step": 24560 }, { "epoch": 0.9260166585007349, "grad_norm": 1.6847190222460249, "learning_rate": 1.65986508347305e-07, "loss": 0.4609, "step": 24570 }, { "epoch": 0.9263935476576339, "grad_norm": 1.8408894133770082, "learning_rate": 1.6430981694468162e-07, "loss": 0.4766, "step": 24580 }, { "epoch": 0.9267704368145329, "grad_norm": 1.674533122441784, "learning_rate": 1.626414955501926e-07, "loss": 0.45, "step": 24590 }, { "epoch": 0.9271473259714318, "grad_norm": 1.8741720379805382, "learning_rate": 1.6098154705150416e-07, "loss": 0.4661, "step": 24600 }, { "epoch": 0.9275242151283307, "grad_norm": 1.733162525973201, "learning_rate": 1.593299743217913e-07, "loss": 0.4568, "step": 24610 }, { "epoch": 0.9279011042852298, "grad_norm": 4.339443575234999, "learning_rate": 1.5768678021973016e-07, "loss": 0.4862, "step": 24620 }, { "epoch": 0.9282779934421287, "grad_norm": 1.750943994731508, "learning_rate": 1.5605196758949614e-07, "loss": 0.4562, "step": 24630 }, { "epoch": 0.9286548825990276, "grad_norm": 1.5174387434569223, "learning_rate": 1.5442553926075687e-07, "loss": 0.4362, "step": 24640 }, { "epoch": 0.9290317717559265, "grad_norm": 1.724853168012421, "learning_rate": 1.528074980486677e-07, "loss": 0.4665, "step": 24650 }, { "epoch": 0.9294086609128256, "grad_norm": 2.2911044470949418, "learning_rate": 1.5119784675386607e-07, "loss": 0.4655, "step": 24660 }, { "epoch": 0.9297855500697245, "grad_norm": 1.7545886816782916, "learning_rate": 1.4959658816246836e-07, "loss": 0.4788, "step": 24670 }, { "epoch": 0.9301624392266234, "grad_norm": 1.7138768298267077, "learning_rate": 1.480037250460642e-07, "loss": 0.4483, "step": 24680 }, { "epoch": 0.9305393283835224, "grad_norm": 1.5647732024525882, "learning_rate": 1.4641926016171092e-07, "loss": 0.4594, "step": 24690 }, { "epoch": 0.9309162175404213, "grad_norm": 1.704892085076998, "learning_rate": 1.4484319625193033e-07, "loss": 0.4761, "step": 24700 }, { "epoch": 0.9312931066973203, "grad_norm": 1.8079933221397861, "learning_rate": 1.4327553604470246e-07, "loss": 0.4568, "step": 24710 }, { "epoch": 0.9316699958542193, "grad_norm": 3.18841995794234, "learning_rate": 1.4171628225346234e-07, "loss": 0.4503, "step": 24720 }, { "epoch": 0.9320468850111182, "grad_norm": 1.645937879683613, "learning_rate": 1.4016543757709332e-07, "loss": 0.4734, "step": 24730 }, { "epoch": 0.9324237741680171, "grad_norm": 1.6358083401976105, "learning_rate": 1.3862300469992484e-07, "loss": 0.4707, "step": 24740 }, { "epoch": 0.9328006633249162, "grad_norm": 1.6561818821247405, "learning_rate": 1.3708898629172518e-07, "loss": 0.4728, "step": 24750 }, { "epoch": 0.9331775524818151, "grad_norm": 1.6439915924189832, "learning_rate": 1.3556338500769982e-07, "loss": 0.4671, "step": 24760 }, { "epoch": 0.933554441638714, "grad_norm": 1.3928410136757627, "learning_rate": 1.3404620348848375e-07, "loss": 0.4766, "step": 24770 }, { "epoch": 0.933931330795613, "grad_norm": 1.6629613706509152, "learning_rate": 1.325374443601385e-07, "loss": 0.4634, "step": 24780 }, { "epoch": 0.934308219952512, "grad_norm": 1.3229160640821136, "learning_rate": 1.310371102341479e-07, "loss": 0.4527, "step": 24790 }, { "epoch": 0.9346851091094109, "grad_norm": 1.6985771267573568, "learning_rate": 1.2954520370741408e-07, "loss": 0.4449, "step": 24800 }, { "epoch": 0.9350619982663099, "grad_norm": 1.776417571410387, "learning_rate": 1.280617273622492e-07, "loss": 0.4278, "step": 24810 }, { "epoch": 0.9354388874232088, "grad_norm": 1.9248752747557138, "learning_rate": 1.2658668376637705e-07, "loss": 0.4785, "step": 24820 }, { "epoch": 0.9358157765801078, "grad_norm": 1.5537222976277234, "learning_rate": 1.251200754729226e-07, "loss": 0.4956, "step": 24830 }, { "epoch": 0.9361926657370068, "grad_norm": 1.6183831830487874, "learning_rate": 1.2366190502041186e-07, "loss": 0.4592, "step": 24840 }, { "epoch": 0.9365695548939057, "grad_norm": 1.7490907041034809, "learning_rate": 1.222121749327654e-07, "loss": 0.4916, "step": 24850 }, { "epoch": 0.9369464440508046, "grad_norm": 1.568020359678583, "learning_rate": 1.2077088771929535e-07, "loss": 0.459, "step": 24860 }, { "epoch": 0.9373233332077037, "grad_norm": 1.6766487321093162, "learning_rate": 1.1933804587469843e-07, "loss": 0.473, "step": 24870 }, { "epoch": 0.9377002223646026, "grad_norm": 1.6215811602508874, "learning_rate": 1.1791365187905524e-07, "loss": 0.451, "step": 24880 }, { "epoch": 0.9380771115215015, "grad_norm": 1.7382912119631357, "learning_rate": 1.1649770819782247e-07, "loss": 0.4738, "step": 24890 }, { "epoch": 0.9384540006784005, "grad_norm": 1.882585550263763, "learning_rate": 1.1509021728183301e-07, "loss": 0.4945, "step": 24900 }, { "epoch": 0.9388308898352994, "grad_norm": 1.412211827907065, "learning_rate": 1.1369118156728587e-07, "loss": 0.458, "step": 24910 }, { "epoch": 0.9392077789921984, "grad_norm": 1.8359718454808982, "learning_rate": 1.1230060347574679e-07, "loss": 0.4584, "step": 24920 }, { "epoch": 0.9395846681490974, "grad_norm": 1.489126957012612, "learning_rate": 1.1091848541414262e-07, "loss": 0.4817, "step": 24930 }, { "epoch": 0.9399615573059963, "grad_norm": 2.0450896007301416, "learning_rate": 1.0954482977475533e-07, "loss": 0.4906, "step": 24940 }, { "epoch": 0.9403384464628952, "grad_norm": 1.8209636581396178, "learning_rate": 1.0817963893522132e-07, "loss": 0.4844, "step": 24950 }, { "epoch": 0.9407153356197943, "grad_norm": 1.708902383560193, "learning_rate": 1.0682291525852484e-07, "loss": 0.4711, "step": 24960 }, { "epoch": 0.9410922247766932, "grad_norm": 3.052634153804913, "learning_rate": 1.0547466109299298e-07, "loss": 0.4866, "step": 24970 }, { "epoch": 0.9414691139335921, "grad_norm": 1.6048817510008635, "learning_rate": 1.0413487877229566e-07, "loss": 0.4614, "step": 24980 }, { "epoch": 0.941846003090491, "grad_norm": 1.6211384980618553, "learning_rate": 1.0280357061543622e-07, "loss": 0.496, "step": 24990 }, { "epoch": 0.9422228922473901, "grad_norm": 1.7208732672554823, "learning_rate": 1.0148073892675358e-07, "loss": 0.4515, "step": 25000 }, { "epoch": 0.942599781404289, "grad_norm": 1.242397602277574, "learning_rate": 1.0016638599591122e-07, "loss": 0.4522, "step": 25010 }, { "epoch": 0.942976670561188, "grad_norm": 1.864373295532734, "learning_rate": 9.886051409790042e-08, "loss": 0.4541, "step": 25020 }, { "epoch": 0.9433535597180869, "grad_norm": 1.7432924180401044, "learning_rate": 9.756312549302982e-08, "loss": 0.4807, "step": 25030 }, { "epoch": 0.9437304488749859, "grad_norm": 1.3858687614024912, "learning_rate": 9.627422242692585e-08, "loss": 0.4545, "step": 25040 }, { "epoch": 0.9441073380318848, "grad_norm": 1.7270495665971028, "learning_rate": 9.499380713052785e-08, "loss": 0.4589, "step": 25050 }, { "epoch": 0.9444842271887838, "grad_norm": 1.5950305399452118, "learning_rate": 9.372188182008358e-08, "loss": 0.4564, "step": 25060 }, { "epoch": 0.9448611163456827, "grad_norm": 1.7644110153181087, "learning_rate": 9.245844869714471e-08, "loss": 0.4859, "step": 25070 }, { "epoch": 0.9452380055025816, "grad_norm": 1.440552445480469, "learning_rate": 9.120350994856475e-08, "loss": 0.4771, "step": 25080 }, { "epoch": 0.9456148946594807, "grad_norm": 3.239934112076224, "learning_rate": 8.995706774649504e-08, "loss": 0.4731, "step": 25090 }, { "epoch": 0.9459917838163796, "grad_norm": 1.9158332648079843, "learning_rate": 8.87191242483787e-08, "loss": 0.4597, "step": 25100 }, { "epoch": 0.9463686729732785, "grad_norm": 1.5922507680686822, "learning_rate": 8.748968159695004e-08, "loss": 0.4316, "step": 25110 }, { "epoch": 0.9467455621301775, "grad_norm": 1.792335011843171, "learning_rate": 8.626874192022905e-08, "loss": 0.5036, "step": 25120 }, { "epoch": 0.9471224512870765, "grad_norm": 1.4714034304995909, "learning_rate": 8.505630733151803e-08, "loss": 0.453, "step": 25130 }, { "epoch": 0.9474993404439754, "grad_norm": 1.4587868542295341, "learning_rate": 8.385237992939777e-08, "loss": 0.4731, "step": 25140 }, { "epoch": 0.9478762296008744, "grad_norm": 1.7817219201477925, "learning_rate": 8.265696179772465e-08, "loss": 0.4456, "step": 25150 }, { "epoch": 0.9482531187577733, "grad_norm": 1.621641692161085, "learning_rate": 8.147005500562577e-08, "loss": 0.4435, "step": 25160 }, { "epoch": 0.9486300079146723, "grad_norm": 1.6494310365282547, "learning_rate": 8.029166160749668e-08, "loss": 0.4665, "step": 25170 }, { "epoch": 0.9490068970715713, "grad_norm": 1.4731323966452758, "learning_rate": 7.912178364299694e-08, "loss": 0.4633, "step": 25180 }, { "epoch": 0.9493837862284702, "grad_norm": 1.4539950989850374, "learning_rate": 7.796042313704733e-08, "loss": 0.4761, "step": 25190 }, { "epoch": 0.9497606753853691, "grad_norm": 1.7614354811781805, "learning_rate": 7.680758209982541e-08, "loss": 0.4874, "step": 25200 }, { "epoch": 0.9501375645422682, "grad_norm": 1.5146197995816046, "learning_rate": 7.566326252676226e-08, "loss": 0.4565, "step": 25210 }, { "epoch": 0.9505144536991671, "grad_norm": 1.4964331021798207, "learning_rate": 7.452746639854069e-08, "loss": 0.4732, "step": 25220 }, { "epoch": 0.950891342856066, "grad_norm": 1.6531417972845415, "learning_rate": 7.34001956810898e-08, "loss": 0.4539, "step": 25230 }, { "epoch": 0.951268232012965, "grad_norm": 1.4443997141550056, "learning_rate": 7.228145232558048e-08, "loss": 0.4651, "step": 25240 }, { "epoch": 0.951645121169864, "grad_norm": 1.6488863792857795, "learning_rate": 7.117123826842598e-08, "loss": 0.4655, "step": 25250 }, { "epoch": 0.9520220103267629, "grad_norm": 2.066125366240161, "learning_rate": 7.00695554312758e-08, "loss": 0.4818, "step": 25260 }, { "epoch": 0.9523988994836619, "grad_norm": 1.7989280362677504, "learning_rate": 6.897640572101294e-08, "loss": 0.4592, "step": 25270 }, { "epoch": 0.9527757886405608, "grad_norm": 1.5907695066167318, "learning_rate": 6.789179102974996e-08, "loss": 0.4615, "step": 25280 }, { "epoch": 0.9531526777974597, "grad_norm": 1.6474294146755808, "learning_rate": 6.681571323482628e-08, "loss": 0.4745, "step": 25290 }, { "epoch": 0.9535295669543588, "grad_norm": 1.8575461910907407, "learning_rate": 6.574817419880586e-08, "loss": 0.4843, "step": 25300 }, { "epoch": 0.9539064561112577, "grad_norm": 1.5427872527092095, "learning_rate": 6.468917576947287e-08, "loss": 0.4616, "step": 25310 }, { "epoch": 0.9542833452681566, "grad_norm": 2.4865362259718116, "learning_rate": 6.363871977982827e-08, "loss": 0.4617, "step": 25320 }, { "epoch": 0.9546602344250555, "grad_norm": 1.7543026874769245, "learning_rate": 6.259680804808654e-08, "loss": 0.4666, "step": 25330 }, { "epoch": 0.9550371235819546, "grad_norm": 1.604570679456589, "learning_rate": 6.156344237767453e-08, "loss": 0.494, "step": 25340 }, { "epoch": 0.9554140127388535, "grad_norm": 1.8669803638424052, "learning_rate": 6.053862455722593e-08, "loss": 0.4654, "step": 25350 }, { "epoch": 0.9557909018957524, "grad_norm": 1.3581166661184105, "learning_rate": 5.952235636057902e-08, "loss": 0.4403, "step": 25360 }, { "epoch": 0.9561677910526514, "grad_norm": 1.5400521124436184, "learning_rate": 5.851463954677394e-08, "loss": 0.4653, "step": 25370 }, { "epoch": 0.9565446802095504, "grad_norm": 1.8105900226878286, "learning_rate": 5.7515475860049354e-08, "loss": 0.4874, "step": 25380 }, { "epoch": 0.9569215693664493, "grad_norm": 1.4382499444470058, "learning_rate": 5.652486702984017e-08, "loss": 0.4648, "step": 25390 }, { "epoch": 0.9572984585233483, "grad_norm": 1.7631024504492547, "learning_rate": 5.554281477077206e-08, "loss": 0.4704, "step": 25400 }, { "epoch": 0.9576753476802472, "grad_norm": 1.6429543944439136, "learning_rate": 5.456932078266197e-08, "loss": 0.4655, "step": 25410 }, { "epoch": 0.9580522368371462, "grad_norm": 1.6616938198698554, "learning_rate": 5.360438675051316e-08, "loss": 0.4773, "step": 25420 }, { "epoch": 0.9584291259940452, "grad_norm": 1.7874329868405086, "learning_rate": 5.264801434451239e-08, "loss": 0.4466, "step": 25430 }, { "epoch": 0.9588060151509441, "grad_norm": 1.2294510476080451, "learning_rate": 5.170020522002661e-08, "loss": 0.4711, "step": 25440 }, { "epoch": 0.959182904307843, "grad_norm": 1.9210918144126727, "learning_rate": 5.0760961017602415e-08, "loss": 0.4725, "step": 25450 }, { "epoch": 0.9595597934647421, "grad_norm": 1.8227209874755572, "learning_rate": 4.983028336295881e-08, "loss": 0.4476, "step": 25460 }, { "epoch": 0.959936682621641, "grad_norm": 1.788350611886206, "learning_rate": 4.8908173866990535e-08, "loss": 0.4567, "step": 25470 }, { "epoch": 0.9603135717785399, "grad_norm": 1.775020734539053, "learning_rate": 4.799463412575978e-08, "loss": 0.5112, "step": 25480 }, { "epoch": 0.9606904609354389, "grad_norm": 1.5375809927016806, "learning_rate": 4.7089665720495e-08, "loss": 0.4391, "step": 25490 }, { "epoch": 0.9610673500923378, "grad_norm": 1.5639240748918983, "learning_rate": 4.619327021759046e-08, "loss": 0.4785, "step": 25500 }, { "epoch": 0.9614442392492368, "grad_norm": 1.764963080417895, "learning_rate": 4.5305449168600024e-08, "loss": 0.4736, "step": 25510 }, { "epoch": 0.9618211284061358, "grad_norm": 1.6562005841477103, "learning_rate": 4.4426204110237794e-08, "loss": 0.4747, "step": 25520 }, { "epoch": 0.9621980175630347, "grad_norm": 1.6196235223477182, "learning_rate": 4.35555365643725e-08, "loss": 0.4522, "step": 25530 }, { "epoch": 0.9625749067199336, "grad_norm": 1.5383045592640703, "learning_rate": 4.2693448038026996e-08, "loss": 0.4704, "step": 25540 }, { "epoch": 0.9629517958768327, "grad_norm": 1.8859648544559755, "learning_rate": 4.1839940023374884e-08, "loss": 0.4595, "step": 25550 }, { "epoch": 0.9633286850337316, "grad_norm": 1.703004713270723, "learning_rate": 4.0995013997736644e-08, "loss": 0.4767, "step": 25560 }, { "epoch": 0.9637055741906305, "grad_norm": 1.76025981546061, "learning_rate": 4.015867142358076e-08, "loss": 0.4687, "step": 25570 }, { "epoch": 0.9640824633475295, "grad_norm": 1.9694305314081857, "learning_rate": 3.933091374851594e-08, "loss": 0.4558, "step": 25580 }, { "epoch": 0.9644593525044285, "grad_norm": 1.3980420080759448, "learning_rate": 3.8511742405293875e-08, "loss": 0.4618, "step": 25590 }, { "epoch": 0.9648362416613274, "grad_norm": 1.6493052417006149, "learning_rate": 3.7701158811803694e-08, "loss": 0.504, "step": 25600 }, { "epoch": 0.9652131308182264, "grad_norm": 1.619654736340804, "learning_rate": 3.6899164371068105e-08, "loss": 0.4445, "step": 25610 }, { "epoch": 0.9655900199751253, "grad_norm": 1.843793532222181, "learning_rate": 3.610576047124614e-08, "loss": 0.4668, "step": 25620 }, { "epoch": 0.9659669091320243, "grad_norm": 2.1787530344860424, "learning_rate": 3.5320948485625395e-08, "loss": 0.4555, "step": 25630 }, { "epoch": 0.9663437982889233, "grad_norm": 1.6664586339835825, "learning_rate": 3.454472977262369e-08, "loss": 0.4812, "step": 25640 }, { "epoch": 0.9667206874458222, "grad_norm": 1.9221822760952356, "learning_rate": 3.3777105675782965e-08, "loss": 0.4762, "step": 25650 }, { "epoch": 0.9670975766027211, "grad_norm": 1.5535350148707952, "learning_rate": 3.3018077523769844e-08, "loss": 0.4743, "step": 25660 }, { "epoch": 0.9674744657596202, "grad_norm": 1.6826206259214103, "learning_rate": 3.226764663037285e-08, "loss": 0.4819, "step": 25670 }, { "epoch": 0.9678513549165191, "grad_norm": 1.5372332783224485, "learning_rate": 3.152581429449853e-08, "loss": 0.4529, "step": 25680 }, { "epoch": 0.968228244073418, "grad_norm": 1.6814181811804552, "learning_rate": 3.079258180017142e-08, "loss": 0.4824, "step": 25690 }, { "epoch": 0.9686051332303169, "grad_norm": 1.7034619827760054, "learning_rate": 3.006795041653021e-08, "loss": 0.4618, "step": 25700 }, { "epoch": 0.9689820223872159, "grad_norm": 2.003962281008904, "learning_rate": 2.9351921397826055e-08, "loss": 0.4614, "step": 25710 }, { "epoch": 0.9693589115441149, "grad_norm": 1.787441426014292, "learning_rate": 2.864449598342034e-08, "loss": 0.4533, "step": 25720 }, { "epoch": 0.9697358007010138, "grad_norm": 1.5820307731521495, "learning_rate": 2.794567539778359e-08, "loss": 0.4599, "step": 25730 }, { "epoch": 0.9701126898579128, "grad_norm": 1.7447057484780453, "learning_rate": 2.725546085049047e-08, "loss": 0.4572, "step": 25740 }, { "epoch": 0.9704895790148117, "grad_norm": 1.7232041715746915, "learning_rate": 2.6573853536221992e-08, "loss": 0.475, "step": 25750 }, { "epoch": 0.9708664681717107, "grad_norm": 1.6685687883415614, "learning_rate": 2.590085463475944e-08, "loss": 0.4587, "step": 25760 }, { "epoch": 0.9712433573286097, "grad_norm": 1.7482487866337566, "learning_rate": 2.5236465310984336e-08, "loss": 0.4702, "step": 25770 }, { "epoch": 0.9716202464855086, "grad_norm": 1.7353631005801522, "learning_rate": 2.458068671487568e-08, "loss": 0.462, "step": 25780 }, { "epoch": 0.9719971356424075, "grad_norm": 1.9350104216433035, "learning_rate": 2.3933519981508834e-08, "loss": 0.4588, "step": 25790 }, { "epoch": 0.9723740247993066, "grad_norm": 1.6857080051807094, "learning_rate": 2.3294966231053873e-08, "loss": 0.468, "step": 25800 }, { "epoch": 0.9727509139562055, "grad_norm": 1.7680489987031007, "learning_rate": 2.266502656877001e-08, "loss": 0.4694, "step": 25810 }, { "epoch": 0.9731278031131044, "grad_norm": 1.4689352131990807, "learning_rate": 2.2043702085010056e-08, "loss": 0.4501, "step": 25820 }, { "epoch": 0.9735046922700034, "grad_norm": 1.5690350494704872, "learning_rate": 2.1430993855212635e-08, "loss": 0.4789, "step": 25830 }, { "epoch": 0.9738815814269024, "grad_norm": 1.4356673233784094, "learning_rate": 2.0826902939903304e-08, "loss": 0.4585, "step": 25840 }, { "epoch": 0.9742584705838013, "grad_norm": 1.757515012502445, "learning_rate": 2.023143038469233e-08, "loss": 0.4944, "step": 25850 }, { "epoch": 0.9746353597407003, "grad_norm": 1.6306453233507494, "learning_rate": 1.9644577220271354e-08, "loss": 0.4539, "step": 25860 }, { "epoch": 0.9750122488975992, "grad_norm": 1.4314817446192643, "learning_rate": 1.906634446241451e-08, "loss": 0.4734, "step": 25870 }, { "epoch": 0.9753891380544981, "grad_norm": 1.5806180685416475, "learning_rate": 1.849673311197453e-08, "loss": 0.4399, "step": 25880 }, { "epoch": 0.9757660272113972, "grad_norm": 1.6520753364457346, "learning_rate": 1.7935744154881087e-08, "loss": 0.4566, "step": 25890 }, { "epoch": 0.9761429163682961, "grad_norm": 1.810709467270653, "learning_rate": 1.7383378562139674e-08, "loss": 0.4487, "step": 25900 }, { "epoch": 0.976519805525195, "grad_norm": 1.61252056021322, "learning_rate": 1.683963728983051e-08, "loss": 0.4715, "step": 25910 }, { "epoch": 0.976896694682094, "grad_norm": 2.425481703892548, "learning_rate": 1.630452127910409e-08, "loss": 0.4637, "step": 25920 }, { "epoch": 0.977273583838993, "grad_norm": 1.540972978097639, "learning_rate": 1.5778031456184507e-08, "loss": 0.4608, "step": 25930 }, { "epoch": 0.9776504729958919, "grad_norm": 1.509850379830826, "learning_rate": 1.5260168732362245e-08, "loss": 0.4722, "step": 25940 }, { "epoch": 0.9780273621527908, "grad_norm": 1.6782554143346065, "learning_rate": 1.4750934003996965e-08, "loss": 0.4587, "step": 25950 }, { "epoch": 0.9784042513096898, "grad_norm": 1.5550524539915491, "learning_rate": 1.4250328152514147e-08, "loss": 0.4865, "step": 25960 }, { "epoch": 0.9787811404665888, "grad_norm": 1.6682526527444668, "learning_rate": 1.3758352044402345e-08, "loss": 0.4975, "step": 25970 }, { "epoch": 0.9791580296234877, "grad_norm": 1.613736685118058, "learning_rate": 1.3275006531215384e-08, "loss": 0.4624, "step": 25980 }, { "epoch": 0.9795349187803867, "grad_norm": 1.6329195096750337, "learning_rate": 1.2800292449566265e-08, "loss": 0.4681, "step": 25990 }, { "epoch": 0.9799118079372856, "grad_norm": 1.7759191946586195, "learning_rate": 1.2334210621128827e-08, "loss": 0.4862, "step": 26000 }, { "epoch": 0.9802886970941846, "grad_norm": 1.6432481031917991, "learning_rate": 1.1876761852636642e-08, "loss": 0.472, "step": 26010 }, { "epoch": 0.9806655862510836, "grad_norm": 1.902773292717396, "learning_rate": 1.1427946935878009e-08, "loss": 0.4545, "step": 26020 }, { "epoch": 0.9810424754079825, "grad_norm": 1.6705242880751991, "learning_rate": 1.0987766647699849e-08, "loss": 0.4702, "step": 26030 }, { "epoch": 0.9814193645648814, "grad_norm": 1.971413464598701, "learning_rate": 1.055622175000104e-08, "loss": 0.4794, "step": 26040 }, { "epoch": 0.9817962537217805, "grad_norm": 1.6534906902045443, "learning_rate": 1.01333129897363e-08, "loss": 0.4578, "step": 26050 }, { "epoch": 0.9821731428786794, "grad_norm": 1.54977127095544, "learning_rate": 9.719041098909532e-09, "loss": 0.4777, "step": 26060 }, { "epoch": 0.9825500320355783, "grad_norm": 1.6720729481583012, "learning_rate": 9.31340679457604e-09, "loss": 0.4823, "step": 26070 }, { "epoch": 0.9829269211924773, "grad_norm": 1.558146371421349, "learning_rate": 8.916410778841978e-09, "loss": 0.491, "step": 26080 }, { "epoch": 0.9833038103493762, "grad_norm": 1.4251003076555135, "learning_rate": 8.528053738860453e-09, "loss": 0.4644, "step": 26090 }, { "epoch": 0.9836806995062752, "grad_norm": 1.8009507558620492, "learning_rate": 8.148336346830987e-09, "loss": 0.4888, "step": 26100 }, { "epoch": 0.9840575886631742, "grad_norm": 1.814480801176586, "learning_rate": 7.777259259999503e-09, "loss": 0.4629, "step": 26110 }, { "epoch": 0.9844344778200731, "grad_norm": 2.196528914285027, "learning_rate": 7.4148231206566665e-09, "loss": 0.4731, "step": 26120 }, { "epoch": 0.984811366976972, "grad_norm": 1.7187541640111528, "learning_rate": 7.061028556136773e-09, "loss": 0.4639, "step": 26130 }, { "epoch": 0.9851882561338711, "grad_norm": 1.5937258075062353, "learning_rate": 6.715876178816638e-09, "loss": 0.4674, "step": 26140 }, { "epoch": 0.98556514529077, "grad_norm": 1.8506500710664755, "learning_rate": 6.379366586113933e-09, "loss": 0.4652, "step": 26150 }, { "epoch": 0.9859420344476689, "grad_norm": 1.6855411533697344, "learning_rate": 6.051500360486628e-09, "loss": 0.4527, "step": 26160 }, { "epoch": 0.9863189236045679, "grad_norm": 1.5117238865140932, "learning_rate": 5.732278069432995e-09, "loss": 0.4779, "step": 26170 }, { "epoch": 0.9866958127614669, "grad_norm": 1.6187884945554707, "learning_rate": 5.421700265488827e-09, "loss": 0.4611, "step": 26180 }, { "epoch": 0.9870727019183658, "grad_norm": 1.3918686813033736, "learning_rate": 5.119767486228e-09, "loss": 0.4547, "step": 26190 }, { "epoch": 0.9874495910752648, "grad_norm": 1.6724439609048405, "learning_rate": 4.826480254259691e-09, "loss": 0.4562, "step": 26200 }, { "epoch": 0.9878264802321637, "grad_norm": 1.540870827083842, "learning_rate": 4.541839077230048e-09, "loss": 0.4569, "step": 26210 }, { "epoch": 0.9882033693890627, "grad_norm": 1.5544229067227138, "learning_rate": 4.265844447818856e-09, "loss": 0.4467, "step": 26220 }, { "epoch": 0.9885802585459617, "grad_norm": 1.6957676041843368, "learning_rate": 3.9984968437406515e-09, "loss": 0.4758, "step": 26230 }, { "epoch": 0.9889571477028606, "grad_norm": 1.4992330073607234, "learning_rate": 3.73979672774194e-09, "loss": 0.4748, "step": 26240 }, { "epoch": 0.9893340368597595, "grad_norm": 1.8025093840339432, "learning_rate": 3.4897445476028692e-09, "loss": 0.4879, "step": 26250 }, { "epoch": 0.9897109260166586, "grad_norm": 1.8270840898752077, "learning_rate": 3.2483407361338926e-09, "loss": 0.4738, "step": 26260 }, { "epoch": 0.9900878151735575, "grad_norm": 1.6829652388909495, "learning_rate": 3.0155857111757724e-09, "loss": 0.456, "step": 26270 }, { "epoch": 0.9904647043304564, "grad_norm": 1.5636345290979268, "learning_rate": 2.791479875600689e-09, "loss": 0.454, "step": 26280 }, { "epoch": 0.9908415934873553, "grad_norm": 1.4293143834926183, "learning_rate": 2.5760236173094643e-09, "loss": 0.4662, "step": 26290 }, { "epoch": 0.9912184826442543, "grad_norm": 1.5866582767032813, "learning_rate": 2.369217309231009e-09, "loss": 0.4303, "step": 26300 }, { "epoch": 0.9915953718011533, "grad_norm": 1.6824826996769227, "learning_rate": 2.171061309322875e-09, "loss": 0.4625, "step": 26310 }, { "epoch": 0.9919722609580522, "grad_norm": 1.4469762688853414, "learning_rate": 1.981555960569037e-09, "loss": 0.4659, "step": 26320 }, { "epoch": 0.9923491501149512, "grad_norm": 1.863209068946735, "learning_rate": 1.8007015909815574e-09, "loss": 0.4444, "step": 26330 }, { "epoch": 0.9927260392718501, "grad_norm": 1.3864226632030934, "learning_rate": 1.6284985135978093e-09, "loss": 0.4594, "step": 26340 }, { "epoch": 0.9931029284287491, "grad_norm": 1.8507882555475772, "learning_rate": 1.4649470264810339e-09, "loss": 0.47, "step": 26350 }, { "epoch": 0.9934798175856481, "grad_norm": 1.727307667076328, "learning_rate": 1.3100474127192285e-09, "loss": 0.4829, "step": 26360 }, { "epoch": 0.993856706742547, "grad_norm": 1.5872381530005595, "learning_rate": 1.1637999404257027e-09, "loss": 0.4752, "step": 26370 }, { "epoch": 0.9942335958994459, "grad_norm": 1.7561750947121424, "learning_rate": 1.0262048627374121e-09, "loss": 0.4572, "step": 26380 }, { "epoch": 0.994610485056345, "grad_norm": 1.523030077019712, "learning_rate": 8.972624178149592e-10, "loss": 0.4498, "step": 26390 }, { "epoch": 0.9949873742132439, "grad_norm": 1.8336481206683812, "learning_rate": 7.769728288420375e-10, "loss": 0.4931, "step": 26400 }, { "epoch": 0.9953642633701428, "grad_norm": 1.6093654004058522, "learning_rate": 6.653363040270978e-10, "loss": 0.4906, "step": 26410 }, { "epoch": 0.9957411525270418, "grad_norm": 1.168615916694283, "learning_rate": 5.62353036598351e-10, "loss": 0.4779, "step": 26420 }, { "epoch": 0.9961180416839408, "grad_norm": 1.55589136590255, "learning_rate": 4.680232048087652e-10, "loss": 0.4741, "step": 26430 }, { "epoch": 0.9964949308408397, "grad_norm": 1.7754177648108063, "learning_rate": 3.823469719316242e-10, "loss": 0.4721, "step": 26440 }, { "epoch": 0.9968718199977387, "grad_norm": 1.6083002564281654, "learning_rate": 3.053244862616378e-10, "loss": 0.4733, "step": 26450 }, { "epoch": 0.9972487091546376, "grad_norm": 1.7559817189307763, "learning_rate": 2.369558811171624e-10, "loss": 0.4724, "step": 26460 }, { "epoch": 0.9976255983115366, "grad_norm": 1.6315234881797243, "learning_rate": 1.772412748352048e-10, "loss": 0.4552, "step": 26470 }, { "epoch": 0.9980024874684356, "grad_norm": 1.8229450451292533, "learning_rate": 1.26180770774198e-10, "loss": 0.475, "step": 26480 }, { "epoch": 0.9983793766253345, "grad_norm": 1.6948887666570254, "learning_rate": 8.377445731511114e-11, "loss": 0.4713, "step": 26490 }, { "epoch": 0.9987562657822334, "grad_norm": 1.6821945503509856, "learning_rate": 5.002240785756396e-11, "loss": 0.4639, "step": 26500 }, { "epoch": 0.9991331549391324, "grad_norm": 1.5658822340781955, "learning_rate": 2.4924680822602242e-11, "loss": 0.4707, "step": 26510 }, { "epoch": 0.9995100440960314, "grad_norm": 1.8251970618778395, "learning_rate": 8.481319651032493e-12, "loss": 0.4814, "step": 26520 }, { "epoch": 0.9998869332529303, "grad_norm": 1.5466187547604036, "learning_rate": 6.923528045321703e-13, "loss": 0.4557, "step": 26530 }, { "epoch": 1.0, "step": 26533, "total_flos": 1663266888056832.0, "train_loss": 0.5075337708668052, "train_runtime": 63984.8167, "train_samples_per_second": 3.317, "train_steps_per_second": 0.415 } ], "logging_steps": 10, "max_steps": 26533, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1663266888056832.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }