{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 18779, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000532509718302359, "grad_norm": 23.296741485595703, "learning_rate": 9.584664536741213e-07, "loss": 7.8793, "step": 10 }, { "epoch": 0.001065019436604718, "grad_norm": 19.85402488708496, "learning_rate": 2.023429179978701e-06, "loss": 7.7093, "step": 20 }, { "epoch": 0.001597529154907077, "grad_norm": 10.07481861114502, "learning_rate": 3.08839190628328e-06, "loss": 7.0137, "step": 30 }, { "epoch": 0.002130038873209436, "grad_norm": 5.072307109832764, "learning_rate": 4.153354632587859e-06, "loss": 6.1742, "step": 40 }, { "epoch": 0.002662548591511795, "grad_norm": 3.5329959392547607, "learning_rate": 5.218317358892439e-06, "loss": 5.6068, "step": 50 }, { "epoch": 0.003195058309814154, "grad_norm": 3.8453404903411865, "learning_rate": 6.283280085197018e-06, "loss": 5.1784, "step": 60 }, { "epoch": 0.003727568028116513, "grad_norm": 6.184778213500977, "learning_rate": 7.3482428115015974e-06, "loss": 4.6031, "step": 70 }, { "epoch": 0.004260077746418872, "grad_norm": 4.906091690063477, "learning_rate": 8.413205537806178e-06, "loss": 3.9631, "step": 80 }, { "epoch": 0.004792587464721231, "grad_norm": 9.875988960266113, "learning_rate": 9.478168264110757e-06, "loss": 3.5113, "step": 90 }, { "epoch": 0.00532509718302359, "grad_norm": 5.586822986602783, "learning_rate": 1.0543130990415335e-05, "loss": 3.1884, "step": 100 }, { "epoch": 0.005857606901325949, "grad_norm": 9.180880546569824, "learning_rate": 1.1608093716719916e-05, "loss": 2.9137, "step": 110 }, { "epoch": 0.006390116619628308, "grad_norm": 17.583784103393555, "learning_rate": 1.2673056443024495e-05, "loss": 2.7283, "step": 120 }, { "epoch": 0.006922626337930667, "grad_norm": 16.766233444213867, "learning_rate": 1.3738019169329076e-05, "loss": 2.5699, "step": 130 }, { "epoch": 0.007455136056233026, "grad_norm": 11.388614654541016, "learning_rate": 1.4802981895633653e-05, "loss": 2.4291, "step": 140 }, { "epoch": 0.007987645774535385, "grad_norm": 18.473289489746094, "learning_rate": 1.5867944621938232e-05, "loss": 2.3001, "step": 150 }, { "epoch": 0.008520155492837744, "grad_norm": 12.690078735351562, "learning_rate": 1.693290734824281e-05, "loss": 2.1744, "step": 160 }, { "epoch": 0.009052665211140103, "grad_norm": 10.144042015075684, "learning_rate": 1.799787007454739e-05, "loss": 2.0552, "step": 170 }, { "epoch": 0.009585174929442462, "grad_norm": 11.107041358947754, "learning_rate": 1.906283280085197e-05, "loss": 1.9585, "step": 180 }, { "epoch": 0.010117684647744821, "grad_norm": 14.497051239013672, "learning_rate": 2.0127795527156552e-05, "loss": 1.8718, "step": 190 }, { "epoch": 0.01065019436604718, "grad_norm": 10.508237838745117, "learning_rate": 2.1192758253461128e-05, "loss": 1.8153, "step": 200 }, { "epoch": 0.01118270408434954, "grad_norm": 11.81551742553711, "learning_rate": 2.2257720979765707e-05, "loss": 1.7521, "step": 210 }, { "epoch": 0.011715213802651898, "grad_norm": 7.003968238830566, "learning_rate": 2.332268370607029e-05, "loss": 1.7067, "step": 220 }, { "epoch": 0.012247723520954257, "grad_norm": 9.637007713317871, "learning_rate": 2.438764643237487e-05, "loss": 1.658, "step": 230 }, { "epoch": 0.012780233239256616, "grad_norm": 11.963647842407227, "learning_rate": 2.5452609158679448e-05, "loss": 1.6003, "step": 240 }, { "epoch": 0.013312742957558975, "grad_norm": 15.572464942932129, "learning_rate": 2.6517571884984027e-05, "loss": 1.5396, "step": 250 }, { "epoch": 0.013845252675861335, "grad_norm": 10.560100555419922, "learning_rate": 2.7582534611288606e-05, "loss": 1.4679, "step": 260 }, { "epoch": 0.014377762394163694, "grad_norm": 14.625675201416016, "learning_rate": 2.864749733759319e-05, "loss": 1.3871, "step": 270 }, { "epoch": 0.014910272112466053, "grad_norm": 15.250794410705566, "learning_rate": 2.971246006389776e-05, "loss": 1.2908, "step": 280 }, { "epoch": 0.015442781830768412, "grad_norm": 10.370095252990723, "learning_rate": 3.0777422790202344e-05, "loss": 1.1773, "step": 290 }, { "epoch": 0.01597529154907077, "grad_norm": 14.734580993652344, "learning_rate": 3.1842385516506926e-05, "loss": 1.0634, "step": 300 }, { "epoch": 0.01650780126737313, "grad_norm": 11.359335899353027, "learning_rate": 3.29073482428115e-05, "loss": 0.9367, "step": 310 }, { "epoch": 0.01704031098567549, "grad_norm": 15.065919876098633, "learning_rate": 3.3972310969116084e-05, "loss": 0.851, "step": 320 }, { "epoch": 0.017572820703977848, "grad_norm": 11.290328025817871, "learning_rate": 3.503727369542067e-05, "loss": 0.7698, "step": 330 }, { "epoch": 0.018105330422280207, "grad_norm": 9.410698890686035, "learning_rate": 3.610223642172524e-05, "loss": 0.6942, "step": 340 }, { "epoch": 0.018637840140582566, "grad_norm": 7.125499725341797, "learning_rate": 3.716719914802982e-05, "loss": 0.6353, "step": 350 }, { "epoch": 0.019170349858884925, "grad_norm": 11.152689933776855, "learning_rate": 3.82321618743344e-05, "loss": 0.6036, "step": 360 }, { "epoch": 0.019702859577187284, "grad_norm": 7.263124465942383, "learning_rate": 3.929712460063898e-05, "loss": 0.5523, "step": 370 }, { "epoch": 0.020235369295489643, "grad_norm": 6.285194396972656, "learning_rate": 4.036208732694356e-05, "loss": 0.5131, "step": 380 }, { "epoch": 0.020767879013792002, "grad_norm": 3.8969569206237793, "learning_rate": 4.142705005324814e-05, "loss": 0.4857, "step": 390 }, { "epoch": 0.02130038873209436, "grad_norm": 4.850637912750244, "learning_rate": 4.249201277955272e-05, "loss": 0.4606, "step": 400 }, { "epoch": 0.02183289845039672, "grad_norm": 3.3862061500549316, "learning_rate": 4.355697550585729e-05, "loss": 0.4423, "step": 410 }, { "epoch": 0.02236540816869908, "grad_norm": 3.709831714630127, "learning_rate": 4.4621938232161876e-05, "loss": 0.4269, "step": 420 }, { "epoch": 0.022897917887001438, "grad_norm": 3.6177499294281006, "learning_rate": 4.568690095846646e-05, "loss": 0.4109, "step": 430 }, { "epoch": 0.023430427605303797, "grad_norm": 3.2016589641571045, "learning_rate": 4.6751863684771034e-05, "loss": 0.4038, "step": 440 }, { "epoch": 0.023962937323606156, "grad_norm": 3.1846377849578857, "learning_rate": 4.781682641107562e-05, "loss": 0.3922, "step": 450 }, { "epoch": 0.024495447041908515, "grad_norm": 3.7085001468658447, "learning_rate": 4.88817891373802e-05, "loss": 0.3852, "step": 460 }, { "epoch": 0.025027956760210874, "grad_norm": 1.9565516710281372, "learning_rate": 4.994675186368477e-05, "loss": 0.3727, "step": 470 }, { "epoch": 0.025560466478513233, "grad_norm": 2.378927230834961, "learning_rate": 5.101171458998936e-05, "loss": 0.3654, "step": 480 }, { "epoch": 0.026092976196815592, "grad_norm": 1.7363628149032593, "learning_rate": 5.207667731629393e-05, "loss": 0.3528, "step": 490 }, { "epoch": 0.02662548591511795, "grad_norm": 2.0001909732818604, "learning_rate": 5.314164004259851e-05, "loss": 0.3472, "step": 500 }, { "epoch": 0.02715799563342031, "grad_norm": 2.2477715015411377, "learning_rate": 5.420660276890309e-05, "loss": 0.3334, "step": 510 }, { "epoch": 0.02769050535172267, "grad_norm": 1.9201889038085938, "learning_rate": 5.527156549520767e-05, "loss": 0.3183, "step": 520 }, { "epoch": 0.028223015070025028, "grad_norm": 1.6685700416564941, "learning_rate": 5.633652822151225e-05, "loss": 0.3067, "step": 530 }, { "epoch": 0.028755524788327387, "grad_norm": 1.8929866552352905, "learning_rate": 5.7401490947816826e-05, "loss": 0.293, "step": 540 }, { "epoch": 0.029288034506629746, "grad_norm": 1.649090051651001, "learning_rate": 5.8466453674121415e-05, "loss": 0.2802, "step": 550 }, { "epoch": 0.029820544224932105, "grad_norm": 0.9757211804389954, "learning_rate": 5.953141640042599e-05, "loss": 0.2706, "step": 560 }, { "epoch": 0.030353053943234464, "grad_norm": 1.1477692127227783, "learning_rate": 6.059637912673056e-05, "loss": 0.2616, "step": 570 }, { "epoch": 0.030885563661536823, "grad_norm": 1.2151044607162476, "learning_rate": 6.166134185303514e-05, "loss": 0.2488, "step": 580 }, { "epoch": 0.03141807337983918, "grad_norm": 0.9159660339355469, "learning_rate": 6.272630457933972e-05, "loss": 0.2435, "step": 590 }, { "epoch": 0.03195058309814154, "grad_norm": 0.9337270855903625, "learning_rate": 6.379126730564431e-05, "loss": 0.2341, "step": 600 }, { "epoch": 0.0324830928164439, "grad_norm": 1.3477636575698853, "learning_rate": 6.485623003194888e-05, "loss": 0.2271, "step": 610 }, { "epoch": 0.03301560253474626, "grad_norm": 0.9192898869514465, "learning_rate": 6.592119275825347e-05, "loss": 0.2222, "step": 620 }, { "epoch": 0.033548112253048615, "grad_norm": 0.6390801668167114, "learning_rate": 6.698615548455805e-05, "loss": 0.2165, "step": 630 }, { "epoch": 0.03408062197135098, "grad_norm": 0.6918138265609741, "learning_rate": 6.805111821086262e-05, "loss": 0.213, "step": 640 }, { "epoch": 0.03461313168965333, "grad_norm": 0.6979911923408508, "learning_rate": 6.91160809371672e-05, "loss": 0.2126, "step": 650 }, { "epoch": 0.035145641407955695, "grad_norm": 0.7399368286132812, "learning_rate": 7.018104366347178e-05, "loss": 0.2094, "step": 660 }, { "epoch": 0.03567815112625805, "grad_norm": 0.7001500725746155, "learning_rate": 7.124600638977636e-05, "loss": 0.2083, "step": 670 }, { "epoch": 0.03621066084456041, "grad_norm": 0.7533488273620605, "learning_rate": 7.231096911608094e-05, "loss": 0.207, "step": 680 }, { "epoch": 0.03674317056286277, "grad_norm": 0.6705746054649353, "learning_rate": 7.337593184238552e-05, "loss": 0.2053, "step": 690 }, { "epoch": 0.03727568028116513, "grad_norm": 0.4637382924556732, "learning_rate": 7.44408945686901e-05, "loss": 0.204, "step": 700 }, { "epoch": 0.03780818999946749, "grad_norm": 0.40789374709129333, "learning_rate": 7.550585729499468e-05, "loss": 0.2019, "step": 710 }, { "epoch": 0.03834069971776985, "grad_norm": 0.591678261756897, "learning_rate": 7.657082002129926e-05, "loss": 0.2011, "step": 720 }, { "epoch": 0.038873209436072205, "grad_norm": 0.5219926834106445, "learning_rate": 7.763578274760383e-05, "loss": 0.1987, "step": 730 }, { "epoch": 0.03940571915437457, "grad_norm": 0.47858574986457825, "learning_rate": 7.870074547390842e-05, "loss": 0.1976, "step": 740 }, { "epoch": 0.03993822887267692, "grad_norm": 0.752047061920166, "learning_rate": 7.9765708200213e-05, "loss": 0.1987, "step": 750 }, { "epoch": 0.040470738590979285, "grad_norm": 0.4126583933830261, "learning_rate": 8.083067092651757e-05, "loss": 0.1971, "step": 760 }, { "epoch": 0.04100324830928164, "grad_norm": 0.8404585719108582, "learning_rate": 8.189563365282216e-05, "loss": 0.1948, "step": 770 }, { "epoch": 0.041535758027584004, "grad_norm": 0.5083792209625244, "learning_rate": 8.296059637912672e-05, "loss": 0.194, "step": 780 }, { "epoch": 0.04206826774588636, "grad_norm": 0.3890551030635834, "learning_rate": 8.402555910543131e-05, "loss": 0.1923, "step": 790 }, { "epoch": 0.04260077746418872, "grad_norm": 0.7016918063163757, "learning_rate": 8.509052183173589e-05, "loss": 0.1912, "step": 800 }, { "epoch": 0.04313328718249108, "grad_norm": 0.44527336955070496, "learning_rate": 8.615548455804048e-05, "loss": 0.1919, "step": 810 }, { "epoch": 0.04366579690079344, "grad_norm": 0.3990408778190613, "learning_rate": 8.722044728434506e-05, "loss": 0.1905, "step": 820 }, { "epoch": 0.044198306619095795, "grad_norm": 0.3964357078075409, "learning_rate": 8.828541001064963e-05, "loss": 0.1882, "step": 830 }, { "epoch": 0.04473081633739816, "grad_norm": 0.6267169713973999, "learning_rate": 8.93503727369542e-05, "loss": 0.1894, "step": 840 }, { "epoch": 0.04526332605570051, "grad_norm": 0.3614656329154968, "learning_rate": 9.041533546325878e-05, "loss": 0.1873, "step": 850 }, { "epoch": 0.045795835774002876, "grad_norm": 0.3725983202457428, "learning_rate": 9.148029818956337e-05, "loss": 0.1854, "step": 860 }, { "epoch": 0.04632834549230523, "grad_norm": 0.7198257446289062, "learning_rate": 9.254526091586795e-05, "loss": 0.1854, "step": 870 }, { "epoch": 0.046860855210607594, "grad_norm": 0.5347720980644226, "learning_rate": 9.361022364217252e-05, "loss": 0.1856, "step": 880 }, { "epoch": 0.04739336492890995, "grad_norm": 0.36126938462257385, "learning_rate": 9.467518636847711e-05, "loss": 0.1832, "step": 890 }, { "epoch": 0.04792587464721231, "grad_norm": 0.5364170670509338, "learning_rate": 9.574014909478169e-05, "loss": 0.1837, "step": 900 }, { "epoch": 0.04845838436551467, "grad_norm": 0.3289523422718048, "learning_rate": 9.680511182108626e-05, "loss": 0.1819, "step": 910 }, { "epoch": 0.04899089408381703, "grad_norm": 0.3482621908187866, "learning_rate": 9.787007454739084e-05, "loss": 0.1825, "step": 920 }, { "epoch": 0.049523403802119385, "grad_norm": 0.5768856406211853, "learning_rate": 9.893503727369543e-05, "loss": 0.1829, "step": 930 }, { "epoch": 0.05005591352042175, "grad_norm": 0.337167352437973, "learning_rate": 0.0001, "loss": 0.1821, "step": 940 }, { "epoch": 0.0505884232387241, "grad_norm": 0.2395765632390976, "learning_rate": 9.99999457314795e-05, "loss": 0.1807, "step": 950 }, { "epoch": 0.051120932957026466, "grad_norm": 0.37906885147094727, "learning_rate": 9.999978292608627e-05, "loss": 0.1795, "step": 960 }, { "epoch": 0.05165344267532882, "grad_norm": 0.4830165207386017, "learning_rate": 9.999951158432521e-05, "loss": 0.1801, "step": 970 }, { "epoch": 0.052185952393631184, "grad_norm": 0.3381877839565277, "learning_rate": 9.999913170703776e-05, "loss": 0.1786, "step": 980 }, { "epoch": 0.05271846211193354, "grad_norm": 0.4923486113548279, "learning_rate": 9.999864329540193e-05, "loss": 0.1792, "step": 990 }, { "epoch": 0.0532509718302359, "grad_norm": 0.29676854610443115, "learning_rate": 9.999804635093233e-05, "loss": 0.1775, "step": 1000 }, { "epoch": 0.05378348154853826, "grad_norm": 0.5925562977790833, "learning_rate": 9.999734087548009e-05, "loss": 0.1774, "step": 1010 }, { "epoch": 0.05431599126684062, "grad_norm": 0.5015407800674438, "learning_rate": 9.999652687123293e-05, "loss": 0.1762, "step": 1020 }, { "epoch": 0.054848500985142976, "grad_norm": 0.2539210915565491, "learning_rate": 9.999560434071517e-05, "loss": 0.1756, "step": 1030 }, { "epoch": 0.05538101070344534, "grad_norm": 0.3802579343318939, "learning_rate": 9.999457328678761e-05, "loss": 0.1749, "step": 1040 }, { "epoch": 0.055913520421747694, "grad_norm": 0.3460189998149872, "learning_rate": 9.999343371264757e-05, "loss": 0.1751, "step": 1050 }, { "epoch": 0.056446030140050056, "grad_norm": 0.4987145960330963, "learning_rate": 9.9992185621829e-05, "loss": 0.1732, "step": 1060 }, { "epoch": 0.05697853985835241, "grad_norm": 0.3508310616016388, "learning_rate": 9.999082901820225e-05, "loss": 0.173, "step": 1070 }, { "epoch": 0.057511049576654774, "grad_norm": 0.6434867978096008, "learning_rate": 9.998936390597424e-05, "loss": 0.1733, "step": 1080 }, { "epoch": 0.05804355929495713, "grad_norm": 0.25631335377693176, "learning_rate": 9.998779028968839e-05, "loss": 0.1727, "step": 1090 }, { "epoch": 0.05857606901325949, "grad_norm": 0.7324220538139343, "learning_rate": 9.998610817422456e-05, "loss": 0.1721, "step": 1100 }, { "epoch": 0.05910857873156185, "grad_norm": 0.2557462453842163, "learning_rate": 9.998431756479907e-05, "loss": 0.1718, "step": 1110 }, { "epoch": 0.05964108844986421, "grad_norm": 0.23237043619155884, "learning_rate": 9.998241846696474e-05, "loss": 0.1708, "step": 1120 }, { "epoch": 0.060173598168166566, "grad_norm": 0.7491874098777771, "learning_rate": 9.998041088661076e-05, "loss": 0.1709, "step": 1130 }, { "epoch": 0.06070610788646893, "grad_norm": 0.3187454640865326, "learning_rate": 9.997829482996277e-05, "loss": 0.1706, "step": 1140 }, { "epoch": 0.061238617604771284, "grad_norm": 0.28493937849998474, "learning_rate": 9.997607030358276e-05, "loss": 0.1696, "step": 1150 }, { "epoch": 0.061771127323073646, "grad_norm": 0.6693065166473389, "learning_rate": 9.997373731436915e-05, "loss": 0.1693, "step": 1160 }, { "epoch": 0.062303637041376, "grad_norm": 0.4238905608654022, "learning_rate": 9.997129586955665e-05, "loss": 0.1691, "step": 1170 }, { "epoch": 0.06283614675967836, "grad_norm": 0.5617002248764038, "learning_rate": 9.996874597671633e-05, "loss": 0.1698, "step": 1180 }, { "epoch": 0.06336865647798072, "grad_norm": 0.29317113757133484, "learning_rate": 9.996608764375555e-05, "loss": 0.1684, "step": 1190 }, { "epoch": 0.06390116619628308, "grad_norm": 0.4380682706832886, "learning_rate": 9.996332087891795e-05, "loss": 0.1688, "step": 1200 }, { "epoch": 0.06443367591458544, "grad_norm": 0.7158795595169067, "learning_rate": 9.996044569078347e-05, "loss": 0.1681, "step": 1210 }, { "epoch": 0.0649661856328878, "grad_norm": 0.39761292934417725, "learning_rate": 9.99574620882682e-05, "loss": 0.1679, "step": 1220 }, { "epoch": 0.06549869535119016, "grad_norm": 0.2829475700855255, "learning_rate": 9.995437008062444e-05, "loss": 0.1683, "step": 1230 }, { "epoch": 0.06603120506949252, "grad_norm": 0.22211559116840363, "learning_rate": 9.995116967744076e-05, "loss": 0.1676, "step": 1240 }, { "epoch": 0.06656371478779488, "grad_norm": 0.24345025420188904, "learning_rate": 9.994786088864172e-05, "loss": 0.1654, "step": 1250 }, { "epoch": 0.06709622450609723, "grad_norm": 0.2510230243206024, "learning_rate": 9.994444372448812e-05, "loss": 0.1671, "step": 1260 }, { "epoch": 0.06762873422439959, "grad_norm": 0.26244106888771057, "learning_rate": 9.994091819557676e-05, "loss": 0.1659, "step": 1270 }, { "epoch": 0.06816124394270195, "grad_norm": 0.2892049252986908, "learning_rate": 9.993728431284053e-05, "loss": 0.1652, "step": 1280 }, { "epoch": 0.06869375366100432, "grad_norm": 0.4036615788936615, "learning_rate": 9.993354208754828e-05, "loss": 0.165, "step": 1290 }, { "epoch": 0.06922626337930667, "grad_norm": 0.32690417766571045, "learning_rate": 9.992969153130491e-05, "loss": 0.1646, "step": 1300 }, { "epoch": 0.06975877309760903, "grad_norm": 0.9821091294288635, "learning_rate": 9.992573265605119e-05, "loss": 0.1651, "step": 1310 }, { "epoch": 0.07029128281591139, "grad_norm": 0.23433181643486023, "learning_rate": 9.992166547406383e-05, "loss": 0.1659, "step": 1320 }, { "epoch": 0.07082379253421375, "grad_norm": 0.3498155474662781, "learning_rate": 9.99174899979554e-05, "loss": 0.165, "step": 1330 }, { "epoch": 0.0713563022525161, "grad_norm": 0.19588203728199005, "learning_rate": 9.991320624067431e-05, "loss": 0.1632, "step": 1340 }, { "epoch": 0.07188881197081846, "grad_norm": 0.3548436462879181, "learning_rate": 9.99088142155047e-05, "loss": 0.1639, "step": 1350 }, { "epoch": 0.07242132168912083, "grad_norm": 0.3274150788784027, "learning_rate": 9.990431393606654e-05, "loss": 0.1623, "step": 1360 }, { "epoch": 0.07295383140742319, "grad_norm": 0.5695179104804993, "learning_rate": 9.989970541631544e-05, "loss": 0.1634, "step": 1370 }, { "epoch": 0.07348634112572554, "grad_norm": 0.21706371009349823, "learning_rate": 9.989498867054268e-05, "loss": 0.1619, "step": 1380 }, { "epoch": 0.0740188508440279, "grad_norm": 0.45233970880508423, "learning_rate": 9.989016371337518e-05, "loss": 0.1622, "step": 1390 }, { "epoch": 0.07455136056233026, "grad_norm": 0.5718231797218323, "learning_rate": 9.988523055977541e-05, "loss": 0.1631, "step": 1400 }, { "epoch": 0.07508387028063263, "grad_norm": 0.6669481992721558, "learning_rate": 9.988018922504137e-05, "loss": 0.1625, "step": 1410 }, { "epoch": 0.07561637999893497, "grad_norm": 0.25058674812316895, "learning_rate": 9.987503972480652e-05, "loss": 0.162, "step": 1420 }, { "epoch": 0.07614888971723734, "grad_norm": 0.2735210359096527, "learning_rate": 9.986978207503977e-05, "loss": 0.1617, "step": 1430 }, { "epoch": 0.0766813994355397, "grad_norm": 0.2729678750038147, "learning_rate": 9.98644162920454e-05, "loss": 0.1607, "step": 1440 }, { "epoch": 0.07721390915384206, "grad_norm": 0.20154890418052673, "learning_rate": 9.985894239246298e-05, "loss": 0.1612, "step": 1450 }, { "epoch": 0.07774641887214441, "grad_norm": 0.37646111845970154, "learning_rate": 9.985336039326747e-05, "loss": 0.1602, "step": 1460 }, { "epoch": 0.07827892859044677, "grad_norm": 0.29742431640625, "learning_rate": 9.98476703117689e-05, "loss": 0.1609, "step": 1470 }, { "epoch": 0.07881143830874913, "grad_norm": 0.2643822729587555, "learning_rate": 9.984187216561258e-05, "loss": 0.1612, "step": 1480 }, { "epoch": 0.0793439480270515, "grad_norm": 0.22973056137561798, "learning_rate": 9.98359659727789e-05, "loss": 0.1614, "step": 1490 }, { "epoch": 0.07987645774535385, "grad_norm": 0.24614231288433075, "learning_rate": 9.982995175158327e-05, "loss": 0.1621, "step": 1500 }, { "epoch": 0.08040896746365621, "grad_norm": 0.2537037134170532, "learning_rate": 9.98238295206762e-05, "loss": 0.1608, "step": 1510 }, { "epoch": 0.08094147718195857, "grad_norm": 0.26126566529273987, "learning_rate": 9.981759929904306e-05, "loss": 0.1596, "step": 1520 }, { "epoch": 0.08147398690026093, "grad_norm": 1.0934852361679077, "learning_rate": 9.981126110600411e-05, "loss": 0.1598, "step": 1530 }, { "epoch": 0.08200649661856328, "grad_norm": 0.21107517182826996, "learning_rate": 9.98048149612145e-05, "loss": 0.1601, "step": 1540 }, { "epoch": 0.08253900633686564, "grad_norm": 0.2115686535835266, "learning_rate": 9.979826088466405e-05, "loss": 0.1598, "step": 1550 }, { "epoch": 0.08307151605516801, "grad_norm": 0.23121733963489532, "learning_rate": 9.979159889667738e-05, "loss": 0.1592, "step": 1560 }, { "epoch": 0.08360402577347037, "grad_norm": 0.19117231667041779, "learning_rate": 9.978482901791366e-05, "loss": 0.1591, "step": 1570 }, { "epoch": 0.08413653549177272, "grad_norm": 0.274919331073761, "learning_rate": 9.977795126936671e-05, "loss": 0.1593, "step": 1580 }, { "epoch": 0.08466904521007508, "grad_norm": 0.17615851759910583, "learning_rate": 9.977096567236481e-05, "loss": 0.1586, "step": 1590 }, { "epoch": 0.08520155492837744, "grad_norm": 0.3320156931877136, "learning_rate": 9.976387224857071e-05, "loss": 0.1575, "step": 1600 }, { "epoch": 0.0857340646466798, "grad_norm": 0.56144779920578, "learning_rate": 9.975667101998153e-05, "loss": 0.1587, "step": 1610 }, { "epoch": 0.08626657436498215, "grad_norm": 0.3506183326244354, "learning_rate": 9.974936200892874e-05, "loss": 0.1588, "step": 1620 }, { "epoch": 0.08679908408328452, "grad_norm": 0.22701147198677063, "learning_rate": 9.974194523807796e-05, "loss": 0.1581, "step": 1630 }, { "epoch": 0.08733159380158688, "grad_norm": 0.27218353748321533, "learning_rate": 9.973442073042903e-05, "loss": 0.1584, "step": 1640 }, { "epoch": 0.08786410351988924, "grad_norm": 0.21976235508918762, "learning_rate": 9.972678850931589e-05, "loss": 0.1571, "step": 1650 }, { "epoch": 0.08839661323819159, "grad_norm": 0.22460529208183289, "learning_rate": 9.971904859840653e-05, "loss": 0.157, "step": 1660 }, { "epoch": 0.08892912295649395, "grad_norm": 0.16970294713974, "learning_rate": 9.971120102170283e-05, "loss": 0.1575, "step": 1670 }, { "epoch": 0.08946163267479632, "grad_norm": 0.31050947308540344, "learning_rate": 9.970324580354063e-05, "loss": 0.1568, "step": 1680 }, { "epoch": 0.08999414239309868, "grad_norm": 0.22615467011928558, "learning_rate": 9.969518296858946e-05, "loss": 0.1574, "step": 1690 }, { "epoch": 0.09052665211140103, "grad_norm": 0.15380023419857025, "learning_rate": 9.968701254185271e-05, "loss": 0.1567, "step": 1700 }, { "epoch": 0.09105916182970339, "grad_norm": 0.20212256908416748, "learning_rate": 9.96787345486673e-05, "loss": 0.157, "step": 1710 }, { "epoch": 0.09159167154800575, "grad_norm": 0.24987904727458954, "learning_rate": 9.967034901470377e-05, "loss": 0.1574, "step": 1720 }, { "epoch": 0.09212418126630811, "grad_norm": 0.30531537532806396, "learning_rate": 9.966185596596618e-05, "loss": 0.1557, "step": 1730 }, { "epoch": 0.09265669098461046, "grad_norm": 0.23161855340003967, "learning_rate": 9.965325542879196e-05, "loss": 0.1568, "step": 1740 }, { "epoch": 0.09318920070291282, "grad_norm": 0.5445181727409363, "learning_rate": 9.964454742985188e-05, "loss": 0.1557, "step": 1750 }, { "epoch": 0.09372171042121519, "grad_norm": 0.28040483593940735, "learning_rate": 9.963573199614992e-05, "loss": 0.1573, "step": 1760 }, { "epoch": 0.09425422013951755, "grad_norm": 0.16802328824996948, "learning_rate": 9.962680915502331e-05, "loss": 0.157, "step": 1770 }, { "epoch": 0.0947867298578199, "grad_norm": 0.15088757872581482, "learning_rate": 9.961777893414226e-05, "loss": 0.1568, "step": 1780 }, { "epoch": 0.09531923957612226, "grad_norm": 0.1732264906167984, "learning_rate": 9.960864136151e-05, "loss": 0.1562, "step": 1790 }, { "epoch": 0.09585174929442462, "grad_norm": 0.5040917992591858, "learning_rate": 9.959939646546272e-05, "loss": 0.1558, "step": 1800 }, { "epoch": 0.09638425901272699, "grad_norm": 0.19744379818439484, "learning_rate": 9.959004427466935e-05, "loss": 0.1559, "step": 1810 }, { "epoch": 0.09691676873102933, "grad_norm": 0.18064717948436737, "learning_rate": 9.958058481813158e-05, "loss": 0.1552, "step": 1820 }, { "epoch": 0.0974492784493317, "grad_norm": 0.1613135039806366, "learning_rate": 9.957101812518377e-05, "loss": 0.1556, "step": 1830 }, { "epoch": 0.09798178816763406, "grad_norm": 0.20252278447151184, "learning_rate": 9.956134422549275e-05, "loss": 0.1551, "step": 1840 }, { "epoch": 0.09851429788593642, "grad_norm": 0.3912264108657837, "learning_rate": 9.955156314905785e-05, "loss": 0.1549, "step": 1850 }, { "epoch": 0.09904680760423877, "grad_norm": 0.19006063044071198, "learning_rate": 9.954167492621079e-05, "loss": 0.156, "step": 1860 }, { "epoch": 0.09957931732254113, "grad_norm": 0.37275323271751404, "learning_rate": 9.953167958761552e-05, "loss": 0.1557, "step": 1870 }, { "epoch": 0.1001118270408435, "grad_norm": 0.17530041933059692, "learning_rate": 9.952157716426813e-05, "loss": 0.1551, "step": 1880 }, { "epoch": 0.10064433675914586, "grad_norm": 0.5232445597648621, "learning_rate": 9.951136768749685e-05, "loss": 0.1546, "step": 1890 }, { "epoch": 0.1011768464774482, "grad_norm": 0.42586958408355713, "learning_rate": 9.950105118896186e-05, "loss": 0.1551, "step": 1900 }, { "epoch": 0.10170935619575057, "grad_norm": 0.2536565065383911, "learning_rate": 9.949062770065525e-05, "loss": 0.155, "step": 1910 }, { "epoch": 0.10224186591405293, "grad_norm": 0.15420402586460114, "learning_rate": 9.948009725490082e-05, "loss": 0.154, "step": 1920 }, { "epoch": 0.1027743756323553, "grad_norm": 0.20187288522720337, "learning_rate": 9.946945988435414e-05, "loss": 0.1551, "step": 1930 }, { "epoch": 0.10330688535065764, "grad_norm": 0.17882299423217773, "learning_rate": 9.945871562200226e-05, "loss": 0.1548, "step": 1940 }, { "epoch": 0.10383939506896, "grad_norm": 0.18503925204277039, "learning_rate": 9.944786450116384e-05, "loss": 0.1544, "step": 1950 }, { "epoch": 0.10437190478726237, "grad_norm": 0.21693278849124908, "learning_rate": 9.943690655548876e-05, "loss": 0.154, "step": 1960 }, { "epoch": 0.10490441450556473, "grad_norm": 0.2536572515964508, "learning_rate": 9.942584181895831e-05, "loss": 0.154, "step": 1970 }, { "epoch": 0.10543692422386708, "grad_norm": 0.20351417362689972, "learning_rate": 9.941467032588483e-05, "loss": 0.1531, "step": 1980 }, { "epoch": 0.10596943394216944, "grad_norm": 0.14263711869716644, "learning_rate": 9.940339211091182e-05, "loss": 0.1541, "step": 1990 }, { "epoch": 0.1065019436604718, "grad_norm": 0.21371303498744965, "learning_rate": 9.939200720901367e-05, "loss": 0.1544, "step": 2000 }, { "epoch": 0.10703445337877417, "grad_norm": 0.18548338115215302, "learning_rate": 9.93805156554956e-05, "loss": 0.154, "step": 2010 }, { "epoch": 0.10756696309707652, "grad_norm": 0.14423610270023346, "learning_rate": 9.936891748599362e-05, "loss": 0.1555, "step": 2020 }, { "epoch": 0.10809947281537888, "grad_norm": 0.2253378927707672, "learning_rate": 9.935721273647429e-05, "loss": 0.1537, "step": 2030 }, { "epoch": 0.10863198253368124, "grad_norm": 0.17178262770175934, "learning_rate": 9.934540144323477e-05, "loss": 0.1533, "step": 2040 }, { "epoch": 0.1091644922519836, "grad_norm": 0.23457373678684235, "learning_rate": 9.933348364290253e-05, "loss": 0.1541, "step": 2050 }, { "epoch": 0.10969700197028595, "grad_norm": 0.3339991867542267, "learning_rate": 9.932145937243537e-05, "loss": 0.1526, "step": 2060 }, { "epoch": 0.11022951168858831, "grad_norm": 0.18367235362529755, "learning_rate": 9.930932866912128e-05, "loss": 0.1536, "step": 2070 }, { "epoch": 0.11076202140689068, "grad_norm": 0.1779884397983551, "learning_rate": 9.929709157057828e-05, "loss": 0.1522, "step": 2080 }, { "epoch": 0.11129453112519304, "grad_norm": 0.24092677235603333, "learning_rate": 9.928474811475426e-05, "loss": 0.1528, "step": 2090 }, { "epoch": 0.11182704084349539, "grad_norm": 0.16710165143013, "learning_rate": 9.927229833992706e-05, "loss": 0.1525, "step": 2100 }, { "epoch": 0.11235955056179775, "grad_norm": 0.3042786121368408, "learning_rate": 9.925974228470415e-05, "loss": 0.1545, "step": 2110 }, { "epoch": 0.11289206028010011, "grad_norm": 0.14976242184638977, "learning_rate": 9.924707998802259e-05, "loss": 0.1531, "step": 2120 }, { "epoch": 0.11342456999840247, "grad_norm": 0.1597498059272766, "learning_rate": 9.923431148914885e-05, "loss": 0.1523, "step": 2130 }, { "epoch": 0.11395707971670482, "grad_norm": 0.2503865957260132, "learning_rate": 9.922143682767886e-05, "loss": 0.1527, "step": 2140 }, { "epoch": 0.11448958943500719, "grad_norm": 0.240915447473526, "learning_rate": 9.920845604353768e-05, "loss": 0.1531, "step": 2150 }, { "epoch": 0.11502209915330955, "grad_norm": 0.28919148445129395, "learning_rate": 9.919536917697942e-05, "loss": 0.1527, "step": 2160 }, { "epoch": 0.11555460887161191, "grad_norm": 0.20423804223537445, "learning_rate": 9.91821762685873e-05, "loss": 0.153, "step": 2170 }, { "epoch": 0.11608711858991426, "grad_norm": 0.18328100442886353, "learning_rate": 9.916887735927326e-05, "loss": 0.1515, "step": 2180 }, { "epoch": 0.11661962830821662, "grad_norm": 0.22620701789855957, "learning_rate": 9.915547249027795e-05, "loss": 0.1514, "step": 2190 }, { "epoch": 0.11715213802651898, "grad_norm": 0.23197805881500244, "learning_rate": 9.914196170317074e-05, "loss": 0.1526, "step": 2200 }, { "epoch": 0.11768464774482135, "grad_norm": 0.320434033870697, "learning_rate": 9.912834503984929e-05, "loss": 0.1526, "step": 2210 }, { "epoch": 0.1182171574631237, "grad_norm": 0.16544243693351746, "learning_rate": 9.911462254253971e-05, "loss": 0.1523, "step": 2220 }, { "epoch": 0.11874966718142606, "grad_norm": 0.21110887825489044, "learning_rate": 9.910079425379626e-05, "loss": 0.1518, "step": 2230 }, { "epoch": 0.11928217689972842, "grad_norm": 0.16100363433361053, "learning_rate": 9.908686021650124e-05, "loss": 0.152, "step": 2240 }, { "epoch": 0.11981468661803078, "grad_norm": 0.14773668348789215, "learning_rate": 9.907282047386497e-05, "loss": 0.152, "step": 2250 }, { "epoch": 0.12034719633633313, "grad_norm": 0.14374825358390808, "learning_rate": 9.905867506942544e-05, "loss": 0.1511, "step": 2260 }, { "epoch": 0.1208797060546355, "grad_norm": 0.13494443893432617, "learning_rate": 9.904442404704843e-05, "loss": 0.1515, "step": 2270 }, { "epoch": 0.12141221577293786, "grad_norm": 0.18906742334365845, "learning_rate": 9.903006745092716e-05, "loss": 0.1519, "step": 2280 }, { "epoch": 0.12194472549124022, "grad_norm": 0.24630281329154968, "learning_rate": 9.901560532558229e-05, "loss": 0.1527, "step": 2290 }, { "epoch": 0.12247723520954257, "grad_norm": 0.19350433349609375, "learning_rate": 9.900103771586171e-05, "loss": 0.1509, "step": 2300 }, { "epoch": 0.12300974492784493, "grad_norm": 0.1677471250295639, "learning_rate": 9.898636466694042e-05, "loss": 0.1515, "step": 2310 }, { "epoch": 0.12354225464614729, "grad_norm": 0.30285850167274475, "learning_rate": 9.897158622432041e-05, "loss": 0.1517, "step": 2320 }, { "epoch": 0.12407476436444966, "grad_norm": 0.16860969364643097, "learning_rate": 9.895670243383048e-05, "loss": 0.1516, "step": 2330 }, { "epoch": 0.124607274082752, "grad_norm": 0.10707177966833115, "learning_rate": 9.894171334162614e-05, "loss": 0.1517, "step": 2340 }, { "epoch": 0.12513978380105437, "grad_norm": 0.16667041182518005, "learning_rate": 9.892661899418945e-05, "loss": 0.152, "step": 2350 }, { "epoch": 0.12567229351935671, "grad_norm": 0.21757569909095764, "learning_rate": 9.891141943832883e-05, "loss": 0.1519, "step": 2360 }, { "epoch": 0.1262048032376591, "grad_norm": 0.1555328369140625, "learning_rate": 9.889611472117902e-05, "loss": 0.1506, "step": 2370 }, { "epoch": 0.12673731295596144, "grad_norm": 0.24122075736522675, "learning_rate": 9.888070489020083e-05, "loss": 0.1519, "step": 2380 }, { "epoch": 0.12726982267426382, "grad_norm": 0.15034180879592896, "learning_rate": 9.886518999318104e-05, "loss": 0.1513, "step": 2390 }, { "epoch": 0.12780233239256616, "grad_norm": 0.1593770682811737, "learning_rate": 9.884957007823226e-05, "loss": 0.1508, "step": 2400 }, { "epoch": 0.1283348421108685, "grad_norm": 0.1536262482404709, "learning_rate": 9.883384519379273e-05, "loss": 0.1512, "step": 2410 }, { "epoch": 0.1288673518291709, "grad_norm": 0.1585126668214798, "learning_rate": 9.881801538862627e-05, "loss": 0.1512, "step": 2420 }, { "epoch": 0.12939986154747324, "grad_norm": 0.13629089295864105, "learning_rate": 9.880208071182203e-05, "loss": 0.1506, "step": 2430 }, { "epoch": 0.1299323712657756, "grad_norm": 0.18671053647994995, "learning_rate": 9.878604121279434e-05, "loss": 0.1513, "step": 2440 }, { "epoch": 0.13046488098407796, "grad_norm": 0.14679767191410065, "learning_rate": 9.876989694128263e-05, "loss": 0.1498, "step": 2450 }, { "epoch": 0.1309973907023803, "grad_norm": 0.21141186356544495, "learning_rate": 9.875364794735124e-05, "loss": 0.1511, "step": 2460 }, { "epoch": 0.1315299004206827, "grad_norm": 0.28956910967826843, "learning_rate": 9.873729428138924e-05, "loss": 0.1514, "step": 2470 }, { "epoch": 0.13206241013898504, "grad_norm": 0.20682266354560852, "learning_rate": 9.87208359941103e-05, "loss": 0.1504, "step": 2480 }, { "epoch": 0.13259491985728739, "grad_norm": 0.1642565280199051, "learning_rate": 9.870427313655256e-05, "loss": 0.1519, "step": 2490 }, { "epoch": 0.13312742957558976, "grad_norm": 0.2124072164297104, "learning_rate": 9.868760576007835e-05, "loss": 0.1512, "step": 2500 }, { "epoch": 0.1336599392938921, "grad_norm": 0.12310315668582916, "learning_rate": 9.867083391637422e-05, "loss": 0.1505, "step": 2510 }, { "epoch": 0.13419244901219446, "grad_norm": 0.11877293884754181, "learning_rate": 9.865395765745062e-05, "loss": 0.1504, "step": 2520 }, { "epoch": 0.13472495873049684, "grad_norm": 0.14998012781143188, "learning_rate": 9.863697703564183e-05, "loss": 0.1504, "step": 2530 }, { "epoch": 0.13525746844879918, "grad_norm": 0.17364120483398438, "learning_rate": 9.861989210360572e-05, "loss": 0.1506, "step": 2540 }, { "epoch": 0.13578997816710156, "grad_norm": 0.2578318119049072, "learning_rate": 9.860270291432367e-05, "loss": 0.15, "step": 2550 }, { "epoch": 0.1363224878854039, "grad_norm": 0.19344595074653625, "learning_rate": 9.858540952110036e-05, "loss": 0.1506, "step": 2560 }, { "epoch": 0.13685499760370626, "grad_norm": 0.12059523165225983, "learning_rate": 9.856801197756362e-05, "loss": 0.1504, "step": 2570 }, { "epoch": 0.13738750732200863, "grad_norm": 0.14271683990955353, "learning_rate": 9.855051033766424e-05, "loss": 0.1503, "step": 2580 }, { "epoch": 0.13792001704031098, "grad_norm": 0.1416383981704712, "learning_rate": 9.853290465567582e-05, "loss": 0.1499, "step": 2590 }, { "epoch": 0.13845252675861333, "grad_norm": 0.2926510274410248, "learning_rate": 9.851519498619462e-05, "loss": 0.1505, "step": 2600 }, { "epoch": 0.1389850364769157, "grad_norm": 0.169399231672287, "learning_rate": 9.849738138413936e-05, "loss": 0.1507, "step": 2610 }, { "epoch": 0.13951754619521806, "grad_norm": 0.2096475064754486, "learning_rate": 9.847946390475103e-05, "loss": 0.1501, "step": 2620 }, { "epoch": 0.14005005591352043, "grad_norm": 0.17753221094608307, "learning_rate": 9.84614426035928e-05, "loss": 0.1501, "step": 2630 }, { "epoch": 0.14058256563182278, "grad_norm": 0.13106787204742432, "learning_rate": 9.844331753654978e-05, "loss": 0.1492, "step": 2640 }, { "epoch": 0.14111507535012513, "grad_norm": 0.1869879961013794, "learning_rate": 9.842508875982885e-05, "loss": 0.1495, "step": 2650 }, { "epoch": 0.1416475850684275, "grad_norm": 0.15248249471187592, "learning_rate": 9.840675632995852e-05, "loss": 0.1489, "step": 2660 }, { "epoch": 0.14218009478672985, "grad_norm": 0.13266538083553314, "learning_rate": 9.838832030378871e-05, "loss": 0.1501, "step": 2670 }, { "epoch": 0.1427126045050322, "grad_norm": 0.1557317078113556, "learning_rate": 9.836978073849061e-05, "loss": 0.1505, "step": 2680 }, { "epoch": 0.14324511422333458, "grad_norm": 0.11426721513271332, "learning_rate": 9.835113769155653e-05, "loss": 0.1505, "step": 2690 }, { "epoch": 0.14377762394163693, "grad_norm": 0.2386896014213562, "learning_rate": 9.83323912207996e-05, "loss": 0.1504, "step": 2700 }, { "epoch": 0.1443101336599393, "grad_norm": 0.17772549390792847, "learning_rate": 9.831354138435373e-05, "loss": 0.1502, "step": 2710 }, { "epoch": 0.14484264337824165, "grad_norm": 0.13859009742736816, "learning_rate": 9.82945882406734e-05, "loss": 0.1492, "step": 2720 }, { "epoch": 0.145375153096544, "grad_norm": 0.16319668292999268, "learning_rate": 9.827553184853333e-05, "loss": 0.1495, "step": 2730 }, { "epoch": 0.14590766281484638, "grad_norm": 0.17237436771392822, "learning_rate": 9.82563722670286e-05, "loss": 0.1497, "step": 2740 }, { "epoch": 0.14644017253314873, "grad_norm": 0.22656778991222382, "learning_rate": 9.823710955557413e-05, "loss": 0.1494, "step": 2750 }, { "epoch": 0.14697268225145108, "grad_norm": 0.12643253803253174, "learning_rate": 9.821774377390474e-05, "loss": 0.1497, "step": 2760 }, { "epoch": 0.14750519196975345, "grad_norm": 0.1322176307439804, "learning_rate": 9.819827498207481e-05, "loss": 0.1494, "step": 2770 }, { "epoch": 0.1480377016880558, "grad_norm": 0.13178198039531708, "learning_rate": 9.817870324045824e-05, "loss": 0.1497, "step": 2780 }, { "epoch": 0.14857021140635818, "grad_norm": 0.16573889553546906, "learning_rate": 9.815902860974812e-05, "loss": 0.1484, "step": 2790 }, { "epoch": 0.14910272112466053, "grad_norm": 0.17168866097927094, "learning_rate": 9.813925115095663e-05, "loss": 0.1498, "step": 2800 }, { "epoch": 0.14963523084296287, "grad_norm": 0.11743076145648956, "learning_rate": 9.811937092541483e-05, "loss": 0.1489, "step": 2810 }, { "epoch": 0.15016774056126525, "grad_norm": 0.09591732919216156, "learning_rate": 9.809938799477247e-05, "loss": 0.1492, "step": 2820 }, { "epoch": 0.1507002502795676, "grad_norm": 0.2620985209941864, "learning_rate": 9.807930242099777e-05, "loss": 0.1484, "step": 2830 }, { "epoch": 0.15123275999786995, "grad_norm": 0.1720651537179947, "learning_rate": 9.805911426637723e-05, "loss": 0.1489, "step": 2840 }, { "epoch": 0.15176526971617232, "grad_norm": 0.12655815482139587, "learning_rate": 9.803882359351556e-05, "loss": 0.1489, "step": 2850 }, { "epoch": 0.15229777943447467, "grad_norm": 0.1679336279630661, "learning_rate": 9.801843046533527e-05, "loss": 0.1474, "step": 2860 }, { "epoch": 0.15283028915277705, "grad_norm": 0.16842088103294373, "learning_rate": 9.799793494507667e-05, "loss": 0.1487, "step": 2870 }, { "epoch": 0.1533627988710794, "grad_norm": 0.15344814956188202, "learning_rate": 9.797733709629755e-05, "loss": 0.1491, "step": 2880 }, { "epoch": 0.15389530858938175, "grad_norm": 0.14372903108596802, "learning_rate": 9.795663698287305e-05, "loss": 0.1486, "step": 2890 }, { "epoch": 0.15442781830768412, "grad_norm": 0.14276905357837677, "learning_rate": 9.793583466899541e-05, "loss": 0.1483, "step": 2900 }, { "epoch": 0.15496032802598647, "grad_norm": 0.21898868680000305, "learning_rate": 9.791493021917384e-05, "loss": 0.1473, "step": 2910 }, { "epoch": 0.15549283774428882, "grad_norm": 0.21863441169261932, "learning_rate": 9.789392369823423e-05, "loss": 0.1472, "step": 2920 }, { "epoch": 0.1560253474625912, "grad_norm": 0.14358623325824738, "learning_rate": 9.787281517131905e-05, "loss": 0.1479, "step": 2930 }, { "epoch": 0.15655785718089354, "grad_norm": 0.12543822824954987, "learning_rate": 9.785160470388706e-05, "loss": 0.1479, "step": 2940 }, { "epoch": 0.15709036689919592, "grad_norm": 0.14014865458011627, "learning_rate": 9.783029236171317e-05, "loss": 0.1474, "step": 2950 }, { "epoch": 0.15762287661749827, "grad_norm": 0.15217439830303192, "learning_rate": 9.78088782108882e-05, "loss": 0.1478, "step": 2960 }, { "epoch": 0.15815538633580062, "grad_norm": 0.10565731674432755, "learning_rate": 9.778736231781864e-05, "loss": 0.1472, "step": 2970 }, { "epoch": 0.158687896054103, "grad_norm": 0.13025479018688202, "learning_rate": 9.77657447492266e-05, "loss": 0.147, "step": 2980 }, { "epoch": 0.15922040577240534, "grad_norm": 0.26370614767074585, "learning_rate": 9.774402557214934e-05, "loss": 0.1468, "step": 2990 }, { "epoch": 0.1597529154907077, "grad_norm": 0.11994566768407822, "learning_rate": 9.772220485393935e-05, "loss": 0.1468, "step": 3000 }, { "epoch": 0.16028542520901007, "grad_norm": 0.13632826507091522, "learning_rate": 9.770028266226392e-05, "loss": 0.1465, "step": 3010 }, { "epoch": 0.16081793492731242, "grad_norm": 0.18415699899196625, "learning_rate": 9.767825906510508e-05, "loss": 0.1461, "step": 3020 }, { "epoch": 0.1613504446456148, "grad_norm": 0.1199774518609047, "learning_rate": 9.765613413075925e-05, "loss": 0.1462, "step": 3030 }, { "epoch": 0.16188295436391714, "grad_norm": 0.12944312393665314, "learning_rate": 9.763390792783718e-05, "loss": 0.1456, "step": 3040 }, { "epoch": 0.1624154640822195, "grad_norm": 0.12913690507411957, "learning_rate": 9.761158052526357e-05, "loss": 0.1461, "step": 3050 }, { "epoch": 0.16294797380052187, "grad_norm": 0.13733190298080444, "learning_rate": 9.758915199227704e-05, "loss": 0.1454, "step": 3060 }, { "epoch": 0.16348048351882422, "grad_norm": 0.13602448999881744, "learning_rate": 9.756662239842977e-05, "loss": 0.1453, "step": 3070 }, { "epoch": 0.16401299323712656, "grad_norm": 0.23941437900066376, "learning_rate": 9.754399181358735e-05, "loss": 0.1447, "step": 3080 }, { "epoch": 0.16454550295542894, "grad_norm": 0.17770028114318848, "learning_rate": 9.752126030792852e-05, "loss": 0.1448, "step": 3090 }, { "epoch": 0.1650780126737313, "grad_norm": 0.12279467284679413, "learning_rate": 9.749842795194502e-05, "loss": 0.1445, "step": 3100 }, { "epoch": 0.16561052239203367, "grad_norm": 0.16316959261894226, "learning_rate": 9.747549481644132e-05, "loss": 0.1436, "step": 3110 }, { "epoch": 0.16614303211033601, "grad_norm": 0.1606248915195465, "learning_rate": 9.74524609725344e-05, "loss": 0.1447, "step": 3120 }, { "epoch": 0.16667554182863836, "grad_norm": 0.14306576550006866, "learning_rate": 9.742932649165357e-05, "loss": 0.1448, "step": 3130 }, { "epoch": 0.16720805154694074, "grad_norm": 0.16349278390407562, "learning_rate": 9.740609144554018e-05, "loss": 0.1443, "step": 3140 }, { "epoch": 0.1677405612652431, "grad_norm": 0.26308995485305786, "learning_rate": 9.738275590624748e-05, "loss": 0.1436, "step": 3150 }, { "epoch": 0.16827307098354544, "grad_norm": 0.13254424929618835, "learning_rate": 9.735931994614034e-05, "loss": 0.1436, "step": 3160 }, { "epoch": 0.1688055807018478, "grad_norm": 0.14894609153270721, "learning_rate": 9.733578363789504e-05, "loss": 0.1428, "step": 3170 }, { "epoch": 0.16933809042015016, "grad_norm": 0.2038808912038803, "learning_rate": 9.731214705449902e-05, "loss": 0.1427, "step": 3180 }, { "epoch": 0.16987060013845254, "grad_norm": 0.1420915275812149, "learning_rate": 9.728841026925072e-05, "loss": 0.1431, "step": 3190 }, { "epoch": 0.1704031098567549, "grad_norm": 0.12903986871242523, "learning_rate": 9.726457335575931e-05, "loss": 0.1426, "step": 3200 }, { "epoch": 0.17093561957505723, "grad_norm": 0.15605418384075165, "learning_rate": 9.724063638794445e-05, "loss": 0.1435, "step": 3210 }, { "epoch": 0.1714681292933596, "grad_norm": 0.179864302277565, "learning_rate": 9.721659944003605e-05, "loss": 0.1418, "step": 3220 }, { "epoch": 0.17200063901166196, "grad_norm": 0.13642147183418274, "learning_rate": 9.719246258657408e-05, "loss": 0.1425, "step": 3230 }, { "epoch": 0.1725331487299643, "grad_norm": 0.19007375836372375, "learning_rate": 9.716822590240835e-05, "loss": 0.1429, "step": 3240 }, { "epoch": 0.17306565844826668, "grad_norm": 0.11586272716522217, "learning_rate": 9.714388946269824e-05, "loss": 0.1423, "step": 3250 }, { "epoch": 0.17359816816656903, "grad_norm": 0.09750824421644211, "learning_rate": 9.711945334291243e-05, "loss": 0.1421, "step": 3260 }, { "epoch": 0.1741306778848714, "grad_norm": 0.16159775853157043, "learning_rate": 9.709491761882881e-05, "loss": 0.1422, "step": 3270 }, { "epoch": 0.17466318760317376, "grad_norm": 0.1439363956451416, "learning_rate": 9.707028236653406e-05, "loss": 0.1428, "step": 3280 }, { "epoch": 0.1751956973214761, "grad_norm": 0.15214209258556366, "learning_rate": 9.704554766242351e-05, "loss": 0.1419, "step": 3290 }, { "epoch": 0.17572820703977848, "grad_norm": 0.14261415600776672, "learning_rate": 9.702071358320095e-05, "loss": 0.142, "step": 3300 }, { "epoch": 0.17626071675808083, "grad_norm": 0.17040428519248962, "learning_rate": 9.69957802058783e-05, "loss": 0.1426, "step": 3310 }, { "epoch": 0.17679322647638318, "grad_norm": 0.11275117844343185, "learning_rate": 9.697074760777542e-05, "loss": 0.1423, "step": 3320 }, { "epoch": 0.17732573619468556, "grad_norm": 0.11213172972202301, "learning_rate": 9.694561586651985e-05, "loss": 0.1416, "step": 3330 }, { "epoch": 0.1778582459129879, "grad_norm": 0.08868248015642166, "learning_rate": 9.692038506004659e-05, "loss": 0.1414, "step": 3340 }, { "epoch": 0.17839075563129028, "grad_norm": 0.09049142897129059, "learning_rate": 9.689505526659783e-05, "loss": 0.1409, "step": 3350 }, { "epoch": 0.17892326534959263, "grad_norm": 0.21919691562652588, "learning_rate": 9.686962656472278e-05, "loss": 0.1423, "step": 3360 }, { "epoch": 0.17945577506789498, "grad_norm": 0.11099066585302353, "learning_rate": 9.684409903327728e-05, "loss": 0.1417, "step": 3370 }, { "epoch": 0.17998828478619736, "grad_norm": 0.10436002165079117, "learning_rate": 9.681847275142371e-05, "loss": 0.1413, "step": 3380 }, { "epoch": 0.1805207945044997, "grad_norm": 0.11029750108718872, "learning_rate": 9.679274779863065e-05, "loss": 0.1407, "step": 3390 }, { "epoch": 0.18105330422280205, "grad_norm": 0.15080855786800385, "learning_rate": 9.67669242546727e-05, "loss": 0.1414, "step": 3400 }, { "epoch": 0.18158581394110443, "grad_norm": 0.11175508052110672, "learning_rate": 9.674100219963018e-05, "loss": 0.1407, "step": 3410 }, { "epoch": 0.18211832365940678, "grad_norm": 0.10869117826223373, "learning_rate": 9.671498171388889e-05, "loss": 0.1401, "step": 3420 }, { "epoch": 0.18265083337770915, "grad_norm": 0.14162185788154602, "learning_rate": 9.668886287813985e-05, "loss": 0.1406, "step": 3430 }, { "epoch": 0.1831833430960115, "grad_norm": 0.2033168226480484, "learning_rate": 9.666264577337908e-05, "loss": 0.1407, "step": 3440 }, { "epoch": 0.18371585281431385, "grad_norm": 3.439692497253418, "learning_rate": 9.663633048090744e-05, "loss": 0.1527, "step": 3450 }, { "epoch": 0.18424836253261623, "grad_norm": 3.6431403160095215, "learning_rate": 9.660991708233009e-05, "loss": 0.7171, "step": 3460 }, { "epoch": 0.18478087225091858, "grad_norm": 1.1392711400985718, "learning_rate": 9.658340565955654e-05, "loss": 0.3094, "step": 3470 }, { "epoch": 0.18531338196922092, "grad_norm": 0.24986179172992706, "learning_rate": 9.655679629480032e-05, "loss": 0.2164, "step": 3480 }, { "epoch": 0.1858458916875233, "grad_norm": 0.16256773471832275, "learning_rate": 9.653008907057855e-05, "loss": 0.1672, "step": 3490 }, { "epoch": 0.18637840140582565, "grad_norm": 0.11707092821598053, "learning_rate": 9.65032840697119e-05, "loss": 0.1552, "step": 3500 }, { "epoch": 0.18691091112412803, "grad_norm": 0.12210855633020401, "learning_rate": 9.647638137532428e-05, "loss": 0.1495, "step": 3510 }, { "epoch": 0.18744342084243037, "grad_norm": 0.10740665346384048, "learning_rate": 9.644938107084247e-05, "loss": 0.1468, "step": 3520 }, { "epoch": 0.18797593056073272, "grad_norm": 0.09589366614818573, "learning_rate": 9.642228323999603e-05, "loss": 0.1453, "step": 3530 }, { "epoch": 0.1885084402790351, "grad_norm": 0.12820713222026825, "learning_rate": 9.639508796681688e-05, "loss": 0.1439, "step": 3540 }, { "epoch": 0.18904094999733745, "grad_norm": 0.3186265528202057, "learning_rate": 9.636779533563915e-05, "loss": 0.1429, "step": 3550 }, { "epoch": 0.1895734597156398, "grad_norm": 0.1107301339507103, "learning_rate": 9.63404054310989e-05, "loss": 0.1427, "step": 3560 }, { "epoch": 0.19010596943394217, "grad_norm": 0.0950065553188324, "learning_rate": 9.631291833813383e-05, "loss": 0.1425, "step": 3570 }, { "epoch": 0.19063847915224452, "grad_norm": 0.1297433227300644, "learning_rate": 9.628533414198298e-05, "loss": 0.1419, "step": 3580 }, { "epoch": 0.1911709888705469, "grad_norm": 0.1097961962223053, "learning_rate": 9.625765292818658e-05, "loss": 0.1413, "step": 3590 }, { "epoch": 0.19170349858884925, "grad_norm": 0.11060044914484024, "learning_rate": 9.622987478258567e-05, "loss": 0.1413, "step": 3600 }, { "epoch": 0.1922360083071516, "grad_norm": 0.11111301183700562, "learning_rate": 9.620199979132191e-05, "loss": 0.1408, "step": 3610 }, { "epoch": 0.19276851802545397, "grad_norm": 0.08389998227357864, "learning_rate": 9.617402804083729e-05, "loss": 0.141, "step": 3620 }, { "epoch": 0.19330102774375632, "grad_norm": 0.1361977458000183, "learning_rate": 9.61459596178738e-05, "loss": 0.1409, "step": 3630 }, { "epoch": 0.19383353746205867, "grad_norm": 0.12257982045412064, "learning_rate": 9.61177946094733e-05, "loss": 0.1405, "step": 3640 }, { "epoch": 0.19436604718036105, "grad_norm": 0.09580480307340622, "learning_rate": 9.608953310297708e-05, "loss": 0.141, "step": 3650 }, { "epoch": 0.1948985568986634, "grad_norm": 0.09593943506479263, "learning_rate": 9.606117518602575e-05, "loss": 0.1404, "step": 3660 }, { "epoch": 0.19543106661696577, "grad_norm": 0.09266688674688339, "learning_rate": 9.603272094655886e-05, "loss": 0.1404, "step": 3670 }, { "epoch": 0.19596357633526812, "grad_norm": 0.1070714071393013, "learning_rate": 9.600417047281464e-05, "loss": 0.1398, "step": 3680 }, { "epoch": 0.19649608605357047, "grad_norm": 0.10331781953573227, "learning_rate": 9.597552385332982e-05, "loss": 0.1401, "step": 3690 }, { "epoch": 0.19702859577187284, "grad_norm": 0.09512060880661011, "learning_rate": 9.594678117693921e-05, "loss": 0.1394, "step": 3700 }, { "epoch": 0.1975611054901752, "grad_norm": 0.0843188613653183, "learning_rate": 9.591794253277551e-05, "loss": 0.14, "step": 3710 }, { "epoch": 0.19809361520847754, "grad_norm": 0.0754111111164093, "learning_rate": 9.588900801026907e-05, "loss": 0.1406, "step": 3720 }, { "epoch": 0.19862612492677992, "grad_norm": 0.09565232694149017, "learning_rate": 9.585997769914752e-05, "loss": 0.1399, "step": 3730 }, { "epoch": 0.19915863464508227, "grad_norm": 0.10425586253404617, "learning_rate": 9.583085168943555e-05, "loss": 0.1391, "step": 3740 }, { "epoch": 0.19969114436338464, "grad_norm": 0.1333099901676178, "learning_rate": 9.580163007145459e-05, "loss": 0.1402, "step": 3750 }, { "epoch": 0.200223654081687, "grad_norm": 0.12000375241041183, "learning_rate": 9.57723129358226e-05, "loss": 0.1401, "step": 3760 }, { "epoch": 0.20075616379998934, "grad_norm": 0.08403091132640839, "learning_rate": 9.574290037345375e-05, "loss": 0.1393, "step": 3770 }, { "epoch": 0.20128867351829172, "grad_norm": 0.08062135428190231, "learning_rate": 9.571339247555809e-05, "loss": 0.1396, "step": 3780 }, { "epoch": 0.20182118323659407, "grad_norm": 0.17963799834251404, "learning_rate": 9.568378933364131e-05, "loss": 0.14, "step": 3790 }, { "epoch": 0.2023536929548964, "grad_norm": 0.09086289256811142, "learning_rate": 9.565409103950451e-05, "loss": 0.1397, "step": 3800 }, { "epoch": 0.2028862026731988, "grad_norm": 0.10271194577217102, "learning_rate": 9.562429768524381e-05, "loss": 0.1394, "step": 3810 }, { "epoch": 0.20341871239150114, "grad_norm": 0.076598159968853, "learning_rate": 9.559440936325017e-05, "loss": 0.1395, "step": 3820 }, { "epoch": 0.20395122210980351, "grad_norm": 0.1196560189127922, "learning_rate": 9.556442616620899e-05, "loss": 0.1388, "step": 3830 }, { "epoch": 0.20448373182810586, "grad_norm": 0.082634337246418, "learning_rate": 9.553434818709992e-05, "loss": 0.1394, "step": 3840 }, { "epoch": 0.2050162415464082, "grad_norm": 0.13070203363895416, "learning_rate": 9.550417551919655e-05, "loss": 0.1393, "step": 3850 }, { "epoch": 0.2055487512647106, "grad_norm": 0.21860548853874207, "learning_rate": 9.547390825606606e-05, "loss": 0.1387, "step": 3860 }, { "epoch": 0.20608126098301294, "grad_norm": 0.08260785788297653, "learning_rate": 9.544354649156899e-05, "loss": 0.139, "step": 3870 }, { "epoch": 0.20661377070131529, "grad_norm": 0.0788755938410759, "learning_rate": 9.541309031985895e-05, "loss": 0.1392, "step": 3880 }, { "epoch": 0.20714628041961766, "grad_norm": 0.12903687357902527, "learning_rate": 9.538253983538232e-05, "loss": 0.1395, "step": 3890 }, { "epoch": 0.20767879013792, "grad_norm": 0.19277387857437134, "learning_rate": 9.535189513287792e-05, "loss": 0.1389, "step": 3900 }, { "epoch": 0.2082112998562224, "grad_norm": 0.1530824899673462, "learning_rate": 9.532115630737674e-05, "loss": 0.1388, "step": 3910 }, { "epoch": 0.20874380957452474, "grad_norm": 0.0937756597995758, "learning_rate": 9.52903234542017e-05, "loss": 0.1398, "step": 3920 }, { "epoch": 0.20927631929282708, "grad_norm": 0.12323369830846786, "learning_rate": 9.52593966689673e-05, "loss": 0.14, "step": 3930 }, { "epoch": 0.20980882901112946, "grad_norm": 0.14708684384822845, "learning_rate": 9.522837604757924e-05, "loss": 0.1388, "step": 3940 }, { "epoch": 0.2103413387294318, "grad_norm": 0.10080372542142868, "learning_rate": 9.519726168623433e-05, "loss": 0.1379, "step": 3950 }, { "epoch": 0.21087384844773416, "grad_norm": 0.11739426851272583, "learning_rate": 9.516605368141998e-05, "loss": 0.1388, "step": 3960 }, { "epoch": 0.21140635816603653, "grad_norm": 0.0870957151055336, "learning_rate": 9.513475212991406e-05, "loss": 0.1388, "step": 3970 }, { "epoch": 0.21193886788433888, "grad_norm": 0.12498774379491806, "learning_rate": 9.510335712878446e-05, "loss": 0.139, "step": 3980 }, { "epoch": 0.21247137760264126, "grad_norm": 0.13790611922740936, "learning_rate": 9.507186877538899e-05, "loss": 0.1391, "step": 3990 }, { "epoch": 0.2130038873209436, "grad_norm": 0.09463178366422653, "learning_rate": 9.504028716737481e-05, "loss": 0.1387, "step": 4000 }, { "epoch": 0.21353639703924596, "grad_norm": 0.0896778553724289, "learning_rate": 9.500861240267836e-05, "loss": 0.1384, "step": 4010 }, { "epoch": 0.21406890675754833, "grad_norm": 0.09591860324144363, "learning_rate": 9.49768445795249e-05, "loss": 0.1379, "step": 4020 }, { "epoch": 0.21460141647585068, "grad_norm": 0.12393760681152344, "learning_rate": 9.49449837964283e-05, "loss": 0.138, "step": 4030 }, { "epoch": 0.21513392619415303, "grad_norm": 0.1211247369647026, "learning_rate": 9.491303015219075e-05, "loss": 0.1381, "step": 4040 }, { "epoch": 0.2156664359124554, "grad_norm": 0.09435896575450897, "learning_rate": 9.488098374590232e-05, "loss": 0.1384, "step": 4050 }, { "epoch": 0.21619894563075776, "grad_norm": 0.2162541151046753, "learning_rate": 9.484884467694082e-05, "loss": 0.1386, "step": 4060 }, { "epoch": 0.21673145534906013, "grad_norm": 0.11500007659196854, "learning_rate": 9.481661304497136e-05, "loss": 0.139, "step": 4070 }, { "epoch": 0.21726396506736248, "grad_norm": 0.12248394638299942, "learning_rate": 9.478428894994612e-05, "loss": 0.1387, "step": 4080 }, { "epoch": 0.21779647478566483, "grad_norm": 0.08362865447998047, "learning_rate": 9.475187249210396e-05, "loss": 0.1386, "step": 4090 }, { "epoch": 0.2183289845039672, "grad_norm": 0.09139638394117355, "learning_rate": 9.471936377197025e-05, "loss": 0.1382, "step": 4100 }, { "epoch": 0.21886149422226955, "grad_norm": 0.08786854147911072, "learning_rate": 9.468676289035643e-05, "loss": 0.1377, "step": 4110 }, { "epoch": 0.2193940039405719, "grad_norm": 0.09356456995010376, "learning_rate": 9.465406994835972e-05, "loss": 0.1377, "step": 4120 }, { "epoch": 0.21992651365887428, "grad_norm": 0.07609741389751434, "learning_rate": 9.462128504736286e-05, "loss": 0.1382, "step": 4130 }, { "epoch": 0.22045902337717663, "grad_norm": 0.12431398779153824, "learning_rate": 9.458840828903368e-05, "loss": 0.1382, "step": 4140 }, { "epoch": 0.220991533095479, "grad_norm": 0.1060996949672699, "learning_rate": 9.4555439775325e-05, "loss": 0.1382, "step": 4150 }, { "epoch": 0.22152404281378135, "grad_norm": 0.07396227866411209, "learning_rate": 9.452237960847405e-05, "loss": 0.1379, "step": 4160 }, { "epoch": 0.2220565525320837, "grad_norm": 0.07973285764455795, "learning_rate": 9.448922789100238e-05, "loss": 0.1376, "step": 4170 }, { "epoch": 0.22258906225038608, "grad_norm": 0.08671050518751144, "learning_rate": 9.445598472571535e-05, "loss": 0.1387, "step": 4180 }, { "epoch": 0.22312157196868843, "grad_norm": 0.07978523522615433, "learning_rate": 9.442265021570198e-05, "loss": 0.1379, "step": 4190 }, { "epoch": 0.22365408168699077, "grad_norm": 0.07077804952859879, "learning_rate": 9.438922446433454e-05, "loss": 0.1382, "step": 4200 }, { "epoch": 0.22418659140529315, "grad_norm": 0.08330279588699341, "learning_rate": 9.435570757526823e-05, "loss": 0.1374, "step": 4210 }, { "epoch": 0.2247191011235955, "grad_norm": 0.1032426580786705, "learning_rate": 9.432209965244085e-05, "loss": 0.1382, "step": 4220 }, { "epoch": 0.22525161084189788, "grad_norm": 0.10053195804357529, "learning_rate": 9.428840080007255e-05, "loss": 0.1373, "step": 4230 }, { "epoch": 0.22578412056020022, "grad_norm": 0.08895772695541382, "learning_rate": 9.425461112266545e-05, "loss": 0.1379, "step": 4240 }, { "epoch": 0.22631663027850257, "grad_norm": 0.08644817024469376, "learning_rate": 9.422073072500328e-05, "loss": 0.1381, "step": 4250 }, { "epoch": 0.22684913999680495, "grad_norm": 0.07521601766347885, "learning_rate": 9.418675971215113e-05, "loss": 0.1377, "step": 4260 }, { "epoch": 0.2273816497151073, "grad_norm": 0.0699540451169014, "learning_rate": 9.415269818945513e-05, "loss": 0.1378, "step": 4270 }, { "epoch": 0.22791415943340965, "grad_norm": 0.16751086711883545, "learning_rate": 9.411854626254202e-05, "loss": 0.1371, "step": 4280 }, { "epoch": 0.22844666915171202, "grad_norm": 0.11966162919998169, "learning_rate": 9.408430403731891e-05, "loss": 0.1374, "step": 4290 }, { "epoch": 0.22897917887001437, "grad_norm": 0.0889548733830452, "learning_rate": 9.404997161997295e-05, "loss": 0.1376, "step": 4300 }, { "epoch": 0.22951168858831675, "grad_norm": 0.10110121965408325, "learning_rate": 9.4015549116971e-05, "loss": 0.1379, "step": 4310 }, { "epoch": 0.2300441983066191, "grad_norm": 0.10038761049509048, "learning_rate": 9.398103663505917e-05, "loss": 0.1368, "step": 4320 }, { "epoch": 0.23057670802492145, "grad_norm": 0.14014580845832825, "learning_rate": 9.394643428126272e-05, "loss": 0.1369, "step": 4330 }, { "epoch": 0.23110921774322382, "grad_norm": 0.07071101665496826, "learning_rate": 9.391174216288561e-05, "loss": 0.1375, "step": 4340 }, { "epoch": 0.23164172746152617, "grad_norm": 0.08393870294094086, "learning_rate": 9.387696038751006e-05, "loss": 0.1378, "step": 4350 }, { "epoch": 0.23217423717982852, "grad_norm": 0.18702152371406555, "learning_rate": 9.384208906299641e-05, "loss": 0.1379, "step": 4360 }, { "epoch": 0.2327067468981309, "grad_norm": 0.12593114376068115, "learning_rate": 9.380712829748266e-05, "loss": 0.1377, "step": 4370 }, { "epoch": 0.23323925661643324, "grad_norm": 0.1111498549580574, "learning_rate": 9.37720781993842e-05, "loss": 0.138, "step": 4380 }, { "epoch": 0.23377176633473562, "grad_norm": 0.11401405185461044, "learning_rate": 9.37369388773934e-05, "loss": 0.1373, "step": 4390 }, { "epoch": 0.23430427605303797, "grad_norm": 0.07457905262708664, "learning_rate": 9.370171044047937e-05, "loss": 0.1376, "step": 4400 }, { "epoch": 0.23483678577134032, "grad_norm": 0.07042038440704346, "learning_rate": 9.366639299788758e-05, "loss": 0.1379, "step": 4410 }, { "epoch": 0.2353692954896427, "grad_norm": 0.06665973365306854, "learning_rate": 9.363098665913941e-05, "loss": 0.1368, "step": 4420 }, { "epoch": 0.23590180520794504, "grad_norm": 0.06450683623552322, "learning_rate": 9.3595491534032e-05, "loss": 0.1376, "step": 4430 }, { "epoch": 0.2364343149262474, "grad_norm": 0.0732714980840683, "learning_rate": 9.355990773263782e-05, "loss": 0.137, "step": 4440 }, { "epoch": 0.23696682464454977, "grad_norm": 0.09271900355815887, "learning_rate": 9.352423536530432e-05, "loss": 0.1366, "step": 4450 }, { "epoch": 0.23749933436285212, "grad_norm": 0.35426032543182373, "learning_rate": 9.34884745426536e-05, "loss": 0.1383, "step": 4460 }, { "epoch": 0.2380318440811545, "grad_norm": 0.0786311998963356, "learning_rate": 9.3452625375582e-05, "loss": 0.1378, "step": 4470 }, { "epoch": 0.23856435379945684, "grad_norm": 0.07759775966405869, "learning_rate": 9.341668797525993e-05, "loss": 0.1373, "step": 4480 }, { "epoch": 0.2390968635177592, "grad_norm": 0.06604979932308197, "learning_rate": 9.338066245313134e-05, "loss": 0.1376, "step": 4490 }, { "epoch": 0.23962937323606157, "grad_norm": 0.06817334145307541, "learning_rate": 9.334454892091349e-05, "loss": 0.1368, "step": 4500 }, { "epoch": 0.24016188295436391, "grad_norm": 0.07450228929519653, "learning_rate": 9.330834749059654e-05, "loss": 0.1368, "step": 4510 }, { "epoch": 0.24069439267266626, "grad_norm": 0.09656868129968643, "learning_rate": 9.327205827444322e-05, "loss": 0.1365, "step": 4520 }, { "epoch": 0.24122690239096864, "grad_norm": 0.1612931787967682, "learning_rate": 9.323568138498855e-05, "loss": 0.1374, "step": 4530 }, { "epoch": 0.241759412109271, "grad_norm": 0.1097157672047615, "learning_rate": 9.319921693503935e-05, "loss": 0.1374, "step": 4540 }, { "epoch": 0.24229192182757336, "grad_norm": 0.07911382615566254, "learning_rate": 9.316266503767402e-05, "loss": 0.1375, "step": 4550 }, { "epoch": 0.2428244315458757, "grad_norm": 0.07367183268070221, "learning_rate": 9.31260258062421e-05, "loss": 0.1372, "step": 4560 }, { "epoch": 0.24335694126417806, "grad_norm": 0.0891132801771164, "learning_rate": 9.308929935436404e-05, "loss": 0.1362, "step": 4570 }, { "epoch": 0.24388945098248044, "grad_norm": 0.10352007299661636, "learning_rate": 9.305248579593064e-05, "loss": 0.1378, "step": 4580 }, { "epoch": 0.2444219607007828, "grad_norm": 0.10638166218996048, "learning_rate": 9.301558524510293e-05, "loss": 0.1367, "step": 4590 }, { "epoch": 0.24495447041908514, "grad_norm": 0.07406862825155258, "learning_rate": 9.297859781631166e-05, "loss": 0.1365, "step": 4600 }, { "epoch": 0.2454869801373875, "grad_norm": 0.10420636832714081, "learning_rate": 9.294152362425701e-05, "loss": 0.1372, "step": 4610 }, { "epoch": 0.24601948985568986, "grad_norm": 0.0823531523346901, "learning_rate": 9.290436278390821e-05, "loss": 0.1367, "step": 4620 }, { "epoch": 0.24655199957399224, "grad_norm": 0.08028628677129745, "learning_rate": 9.286711541050322e-05, "loss": 0.1371, "step": 4630 }, { "epoch": 0.24708450929229459, "grad_norm": 0.13905274868011475, "learning_rate": 9.282978161954825e-05, "loss": 0.1368, "step": 4640 }, { "epoch": 0.24761701901059693, "grad_norm": 0.07757926732301712, "learning_rate": 9.279236152681763e-05, "loss": 0.1369, "step": 4650 }, { "epoch": 0.2481495287288993, "grad_norm": 0.08740050345659256, "learning_rate": 9.275485524835319e-05, "loss": 0.1371, "step": 4660 }, { "epoch": 0.24868203844720166, "grad_norm": 0.07663418352603912, "learning_rate": 9.271726290046413e-05, "loss": 0.1375, "step": 4670 }, { "epoch": 0.249214548165504, "grad_norm": 0.11915243417024612, "learning_rate": 9.267958459972652e-05, "loss": 0.1365, "step": 4680 }, { "epoch": 0.24974705788380638, "grad_norm": 0.11448535323143005, "learning_rate": 9.264182046298294e-05, "loss": 0.137, "step": 4690 }, { "epoch": 0.25027956760210873, "grad_norm": 0.09888239204883575, "learning_rate": 9.260397060734219e-05, "loss": 0.1373, "step": 4700 }, { "epoch": 0.2508120773204111, "grad_norm": 0.12656264007091522, "learning_rate": 9.256603515017885e-05, "loss": 0.1364, "step": 4710 }, { "epoch": 0.25134458703871343, "grad_norm": 0.1620924174785614, "learning_rate": 9.252801420913304e-05, "loss": 0.136, "step": 4720 }, { "epoch": 0.2518770967570158, "grad_norm": 0.07459171861410141, "learning_rate": 9.24899079021099e-05, "loss": 0.1373, "step": 4730 }, { "epoch": 0.2524096064753182, "grad_norm": 0.09508336335420609, "learning_rate": 9.245171634727926e-05, "loss": 0.1363, "step": 4740 }, { "epoch": 0.25294211619362056, "grad_norm": 0.07613290101289749, "learning_rate": 9.241343966307543e-05, "loss": 0.1363, "step": 4750 }, { "epoch": 0.2534746259119229, "grad_norm": 0.08999643474817276, "learning_rate": 9.237507796819662e-05, "loss": 0.1365, "step": 4760 }, { "epoch": 0.25400713563022526, "grad_norm": 0.11467399448156357, "learning_rate": 9.233663138160464e-05, "loss": 0.1364, "step": 4770 }, { "epoch": 0.25453964534852763, "grad_norm": 0.0825829803943634, "learning_rate": 9.229810002252464e-05, "loss": 0.1367, "step": 4780 }, { "epoch": 0.25507215506682995, "grad_norm": 0.08100995421409607, "learning_rate": 9.225948401044457e-05, "loss": 0.137, "step": 4790 }, { "epoch": 0.25560466478513233, "grad_norm": 0.08392170816659927, "learning_rate": 9.222078346511502e-05, "loss": 0.1366, "step": 4800 }, { "epoch": 0.2561371745034347, "grad_norm": 0.061139799654483795, "learning_rate": 9.218199850654854e-05, "loss": 0.1368, "step": 4810 }, { "epoch": 0.256669684221737, "grad_norm": 0.1355183869600296, "learning_rate": 9.21431292550196e-05, "loss": 0.1363, "step": 4820 }, { "epoch": 0.2572021939400394, "grad_norm": 0.08287263661623001, "learning_rate": 9.210417583106401e-05, "loss": 0.1363, "step": 4830 }, { "epoch": 0.2577347036583418, "grad_norm": 0.0793054848909378, "learning_rate": 9.206513835547861e-05, "loss": 0.1362, "step": 4840 }, { "epoch": 0.2582672133766441, "grad_norm": 0.09595254063606262, "learning_rate": 9.202601694932087e-05, "loss": 0.136, "step": 4850 }, { "epoch": 0.2587997230949465, "grad_norm": 0.07301712781190872, "learning_rate": 9.198681173390858e-05, "loss": 0.1371, "step": 4860 }, { "epoch": 0.25933223281324885, "grad_norm": 0.11917870491743088, "learning_rate": 9.194752283081937e-05, "loss": 0.137, "step": 4870 }, { "epoch": 0.2598647425315512, "grad_norm": 0.07802341878414154, "learning_rate": 9.190815036189042e-05, "loss": 0.1363, "step": 4880 }, { "epoch": 0.26039725224985355, "grad_norm": 0.07218264043331146, "learning_rate": 9.186869444921808e-05, "loss": 0.136, "step": 4890 }, { "epoch": 0.2609297619681559, "grad_norm": 0.07441945374011993, "learning_rate": 9.182915521515745e-05, "loss": 0.137, "step": 4900 }, { "epoch": 0.2614622716864583, "grad_norm": 0.16663500666618347, "learning_rate": 9.178953278232193e-05, "loss": 0.1369, "step": 4910 }, { "epoch": 0.2619947814047606, "grad_norm": 0.09314275532960892, "learning_rate": 9.174982727358306e-05, "loss": 0.1359, "step": 4920 }, { "epoch": 0.262527291123063, "grad_norm": 0.07567309588193893, "learning_rate": 9.171003881206992e-05, "loss": 0.1364, "step": 4930 }, { "epoch": 0.2630598008413654, "grad_norm": 0.10356537252664566, "learning_rate": 9.167016752116883e-05, "loss": 0.1367, "step": 4940 }, { "epoch": 0.2635923105596677, "grad_norm": 0.07590640336275101, "learning_rate": 9.163021352452302e-05, "loss": 0.1359, "step": 4950 }, { "epoch": 0.2641248202779701, "grad_norm": 0.09072890132665634, "learning_rate": 9.159017694603214e-05, "loss": 0.1367, "step": 4960 }, { "epoch": 0.26465732999627245, "grad_norm": 0.10349312424659729, "learning_rate": 9.155005790985197e-05, "loss": 0.136, "step": 4970 }, { "epoch": 0.26518983971457477, "grad_norm": 0.08810363709926605, "learning_rate": 9.150985654039394e-05, "loss": 0.1362, "step": 4980 }, { "epoch": 0.26572234943287715, "grad_norm": 0.13400596380233765, "learning_rate": 9.14695729623249e-05, "loss": 0.1362, "step": 4990 }, { "epoch": 0.2662548591511795, "grad_norm": 0.12602917850017548, "learning_rate": 9.142920730056652e-05, "loss": 0.1353, "step": 5000 }, { "epoch": 0.26678736886948184, "grad_norm": 0.08865707367658615, "learning_rate": 9.138875968029512e-05, "loss": 0.1361, "step": 5010 }, { "epoch": 0.2673198785877842, "grad_norm": 0.11873424053192139, "learning_rate": 9.13482302269411e-05, "loss": 0.1371, "step": 5020 }, { "epoch": 0.2678523883060866, "grad_norm": 0.08005053550004959, "learning_rate": 9.13076190661887e-05, "loss": 0.1356, "step": 5030 }, { "epoch": 0.2683848980243889, "grad_norm": 0.14600218832492828, "learning_rate": 9.126692632397543e-05, "loss": 0.1359, "step": 5040 }, { "epoch": 0.2689174077426913, "grad_norm": 0.09237764775753021, "learning_rate": 9.122615212649189e-05, "loss": 0.136, "step": 5050 }, { "epoch": 0.26944991746099367, "grad_norm": 0.06887295097112656, "learning_rate": 9.118529660018125e-05, "loss": 0.1371, "step": 5060 }, { "epoch": 0.26998242717929605, "grad_norm": 0.07038972526788712, "learning_rate": 9.114435987173886e-05, "loss": 0.1365, "step": 5070 }, { "epoch": 0.27051493689759837, "grad_norm": 0.06756497174501419, "learning_rate": 9.110334206811195e-05, "loss": 0.1359, "step": 5080 }, { "epoch": 0.27104744661590074, "grad_norm": 0.07146366685628891, "learning_rate": 9.106224331649906e-05, "loss": 0.1362, "step": 5090 }, { "epoch": 0.2715799563342031, "grad_norm": 0.07654134929180145, "learning_rate": 9.102106374434984e-05, "loss": 0.1357, "step": 5100 }, { "epoch": 0.27211246605250544, "grad_norm": 0.11621958762407303, "learning_rate": 9.097980347936457e-05, "loss": 0.1362, "step": 5110 }, { "epoch": 0.2726449757708078, "grad_norm": 0.08447077125310898, "learning_rate": 9.093846264949368e-05, "loss": 0.1354, "step": 5120 }, { "epoch": 0.2731774854891102, "grad_norm": 0.06373301893472672, "learning_rate": 9.089704138293756e-05, "loss": 0.1357, "step": 5130 }, { "epoch": 0.2737099952074125, "grad_norm": 0.07651172578334808, "learning_rate": 9.085553980814592e-05, "loss": 0.1361, "step": 5140 }, { "epoch": 0.2742425049257149, "grad_norm": 0.08544403314590454, "learning_rate": 9.081395805381761e-05, "loss": 0.1356, "step": 5150 }, { "epoch": 0.27477501464401727, "grad_norm": 0.1044570654630661, "learning_rate": 9.077229624890002e-05, "loss": 0.1355, "step": 5160 }, { "epoch": 0.2753075243623196, "grad_norm": 0.09831110388040543, "learning_rate": 9.073055452258889e-05, "loss": 0.1364, "step": 5170 }, { "epoch": 0.27584003408062197, "grad_norm": 0.07489628344774246, "learning_rate": 9.068873300432772e-05, "loss": 0.1369, "step": 5180 }, { "epoch": 0.27637254379892434, "grad_norm": 0.06893607974052429, "learning_rate": 9.064683182380749e-05, "loss": 0.1359, "step": 5190 }, { "epoch": 0.27690505351722666, "grad_norm": 0.08887787163257599, "learning_rate": 9.060485111096617e-05, "loss": 0.1347, "step": 5200 }, { "epoch": 0.27743756323552904, "grad_norm": 0.074364572763443, "learning_rate": 9.056279099598845e-05, "loss": 0.1357, "step": 5210 }, { "epoch": 0.2779700729538314, "grad_norm": 0.15046152472496033, "learning_rate": 9.052065160930516e-05, "loss": 0.1348, "step": 5220 }, { "epoch": 0.2785025826721338, "grad_norm": 0.08475669473409653, "learning_rate": 9.0478433081593e-05, "loss": 0.1357, "step": 5230 }, { "epoch": 0.2790350923904361, "grad_norm": 0.09118683636188507, "learning_rate": 9.043613554377411e-05, "loss": 0.1354, "step": 5240 }, { "epoch": 0.2795676021087385, "grad_norm": 0.08327824622392654, "learning_rate": 9.03937591270156e-05, "loss": 0.135, "step": 5250 }, { "epoch": 0.28010011182704087, "grad_norm": 0.12977443635463715, "learning_rate": 9.035130396272922e-05, "loss": 0.1353, "step": 5260 }, { "epoch": 0.2806326215453432, "grad_norm": 0.07361641526222229, "learning_rate": 9.030877018257091e-05, "loss": 0.1363, "step": 5270 }, { "epoch": 0.28116513126364556, "grad_norm": 0.06867006421089172, "learning_rate": 9.02661579184404e-05, "loss": 0.1353, "step": 5280 }, { "epoch": 0.28169764098194794, "grad_norm": 0.09308381378650665, "learning_rate": 9.022346730248079e-05, "loss": 0.1357, "step": 5290 }, { "epoch": 0.28223015070025026, "grad_norm": 0.07800911366939545, "learning_rate": 9.01806984670782e-05, "loss": 0.1348, "step": 5300 }, { "epoch": 0.28276266041855264, "grad_norm": 0.06358273327350616, "learning_rate": 9.013785154486127e-05, "loss": 0.1352, "step": 5310 }, { "epoch": 0.283295170136855, "grad_norm": 0.06616450846195221, "learning_rate": 9.009492666870078e-05, "loss": 0.135, "step": 5320 }, { "epoch": 0.28382767985515733, "grad_norm": 0.11287859827280045, "learning_rate": 9.005192397170932e-05, "loss": 0.1363, "step": 5330 }, { "epoch": 0.2843601895734597, "grad_norm": 0.06982850283384323, "learning_rate": 9.000884358724073e-05, "loss": 0.1358, "step": 5340 }, { "epoch": 0.2848926992917621, "grad_norm": 0.0698726698756218, "learning_rate": 8.996568564888978e-05, "loss": 0.1349, "step": 5350 }, { "epoch": 0.2854252090100644, "grad_norm": 0.0825994461774826, "learning_rate": 8.99224502904918e-05, "loss": 0.1351, "step": 5360 }, { "epoch": 0.2859577187283668, "grad_norm": 0.10726054012775421, "learning_rate": 8.987913764612212e-05, "loss": 0.1359, "step": 5370 }, { "epoch": 0.28649022844666916, "grad_norm": 0.06900358200073242, "learning_rate": 8.983574785009578e-05, "loss": 0.1358, "step": 5380 }, { "epoch": 0.28702273816497154, "grad_norm": 0.062367282807826996, "learning_rate": 8.979228103696709e-05, "loss": 0.1359, "step": 5390 }, { "epoch": 0.28755524788327386, "grad_norm": 0.11140688508749008, "learning_rate": 8.974873734152915e-05, "loss": 0.1349, "step": 5400 }, { "epoch": 0.28808775760157623, "grad_norm": 0.07785354554653168, "learning_rate": 8.970511689881351e-05, "loss": 0.1357, "step": 5410 }, { "epoch": 0.2886202673198786, "grad_norm": 0.09785955399274826, "learning_rate": 8.96614198440897e-05, "loss": 0.1349, "step": 5420 }, { "epoch": 0.28915277703818093, "grad_norm": 0.07891997694969177, "learning_rate": 8.961764631286487e-05, "loss": 0.1357, "step": 5430 }, { "epoch": 0.2896852867564833, "grad_norm": 0.08100765943527222, "learning_rate": 8.957379644088325e-05, "loss": 0.1356, "step": 5440 }, { "epoch": 0.2902177964747857, "grad_norm": 0.08178524672985077, "learning_rate": 8.952987036412584e-05, "loss": 0.135, "step": 5450 }, { "epoch": 0.290750306193088, "grad_norm": 0.11689390987157822, "learning_rate": 8.948586821880997e-05, "loss": 0.1349, "step": 5460 }, { "epoch": 0.2912828159113904, "grad_norm": 0.06157712638378143, "learning_rate": 8.944179014138891e-05, "loss": 0.1352, "step": 5470 }, { "epoch": 0.29181532562969276, "grad_norm": 0.09353891015052795, "learning_rate": 8.939763626855129e-05, "loss": 0.1347, "step": 5480 }, { "epoch": 0.2923478353479951, "grad_norm": 0.09575408697128296, "learning_rate": 8.93534067372209e-05, "loss": 0.1351, "step": 5490 }, { "epoch": 0.29288034506629745, "grad_norm": 0.05233992263674736, "learning_rate": 8.930910168455603e-05, "loss": 0.1354, "step": 5500 }, { "epoch": 0.29341285478459983, "grad_norm": 0.1354876458644867, "learning_rate": 8.926472124794931e-05, "loss": 0.1348, "step": 5510 }, { "epoch": 0.29394536450290215, "grad_norm": 0.10183115303516388, "learning_rate": 8.922026556502699e-05, "loss": 0.135, "step": 5520 }, { "epoch": 0.2944778742212045, "grad_norm": 0.08267311006784439, "learning_rate": 8.917573477364876e-05, "loss": 0.1351, "step": 5530 }, { "epoch": 0.2950103839395069, "grad_norm": 0.07382847368717194, "learning_rate": 8.91311290119072e-05, "loss": 0.1354, "step": 5540 }, { "epoch": 0.2955428936578093, "grad_norm": 0.06079572066664696, "learning_rate": 8.908644841812739e-05, "loss": 0.1355, "step": 5550 }, { "epoch": 0.2960754033761116, "grad_norm": 0.09829048812389374, "learning_rate": 8.904169313086645e-05, "loss": 0.1353, "step": 5560 }, { "epoch": 0.296607913094414, "grad_norm": 0.09984841197729111, "learning_rate": 8.899686328891315e-05, "loss": 0.1348, "step": 5570 }, { "epoch": 0.29714042281271635, "grad_norm": 0.1072811409831047, "learning_rate": 8.895195903128739e-05, "loss": 0.1356, "step": 5580 }, { "epoch": 0.2976729325310187, "grad_norm": 0.0519433431327343, "learning_rate": 8.890698049723995e-05, "loss": 0.1345, "step": 5590 }, { "epoch": 0.29820544224932105, "grad_norm": 0.057259172201156616, "learning_rate": 8.886192782625189e-05, "loss": 0.1338, "step": 5600 }, { "epoch": 0.2987379519676234, "grad_norm": 0.102280393242836, "learning_rate": 8.881680115803412e-05, "loss": 0.1347, "step": 5610 }, { "epoch": 0.29927046168592575, "grad_norm": 0.07144474983215332, "learning_rate": 8.877160063252712e-05, "loss": 0.1353, "step": 5620 }, { "epoch": 0.2998029714042281, "grad_norm": 0.07198172062635422, "learning_rate": 8.87263263899003e-05, "loss": 0.1354, "step": 5630 }, { "epoch": 0.3003354811225305, "grad_norm": 0.08418303728103638, "learning_rate": 8.86809785705518e-05, "loss": 0.1353, "step": 5640 }, { "epoch": 0.3008679908408328, "grad_norm": 0.09237035363912582, "learning_rate": 8.86355573151078e-05, "loss": 0.1356, "step": 5650 }, { "epoch": 0.3014005005591352, "grad_norm": 0.12669327855110168, "learning_rate": 8.859006276442226e-05, "loss": 0.1343, "step": 5660 }, { "epoch": 0.3019330102774376, "grad_norm": 0.05533193424344063, "learning_rate": 8.854449505957645e-05, "loss": 0.1351, "step": 5670 }, { "epoch": 0.3024655199957399, "grad_norm": 0.06650611758232117, "learning_rate": 8.849885434187848e-05, "loss": 0.1345, "step": 5680 }, { "epoch": 0.30299802971404227, "grad_norm": 0.07978025078773499, "learning_rate": 8.845314075286286e-05, "loss": 0.1347, "step": 5690 }, { "epoch": 0.30353053943234465, "grad_norm": 0.06983333081007004, "learning_rate": 8.840735443429014e-05, "loss": 0.1345, "step": 5700 }, { "epoch": 0.304063049150647, "grad_norm": 0.08889699727296829, "learning_rate": 8.836149552814632e-05, "loss": 0.1341, "step": 5710 }, { "epoch": 0.30459555886894935, "grad_norm": 0.17295877635478973, "learning_rate": 8.831556417664255e-05, "loss": 0.1349, "step": 5720 }, { "epoch": 0.3051280685872517, "grad_norm": 0.08790881931781769, "learning_rate": 8.826956052221464e-05, "loss": 0.1356, "step": 5730 }, { "epoch": 0.3056605783055541, "grad_norm": 0.11530311405658722, "learning_rate": 8.822348470752263e-05, "loss": 0.1341, "step": 5740 }, { "epoch": 0.3061930880238564, "grad_norm": 0.07714807987213135, "learning_rate": 8.817733687545024e-05, "loss": 0.1349, "step": 5750 }, { "epoch": 0.3067255977421588, "grad_norm": 0.07843048125505447, "learning_rate": 8.813111716910463e-05, "loss": 0.135, "step": 5760 }, { "epoch": 0.30725810746046117, "grad_norm": 0.059752389788627625, "learning_rate": 8.808482573181583e-05, "loss": 0.134, "step": 5770 }, { "epoch": 0.3077906171787635, "grad_norm": 0.07728555053472519, "learning_rate": 8.803846270713622e-05, "loss": 0.1347, "step": 5780 }, { "epoch": 0.30832312689706587, "grad_norm": 0.11523959785699844, "learning_rate": 8.79920282388403e-05, "loss": 0.1345, "step": 5790 }, { "epoch": 0.30885563661536825, "grad_norm": 0.0835232064127922, "learning_rate": 8.794552247092404e-05, "loss": 0.1342, "step": 5800 }, { "epoch": 0.30938814633367057, "grad_norm": 0.08657065033912659, "learning_rate": 8.789894554760456e-05, "loss": 0.1351, "step": 5810 }, { "epoch": 0.30992065605197294, "grad_norm": 0.17280389368534088, "learning_rate": 8.78522976133196e-05, "loss": 0.1345, "step": 5820 }, { "epoch": 0.3104531657702753, "grad_norm": 0.07250665873289108, "learning_rate": 8.780557881272711e-05, "loss": 0.1339, "step": 5830 }, { "epoch": 0.31098567548857764, "grad_norm": 0.135615274310112, "learning_rate": 8.775878929070483e-05, "loss": 0.1348, "step": 5840 }, { "epoch": 0.31151818520688, "grad_norm": 0.09052561223506927, "learning_rate": 8.77119291923498e-05, "loss": 0.1342, "step": 5850 }, { "epoch": 0.3120506949251824, "grad_norm": 0.06174413859844208, "learning_rate": 8.766499866297791e-05, "loss": 0.1338, "step": 5860 }, { "epoch": 0.31258320464348477, "grad_norm": 0.08344202488660812, "learning_rate": 8.761799784812348e-05, "loss": 0.1343, "step": 5870 }, { "epoch": 0.3131157143617871, "grad_norm": 0.11124816536903381, "learning_rate": 8.757092689353876e-05, "loss": 0.1348, "step": 5880 }, { "epoch": 0.31364822408008947, "grad_norm": 0.19503851234912872, "learning_rate": 8.752378594519355e-05, "loss": 0.1351, "step": 5890 }, { "epoch": 0.31418073379839184, "grad_norm": 0.0828915536403656, "learning_rate": 8.747657514927463e-05, "loss": 0.135, "step": 5900 }, { "epoch": 0.31471324351669416, "grad_norm": 0.07251901179552078, "learning_rate": 8.742929465218548e-05, "loss": 0.1337, "step": 5910 }, { "epoch": 0.31524575323499654, "grad_norm": 0.05215257406234741, "learning_rate": 8.738194460054567e-05, "loss": 0.1335, "step": 5920 }, { "epoch": 0.3157782629532989, "grad_norm": 0.06828713417053223, "learning_rate": 8.733452514119048e-05, "loss": 0.1341, "step": 5930 }, { "epoch": 0.31631077267160124, "grad_norm": 0.05530816689133644, "learning_rate": 8.728703642117038e-05, "loss": 0.1342, "step": 5940 }, { "epoch": 0.3168432823899036, "grad_norm": 0.08321405947208405, "learning_rate": 8.723947858775068e-05, "loss": 0.1342, "step": 5950 }, { "epoch": 0.317375792108206, "grad_norm": 0.08132331073284149, "learning_rate": 8.7191851788411e-05, "loss": 0.1344, "step": 5960 }, { "epoch": 0.3179083018265083, "grad_norm": 0.062430258840322495, "learning_rate": 8.714415617084484e-05, "loss": 0.1346, "step": 5970 }, { "epoch": 0.3184408115448107, "grad_norm": 0.09331026673316956, "learning_rate": 8.709639188295906e-05, "loss": 0.1346, "step": 5980 }, { "epoch": 0.31897332126311306, "grad_norm": 0.05821016803383827, "learning_rate": 8.70485590728735e-05, "loss": 0.1347, "step": 5990 }, { "epoch": 0.3195058309814154, "grad_norm": 0.059810176491737366, "learning_rate": 8.700065788892053e-05, "loss": 0.1341, "step": 6000 }, { "epoch": 0.32003834069971776, "grad_norm": 0.07964300364255905, "learning_rate": 8.695268847964449e-05, "loss": 0.1348, "step": 6010 }, { "epoch": 0.32057085041802014, "grad_norm": 0.08967084437608719, "learning_rate": 8.690465099380131e-05, "loss": 0.1343, "step": 6020 }, { "epoch": 0.3211033601363225, "grad_norm": 0.05978870391845703, "learning_rate": 8.685654558035803e-05, "loss": 0.1335, "step": 6030 }, { "epoch": 0.32163586985462483, "grad_norm": 0.05357252061367035, "learning_rate": 8.680837238849237e-05, "loss": 0.134, "step": 6040 }, { "epoch": 0.3221683795729272, "grad_norm": 0.09251965582370758, "learning_rate": 8.676013156759219e-05, "loss": 0.1344, "step": 6050 }, { "epoch": 0.3227008892912296, "grad_norm": 0.06674574315547943, "learning_rate": 8.671182326725509e-05, "loss": 0.1346, "step": 6060 }, { "epoch": 0.3232333990095319, "grad_norm": 0.0678028017282486, "learning_rate": 8.666344763728793e-05, "loss": 0.1334, "step": 6070 }, { "epoch": 0.3237659087278343, "grad_norm": 0.08303205668926239, "learning_rate": 8.661500482770635e-05, "loss": 0.1333, "step": 6080 }, { "epoch": 0.32429841844613666, "grad_norm": 0.059111885726451874, "learning_rate": 8.656649498873435e-05, "loss": 0.1334, "step": 6090 }, { "epoch": 0.324830928164439, "grad_norm": 0.0759367048740387, "learning_rate": 8.651791827080373e-05, "loss": 0.1334, "step": 6100 }, { "epoch": 0.32536343788274136, "grad_norm": 0.05576184391975403, "learning_rate": 8.646927482455375e-05, "loss": 0.1347, "step": 6110 }, { "epoch": 0.32589594760104373, "grad_norm": 0.12025826424360275, "learning_rate": 8.642056480083058e-05, "loss": 0.1341, "step": 6120 }, { "epoch": 0.32642845731934605, "grad_norm": 0.055676814168691635, "learning_rate": 8.637178835068685e-05, "loss": 0.1342, "step": 6130 }, { "epoch": 0.32696096703764843, "grad_norm": 0.06213228031992912, "learning_rate": 8.632294562538114e-05, "loss": 0.1336, "step": 6140 }, { "epoch": 0.3274934767559508, "grad_norm": 0.05842900648713112, "learning_rate": 8.627403677637762e-05, "loss": 0.1339, "step": 6150 }, { "epoch": 0.32802598647425313, "grad_norm": 0.07621738314628601, "learning_rate": 8.62250619553455e-05, "loss": 0.1339, "step": 6160 }, { "epoch": 0.3285584961925555, "grad_norm": 0.06162210926413536, "learning_rate": 8.61760213141585e-05, "loss": 0.1345, "step": 6170 }, { "epoch": 0.3290910059108579, "grad_norm": 0.0727054551243782, "learning_rate": 8.612691500489453e-05, "loss": 0.1342, "step": 6180 }, { "epoch": 0.32962351562916026, "grad_norm": 0.05833178386092186, "learning_rate": 8.607774317983515e-05, "loss": 0.1337, "step": 6190 }, { "epoch": 0.3301560253474626, "grad_norm": 0.11485815793275833, "learning_rate": 8.602850599146502e-05, "loss": 0.1334, "step": 6200 }, { "epoch": 0.33068853506576495, "grad_norm": 0.0697018951177597, "learning_rate": 8.597920359247156e-05, "loss": 0.1335, "step": 6210 }, { "epoch": 0.33122104478406733, "grad_norm": 0.05111430957913399, "learning_rate": 8.592983613574435e-05, "loss": 0.1332, "step": 6220 }, { "epoch": 0.33175355450236965, "grad_norm": 0.06886550784111023, "learning_rate": 8.588040377437479e-05, "loss": 0.1338, "step": 6230 }, { "epoch": 0.33228606422067203, "grad_norm": 0.04854755103588104, "learning_rate": 8.58309066616555e-05, "loss": 0.1336, "step": 6240 }, { "epoch": 0.3328185739389744, "grad_norm": 0.0921018123626709, "learning_rate": 8.57813449510799e-05, "loss": 0.1338, "step": 6250 }, { "epoch": 0.3333510836572767, "grad_norm": 0.09607180953025818, "learning_rate": 8.573171879634177e-05, "loss": 0.1341, "step": 6260 }, { "epoch": 0.3338835933755791, "grad_norm": 0.16610988974571228, "learning_rate": 8.568202835133468e-05, "loss": 0.1343, "step": 6270 }, { "epoch": 0.3344161030938815, "grad_norm": 0.07573292404413223, "learning_rate": 8.563227377015162e-05, "loss": 0.1336, "step": 6280 }, { "epoch": 0.3349486128121838, "grad_norm": 0.10059863328933716, "learning_rate": 8.558245520708444e-05, "loss": 0.1338, "step": 6290 }, { "epoch": 0.3354811225304862, "grad_norm": 0.0501171350479126, "learning_rate": 8.553257281662342e-05, "loss": 0.1331, "step": 6300 }, { "epoch": 0.33601363224878855, "grad_norm": 0.07637584954500198, "learning_rate": 8.548262675345673e-05, "loss": 0.1336, "step": 6310 }, { "epoch": 0.3365461419670909, "grad_norm": 0.08559510856866837, "learning_rate": 8.543261717247006e-05, "loss": 0.1345, "step": 6320 }, { "epoch": 0.33707865168539325, "grad_norm": 0.07463840395212173, "learning_rate": 8.5382544228746e-05, "loss": 0.134, "step": 6330 }, { "epoch": 0.3376111614036956, "grad_norm": 0.06291361898183823, "learning_rate": 8.533240807756373e-05, "loss": 0.134, "step": 6340 }, { "epoch": 0.338143671121998, "grad_norm": 0.054589059203863144, "learning_rate": 8.52822088743983e-05, "loss": 0.1336, "step": 6350 }, { "epoch": 0.3386761808403003, "grad_norm": 0.08773118257522583, "learning_rate": 8.523194677492044e-05, "loss": 0.1332, "step": 6360 }, { "epoch": 0.3392086905586027, "grad_norm": 0.08630936592817307, "learning_rate": 8.518162193499581e-05, "loss": 0.1339, "step": 6370 }, { "epoch": 0.3397412002769051, "grad_norm": 0.0654667541384697, "learning_rate": 8.513123451068467e-05, "loss": 0.1339, "step": 6380 }, { "epoch": 0.3402737099952074, "grad_norm": 0.04769926890730858, "learning_rate": 8.508078465824138e-05, "loss": 0.1335, "step": 6390 }, { "epoch": 0.3408062197135098, "grad_norm": 0.10144821554422379, "learning_rate": 8.503027253411387e-05, "loss": 0.1328, "step": 6400 }, { "epoch": 0.34133872943181215, "grad_norm": 0.09670275449752808, "learning_rate": 8.497969829494319e-05, "loss": 0.1338, "step": 6410 }, { "epoch": 0.34187123915011447, "grad_norm": 0.08334879577159882, "learning_rate": 8.492906209756294e-05, "loss": 0.1328, "step": 6420 }, { "epoch": 0.34240374886841685, "grad_norm": 0.06717374920845032, "learning_rate": 8.487836409899905e-05, "loss": 0.134, "step": 6430 }, { "epoch": 0.3429362585867192, "grad_norm": 0.1657373309135437, "learning_rate": 8.482760445646885e-05, "loss": 0.1333, "step": 6440 }, { "epoch": 0.34346876830502154, "grad_norm": 0.07656820863485336, "learning_rate": 8.477678332738102e-05, "loss": 0.1331, "step": 6450 }, { "epoch": 0.3440012780233239, "grad_norm": 0.06148603931069374, "learning_rate": 8.472590086933479e-05, "loss": 0.1338, "step": 6460 }, { "epoch": 0.3445337877416263, "grad_norm": 0.0530422069132328, "learning_rate": 8.467495724011967e-05, "loss": 0.1335, "step": 6470 }, { "epoch": 0.3450662974599286, "grad_norm": 1.325517177581787, "learning_rate": 8.462395259771483e-05, "loss": 0.1388, "step": 6480 }, { "epoch": 0.345598807178231, "grad_norm": 4.959922790527344, "learning_rate": 8.457288710028862e-05, "loss": 0.2724, "step": 6490 }, { "epoch": 0.34613131689653337, "grad_norm": 0.29457658529281616, "learning_rate": 8.452176090619812e-05, "loss": 0.1983, "step": 6500 }, { "epoch": 0.34666382661483575, "grad_norm": 0.10155448317527771, "learning_rate": 8.447057417398866e-05, "loss": 0.1556, "step": 6510 }, { "epoch": 0.34719633633313807, "grad_norm": 0.08506519347429276, "learning_rate": 8.441932706239329e-05, "loss": 0.1438, "step": 6520 }, { "epoch": 0.34772884605144044, "grad_norm": 0.06791000813245773, "learning_rate": 8.436801973033227e-05, "loss": 0.1402, "step": 6530 }, { "epoch": 0.3482613557697428, "grad_norm": 0.0665493905544281, "learning_rate": 8.43166523369126e-05, "loss": 0.1381, "step": 6540 }, { "epoch": 0.34879386548804514, "grad_norm": 0.14150767028331757, "learning_rate": 8.42652250414276e-05, "loss": 0.136, "step": 6550 }, { "epoch": 0.3493263752063475, "grad_norm": 0.07686637341976166, "learning_rate": 8.421373800335632e-05, "loss": 0.1352, "step": 6560 }, { "epoch": 0.3498588849246499, "grad_norm": 0.06520914286375046, "learning_rate": 8.416219138236308e-05, "loss": 0.1349, "step": 6570 }, { "epoch": 0.3503913946429522, "grad_norm": 0.1157696321606636, "learning_rate": 8.411058533829688e-05, "loss": 0.1346, "step": 6580 }, { "epoch": 0.3509239043612546, "grad_norm": 0.10777822136878967, "learning_rate": 8.405892003119115e-05, "loss": 0.1354, "step": 6590 }, { "epoch": 0.35145641407955697, "grad_norm": 0.07395236939191818, "learning_rate": 8.4007195621263e-05, "loss": 0.1335, "step": 6600 }, { "epoch": 0.3519889237978593, "grad_norm": 0.0857616439461708, "learning_rate": 8.395541226891283e-05, "loss": 0.1339, "step": 6610 }, { "epoch": 0.35252143351616166, "grad_norm": 0.055322933942079544, "learning_rate": 8.390357013472386e-05, "loss": 0.1339, "step": 6620 }, { "epoch": 0.35305394323446404, "grad_norm": 0.06957754492759705, "learning_rate": 8.385166937946154e-05, "loss": 0.1337, "step": 6630 }, { "epoch": 0.35358645295276636, "grad_norm": 0.06771986186504364, "learning_rate": 8.379971016407313e-05, "loss": 0.1344, "step": 6640 }, { "epoch": 0.35411896267106874, "grad_norm": 0.08326587826013565, "learning_rate": 8.374769264968722e-05, "loss": 0.1333, "step": 6650 }, { "epoch": 0.3546514723893711, "grad_norm": 0.0682111382484436, "learning_rate": 8.369561699761317e-05, "loss": 0.1347, "step": 6660 }, { "epoch": 0.3551839821076735, "grad_norm": 0.10408024489879608, "learning_rate": 8.364348336934056e-05, "loss": 0.1335, "step": 6670 }, { "epoch": 0.3557164918259758, "grad_norm": 0.07545497268438339, "learning_rate": 8.359129192653883e-05, "loss": 0.1329, "step": 6680 }, { "epoch": 0.3562490015442782, "grad_norm": 0.06856414675712585, "learning_rate": 8.353904283105671e-05, "loss": 0.1333, "step": 6690 }, { "epoch": 0.35678151126258056, "grad_norm": 0.12046464532613754, "learning_rate": 8.34867362449217e-05, "loss": 0.1341, "step": 6700 }, { "epoch": 0.3573140209808829, "grad_norm": 0.08481092005968094, "learning_rate": 8.343437233033952e-05, "loss": 0.1331, "step": 6710 }, { "epoch": 0.35784653069918526, "grad_norm": 0.0763193815946579, "learning_rate": 8.338195124969377e-05, "loss": 0.133, "step": 6720 }, { "epoch": 0.35837904041748764, "grad_norm": 0.05080103129148483, "learning_rate": 8.332947316554527e-05, "loss": 0.1332, "step": 6730 }, { "epoch": 0.35891155013578996, "grad_norm": 0.09795154631137848, "learning_rate": 8.327693824063158e-05, "loss": 0.1339, "step": 6740 }, { "epoch": 0.35944405985409233, "grad_norm": 0.10069025307893753, "learning_rate": 8.322434663786662e-05, "loss": 0.1335, "step": 6750 }, { "epoch": 0.3599765695723947, "grad_norm": 0.08091656118631363, "learning_rate": 8.317169852034002e-05, "loss": 0.1336, "step": 6760 }, { "epoch": 0.36050907929069703, "grad_norm": 0.06075895577669144, "learning_rate": 8.31189940513166e-05, "loss": 0.1341, "step": 6770 }, { "epoch": 0.3610415890089994, "grad_norm": 0.051195014268159866, "learning_rate": 8.306623339423605e-05, "loss": 0.1338, "step": 6780 }, { "epoch": 0.3615740987273018, "grad_norm": 0.05651082843542099, "learning_rate": 8.301341671271222e-05, "loss": 0.1332, "step": 6790 }, { "epoch": 0.3621066084456041, "grad_norm": 0.05757668614387512, "learning_rate": 8.29605441705327e-05, "loss": 0.133, "step": 6800 }, { "epoch": 0.3626391181639065, "grad_norm": 0.05644191801548004, "learning_rate": 8.290761593165836e-05, "loss": 0.1333, "step": 6810 }, { "epoch": 0.36317162788220886, "grad_norm": 0.07972195744514465, "learning_rate": 8.285463216022276e-05, "loss": 0.133, "step": 6820 }, { "epoch": 0.36370413760051123, "grad_norm": 0.04617351293563843, "learning_rate": 8.280159302053163e-05, "loss": 0.1328, "step": 6830 }, { "epoch": 0.36423664731881356, "grad_norm": 0.09602131694555283, "learning_rate": 8.274849867706247e-05, "loss": 0.1331, "step": 6840 }, { "epoch": 0.36476915703711593, "grad_norm": 0.04789271950721741, "learning_rate": 8.269534929446392e-05, "loss": 0.133, "step": 6850 }, { "epoch": 0.3653016667554183, "grad_norm": 0.09267139434814453, "learning_rate": 8.26421450375553e-05, "loss": 0.1325, "step": 6860 }, { "epoch": 0.36583417647372063, "grad_norm": 0.058588556945323944, "learning_rate": 8.258888607132614e-05, "loss": 0.1336, "step": 6870 }, { "epoch": 0.366366686192023, "grad_norm": 0.052210818976163864, "learning_rate": 8.253557256093558e-05, "loss": 0.1328, "step": 6880 }, { "epoch": 0.3668991959103254, "grad_norm": 0.09430071711540222, "learning_rate": 8.248220467171195e-05, "loss": 0.1328, "step": 6890 }, { "epoch": 0.3674317056286277, "grad_norm": 0.07742954045534134, "learning_rate": 8.242878256915216e-05, "loss": 0.1328, "step": 6900 }, { "epoch": 0.3679642153469301, "grad_norm": 0.06042707711458206, "learning_rate": 8.237530641892128e-05, "loss": 0.133, "step": 6910 }, { "epoch": 0.36849672506523246, "grad_norm": 0.06480567157268524, "learning_rate": 8.232177638685194e-05, "loss": 0.1328, "step": 6920 }, { "epoch": 0.3690292347835348, "grad_norm": 0.047677043825387955, "learning_rate": 8.226819263894395e-05, "loss": 0.1331, "step": 6930 }, { "epoch": 0.36956174450183715, "grad_norm": 0.051471047103405, "learning_rate": 8.221455534136358e-05, "loss": 0.1324, "step": 6940 }, { "epoch": 0.37009425422013953, "grad_norm": 0.07004884630441666, "learning_rate": 8.216086466044323e-05, "loss": 0.1327, "step": 6950 }, { "epoch": 0.37062676393844185, "grad_norm": 0.07678276300430298, "learning_rate": 8.210712076268088e-05, "loss": 0.1327, "step": 6960 }, { "epoch": 0.3711592736567442, "grad_norm": 0.07195029407739639, "learning_rate": 8.205332381473942e-05, "loss": 0.1324, "step": 6970 }, { "epoch": 0.3716917833750466, "grad_norm": 0.061837486922740936, "learning_rate": 8.199947398344639e-05, "loss": 0.1325, "step": 6980 }, { "epoch": 0.372224293093349, "grad_norm": 0.1034204512834549, "learning_rate": 8.19455714357932e-05, "loss": 0.1326, "step": 6990 }, { "epoch": 0.3727568028116513, "grad_norm": 0.10331778973340988, "learning_rate": 8.189161633893481e-05, "loss": 0.1328, "step": 7000 }, { "epoch": 0.3732893125299537, "grad_norm": 0.06943188607692719, "learning_rate": 8.183760886018914e-05, "loss": 0.1323, "step": 7010 }, { "epoch": 0.37382182224825605, "grad_norm": 0.050394218415021896, "learning_rate": 8.178354916703654e-05, "loss": 0.1324, "step": 7020 }, { "epoch": 0.3743543319665584, "grad_norm": 0.06192854419350624, "learning_rate": 8.172943742711923e-05, "loss": 0.1323, "step": 7030 }, { "epoch": 0.37488684168486075, "grad_norm": 0.12752105295658112, "learning_rate": 8.16752738082409e-05, "loss": 0.1329, "step": 7040 }, { "epoch": 0.3754193514031631, "grad_norm": 0.07455851882696152, "learning_rate": 8.162105847836605e-05, "loss": 0.1331, "step": 7050 }, { "epoch": 0.37595186112146545, "grad_norm": 0.07023312151432037, "learning_rate": 8.156679160561963e-05, "loss": 0.1326, "step": 7060 }, { "epoch": 0.3764843708397678, "grad_norm": 0.057135872542858124, "learning_rate": 8.151247335828638e-05, "loss": 0.1334, "step": 7070 }, { "epoch": 0.3770168805580702, "grad_norm": 0.10991890728473663, "learning_rate": 8.145810390481033e-05, "loss": 0.1328, "step": 7080 }, { "epoch": 0.3775493902763725, "grad_norm": 0.06575486063957214, "learning_rate": 8.140368341379431e-05, "loss": 0.133, "step": 7090 }, { "epoch": 0.3780818999946749, "grad_norm": 0.06990350782871246, "learning_rate": 8.134921205399945e-05, "loss": 0.1321, "step": 7100 }, { "epoch": 0.3786144097129773, "grad_norm": 0.06953799724578857, "learning_rate": 8.129468999434464e-05, "loss": 0.132, "step": 7110 }, { "epoch": 0.3791469194312796, "grad_norm": 0.09842592477798462, "learning_rate": 8.124011740390591e-05, "loss": 0.1323, "step": 7120 }, { "epoch": 0.37967942914958197, "grad_norm": 0.08032160997390747, "learning_rate": 8.118549445191613e-05, "loss": 0.1324, "step": 7130 }, { "epoch": 0.38021193886788435, "grad_norm": 0.07145192474126816, "learning_rate": 8.113082130776417e-05, "loss": 0.1315, "step": 7140 }, { "epoch": 0.3807444485861867, "grad_norm": 0.05545572564005852, "learning_rate": 8.107609814099466e-05, "loss": 0.1327, "step": 7150 }, { "epoch": 0.38127695830448904, "grad_norm": 0.06006612256169319, "learning_rate": 8.102132512130738e-05, "loss": 0.1316, "step": 7160 }, { "epoch": 0.3818094680227914, "grad_norm": 0.08068816363811493, "learning_rate": 8.096650241855661e-05, "loss": 0.1319, "step": 7170 }, { "epoch": 0.3823419777410938, "grad_norm": 0.08527512848377228, "learning_rate": 8.091163020275077e-05, "loss": 0.1324, "step": 7180 }, { "epoch": 0.3828744874593961, "grad_norm": 0.06154448911547661, "learning_rate": 8.085670864405179e-05, "loss": 0.1327, "step": 7190 }, { "epoch": 0.3834069971776985, "grad_norm": 0.05169384926557541, "learning_rate": 8.080173791277463e-05, "loss": 0.132, "step": 7200 }, { "epoch": 0.38393950689600087, "grad_norm": 0.11166296899318695, "learning_rate": 8.074671817938674e-05, "loss": 0.1318, "step": 7210 }, { "epoch": 0.3844720166143032, "grad_norm": 0.05975338816642761, "learning_rate": 8.069164961450751e-05, "loss": 0.1313, "step": 7220 }, { "epoch": 0.38500452633260557, "grad_norm": 0.07280656695365906, "learning_rate": 8.063653238890779e-05, "loss": 0.1324, "step": 7230 }, { "epoch": 0.38553703605090794, "grad_norm": 0.050891146063804626, "learning_rate": 8.058136667350928e-05, "loss": 0.132, "step": 7240 }, { "epoch": 0.38606954576921027, "grad_norm": 0.10308956354856491, "learning_rate": 8.05261526393841e-05, "loss": 0.1323, "step": 7250 }, { "epoch": 0.38660205548751264, "grad_norm": 0.08276902139186859, "learning_rate": 8.04708904577542e-05, "loss": 0.1324, "step": 7260 }, { "epoch": 0.387134565205815, "grad_norm": 0.06150532513856888, "learning_rate": 8.041558029999081e-05, "loss": 0.1324, "step": 7270 }, { "epoch": 0.38766707492411734, "grad_norm": 0.08963697403669357, "learning_rate": 8.036022233761396e-05, "loss": 0.1332, "step": 7280 }, { "epoch": 0.3881995846424197, "grad_norm": 0.08556204289197922, "learning_rate": 8.030481674229192e-05, "loss": 0.1319, "step": 7290 }, { "epoch": 0.3887320943607221, "grad_norm": 0.0741380900144577, "learning_rate": 8.024936368584066e-05, "loss": 0.132, "step": 7300 }, { "epoch": 0.38926460407902447, "grad_norm": 0.060994237661361694, "learning_rate": 8.019386334022336e-05, "loss": 0.1328, "step": 7310 }, { "epoch": 0.3897971137973268, "grad_norm": 0.053207580000162125, "learning_rate": 8.013831587754984e-05, "loss": 0.1321, "step": 7320 }, { "epoch": 0.39032962351562916, "grad_norm": 0.08496523648500443, "learning_rate": 8.008272147007597e-05, "loss": 0.1317, "step": 7330 }, { "epoch": 0.39086213323393154, "grad_norm": 0.06788633018732071, "learning_rate": 8.002708029020329e-05, "loss": 0.1323, "step": 7340 }, { "epoch": 0.39139464295223386, "grad_norm": 0.05240168422460556, "learning_rate": 7.997139251047835e-05, "loss": 0.1323, "step": 7350 }, { "epoch": 0.39192715267053624, "grad_norm": 0.08682172000408173, "learning_rate": 7.991565830359218e-05, "loss": 0.1321, "step": 7360 }, { "epoch": 0.3924596623888386, "grad_norm": 0.05870863422751427, "learning_rate": 7.985987784237981e-05, "loss": 0.1317, "step": 7370 }, { "epoch": 0.39299217210714094, "grad_norm": 0.053884461522102356, "learning_rate": 7.980405129981971e-05, "loss": 0.1322, "step": 7380 }, { "epoch": 0.3935246818254433, "grad_norm": 0.051192574203014374, "learning_rate": 7.974817884903325e-05, "loss": 0.132, "step": 7390 }, { "epoch": 0.3940571915437457, "grad_norm": 0.07789867371320724, "learning_rate": 7.969226066328415e-05, "loss": 0.1322, "step": 7400 }, { "epoch": 0.394589701262048, "grad_norm": 0.12169856578111649, "learning_rate": 7.963629691597794e-05, "loss": 0.1331, "step": 7410 }, { "epoch": 0.3951222109803504, "grad_norm": 0.05751097947359085, "learning_rate": 7.95802877806615e-05, "loss": 0.1317, "step": 7420 }, { "epoch": 0.39565472069865276, "grad_norm": 0.0670279935002327, "learning_rate": 7.952423343102242e-05, "loss": 0.1321, "step": 7430 }, { "epoch": 0.3961872304169551, "grad_norm": 0.12209637463092804, "learning_rate": 7.946813404088849e-05, "loss": 0.1318, "step": 7440 }, { "epoch": 0.39671974013525746, "grad_norm": 0.06626468896865845, "learning_rate": 7.94119897842272e-05, "loss": 0.1318, "step": 7450 }, { "epoch": 0.39725224985355984, "grad_norm": 0.04306609556078911, "learning_rate": 7.935580083514516e-05, "loss": 0.1318, "step": 7460 }, { "epoch": 0.3977847595718622, "grad_norm": 0.07492338865995407, "learning_rate": 7.929956736788759e-05, "loss": 0.1318, "step": 7470 }, { "epoch": 0.39831726929016453, "grad_norm": 0.051630035042762756, "learning_rate": 7.924328955683774e-05, "loss": 0.1314, "step": 7480 }, { "epoch": 0.3988497790084669, "grad_norm": 0.06161106750369072, "learning_rate": 7.918696757651637e-05, "loss": 0.1319, "step": 7490 }, { "epoch": 0.3993822887267693, "grad_norm": 0.048934947699308395, "learning_rate": 7.913060160158125e-05, "loss": 0.1318, "step": 7500 }, { "epoch": 0.3999147984450716, "grad_norm": 0.08472836762666702, "learning_rate": 7.907419180682656e-05, "loss": 0.1324, "step": 7510 }, { "epoch": 0.400447308163374, "grad_norm": 0.07017608731985092, "learning_rate": 7.901773836718234e-05, "loss": 0.1315, "step": 7520 }, { "epoch": 0.40097981788167636, "grad_norm": 0.09098348021507263, "learning_rate": 7.8961241457714e-05, "loss": 0.1316, "step": 7530 }, { "epoch": 0.4015123275999787, "grad_norm": 0.07034831494092941, "learning_rate": 7.890470125362174e-05, "loss": 0.132, "step": 7540 }, { "epoch": 0.40204483731828106, "grad_norm": 0.08528514206409454, "learning_rate": 7.884811793024009e-05, "loss": 0.1317, "step": 7550 }, { "epoch": 0.40257734703658343, "grad_norm": 0.10862760245800018, "learning_rate": 7.879149166303719e-05, "loss": 0.1315, "step": 7560 }, { "epoch": 0.40310985675488575, "grad_norm": 0.04836263135075569, "learning_rate": 7.873482262761438e-05, "loss": 0.1317, "step": 7570 }, { "epoch": 0.40364236647318813, "grad_norm": 0.051307760179042816, "learning_rate": 7.867811099970568e-05, "loss": 0.1328, "step": 7580 }, { "epoch": 0.4041748761914905, "grad_norm": 0.05256601795554161, "learning_rate": 7.862135695517712e-05, "loss": 0.1321, "step": 7590 }, { "epoch": 0.4047073859097928, "grad_norm": 0.05649365857243538, "learning_rate": 7.856456067002633e-05, "loss": 0.1314, "step": 7600 }, { "epoch": 0.4052398956280952, "grad_norm": 0.04195050150156021, "learning_rate": 7.85077223203819e-05, "loss": 0.1327, "step": 7610 }, { "epoch": 0.4057724053463976, "grad_norm": 0.07042062282562256, "learning_rate": 7.845084208250286e-05, "loss": 0.1319, "step": 7620 }, { "epoch": 0.40630491506469996, "grad_norm": 0.048713624477386475, "learning_rate": 7.839392013277814e-05, "loss": 0.1315, "step": 7630 }, { "epoch": 0.4068374247830023, "grad_norm": 0.05016913264989853, "learning_rate": 7.833695664772605e-05, "loss": 0.132, "step": 7640 }, { "epoch": 0.40736993450130465, "grad_norm": 0.04809438809752464, "learning_rate": 7.827995180399364e-05, "loss": 0.1315, "step": 7650 }, { "epoch": 0.40790244421960703, "grad_norm": 0.0424528494477272, "learning_rate": 7.822290577835627e-05, "loss": 0.1312, "step": 7660 }, { "epoch": 0.40843495393790935, "grad_norm": 0.049090851098299026, "learning_rate": 7.8165818747717e-05, "loss": 0.1318, "step": 7670 }, { "epoch": 0.4089674636562117, "grad_norm": 0.09739360958337784, "learning_rate": 7.810869088910604e-05, "loss": 0.1314, "step": 7680 }, { "epoch": 0.4094999733745141, "grad_norm": 0.06400451064109802, "learning_rate": 7.805152237968019e-05, "loss": 0.1319, "step": 7690 }, { "epoch": 0.4100324830928164, "grad_norm": 0.09439321607351303, "learning_rate": 7.799431339672238e-05, "loss": 0.1315, "step": 7700 }, { "epoch": 0.4105649928111188, "grad_norm": 0.061424221843481064, "learning_rate": 7.793706411764095e-05, "loss": 0.132, "step": 7710 }, { "epoch": 0.4110975025294212, "grad_norm": 0.06444218754768372, "learning_rate": 7.787977471996928e-05, "loss": 0.1313, "step": 7720 }, { "epoch": 0.4116300122477235, "grad_norm": 0.052814047783613205, "learning_rate": 7.782244538136513e-05, "loss": 0.1316, "step": 7730 }, { "epoch": 0.4121625219660259, "grad_norm": 0.06464862823486328, "learning_rate": 7.776507627961012e-05, "loss": 0.1313, "step": 7740 }, { "epoch": 0.41269503168432825, "grad_norm": 0.05052724853157997, "learning_rate": 7.770766759260918e-05, "loss": 0.1317, "step": 7750 }, { "epoch": 0.41322754140263057, "grad_norm": 0.10346025973558426, "learning_rate": 7.765021949839e-05, "loss": 0.1319, "step": 7760 }, { "epoch": 0.41376005112093295, "grad_norm": 0.07890909165143967, "learning_rate": 7.759273217510246e-05, "loss": 0.1316, "step": 7770 }, { "epoch": 0.4142925608392353, "grad_norm": 0.04561850428581238, "learning_rate": 7.75352058010181e-05, "loss": 0.1317, "step": 7780 }, { "epoch": 0.4148250705575377, "grad_norm": 0.09326593577861786, "learning_rate": 7.747764055452957e-05, "loss": 0.1309, "step": 7790 }, { "epoch": 0.41535758027584, "grad_norm": 0.06307931989431381, "learning_rate": 7.742003661415007e-05, "loss": 0.1307, "step": 7800 }, { "epoch": 0.4158900899941424, "grad_norm": 0.07909877598285675, "learning_rate": 7.736239415851274e-05, "loss": 0.1312, "step": 7810 }, { "epoch": 0.4164225997124448, "grad_norm": 0.05338076129555702, "learning_rate": 7.730471336637024e-05, "loss": 0.1309, "step": 7820 }, { "epoch": 0.4169551094307471, "grad_norm": 0.08736453205347061, "learning_rate": 7.724699441659404e-05, "loss": 0.1321, "step": 7830 }, { "epoch": 0.41748761914904947, "grad_norm": 0.062187109142541885, "learning_rate": 7.718923748817397e-05, "loss": 0.132, "step": 7840 }, { "epoch": 0.41802012886735185, "grad_norm": 0.0855235755443573, "learning_rate": 7.713144276021768e-05, "loss": 0.1306, "step": 7850 }, { "epoch": 0.41855263858565417, "grad_norm": 0.04441085830330849, "learning_rate": 7.707361041194992e-05, "loss": 0.1313, "step": 7860 }, { "epoch": 0.41908514830395655, "grad_norm": 0.06373197585344315, "learning_rate": 7.70157406227122e-05, "loss": 0.1316, "step": 7870 }, { "epoch": 0.4196176580222589, "grad_norm": 0.05832177773118019, "learning_rate": 7.695783357196214e-05, "loss": 0.1312, "step": 7880 }, { "epoch": 0.42015016774056124, "grad_norm": 0.0553959384560585, "learning_rate": 7.689988943927285e-05, "loss": 0.1317, "step": 7890 }, { "epoch": 0.4206826774588636, "grad_norm": 0.07334991544485092, "learning_rate": 7.684190840433247e-05, "loss": 0.1312, "step": 7900 }, { "epoch": 0.421215187177166, "grad_norm": 0.08733541518449783, "learning_rate": 7.67838906469436e-05, "loss": 0.1317, "step": 7910 }, { "epoch": 0.4217476968954683, "grad_norm": 0.07919137924909592, "learning_rate": 7.672583634702262e-05, "loss": 0.131, "step": 7920 }, { "epoch": 0.4222802066137707, "grad_norm": 0.08723526448011398, "learning_rate": 7.666774568459938e-05, "loss": 0.1318, "step": 7930 }, { "epoch": 0.42281271633207307, "grad_norm": 0.053012095391750336, "learning_rate": 7.660961883981636e-05, "loss": 0.1317, "step": 7940 }, { "epoch": 0.42334522605037544, "grad_norm": 0.09206791967153549, "learning_rate": 7.65514559929283e-05, "loss": 0.1319, "step": 7950 }, { "epoch": 0.42387773576867777, "grad_norm": 0.06498973816633224, "learning_rate": 7.649325732430161e-05, "loss": 0.1311, "step": 7960 }, { "epoch": 0.42441024548698014, "grad_norm": 0.07660607993602753, "learning_rate": 7.643502301441373e-05, "loss": 0.1314, "step": 7970 }, { "epoch": 0.4249427552052825, "grad_norm": 0.08989237993955612, "learning_rate": 7.637675324385266e-05, "loss": 0.1304, "step": 7980 }, { "epoch": 0.42547526492358484, "grad_norm": 0.08027999103069305, "learning_rate": 7.631844819331633e-05, "loss": 0.1311, "step": 7990 }, { "epoch": 0.4260077746418872, "grad_norm": 0.05923927202820778, "learning_rate": 7.626010804361216e-05, "loss": 0.1303, "step": 8000 }, { "epoch": 0.4265402843601896, "grad_norm": 0.05849640443921089, "learning_rate": 7.62017329756563e-05, "loss": 0.1307, "step": 8010 }, { "epoch": 0.4270727940784919, "grad_norm": 0.05768370255827904, "learning_rate": 7.614332317047326e-05, "loss": 0.1315, "step": 8020 }, { "epoch": 0.4276053037967943, "grad_norm": 0.05652983486652374, "learning_rate": 7.608487880919525e-05, "loss": 0.1311, "step": 8030 }, { "epoch": 0.42813781351509667, "grad_norm": 0.0556759238243103, "learning_rate": 7.602640007306165e-05, "loss": 0.1316, "step": 8040 }, { "epoch": 0.428670323233399, "grad_norm": 0.04655342176556587, "learning_rate": 7.596788714341843e-05, "loss": 0.1313, "step": 8050 }, { "epoch": 0.42920283295170136, "grad_norm": 0.048768457025289536, "learning_rate": 7.590934020171758e-05, "loss": 0.1308, "step": 8060 }, { "epoch": 0.42973534267000374, "grad_norm": 0.05214981734752655, "learning_rate": 7.58507594295166e-05, "loss": 0.131, "step": 8070 }, { "epoch": 0.43026785238830606, "grad_norm": 0.060043588280677795, "learning_rate": 7.579214500847789e-05, "loss": 0.1315, "step": 8080 }, { "epoch": 0.43080036210660844, "grad_norm": 0.04958285391330719, "learning_rate": 7.573349712036815e-05, "loss": 0.1314, "step": 8090 }, { "epoch": 0.4313328718249108, "grad_norm": 0.05665591359138489, "learning_rate": 7.567481594705795e-05, "loss": 0.1314, "step": 8100 }, { "epoch": 0.4318653815432132, "grad_norm": 0.056042492389678955, "learning_rate": 7.561610167052095e-05, "loss": 0.131, "step": 8110 }, { "epoch": 0.4323978912615155, "grad_norm": 0.05700002983212471, "learning_rate": 7.555735447283364e-05, "loss": 0.1313, "step": 8120 }, { "epoch": 0.4329304009798179, "grad_norm": 0.05349269136786461, "learning_rate": 7.549857453617446e-05, "loss": 0.1313, "step": 8130 }, { "epoch": 0.43346291069812026, "grad_norm": 0.05427918955683708, "learning_rate": 7.543976204282342e-05, "loss": 0.131, "step": 8140 }, { "epoch": 0.4339954204164226, "grad_norm": 0.12502682209014893, "learning_rate": 7.538091717516149e-05, "loss": 0.1309, "step": 8150 }, { "epoch": 0.43452793013472496, "grad_norm": 0.06011335179209709, "learning_rate": 7.532204011567006e-05, "loss": 0.1315, "step": 8160 }, { "epoch": 0.43506043985302734, "grad_norm": 0.07122571021318436, "learning_rate": 7.526313104693031e-05, "loss": 0.1314, "step": 8170 }, { "epoch": 0.43559294957132966, "grad_norm": 0.04538768157362938, "learning_rate": 7.520419015162267e-05, "loss": 0.1315, "step": 8180 }, { "epoch": 0.43612545928963203, "grad_norm": 0.04720662534236908, "learning_rate": 7.514521761252635e-05, "loss": 0.131, "step": 8190 }, { "epoch": 0.4366579690079344, "grad_norm": 0.07761963456869125, "learning_rate": 7.508621361251858e-05, "loss": 0.1316, "step": 8200 }, { "epoch": 0.43719047872623673, "grad_norm": 0.08107470721006393, "learning_rate": 7.502717833457424e-05, "loss": 0.1308, "step": 8210 }, { "epoch": 0.4377229884445391, "grad_norm": 0.08958134800195694, "learning_rate": 7.496811196176513e-05, "loss": 0.1314, "step": 8220 }, { "epoch": 0.4382554981628415, "grad_norm": 0.04781255125999451, "learning_rate": 7.490901467725957e-05, "loss": 0.1306, "step": 8230 }, { "epoch": 0.4387880078811438, "grad_norm": 0.06295894831418991, "learning_rate": 7.484988666432165e-05, "loss": 0.1311, "step": 8240 }, { "epoch": 0.4393205175994462, "grad_norm": 0.06639114022254944, "learning_rate": 7.479072810631078e-05, "loss": 0.1311, "step": 8250 }, { "epoch": 0.43985302731774856, "grad_norm": 0.06550955027341843, "learning_rate": 7.473153918668112e-05, "loss": 0.1307, "step": 8260 }, { "epoch": 0.44038553703605093, "grad_norm": 0.06374099105596542, "learning_rate": 7.467232008898098e-05, "loss": 0.131, "step": 8270 }, { "epoch": 0.44091804675435325, "grad_norm": 0.055466748774051666, "learning_rate": 7.461307099685218e-05, "loss": 0.1306, "step": 8280 }, { "epoch": 0.44145055647265563, "grad_norm": 0.06467512249946594, "learning_rate": 7.455379209402964e-05, "loss": 0.1312, "step": 8290 }, { "epoch": 0.441983066190958, "grad_norm": 0.10842160880565643, "learning_rate": 7.44944835643407e-05, "loss": 0.131, "step": 8300 }, { "epoch": 0.44251557590926033, "grad_norm": 0.10142064094543457, "learning_rate": 7.443514559170456e-05, "loss": 0.1303, "step": 8310 }, { "epoch": 0.4430480856275627, "grad_norm": 0.07040092349052429, "learning_rate": 7.437577836013174e-05, "loss": 0.1314, "step": 8320 }, { "epoch": 0.4435805953458651, "grad_norm": 0.06632167845964432, "learning_rate": 7.431638205372348e-05, "loss": 0.1305, "step": 8330 }, { "epoch": 0.4441131050641674, "grad_norm": 0.0742000862956047, "learning_rate": 7.425695685667118e-05, "loss": 0.1313, "step": 8340 }, { "epoch": 0.4446456147824698, "grad_norm": 0.05238117650151253, "learning_rate": 7.419750295325587e-05, "loss": 0.131, "step": 8350 }, { "epoch": 0.44517812450077215, "grad_norm": 0.06212290748953819, "learning_rate": 7.413802052784756e-05, "loss": 0.131, "step": 8360 }, { "epoch": 0.4457106342190745, "grad_norm": 0.07771137356758118, "learning_rate": 7.407850976490469e-05, "loss": 0.1309, "step": 8370 }, { "epoch": 0.44624314393737685, "grad_norm": 0.0551883801817894, "learning_rate": 7.401897084897365e-05, "loss": 0.1301, "step": 8380 }, { "epoch": 0.44677565365567923, "grad_norm": 0.06460625678300858, "learning_rate": 7.395940396468808e-05, "loss": 0.1307, "step": 8390 }, { "epoch": 0.44730816337398155, "grad_norm": 0.08054537326097488, "learning_rate": 7.389980929676835e-05, "loss": 0.1305, "step": 8400 }, { "epoch": 0.4478406730922839, "grad_norm": 0.08456294983625412, "learning_rate": 7.384018703002098e-05, "loss": 0.1309, "step": 8410 }, { "epoch": 0.4483731828105863, "grad_norm": 0.06319648027420044, "learning_rate": 7.378053734933814e-05, "loss": 0.1304, "step": 8420 }, { "epoch": 0.4489056925288887, "grad_norm": 0.05323270335793495, "learning_rate": 7.372086043969694e-05, "loss": 0.1316, "step": 8430 }, { "epoch": 0.449438202247191, "grad_norm": 0.05555250123143196, "learning_rate": 7.366115648615898e-05, "loss": 0.1313, "step": 8440 }, { "epoch": 0.4499707119654934, "grad_norm": 0.08050252497196198, "learning_rate": 7.360142567386968e-05, "loss": 0.1303, "step": 8450 }, { "epoch": 0.45050322168379575, "grad_norm": 0.0804496631026268, "learning_rate": 7.354166818805776e-05, "loss": 0.1305, "step": 8460 }, { "epoch": 0.4510357314020981, "grad_norm": 0.09748142957687378, "learning_rate": 7.34818842140347e-05, "loss": 0.1308, "step": 8470 }, { "epoch": 0.45156824112040045, "grad_norm": 0.0667809545993805, "learning_rate": 7.34220739371941e-05, "loss": 0.1312, "step": 8480 }, { "epoch": 0.4521007508387028, "grad_norm": 0.08125482499599457, "learning_rate": 7.336223754301105e-05, "loss": 0.1311, "step": 8490 }, { "epoch": 0.45263326055700515, "grad_norm": 0.057649750262498856, "learning_rate": 7.330237521704177e-05, "loss": 0.1307, "step": 8500 }, { "epoch": 0.4531657702753075, "grad_norm": 0.06427519768476486, "learning_rate": 7.324248714492279e-05, "loss": 0.131, "step": 8510 }, { "epoch": 0.4536982799936099, "grad_norm": 0.05290444567799568, "learning_rate": 7.31825735123705e-05, "loss": 0.1304, "step": 8520 }, { "epoch": 0.4542307897119122, "grad_norm": 0.056924887001514435, "learning_rate": 7.312263450518061e-05, "loss": 0.1296, "step": 8530 }, { "epoch": 0.4547632994302146, "grad_norm": 0.06351561844348907, "learning_rate": 7.306267030922745e-05, "loss": 0.1306, "step": 8540 }, { "epoch": 0.455295809148517, "grad_norm": 0.08165629208087921, "learning_rate": 7.300268111046348e-05, "loss": 0.1307, "step": 8550 }, { "epoch": 0.4558283188668193, "grad_norm": 0.059766896069049835, "learning_rate": 7.294266709491873e-05, "loss": 0.1305, "step": 8560 }, { "epoch": 0.45636082858512167, "grad_norm": 0.05260982736945152, "learning_rate": 7.288262844870013e-05, "loss": 0.1301, "step": 8570 }, { "epoch": 0.45689333830342405, "grad_norm": 0.06455428898334503, "learning_rate": 7.282256535799106e-05, "loss": 0.1304, "step": 8580 }, { "epoch": 0.4574258480217264, "grad_norm": 0.05693411827087402, "learning_rate": 7.276247800905063e-05, "loss": 0.1304, "step": 8590 }, { "epoch": 0.45795835774002874, "grad_norm": 0.05784597992897034, "learning_rate": 7.270236658821322e-05, "loss": 0.1308, "step": 8600 }, { "epoch": 0.4584908674583311, "grad_norm": 0.10032429546117783, "learning_rate": 7.264223128188789e-05, "loss": 0.1308, "step": 8610 }, { "epoch": 0.4590233771766335, "grad_norm": 0.0799618735909462, "learning_rate": 7.258207227655768e-05, "loss": 0.1314, "step": 8620 }, { "epoch": 0.4595558868949358, "grad_norm": 0.08555562049150467, "learning_rate": 7.25218897587792e-05, "loss": 0.1301, "step": 8630 }, { "epoch": 0.4600883966132382, "grad_norm": 0.06158687174320221, "learning_rate": 7.246168391518196e-05, "loss": 0.1302, "step": 8640 }, { "epoch": 0.46062090633154057, "grad_norm": 0.06019744649529457, "learning_rate": 7.240145493246776e-05, "loss": 0.1304, "step": 8650 }, { "epoch": 0.4611534160498429, "grad_norm": 0.06112377345561981, "learning_rate": 7.234120299741021e-05, "loss": 0.1301, "step": 8660 }, { "epoch": 0.46168592576814527, "grad_norm": 0.04358561709523201, "learning_rate": 7.228092829685406e-05, "loss": 0.1299, "step": 8670 }, { "epoch": 0.46221843548644764, "grad_norm": 0.04648636281490326, "learning_rate": 7.22206310177147e-05, "loss": 0.1296, "step": 8680 }, { "epoch": 0.46275094520474996, "grad_norm": 0.040558718144893646, "learning_rate": 7.216031134697747e-05, "loss": 0.1307, "step": 8690 }, { "epoch": 0.46328345492305234, "grad_norm": 0.04816916212439537, "learning_rate": 7.209996947169719e-05, "loss": 0.1307, "step": 8700 }, { "epoch": 0.4638159646413547, "grad_norm": 0.08434905111789703, "learning_rate": 7.203960557899758e-05, "loss": 0.1304, "step": 8710 }, { "epoch": 0.46434847435965704, "grad_norm": 0.04249223694205284, "learning_rate": 7.197921985607055e-05, "loss": 0.1307, "step": 8720 }, { "epoch": 0.4648809840779594, "grad_norm": 0.04334559664130211, "learning_rate": 7.191881249017574e-05, "loss": 0.1301, "step": 8730 }, { "epoch": 0.4654134937962618, "grad_norm": 0.06121005490422249, "learning_rate": 7.185838366863995e-05, "loss": 0.1307, "step": 8740 }, { "epoch": 0.46594600351456417, "grad_norm": 0.05099225789308548, "learning_rate": 7.179793357885645e-05, "loss": 0.1305, "step": 8750 }, { "epoch": 0.4664785132328665, "grad_norm": 0.04724998399615288, "learning_rate": 7.173746240828451e-05, "loss": 0.1295, "step": 8760 }, { "epoch": 0.46701102295116886, "grad_norm": 0.04000856354832649, "learning_rate": 7.167697034444874e-05, "loss": 0.13, "step": 8770 }, { "epoch": 0.46754353266947124, "grad_norm": 0.09295206516981125, "learning_rate": 7.161645757493858e-05, "loss": 0.1301, "step": 8780 }, { "epoch": 0.46807604238777356, "grad_norm": 0.05277612432837486, "learning_rate": 7.155592428740765e-05, "loss": 0.1303, "step": 8790 }, { "epoch": 0.46860855210607594, "grad_norm": 0.05306980386376381, "learning_rate": 7.14953706695732e-05, "loss": 0.1297, "step": 8800 }, { "epoch": 0.4691410618243783, "grad_norm": 0.06097976118326187, "learning_rate": 7.14347969092155e-05, "loss": 0.1308, "step": 8810 }, { "epoch": 0.46967357154268063, "grad_norm": 0.059332527220249176, "learning_rate": 7.137420319417738e-05, "loss": 0.1296, "step": 8820 }, { "epoch": 0.470206081260983, "grad_norm": 0.09293901175260544, "learning_rate": 7.131358971236344e-05, "loss": 0.1296, "step": 8830 }, { "epoch": 0.4707385909792854, "grad_norm": 0.046720948070287704, "learning_rate": 7.125295665173964e-05, "loss": 0.1304, "step": 8840 }, { "epoch": 0.4712711006975877, "grad_norm": 0.06865198165178299, "learning_rate": 7.119230420033259e-05, "loss": 0.1306, "step": 8850 }, { "epoch": 0.4718036104158901, "grad_norm": 0.11196744441986084, "learning_rate": 7.113163254622915e-05, "loss": 0.1301, "step": 8860 }, { "epoch": 0.47233612013419246, "grad_norm": 0.056259218603372574, "learning_rate": 7.107094187757559e-05, "loss": 0.1298, "step": 8870 }, { "epoch": 0.4728686298524948, "grad_norm": 0.06268846988677979, "learning_rate": 7.101023238257725e-05, "loss": 0.1303, "step": 8880 }, { "epoch": 0.47340113957079716, "grad_norm": 0.05749877542257309, "learning_rate": 7.094950424949784e-05, "loss": 0.1305, "step": 8890 }, { "epoch": 0.47393364928909953, "grad_norm": 0.05980097874999046, "learning_rate": 7.088875766665879e-05, "loss": 0.1299, "step": 8900 }, { "epoch": 0.4744661590074019, "grad_norm": 0.048347923904657364, "learning_rate": 7.082799282243881e-05, "loss": 0.1302, "step": 8910 }, { "epoch": 0.47499866872570423, "grad_norm": 0.0524616502225399, "learning_rate": 7.076720990527324e-05, "loss": 0.1301, "step": 8920 }, { "epoch": 0.4755311784440066, "grad_norm": 0.06477531045675278, "learning_rate": 7.070640910365344e-05, "loss": 0.1306, "step": 8930 }, { "epoch": 0.476063688162309, "grad_norm": 0.05950429290533066, "learning_rate": 7.064559060612625e-05, "loss": 0.13, "step": 8940 }, { "epoch": 0.4765961978806113, "grad_norm": 0.0458899661898613, "learning_rate": 7.058475460129337e-05, "loss": 0.1299, "step": 8950 }, { "epoch": 0.4771287075989137, "grad_norm": 0.04977622628211975, "learning_rate": 7.05239012778108e-05, "loss": 0.1297, "step": 8960 }, { "epoch": 0.47766121731721606, "grad_norm": 0.052012983709573746, "learning_rate": 7.046303082438823e-05, "loss": 0.1304, "step": 8970 }, { "epoch": 0.4781937270355184, "grad_norm": 0.09166349470615387, "learning_rate": 7.040214342978851e-05, "loss": 0.1303, "step": 8980 }, { "epoch": 0.47872623675382076, "grad_norm": 0.057922665029764175, "learning_rate": 7.034123928282699e-05, "loss": 0.1292, "step": 8990 }, { "epoch": 0.47925874647212313, "grad_norm": 0.05284808203577995, "learning_rate": 7.028031857237098e-05, "loss": 0.1299, "step": 9000 }, { "epoch": 0.47979125619042545, "grad_norm": 0.05781892314553261, "learning_rate": 7.021938148733918e-05, "loss": 0.1304, "step": 9010 }, { "epoch": 0.48032376590872783, "grad_norm": 0.04390615597367287, "learning_rate": 7.0158428216701e-05, "loss": 0.1295, "step": 9020 }, { "epoch": 0.4808562756270302, "grad_norm": 0.06015874817967415, "learning_rate": 7.009745894947612e-05, "loss": 0.1299, "step": 9030 }, { "epoch": 0.4813887853453325, "grad_norm": 0.17922475934028625, "learning_rate": 7.003647387473378e-05, "loss": 0.1299, "step": 9040 }, { "epoch": 0.4819212950636349, "grad_norm": 0.07132676243782043, "learning_rate": 6.997547318159225e-05, "loss": 0.1304, "step": 9050 }, { "epoch": 0.4824538047819373, "grad_norm": 0.06266484409570694, "learning_rate": 6.991445705921825e-05, "loss": 0.1302, "step": 9060 }, { "epoch": 0.48298631450023966, "grad_norm": 0.04912625625729561, "learning_rate": 6.985342569682632e-05, "loss": 0.1299, "step": 9070 }, { "epoch": 0.483518824218542, "grad_norm": 0.05088292434811592, "learning_rate": 6.979237928367827e-05, "loss": 0.1298, "step": 9080 }, { "epoch": 0.48405133393684435, "grad_norm": 0.09333918988704681, "learning_rate": 6.973131800908262e-05, "loss": 0.13, "step": 9090 }, { "epoch": 0.48458384365514673, "grad_norm": 0.05258602276444435, "learning_rate": 6.967024206239392e-05, "loss": 0.1292, "step": 9100 }, { "epoch": 0.48511635337344905, "grad_norm": 0.05117359384894371, "learning_rate": 6.960915163301222e-05, "loss": 0.1298, "step": 9110 }, { "epoch": 0.4856488630917514, "grad_norm": 0.0650695338845253, "learning_rate": 6.954804691038255e-05, "loss": 0.1302, "step": 9120 }, { "epoch": 0.4861813728100538, "grad_norm": 0.07531211525201797, "learning_rate": 6.948692808399417e-05, "loss": 0.129, "step": 9130 }, { "epoch": 0.4867138825283561, "grad_norm": 0.0522490069270134, "learning_rate": 6.942579534338018e-05, "loss": 0.1302, "step": 9140 }, { "epoch": 0.4872463922466585, "grad_norm": 0.0909682959318161, "learning_rate": 6.93646488781167e-05, "loss": 0.1288, "step": 9150 }, { "epoch": 0.4877789019649609, "grad_norm": 0.0672360509634018, "learning_rate": 6.930348887782257e-05, "loss": 0.1298, "step": 9160 }, { "epoch": 0.4883114116832632, "grad_norm": 0.050222091376781464, "learning_rate": 6.924231553215845e-05, "loss": 0.1291, "step": 9170 }, { "epoch": 0.4888439214015656, "grad_norm": 0.0731450617313385, "learning_rate": 6.918112903082648e-05, "loss": 0.1295, "step": 9180 }, { "epoch": 0.48937643111986795, "grad_norm": 0.044536300003528595, "learning_rate": 6.911992956356958e-05, "loss": 0.1296, "step": 9190 }, { "epoch": 0.48990894083817027, "grad_norm": 0.10119880735874176, "learning_rate": 6.905871732017083e-05, "loss": 0.1297, "step": 9200 }, { "epoch": 0.49044145055647265, "grad_norm": 0.04427400976419449, "learning_rate": 6.8997492490453e-05, "loss": 0.1296, "step": 9210 }, { "epoch": 0.490973960274775, "grad_norm": 0.05631903558969498, "learning_rate": 6.893625526427785e-05, "loss": 0.1294, "step": 9220 }, { "epoch": 0.4915064699930774, "grad_norm": 0.05250485986471176, "learning_rate": 6.88750058315456e-05, "loss": 0.1288, "step": 9230 }, { "epoch": 0.4920389797113797, "grad_norm": 0.04813829064369202, "learning_rate": 6.881374438219426e-05, "loss": 0.1299, "step": 9240 }, { "epoch": 0.4925714894296821, "grad_norm": 0.10428118705749512, "learning_rate": 6.875247110619923e-05, "loss": 0.1293, "step": 9250 }, { "epoch": 0.4931039991479845, "grad_norm": 0.05188250541687012, "learning_rate": 6.869118619357244e-05, "loss": 0.1298, "step": 9260 }, { "epoch": 0.4936365088662868, "grad_norm": 0.06389789283275604, "learning_rate": 6.862988983436205e-05, "loss": 0.1297, "step": 9270 }, { "epoch": 0.49416901858458917, "grad_norm": 0.05871303752064705, "learning_rate": 6.856858221865158e-05, "loss": 0.1296, "step": 9280 }, { "epoch": 0.49470152830289155, "grad_norm": 0.09698927402496338, "learning_rate": 6.850726353655956e-05, "loss": 0.13, "step": 9290 }, { "epoch": 0.49523403802119387, "grad_norm": 0.0667075663805008, "learning_rate": 6.844593397823881e-05, "loss": 0.1294, "step": 9300 }, { "epoch": 0.49576654773949624, "grad_norm": 0.05773301422595978, "learning_rate": 6.838459373387583e-05, "loss": 0.1294, "step": 9310 }, { "epoch": 0.4962990574577986, "grad_norm": 0.06608272343873978, "learning_rate": 6.83232429936903e-05, "loss": 0.1293, "step": 9320 }, { "epoch": 0.49683156717610094, "grad_norm": 0.057207848876714706, "learning_rate": 6.826188194793447e-05, "loss": 0.1291, "step": 9330 }, { "epoch": 0.4973640768944033, "grad_norm": 0.03619164600968361, "learning_rate": 6.82005107868925e-05, "loss": 0.1294, "step": 9340 }, { "epoch": 0.4978965866127057, "grad_norm": 0.046284269541502, "learning_rate": 6.813912970087994e-05, "loss": 0.1294, "step": 9350 }, { "epoch": 0.498429096331008, "grad_norm": 0.0476924329996109, "learning_rate": 6.807773888024314e-05, "loss": 0.1288, "step": 9360 }, { "epoch": 0.4989616060493104, "grad_norm": 0.06622269749641418, "learning_rate": 6.801633851535857e-05, "loss": 0.1288, "step": 9370 }, { "epoch": 0.49949411576761277, "grad_norm": 0.042118556797504425, "learning_rate": 6.795492879663237e-05, "loss": 0.1285, "step": 9380 }, { "epoch": 0.5000266254859151, "grad_norm": 0.044616151601076126, "learning_rate": 6.789350991449966e-05, "loss": 0.1282, "step": 9390 }, { "epoch": 0.5005591352042175, "grad_norm": 0.053620822727680206, "learning_rate": 6.783208205942399e-05, "loss": 0.1288, "step": 9400 }, { "epoch": 0.5010916449225198, "grad_norm": 0.05040338635444641, "learning_rate": 6.777064542189668e-05, "loss": 0.1294, "step": 9410 }, { "epoch": 0.5016241546408222, "grad_norm": 0.07730654627084732, "learning_rate": 6.770920019243636e-05, "loss": 0.1291, "step": 9420 }, { "epoch": 0.5021566643591245, "grad_norm": 0.04234246164560318, "learning_rate": 6.764774656158825e-05, "loss": 0.1291, "step": 9430 }, { "epoch": 0.5026891740774269, "grad_norm": 0.04666012525558472, "learning_rate": 6.758628471992365e-05, "loss": 0.1286, "step": 9440 }, { "epoch": 0.5032216837957293, "grad_norm": 0.043177202343940735, "learning_rate": 6.752481485803933e-05, "loss": 0.1287, "step": 9450 }, { "epoch": 0.5037541935140316, "grad_norm": 0.05593249201774597, "learning_rate": 6.746333716655691e-05, "loss": 0.1296, "step": 9460 }, { "epoch": 0.504286703232334, "grad_norm": 0.050741735845804214, "learning_rate": 6.740185183612227e-05, "loss": 0.1286, "step": 9470 }, { "epoch": 0.5048192129506364, "grad_norm": 0.08293752372264862, "learning_rate": 6.734035905740504e-05, "loss": 0.1289, "step": 9480 }, { "epoch": 0.5053517226689387, "grad_norm": 0.11741827428340912, "learning_rate": 6.727885902109785e-05, "loss": 0.1286, "step": 9490 }, { "epoch": 0.5058842323872411, "grad_norm": 0.05878937989473343, "learning_rate": 6.7217351917916e-05, "loss": 0.1288, "step": 9500 }, { "epoch": 0.5064167421055434, "grad_norm": 0.04729843512177467, "learning_rate": 6.715583793859652e-05, "loss": 0.1291, "step": 9510 }, { "epoch": 0.5069492518238458, "grad_norm": 0.04623175784945488, "learning_rate": 6.709431727389789e-05, "loss": 0.1275, "step": 9520 }, { "epoch": 0.5074817615421482, "grad_norm": 0.047292426228523254, "learning_rate": 6.703279011459927e-05, "loss": 0.1285, "step": 9530 }, { "epoch": 0.5080142712604505, "grad_norm": 0.04683827981352806, "learning_rate": 6.697125665149993e-05, "loss": 0.1283, "step": 9540 }, { "epoch": 0.5085467809787528, "grad_norm": 0.08465840667486191, "learning_rate": 6.69097170754188e-05, "loss": 0.1289, "step": 9550 }, { "epoch": 0.5090792906970553, "grad_norm": 0.04499583691358566, "learning_rate": 6.684817157719364e-05, "loss": 0.1295, "step": 9560 }, { "epoch": 0.5096118004153576, "grad_norm": 0.05609264224767685, "learning_rate": 6.678662034768063e-05, "loss": 0.1291, "step": 9570 }, { "epoch": 0.5101443101336599, "grad_norm": 0.04982760548591614, "learning_rate": 6.672506357775375e-05, "loss": 0.1287, "step": 9580 }, { "epoch": 0.5106768198519623, "grad_norm": 0.04551566392183304, "learning_rate": 6.666350145830413e-05, "loss": 0.1287, "step": 9590 }, { "epoch": 0.5112093295702647, "grad_norm": 0.06523692607879639, "learning_rate": 6.660193418023947e-05, "loss": 0.1289, "step": 9600 }, { "epoch": 0.511741839288567, "grad_norm": 0.09148914366960526, "learning_rate": 6.654036193448349e-05, "loss": 0.1285, "step": 9610 }, { "epoch": 0.5122743490068694, "grad_norm": 0.040613338351249695, "learning_rate": 6.647878491197535e-05, "loss": 0.1281, "step": 9620 }, { "epoch": 0.5128068587251717, "grad_norm": 0.06947502493858337, "learning_rate": 6.641720330366894e-05, "loss": 0.1281, "step": 9630 }, { "epoch": 0.513339368443474, "grad_norm": 0.060511503368616104, "learning_rate": 6.635561730053245e-05, "loss": 0.1285, "step": 9640 }, { "epoch": 0.5138718781617765, "grad_norm": 0.06579563021659851, "learning_rate": 6.629402709354766e-05, "loss": 0.1282, "step": 9650 }, { "epoch": 0.5144043878800788, "grad_norm": 0.055754803121089935, "learning_rate": 6.62324328737094e-05, "loss": 0.1284, "step": 9660 }, { "epoch": 0.5149368975983811, "grad_norm": 0.05002092942595482, "learning_rate": 6.617083483202493e-05, "loss": 0.1295, "step": 9670 }, { "epoch": 0.5154694073166836, "grad_norm": 0.03860372677445412, "learning_rate": 6.610923315951336e-05, "loss": 0.1284, "step": 9680 }, { "epoch": 0.5160019170349859, "grad_norm": 0.08359445631504059, "learning_rate": 6.604762804720508e-05, "loss": 0.1285, "step": 9690 }, { "epoch": 0.5165344267532882, "grad_norm": 0.05430614575743675, "learning_rate": 6.598601968614115e-05, "loss": 0.1283, "step": 9700 }, { "epoch": 0.5170669364715906, "grad_norm": 0.05833563208580017, "learning_rate": 6.592440826737266e-05, "loss": 0.1289, "step": 9710 }, { "epoch": 0.517599446189893, "grad_norm": 0.05940975248813629, "learning_rate": 6.586279398196023e-05, "loss": 0.1284, "step": 9720 }, { "epoch": 0.5181319559081953, "grad_norm": 0.0453868992626667, "learning_rate": 6.580117702097332e-05, "loss": 0.1288, "step": 9730 }, { "epoch": 0.5186644656264977, "grad_norm": 0.042583536356687546, "learning_rate": 6.57395575754898e-05, "loss": 0.1293, "step": 9740 }, { "epoch": 0.5191969753448, "grad_norm": 0.05306556820869446, "learning_rate": 6.567793583659507e-05, "loss": 0.128, "step": 9750 }, { "epoch": 0.5197294850631023, "grad_norm": 0.04358596354722977, "learning_rate": 6.561631199538179e-05, "loss": 0.129, "step": 9760 }, { "epoch": 0.5202619947814048, "grad_norm": 0.11662445962429047, "learning_rate": 6.555468624294907e-05, "loss": 0.1276, "step": 9770 }, { "epoch": 0.5207945044997071, "grad_norm": 0.050507139414548874, "learning_rate": 6.549305877040199e-05, "loss": 0.1291, "step": 9780 }, { "epoch": 0.5213270142180095, "grad_norm": 0.059976451098918915, "learning_rate": 6.543142976885088e-05, "loss": 0.1279, "step": 9790 }, { "epoch": 0.5218595239363119, "grad_norm": 0.04601925238966942, "learning_rate": 6.536979942941091e-05, "loss": 0.1288, "step": 9800 }, { "epoch": 0.5223920336546142, "grad_norm": 0.05751890689134598, "learning_rate": 6.530816794320134e-05, "loss": 0.1283, "step": 9810 }, { "epoch": 0.5229245433729166, "grad_norm": 0.05591721832752228, "learning_rate": 6.524653550134501e-05, "loss": 0.1287, "step": 9820 }, { "epoch": 0.5234570530912189, "grad_norm": 0.05766240507364273, "learning_rate": 6.518490229496772e-05, "loss": 0.1285, "step": 9830 }, { "epoch": 0.5239895628095212, "grad_norm": 0.054135777056217194, "learning_rate": 6.512326851519762e-05, "loss": 0.1287, "step": 9840 }, { "epoch": 0.5245220725278237, "grad_norm": 0.04491560161113739, "learning_rate": 6.506163435316468e-05, "loss": 0.1276, "step": 9850 }, { "epoch": 0.525054582246126, "grad_norm": 0.044994186609983444, "learning_rate": 6.5e-05, "loss": 0.1286, "step": 9860 }, { "epoch": 0.5255870919644283, "grad_norm": 0.0446479506790638, "learning_rate": 6.493836564683533e-05, "loss": 0.1286, "step": 9870 }, { "epoch": 0.5261196016827308, "grad_norm": 0.06419171392917633, "learning_rate": 6.48767314848024e-05, "loss": 0.1283, "step": 9880 }, { "epoch": 0.5266521114010331, "grad_norm": 0.041707735508680344, "learning_rate": 6.481509770503229e-05, "loss": 0.1275, "step": 9890 }, { "epoch": 0.5271846211193354, "grad_norm": 0.08214934170246124, "learning_rate": 6.475346449865499e-05, "loss": 0.1287, "step": 9900 }, { "epoch": 0.5277171308376378, "grad_norm": 0.09313659369945526, "learning_rate": 6.469183205679865e-05, "loss": 0.1277, "step": 9910 }, { "epoch": 0.5282496405559401, "grad_norm": 0.05460633337497711, "learning_rate": 6.46302005705891e-05, "loss": 0.1287, "step": 9920 }, { "epoch": 0.5287821502742425, "grad_norm": 0.0486149825155735, "learning_rate": 6.456857023114913e-05, "loss": 0.1276, "step": 9930 }, { "epoch": 0.5293146599925449, "grad_norm": 0.04761586710810661, "learning_rate": 6.450694122959801e-05, "loss": 0.1287, "step": 9940 }, { "epoch": 0.5298471697108472, "grad_norm": 0.04752049222588539, "learning_rate": 6.444531375705092e-05, "loss": 0.1285, "step": 9950 }, { "epoch": 0.5303796794291495, "grad_norm": 0.06729278713464737, "learning_rate": 6.438368800461821e-05, "loss": 0.1286, "step": 9960 }, { "epoch": 0.530912189147452, "grad_norm": 0.04480813071131706, "learning_rate": 6.432206416340492e-05, "loss": 0.1284, "step": 9970 }, { "epoch": 0.5314446988657543, "grad_norm": 0.040219422429800034, "learning_rate": 6.426044242451022e-05, "loss": 0.1287, "step": 9980 }, { "epoch": 0.5319772085840566, "grad_norm": 0.03565455228090286, "learning_rate": 6.419882297902667e-05, "loss": 0.1284, "step": 9990 }, { "epoch": 0.532509718302359, "grad_norm": 0.05310383439064026, "learning_rate": 6.413720601803979e-05, "loss": 0.1285, "step": 10000 }, { "epoch": 0.5330422280206614, "grad_norm": 0.07043947279453278, "learning_rate": 6.407559173262735e-05, "loss": 0.1283, "step": 10010 }, { "epoch": 0.5335747377389637, "grad_norm": 0.04902435466647148, "learning_rate": 6.401398031385886e-05, "loss": 0.1276, "step": 10020 }, { "epoch": 0.5341072474572661, "grad_norm": 0.05668781325221062, "learning_rate": 6.395237195279491e-05, "loss": 0.1283, "step": 10030 }, { "epoch": 0.5346397571755684, "grad_norm": 0.05145740881562233, "learning_rate": 6.389076684048664e-05, "loss": 0.1264, "step": 10040 }, { "epoch": 0.5351722668938708, "grad_norm": 0.1139606162905693, "learning_rate": 6.382916516797508e-05, "loss": 0.1283, "step": 10050 }, { "epoch": 0.5357047766121732, "grad_norm": 0.05299168825149536, "learning_rate": 6.376756712629059e-05, "loss": 0.1281, "step": 10060 }, { "epoch": 0.5362372863304755, "grad_norm": 0.06942315399646759, "learning_rate": 6.370597290645234e-05, "loss": 0.1281, "step": 10070 }, { "epoch": 0.5367697960487778, "grad_norm": 0.07276537269353867, "learning_rate": 6.364438269946755e-05, "loss": 0.1284, "step": 10080 }, { "epoch": 0.5373023057670803, "grad_norm": 0.043881241232156754, "learning_rate": 6.358279669633106e-05, "loss": 0.1275, "step": 10090 }, { "epoch": 0.5378348154853826, "grad_norm": 0.047917068004608154, "learning_rate": 6.352121508802467e-05, "loss": 0.1282, "step": 10100 }, { "epoch": 0.538367325203685, "grad_norm": 0.04417307674884796, "learning_rate": 6.345963806551651e-05, "loss": 0.1281, "step": 10110 }, { "epoch": 0.5388998349219873, "grad_norm": 0.053708259016275406, "learning_rate": 6.339806581976055e-05, "loss": 0.1286, "step": 10120 }, { "epoch": 0.5394323446402897, "grad_norm": 0.05327571928501129, "learning_rate": 6.333649854169587e-05, "loss": 0.1284, "step": 10130 }, { "epoch": 0.5399648543585921, "grad_norm": 0.062333524227142334, "learning_rate": 6.327493642224624e-05, "loss": 0.1281, "step": 10140 }, { "epoch": 0.5404973640768944, "grad_norm": 0.04436059668660164, "learning_rate": 6.321337965231937e-05, "loss": 0.1278, "step": 10150 }, { "epoch": 0.5410298737951967, "grad_norm": 0.07489614933729172, "learning_rate": 6.315182842280638e-05, "loss": 0.1284, "step": 10160 }, { "epoch": 0.5415623835134992, "grad_norm": 0.06262974441051483, "learning_rate": 6.309028292458122e-05, "loss": 0.1269, "step": 10170 }, { "epoch": 0.5420948932318015, "grad_norm": 0.08940589427947998, "learning_rate": 6.302874334850006e-05, "loss": 0.128, "step": 10180 }, { "epoch": 0.5426274029501038, "grad_norm": 0.039577096700668335, "learning_rate": 6.296720988540075e-05, "loss": 0.1273, "step": 10190 }, { "epoch": 0.5431599126684062, "grad_norm": 0.05988942086696625, "learning_rate": 6.290568272610211e-05, "loss": 0.1276, "step": 10200 }, { "epoch": 0.5436924223867086, "grad_norm": 0.047797802835702896, "learning_rate": 6.284416206140348e-05, "loss": 0.1278, "step": 10210 }, { "epoch": 0.5442249321050109, "grad_norm": 0.05901528522372246, "learning_rate": 6.278264808208402e-05, "loss": 0.1283, "step": 10220 }, { "epoch": 0.5447574418233133, "grad_norm": 0.10273321717977524, "learning_rate": 6.272114097890213e-05, "loss": 0.1279, "step": 10230 }, { "epoch": 0.5452899515416156, "grad_norm": 0.07229287177324295, "learning_rate": 6.265964094259498e-05, "loss": 0.1283, "step": 10240 }, { "epoch": 0.545822461259918, "grad_norm": 0.04700973257422447, "learning_rate": 6.259814816387775e-05, "loss": 0.1276, "step": 10250 }, { "epoch": 0.5463549709782204, "grad_norm": 0.10428871214389801, "learning_rate": 6.25366628334431e-05, "loss": 0.1285, "step": 10260 }, { "epoch": 0.5468874806965227, "grad_norm": 0.048143282532691956, "learning_rate": 6.247518514196067e-05, "loss": 0.1275, "step": 10270 }, { "epoch": 0.547419990414825, "grad_norm": 0.054553814232349396, "learning_rate": 6.241371528007634e-05, "loss": 0.1281, "step": 10280 }, { "epoch": 0.5479525001331275, "grad_norm": 0.07188910245895386, "learning_rate": 6.235225343841174e-05, "loss": 0.1276, "step": 10290 }, { "epoch": 0.5484850098514298, "grad_norm": 0.05446217581629753, "learning_rate": 6.229079980756365e-05, "loss": 0.1283, "step": 10300 }, { "epoch": 0.5490175195697321, "grad_norm": 0.041187744587659836, "learning_rate": 6.222935457810333e-05, "loss": 0.1277, "step": 10310 }, { "epoch": 0.5495500292880345, "grad_norm": 0.05523503199219704, "learning_rate": 6.216791794057601e-05, "loss": 0.1276, "step": 10320 }, { "epoch": 0.5500825390063369, "grad_norm": 0.044267792254686356, "learning_rate": 6.210649008550033e-05, "loss": 0.1279, "step": 10330 }, { "epoch": 0.5506150487246392, "grad_norm": 0.04887842759490013, "learning_rate": 6.204507120336764e-05, "loss": 0.128, "step": 10340 }, { "epoch": 0.5511475584429416, "grad_norm": 0.04334214702248573, "learning_rate": 6.198366148464143e-05, "loss": 0.1276, "step": 10350 }, { "epoch": 0.5516800681612439, "grad_norm": 0.05289037153124809, "learning_rate": 6.192226111975687e-05, "loss": 0.1275, "step": 10360 }, { "epoch": 0.5522125778795463, "grad_norm": 0.05000938102602959, "learning_rate": 6.186087029912005e-05, "loss": 0.1277, "step": 10370 }, { "epoch": 0.5527450875978487, "grad_norm": 0.05687737837433815, "learning_rate": 6.179948921310749e-05, "loss": 0.1282, "step": 10380 }, { "epoch": 0.553277597316151, "grad_norm": 0.040263786911964417, "learning_rate": 6.173811805206553e-05, "loss": 0.1275, "step": 10390 }, { "epoch": 0.5538101070344533, "grad_norm": 0.056092556565999985, "learning_rate": 6.16767570063097e-05, "loss": 0.1272, "step": 10400 }, { "epoch": 0.5543426167527558, "grad_norm": 0.04456920921802521, "learning_rate": 6.161540626612419e-05, "loss": 0.1271, "step": 10410 }, { "epoch": 0.5548751264710581, "grad_norm": 0.05201718211174011, "learning_rate": 6.15540660217612e-05, "loss": 0.1273, "step": 10420 }, { "epoch": 0.5554076361893605, "grad_norm": 0.045165225863456726, "learning_rate": 6.149273646344044e-05, "loss": 0.1271, "step": 10430 }, { "epoch": 0.5559401459076628, "grad_norm": 0.07475852966308594, "learning_rate": 6.14314177813484e-05, "loss": 0.128, "step": 10440 }, { "epoch": 0.5564726556259652, "grad_norm": 0.08229029923677444, "learning_rate": 6.137011016563797e-05, "loss": 0.1277, "step": 10450 }, { "epoch": 0.5570051653442676, "grad_norm": 0.09118565171957016, "learning_rate": 6.130881380642755e-05, "loss": 0.1277, "step": 10460 }, { "epoch": 0.5575376750625699, "grad_norm": 0.04762515053153038, "learning_rate": 6.124752889380079e-05, "loss": 0.1275, "step": 10470 }, { "epoch": 0.5580701847808722, "grad_norm": 0.03888937830924988, "learning_rate": 6.118625561780574e-05, "loss": 0.1275, "step": 10480 }, { "epoch": 0.5586026944991747, "grad_norm": 0.04357834532856941, "learning_rate": 6.112499416845443e-05, "loss": 0.1279, "step": 10490 }, { "epoch": 0.559135204217477, "grad_norm": 0.06639399379491806, "learning_rate": 6.106374473572216e-05, "loss": 0.128, "step": 10500 }, { "epoch": 0.5596677139357793, "grad_norm": 0.051041729748249054, "learning_rate": 6.100250750954699e-05, "loss": 0.128, "step": 10510 }, { "epoch": 0.5602002236540817, "grad_norm": 0.08065960556268692, "learning_rate": 6.094128267982916e-05, "loss": 0.1275, "step": 10520 }, { "epoch": 0.560732733372384, "grad_norm": 0.04977899789810181, "learning_rate": 6.0880070436430424e-05, "loss": 0.1283, "step": 10530 }, { "epoch": 0.5612652430906864, "grad_norm": 0.04065399989485741, "learning_rate": 6.081887096917351e-05, "loss": 0.1272, "step": 10540 }, { "epoch": 0.5617977528089888, "grad_norm": 0.05486559495329857, "learning_rate": 6.075768446784154e-05, "loss": 0.1273, "step": 10550 }, { "epoch": 0.5623302625272911, "grad_norm": 0.053763121366500854, "learning_rate": 6.0696511122177436e-05, "loss": 0.1275, "step": 10560 }, { "epoch": 0.5628627722455934, "grad_norm": 0.049751050770282745, "learning_rate": 6.063535112188329e-05, "loss": 0.1263, "step": 10570 }, { "epoch": 0.5633952819638959, "grad_norm": 0.036564771085977554, "learning_rate": 6.057420465661982e-05, "loss": 0.128, "step": 10580 }, { "epoch": 0.5639277916821982, "grad_norm": 0.05281112715601921, "learning_rate": 6.051307191600581e-05, "loss": 0.1269, "step": 10590 }, { "epoch": 0.5644603014005005, "grad_norm": 0.053066980093717575, "learning_rate": 6.045195308961746e-05, "loss": 0.1269, "step": 10600 }, { "epoch": 0.564992811118803, "grad_norm": 0.05914291366934776, "learning_rate": 6.039084836698779e-05, "loss": 0.1275, "step": 10610 }, { "epoch": 0.5655253208371053, "grad_norm": 0.06061727926135063, "learning_rate": 6.032975793760609e-05, "loss": 0.1274, "step": 10620 }, { "epoch": 0.5660578305554076, "grad_norm": 0.06171563267707825, "learning_rate": 6.026868199091737e-05, "loss": 0.1273, "step": 10630 }, { "epoch": 0.56659034027371, "grad_norm": 0.07542983442544937, "learning_rate": 6.020762071632172e-05, "loss": 0.1276, "step": 10640 }, { "epoch": 0.5671228499920123, "grad_norm": 0.04028952494263649, "learning_rate": 6.014657430317368e-05, "loss": 0.1267, "step": 10650 }, { "epoch": 0.5676553597103147, "grad_norm": 0.05201804265379906, "learning_rate": 6.0085542940781755e-05, "loss": 0.1273, "step": 10660 }, { "epoch": 0.5681878694286171, "grad_norm": 0.047102462500333786, "learning_rate": 6.0024526818407745e-05, "loss": 0.1267, "step": 10670 }, { "epoch": 0.5687203791469194, "grad_norm": 0.06168229877948761, "learning_rate": 5.996352612526623e-05, "loss": 0.1272, "step": 10680 }, { "epoch": 0.5692528888652217, "grad_norm": 0.05583483725786209, "learning_rate": 5.9902541050523886e-05, "loss": 0.1278, "step": 10690 }, { "epoch": 0.5697853985835242, "grad_norm": 0.06403730064630508, "learning_rate": 5.9841571783299e-05, "loss": 0.1273, "step": 10700 }, { "epoch": 0.5703179083018265, "grad_norm": 0.042866677045822144, "learning_rate": 5.9780618512660834e-05, "loss": 0.1275, "step": 10710 }, { "epoch": 0.5708504180201288, "grad_norm": 0.05416185408830643, "learning_rate": 5.971968142762903e-05, "loss": 0.1276, "step": 10720 }, { "epoch": 0.5713829277384312, "grad_norm": 0.04284673184156418, "learning_rate": 5.9658760717173e-05, "loss": 0.1276, "step": 10730 }, { "epoch": 0.5719154374567336, "grad_norm": 0.050528384745121, "learning_rate": 5.959785657021149e-05, "loss": 0.1272, "step": 10740 }, { "epoch": 0.572447947175036, "grad_norm": 0.0541527085006237, "learning_rate": 5.953696917561178e-05, "loss": 0.1272, "step": 10750 }, { "epoch": 0.5729804568933383, "grad_norm": 0.04789347946643829, "learning_rate": 5.947609872218922e-05, "loss": 0.1276, "step": 10760 }, { "epoch": 0.5735129666116406, "grad_norm": 0.04346901550889015, "learning_rate": 5.9415245398706645e-05, "loss": 0.1273, "step": 10770 }, { "epoch": 0.5740454763299431, "grad_norm": 0.03955162316560745, "learning_rate": 5.9354409393873756e-05, "loss": 0.1271, "step": 10780 }, { "epoch": 0.5745779860482454, "grad_norm": 0.05987564101815224, "learning_rate": 5.929359089634657e-05, "loss": 0.1269, "step": 10790 }, { "epoch": 0.5751104957665477, "grad_norm": 0.0456010103225708, "learning_rate": 5.923279009472678e-05, "loss": 0.1269, "step": 10800 }, { "epoch": 0.5756430054848501, "grad_norm": 0.062419842928647995, "learning_rate": 5.9172007177561194e-05, "loss": 0.1274, "step": 10810 }, { "epoch": 0.5761755152031525, "grad_norm": 0.0384056381881237, "learning_rate": 5.911124233334122e-05, "loss": 0.1274, "step": 10820 }, { "epoch": 0.5767080249214548, "grad_norm": 0.06525867432355881, "learning_rate": 5.905049575050218e-05, "loss": 0.1271, "step": 10830 }, { "epoch": 0.5772405346397572, "grad_norm": 0.05695752054452896, "learning_rate": 5.8989767617422744e-05, "loss": 0.1279, "step": 10840 }, { "epoch": 0.5777730443580595, "grad_norm": 0.05444275960326195, "learning_rate": 5.8929058122424406e-05, "loss": 0.1273, "step": 10850 }, { "epoch": 0.5783055540763619, "grad_norm": 0.051563095301389694, "learning_rate": 5.886836745377087e-05, "loss": 0.1273, "step": 10860 }, { "epoch": 0.5788380637946643, "grad_norm": 0.07469698041677475, "learning_rate": 5.8807695799667416e-05, "loss": 0.1272, "step": 10870 }, { "epoch": 0.5793705735129666, "grad_norm": 0.049753960222005844, "learning_rate": 5.874704334826038e-05, "loss": 0.1276, "step": 10880 }, { "epoch": 0.5799030832312689, "grad_norm": 0.04683075100183487, "learning_rate": 5.8686410287636575e-05, "loss": 0.1273, "step": 10890 }, { "epoch": 0.5804355929495714, "grad_norm": 0.04198311269283295, "learning_rate": 5.862579680582263e-05, "loss": 0.1277, "step": 10900 }, { "epoch": 0.5809681026678737, "grad_norm": 0.045857448130846024, "learning_rate": 5.8565203090784484e-05, "loss": 0.1271, "step": 10910 }, { "epoch": 0.581500612386176, "grad_norm": 0.040586717426776886, "learning_rate": 5.8504629330426816e-05, "loss": 0.1274, "step": 10920 }, { "epoch": 0.5820331221044784, "grad_norm": 0.050801508128643036, "learning_rate": 5.844407571259235e-05, "loss": 0.127, "step": 10930 }, { "epoch": 0.5825656318227808, "grad_norm": 0.050907645374536514, "learning_rate": 5.8383542425061424e-05, "loss": 0.1272, "step": 10940 }, { "epoch": 0.5830981415410831, "grad_norm": 0.08759643882513046, "learning_rate": 5.8323029655551266e-05, "loss": 0.1267, "step": 10950 }, { "epoch": 0.5836306512593855, "grad_norm": 0.05688736215233803, "learning_rate": 5.8262537591715493e-05, "loss": 0.1275, "step": 10960 }, { "epoch": 0.5841631609776878, "grad_norm": 0.05095606669783592, "learning_rate": 5.820206642114355e-05, "loss": 0.1271, "step": 10970 }, { "epoch": 0.5846956706959902, "grad_norm": 0.07440601289272308, "learning_rate": 5.814161633136006e-05, "loss": 0.1275, "step": 10980 }, { "epoch": 0.5852281804142926, "grad_norm": 0.07147916406393051, "learning_rate": 5.808118750982427e-05, "loss": 0.1276, "step": 10990 }, { "epoch": 0.5857606901325949, "grad_norm": 0.06379908323287964, "learning_rate": 5.802078014392946e-05, "loss": 0.1269, "step": 11000 }, { "epoch": 0.5862931998508972, "grad_norm": 0.09271499514579773, "learning_rate": 5.796039442100243e-05, "loss": 0.126, "step": 11010 }, { "epoch": 0.5868257095691997, "grad_norm": 0.04959186539053917, "learning_rate": 5.7900030528302804e-05, "loss": 0.1277, "step": 11020 }, { "epoch": 0.587358219287502, "grad_norm": 0.03943556919693947, "learning_rate": 5.783968865302254e-05, "loss": 0.1264, "step": 11030 }, { "epoch": 0.5878907290058043, "grad_norm": 0.04361870139837265, "learning_rate": 5.777936898228531e-05, "loss": 0.1276, "step": 11040 }, { "epoch": 0.5884232387241067, "grad_norm": 0.0441637746989727, "learning_rate": 5.771907170314593e-05, "loss": 0.1274, "step": 11050 }, { "epoch": 0.588955748442409, "grad_norm": 0.048377875238657, "learning_rate": 5.7658797002589796e-05, "loss": 0.1271, "step": 11060 }, { "epoch": 0.5894882581607115, "grad_norm": 0.04017746075987816, "learning_rate": 5.759854506753224e-05, "loss": 0.1271, "step": 11070 }, { "epoch": 0.5900207678790138, "grad_norm": 0.08903171867132187, "learning_rate": 5.753831608481803e-05, "loss": 0.1265, "step": 11080 }, { "epoch": 0.5905532775973161, "grad_norm": 0.05974121764302254, "learning_rate": 5.747811024122081e-05, "loss": 0.127, "step": 11090 }, { "epoch": 0.5910857873156186, "grad_norm": 0.04399004206061363, "learning_rate": 5.741792772344232e-05, "loss": 0.127, "step": 11100 }, { "epoch": 0.5916182970339209, "grad_norm": 0.042719513177871704, "learning_rate": 5.7357768718112114e-05, "loss": 0.1271, "step": 11110 }, { "epoch": 0.5921508067522232, "grad_norm": 0.04649467021226883, "learning_rate": 5.729763341178678e-05, "loss": 0.1275, "step": 11120 }, { "epoch": 0.5926833164705256, "grad_norm": 0.061461612582206726, "learning_rate": 5.723752199094938e-05, "loss": 0.127, "step": 11130 }, { "epoch": 0.593215826188828, "grad_norm": 0.09266576170921326, "learning_rate": 5.717743464200895e-05, "loss": 0.1276, "step": 11140 }, { "epoch": 0.5937483359071303, "grad_norm": 0.039458803832530975, "learning_rate": 5.711737155129987e-05, "loss": 0.1273, "step": 11150 }, { "epoch": 0.5942808456254327, "grad_norm": 0.0349600687623024, "learning_rate": 5.7057332905081275e-05, "loss": 0.127, "step": 11160 }, { "epoch": 0.594813355343735, "grad_norm": 0.07518605887889862, "learning_rate": 5.699731888953653e-05, "loss": 0.1257, "step": 11170 }, { "epoch": 0.5953458650620373, "grad_norm": 0.056736767292022705, "learning_rate": 5.6937329690772554e-05, "loss": 0.1275, "step": 11180 }, { "epoch": 0.5958783747803398, "grad_norm": 0.0569731779396534, "learning_rate": 5.687736549481939e-05, "loss": 0.1269, "step": 11190 }, { "epoch": 0.5964108844986421, "grad_norm": 0.06685450673103333, "learning_rate": 5.681742648762949e-05, "loss": 0.1275, "step": 11200 }, { "epoch": 0.5969433942169444, "grad_norm": 0.05476146563887596, "learning_rate": 5.675751285507722e-05, "loss": 0.1266, "step": 11210 }, { "epoch": 0.5974759039352469, "grad_norm": 0.04533839598298073, "learning_rate": 5.6697624782958235e-05, "loss": 0.1273, "step": 11220 }, { "epoch": 0.5980084136535492, "grad_norm": 0.04754569008946419, "learning_rate": 5.6637762456988943e-05, "loss": 0.126, "step": 11230 }, { "epoch": 0.5985409233718515, "grad_norm": 0.06229964643716812, "learning_rate": 5.657792606280592e-05, "loss": 0.126, "step": 11240 }, { "epoch": 0.5990734330901539, "grad_norm": 0.057759564369916916, "learning_rate": 5.65181157859653e-05, "loss": 0.1274, "step": 11250 }, { "epoch": 0.5996059428084562, "grad_norm": 0.05225152522325516, "learning_rate": 5.6458331811942235e-05, "loss": 0.1264, "step": 11260 }, { "epoch": 0.6001384525267586, "grad_norm": 0.0388801135122776, "learning_rate": 5.639857432613034e-05, "loss": 0.127, "step": 11270 }, { "epoch": 0.600670962245061, "grad_norm": 0.044373005628585815, "learning_rate": 5.633884351384103e-05, "loss": 0.1274, "step": 11280 }, { "epoch": 0.6012034719633633, "grad_norm": 0.07331310957670212, "learning_rate": 5.627913956030306e-05, "loss": 0.1272, "step": 11290 }, { "epoch": 0.6017359816816656, "grad_norm": 0.05038286745548248, "learning_rate": 5.621946265066187e-05, "loss": 0.1273, "step": 11300 }, { "epoch": 0.6022684913999681, "grad_norm": 0.058994751423597336, "learning_rate": 5.615981296997901e-05, "loss": 0.1273, "step": 11310 }, { "epoch": 0.6028010011182704, "grad_norm": 0.036481309682130814, "learning_rate": 5.610019070323167e-05, "loss": 0.1271, "step": 11320 }, { "epoch": 0.6033335108365727, "grad_norm": 0.03966144099831581, "learning_rate": 5.604059603531193e-05, "loss": 0.127, "step": 11330 }, { "epoch": 0.6038660205548751, "grad_norm": 0.055649157613515854, "learning_rate": 5.5981029151026345e-05, "loss": 0.1277, "step": 11340 }, { "epoch": 0.6043985302731775, "grad_norm": 0.05199277400970459, "learning_rate": 5.59214902350953e-05, "loss": 0.1267, "step": 11350 }, { "epoch": 0.6049310399914798, "grad_norm": 0.04140447452664375, "learning_rate": 5.586197947215246e-05, "loss": 0.1263, "step": 11360 }, { "epoch": 0.6054635497097822, "grad_norm": 0.03502384573221207, "learning_rate": 5.580249704674414e-05, "loss": 0.1268, "step": 11370 }, { "epoch": 0.6059960594280845, "grad_norm": 0.04866914451122284, "learning_rate": 5.574304314332881e-05, "loss": 0.1258, "step": 11380 }, { "epoch": 0.606528569146387, "grad_norm": 0.039416830986738205, "learning_rate": 5.568361794627652e-05, "loss": 0.1266, "step": 11390 }, { "epoch": 0.6070610788646893, "grad_norm": 0.04279816523194313, "learning_rate": 5.562422163986827e-05, "loss": 0.1265, "step": 11400 }, { "epoch": 0.6075935885829916, "grad_norm": 0.08062811195850372, "learning_rate": 5.556485440829544e-05, "loss": 0.1266, "step": 11410 }, { "epoch": 0.608126098301294, "grad_norm": 0.042986951768398285, "learning_rate": 5.550551643565931e-05, "loss": 0.1269, "step": 11420 }, { "epoch": 0.6086586080195964, "grad_norm": 0.056119028478860855, "learning_rate": 5.544620790597037e-05, "loss": 0.1273, "step": 11430 }, { "epoch": 0.6091911177378987, "grad_norm": 0.0498378686606884, "learning_rate": 5.5386929003147835e-05, "loss": 0.1273, "step": 11440 }, { "epoch": 0.6097236274562011, "grad_norm": 0.04148755222558975, "learning_rate": 5.5327679911019034e-05, "loss": 0.1265, "step": 11450 }, { "epoch": 0.6102561371745034, "grad_norm": 0.04891781508922577, "learning_rate": 5.5268460813318866e-05, "loss": 0.127, "step": 11460 }, { "epoch": 0.6107886468928058, "grad_norm": 0.07420381903648376, "learning_rate": 5.520927189368923e-05, "loss": 0.1268, "step": 11470 }, { "epoch": 0.6113211566111082, "grad_norm": 0.12143438309431076, "learning_rate": 5.5150113335678365e-05, "loss": 0.1273, "step": 11480 }, { "epoch": 0.6118536663294105, "grad_norm": 0.045520998537540436, "learning_rate": 5.509098532274044e-05, "loss": 0.127, "step": 11490 }, { "epoch": 0.6123861760477128, "grad_norm": 0.05857592076063156, "learning_rate": 5.503188803823487e-05, "loss": 0.1269, "step": 11500 }, { "epoch": 0.6129186857660153, "grad_norm": 0.04991764947772026, "learning_rate": 5.497282166542579e-05, "loss": 0.1267, "step": 11510 }, { "epoch": 0.6134511954843176, "grad_norm": 0.059020016342401505, "learning_rate": 5.4913786387481426e-05, "loss": 0.127, "step": 11520 }, { "epoch": 0.6139837052026199, "grad_norm": 0.06219782307744026, "learning_rate": 5.485478238747367e-05, "loss": 0.1268, "step": 11530 }, { "epoch": 0.6145162149209223, "grad_norm": 0.060054097324609756, "learning_rate": 5.4795809848377323e-05, "loss": 0.1266, "step": 11540 }, { "epoch": 0.6150487246392247, "grad_norm": 0.05419805273413658, "learning_rate": 5.473686895306971e-05, "loss": 0.1269, "step": 11550 }, { "epoch": 0.615581234357527, "grad_norm": 0.07475223392248154, "learning_rate": 5.4677959884329944e-05, "loss": 0.1264, "step": 11560 }, { "epoch": 0.6161137440758294, "grad_norm": 0.052985042333602905, "learning_rate": 5.4619082824838506e-05, "loss": 0.1266, "step": 11570 }, { "epoch": 0.6166462537941317, "grad_norm": 0.062309183180332184, "learning_rate": 5.45602379571766e-05, "loss": 0.1276, "step": 11580 }, { "epoch": 0.6171787635124341, "grad_norm": 0.0704023614525795, "learning_rate": 5.450142546382555e-05, "loss": 0.126, "step": 11590 }, { "epoch": 0.6177112732307365, "grad_norm": 0.04271765798330307, "learning_rate": 5.444264552716636e-05, "loss": 0.1267, "step": 11600 }, { "epoch": 0.6182437829490388, "grad_norm": 0.03997405245900154, "learning_rate": 5.438389832947903e-05, "loss": 0.1267, "step": 11610 }, { "epoch": 0.6187762926673411, "grad_norm": 0.06841737031936646, "learning_rate": 5.432518405294208e-05, "loss": 0.1259, "step": 11620 }, { "epoch": 0.6193088023856436, "grad_norm": 0.07736402004957199, "learning_rate": 5.426650287963186e-05, "loss": 0.1277, "step": 11630 }, { "epoch": 0.6198413121039459, "grad_norm": 0.05138285458087921, "learning_rate": 5.4207854991522125e-05, "loss": 0.1268, "step": 11640 }, { "epoch": 0.6203738218222482, "grad_norm": 0.055199526250362396, "learning_rate": 5.4149240570483394e-05, "loss": 0.1258, "step": 11650 }, { "epoch": 0.6209063315405506, "grad_norm": 0.0480291023850441, "learning_rate": 5.409065979828243e-05, "loss": 0.1267, "step": 11660 }, { "epoch": 0.621438841258853, "grad_norm": 0.0840907022356987, "learning_rate": 5.403211285658158e-05, "loss": 0.1265, "step": 11670 }, { "epoch": 0.6219713509771553, "grad_norm": 0.042071383446455, "learning_rate": 5.397359992693835e-05, "loss": 0.1262, "step": 11680 }, { "epoch": 0.6225038606954577, "grad_norm": 0.0721912607550621, "learning_rate": 5.3915121190804755e-05, "loss": 0.1268, "step": 11690 }, { "epoch": 0.62303637041376, "grad_norm": 0.058796901255846024, "learning_rate": 5.385667682952675e-05, "loss": 0.1263, "step": 11700 }, { "epoch": 0.6235688801320625, "grad_norm": 0.04958143085241318, "learning_rate": 5.3798267024343706e-05, "loss": 0.1266, "step": 11710 }, { "epoch": 0.6241013898503648, "grad_norm": 0.07576627284288406, "learning_rate": 5.373989195638785e-05, "loss": 0.1268, "step": 11720 }, { "epoch": 0.6246338995686671, "grad_norm": 0.050743598490953445, "learning_rate": 5.368155180668366e-05, "loss": 0.1262, "step": 11730 }, { "epoch": 0.6251664092869695, "grad_norm": 0.046020250767469406, "learning_rate": 5.3623246756147346e-05, "loss": 0.1262, "step": 11740 }, { "epoch": 0.6256989190052719, "grad_norm": 0.04735419899225235, "learning_rate": 5.356497698558628e-05, "loss": 0.1265, "step": 11750 }, { "epoch": 0.6262314287235742, "grad_norm": 0.058625295758247375, "learning_rate": 5.3506742675698384e-05, "loss": 0.1267, "step": 11760 }, { "epoch": 0.6267639384418766, "grad_norm": 0.05440155416727066, "learning_rate": 5.34485440070717e-05, "loss": 0.1265, "step": 11770 }, { "epoch": 0.6272964481601789, "grad_norm": 0.06765516102313995, "learning_rate": 5.3390381160183645e-05, "loss": 0.1273, "step": 11780 }, { "epoch": 0.6278289578784813, "grad_norm": 0.042375244200229645, "learning_rate": 5.333225431540062e-05, "loss": 0.1269, "step": 11790 }, { "epoch": 0.6283614675967837, "grad_norm": 0.04143916070461273, "learning_rate": 5.327416365297737e-05, "loss": 0.1267, "step": 11800 }, { "epoch": 0.628893977315086, "grad_norm": 0.03681569918990135, "learning_rate": 5.321610935305643e-05, "loss": 0.1265, "step": 11810 }, { "epoch": 0.6294264870333883, "grad_norm": 0.04781223088502884, "learning_rate": 5.315809159566753e-05, "loss": 0.1262, "step": 11820 }, { "epoch": 0.6299589967516908, "grad_norm": 0.05528895556926727, "learning_rate": 5.3100110560727155e-05, "loss": 0.1261, "step": 11830 }, { "epoch": 0.6304915064699931, "grad_norm": 0.050820931792259216, "learning_rate": 5.3042166428037867e-05, "loss": 0.1265, "step": 11840 }, { "epoch": 0.6310240161882954, "grad_norm": 0.049191396683454514, "learning_rate": 5.2984259377287795e-05, "loss": 0.1266, "step": 11850 }, { "epoch": 0.6315565259065978, "grad_norm": 0.04525256156921387, "learning_rate": 5.2926389588050095e-05, "loss": 0.127, "step": 11860 }, { "epoch": 0.6320890356249002, "grad_norm": 0.03086530603468418, "learning_rate": 5.2868557239782335e-05, "loss": 0.1268, "step": 11870 }, { "epoch": 0.6326215453432025, "grad_norm": 0.06164710223674774, "learning_rate": 5.2810762511826017e-05, "loss": 0.1265, "step": 11880 }, { "epoch": 0.6331540550615049, "grad_norm": 0.10141383111476898, "learning_rate": 5.275300558340596e-05, "loss": 0.1264, "step": 11890 }, { "epoch": 0.6336865647798072, "grad_norm": 0.043847665190696716, "learning_rate": 5.269528663362976e-05, "loss": 0.1266, "step": 11900 }, { "epoch": 0.6342190744981095, "grad_norm": 0.04844609647989273, "learning_rate": 5.2637605841487246e-05, "loss": 0.1264, "step": 11910 }, { "epoch": 0.634751584216412, "grad_norm": 0.04017659276723862, "learning_rate": 5.257996338584994e-05, "loss": 0.1266, "step": 11920 }, { "epoch": 0.6352840939347143, "grad_norm": 0.04086530581116676, "learning_rate": 5.2522359445470434e-05, "loss": 0.1264, "step": 11930 }, { "epoch": 0.6358166036530166, "grad_norm": 0.0746808871626854, "learning_rate": 5.246479419898191e-05, "loss": 0.1265, "step": 11940 }, { "epoch": 0.636349113371319, "grad_norm": 0.047690387815237045, "learning_rate": 5.2407267824897556e-05, "loss": 0.1261, "step": 11950 }, { "epoch": 0.6368816230896214, "grad_norm": 0.07566772401332855, "learning_rate": 5.234978050161002e-05, "loss": 0.1263, "step": 11960 }, { "epoch": 0.6374141328079237, "grad_norm": 0.07059159129858017, "learning_rate": 5.229233240739082e-05, "loss": 0.1263, "step": 11970 }, { "epoch": 0.6379466425262261, "grad_norm": 0.06677578389644623, "learning_rate": 5.223492372038989e-05, "loss": 0.1273, "step": 11980 }, { "epoch": 0.6384791522445284, "grad_norm": 0.09771794825792313, "learning_rate": 5.217755461863487e-05, "loss": 0.1268, "step": 11990 }, { "epoch": 0.6390116619628308, "grad_norm": 0.0393114909529686, "learning_rate": 5.212022528003072e-05, "loss": 0.1264, "step": 12000 }, { "epoch": 0.6395441716811332, "grad_norm": 0.05768098682165146, "learning_rate": 5.2062935882359054e-05, "loss": 0.1267, "step": 12010 }, { "epoch": 0.6400766813994355, "grad_norm": 0.04626571014523506, "learning_rate": 5.2005686603277625e-05, "loss": 0.1272, "step": 12020 }, { "epoch": 0.640609191117738, "grad_norm": 0.06112409383058548, "learning_rate": 5.1948477620319805e-05, "loss": 0.1265, "step": 12030 }, { "epoch": 0.6411417008360403, "grad_norm": 0.04936950281262398, "learning_rate": 5.1891309110893974e-05, "loss": 0.1268, "step": 12040 }, { "epoch": 0.6416742105543426, "grad_norm": 0.053032536059617996, "learning_rate": 5.183418125228301e-05, "loss": 0.1269, "step": 12050 }, { "epoch": 0.642206720272645, "grad_norm": 0.053800616413354874, "learning_rate": 5.177709422164374e-05, "loss": 0.1269, "step": 12060 }, { "epoch": 0.6427392299909473, "grad_norm": 0.03961695730686188, "learning_rate": 5.1720048196006376e-05, "loss": 0.1265, "step": 12070 }, { "epoch": 0.6432717397092497, "grad_norm": 0.040204983204603195, "learning_rate": 5.166304335227396e-05, "loss": 0.1262, "step": 12080 }, { "epoch": 0.6438042494275521, "grad_norm": 0.11371159553527832, "learning_rate": 5.160607986722186e-05, "loss": 0.1263, "step": 12090 }, { "epoch": 0.6443367591458544, "grad_norm": 0.037854380905628204, "learning_rate": 5.154915791749715e-05, "loss": 0.1265, "step": 12100 }, { "epoch": 0.6448692688641567, "grad_norm": 0.07416236400604248, "learning_rate": 5.1492277679618104e-05, "loss": 0.1261, "step": 12110 }, { "epoch": 0.6454017785824592, "grad_norm": 0.0411413200199604, "learning_rate": 5.1435439329973664e-05, "loss": 0.1262, "step": 12120 }, { "epoch": 0.6459342883007615, "grad_norm": 0.06878205388784409, "learning_rate": 5.1378643044822884e-05, "loss": 0.1268, "step": 12130 }, { "epoch": 0.6464667980190638, "grad_norm": 0.05869507044553757, "learning_rate": 5.132188900029433e-05, "loss": 0.1258, "step": 12140 }, { "epoch": 0.6469993077373662, "grad_norm": 0.049447815865278244, "learning_rate": 5.126517737238563e-05, "loss": 0.1266, "step": 12150 }, { "epoch": 0.6475318174556686, "grad_norm": 0.09829236567020416, "learning_rate": 5.120850833696282e-05, "loss": 0.1262, "step": 12160 }, { "epoch": 0.6480643271739709, "grad_norm": 0.03944886848330498, "learning_rate": 5.115188206975992e-05, "loss": 0.127, "step": 12170 }, { "epoch": 0.6485968368922733, "grad_norm": 0.038028497248888016, "learning_rate": 5.109529874637824e-05, "loss": 0.1266, "step": 12180 }, { "epoch": 0.6491293466105756, "grad_norm": 0.05182207375764847, "learning_rate": 5.103875854228601e-05, "loss": 0.1259, "step": 12190 }, { "epoch": 0.649661856328878, "grad_norm": 0.04164310172200203, "learning_rate": 5.098226163281767e-05, "loss": 0.1256, "step": 12200 }, { "epoch": 0.6501943660471804, "grad_norm": 0.04075628146529198, "learning_rate": 5.0925808193173454e-05, "loss": 0.1267, "step": 12210 }, { "epoch": 0.6507268757654827, "grad_norm": 0.05309925228357315, "learning_rate": 5.0869398398418744e-05, "loss": 0.1257, "step": 12220 }, { "epoch": 0.651259385483785, "grad_norm": 0.05489126220345497, "learning_rate": 5.081303242348363e-05, "loss": 0.1262, "step": 12230 }, { "epoch": 0.6517918952020875, "grad_norm": 0.06438528001308441, "learning_rate": 5.075671044316228e-05, "loss": 0.1268, "step": 12240 }, { "epoch": 0.6523244049203898, "grad_norm": 0.045175325125455856, "learning_rate": 5.070043263211242e-05, "loss": 0.1262, "step": 12250 }, { "epoch": 0.6528569146386921, "grad_norm": 0.05378909409046173, "learning_rate": 5.064419916485485e-05, "loss": 0.1264, "step": 12260 }, { "epoch": 0.6533894243569945, "grad_norm": 0.05675683543086052, "learning_rate": 5.058801021577282e-05, "loss": 0.1266, "step": 12270 }, { "epoch": 0.6539219340752969, "grad_norm": 0.0350642092525959, "learning_rate": 5.053186595911152e-05, "loss": 0.1269, "step": 12280 }, { "epoch": 0.6544544437935992, "grad_norm": 0.05776926130056381, "learning_rate": 5.0475766568977586e-05, "loss": 0.1264, "step": 12290 }, { "epoch": 0.6549869535119016, "grad_norm": 0.053256552666425705, "learning_rate": 5.041971221933851e-05, "loss": 0.127, "step": 12300 }, { "epoch": 0.6555194632302039, "grad_norm": 0.03956317901611328, "learning_rate": 5.0363703084022065e-05, "loss": 0.1265, "step": 12310 }, { "epoch": 0.6560519729485063, "grad_norm": 0.033014725893735886, "learning_rate": 5.0307739336715864e-05, "loss": 0.1261, "step": 12320 }, { "epoch": 0.6565844826668087, "grad_norm": 0.040162548422813416, "learning_rate": 5.0251821150966746e-05, "loss": 0.1266, "step": 12330 }, { "epoch": 0.657116992385111, "grad_norm": 0.08668463677167892, "learning_rate": 5.0195948700180294e-05, "loss": 0.1256, "step": 12340 }, { "epoch": 0.6576495021034134, "grad_norm": 0.05517444759607315, "learning_rate": 5.0140122157620185e-05, "loss": 0.1262, "step": 12350 }, { "epoch": 0.6581820118217158, "grad_norm": 0.05813097953796387, "learning_rate": 5.008434169640781e-05, "loss": 0.1258, "step": 12360 }, { "epoch": 0.6587145215400181, "grad_norm": 0.036499012261629105, "learning_rate": 5.002860748952165e-05, "loss": 0.1261, "step": 12370 }, { "epoch": 0.6592470312583205, "grad_norm": 0.03472182899713516, "learning_rate": 4.997291970979672e-05, "loss": 0.1268, "step": 12380 }, { "epoch": 0.6597795409766228, "grad_norm": 0.06570050120353699, "learning_rate": 4.9917278529924036e-05, "loss": 0.1265, "step": 12390 }, { "epoch": 0.6603120506949252, "grad_norm": 0.0551844947040081, "learning_rate": 4.9861684122450166e-05, "loss": 0.1262, "step": 12400 }, { "epoch": 0.6608445604132276, "grad_norm": 0.05402039363980293, "learning_rate": 4.9806136659776625e-05, "loss": 0.1264, "step": 12410 }, { "epoch": 0.6613770701315299, "grad_norm": 0.05242108181118965, "learning_rate": 4.975063631415934e-05, "loss": 0.1262, "step": 12420 }, { "epoch": 0.6619095798498322, "grad_norm": 0.04954907298088074, "learning_rate": 4.96951832577081e-05, "loss": 0.1259, "step": 12430 }, { "epoch": 0.6624420895681347, "grad_norm": 0.031282830983400345, "learning_rate": 4.963977766238604e-05, "loss": 0.1259, "step": 12440 }, { "epoch": 0.662974599286437, "grad_norm": 0.037177179008722305, "learning_rate": 4.95844197000092e-05, "loss": 0.1263, "step": 12450 }, { "epoch": 0.6635071090047393, "grad_norm": 0.059044573456048965, "learning_rate": 4.95291095422458e-05, "loss": 0.1251, "step": 12460 }, { "epoch": 0.6640396187230417, "grad_norm": 0.04457508400082588, "learning_rate": 4.94738473606159e-05, "loss": 0.127, "step": 12470 }, { "epoch": 0.6645721284413441, "grad_norm": 0.04706263169646263, "learning_rate": 4.941863332649072e-05, "loss": 0.1256, "step": 12480 }, { "epoch": 0.6651046381596464, "grad_norm": 0.04623222351074219, "learning_rate": 4.936346761109223e-05, "loss": 0.1256, "step": 12490 }, { "epoch": 0.6656371478779488, "grad_norm": 0.04978486895561218, "learning_rate": 4.9308350385492494e-05, "loss": 0.126, "step": 12500 }, { "epoch": 0.6661696575962511, "grad_norm": 0.055559538304805756, "learning_rate": 4.925328182061326e-05, "loss": 0.126, "step": 12510 }, { "epoch": 0.6667021673145535, "grad_norm": 0.04878619685769081, "learning_rate": 4.9198262087225375e-05, "loss": 0.1262, "step": 12520 }, { "epoch": 0.6672346770328559, "grad_norm": 0.045161280781030655, "learning_rate": 4.9143291355948225e-05, "loss": 0.1259, "step": 12530 }, { "epoch": 0.6677671867511582, "grad_norm": 0.08460939675569534, "learning_rate": 4.9088369797249234e-05, "loss": 0.1261, "step": 12540 }, { "epoch": 0.6682996964694605, "grad_norm": 0.05501072108745575, "learning_rate": 4.903349758144339e-05, "loss": 0.1265, "step": 12550 }, { "epoch": 0.668832206187763, "grad_norm": 0.04381651058793068, "learning_rate": 4.897867487869262e-05, "loss": 0.1259, "step": 12560 }, { "epoch": 0.6693647159060653, "grad_norm": 0.051561057567596436, "learning_rate": 4.8923901859005335e-05, "loss": 0.1261, "step": 12570 }, { "epoch": 0.6698972256243676, "grad_norm": 0.03529192507266998, "learning_rate": 4.886917869223585e-05, "loss": 0.1261, "step": 12580 }, { "epoch": 0.67042973534267, "grad_norm": 0.047314297407865524, "learning_rate": 4.881450554808389e-05, "loss": 0.1266, "step": 12590 }, { "epoch": 0.6709622450609724, "grad_norm": 0.06020704656839371, "learning_rate": 4.875988259609407e-05, "loss": 0.1258, "step": 12600 }, { "epoch": 0.6714947547792747, "grad_norm": 0.040109023451805115, "learning_rate": 4.870531000565537e-05, "loss": 0.1261, "step": 12610 }, { "epoch": 0.6720272644975771, "grad_norm": 0.03262796998023987, "learning_rate": 4.865078794600053e-05, "loss": 0.1264, "step": 12620 }, { "epoch": 0.6725597742158794, "grad_norm": 0.039773985743522644, "learning_rate": 4.859631658620569e-05, "loss": 0.1265, "step": 12630 }, { "epoch": 0.6730922839341817, "grad_norm": 0.046503521502017975, "learning_rate": 4.854189609518969e-05, "loss": 0.1259, "step": 12640 }, { "epoch": 0.6736247936524842, "grad_norm": 0.04692930728197098, "learning_rate": 4.848752664171362e-05, "loss": 0.1258, "step": 12650 }, { "epoch": 0.6741573033707865, "grad_norm": 0.04684825614094734, "learning_rate": 4.843320839438035e-05, "loss": 0.1266, "step": 12660 }, { "epoch": 0.6746898130890889, "grad_norm": 0.051075540482997894, "learning_rate": 4.837894152163395e-05, "loss": 0.1257, "step": 12670 }, { "epoch": 0.6752223228073913, "grad_norm": 0.08315866440534592, "learning_rate": 4.832472619175913e-05, "loss": 0.1256, "step": 12680 }, { "epoch": 0.6757548325256936, "grad_norm": 0.05411198362708092, "learning_rate": 4.827056257288079e-05, "loss": 0.1265, "step": 12690 }, { "epoch": 0.676287342243996, "grad_norm": 0.045681897550821304, "learning_rate": 4.821645083296347e-05, "loss": 0.1264, "step": 12700 }, { "epoch": 0.6768198519622983, "grad_norm": 0.03494982793927193, "learning_rate": 4.8162391139810845e-05, "loss": 0.1265, "step": 12710 }, { "epoch": 0.6773523616806006, "grad_norm": 0.04865271970629692, "learning_rate": 4.8108383661065185e-05, "loss": 0.126, "step": 12720 }, { "epoch": 0.6778848713989031, "grad_norm": 0.055251702666282654, "learning_rate": 4.805442856420682e-05, "loss": 0.1264, "step": 12730 }, { "epoch": 0.6784173811172054, "grad_norm": 0.051663871854543686, "learning_rate": 4.800052601655362e-05, "loss": 0.1255, "step": 12740 }, { "epoch": 0.6789498908355077, "grad_norm": 0.05289029702544212, "learning_rate": 4.794667618526057e-05, "loss": 0.126, "step": 12750 }, { "epoch": 0.6794824005538102, "grad_norm": 0.05994449183344841, "learning_rate": 4.7892879237319136e-05, "loss": 0.1261, "step": 12760 }, { "epoch": 0.6800149102721125, "grad_norm": 0.03531305119395256, "learning_rate": 4.783913533955675e-05, "loss": 0.1265, "step": 12770 }, { "epoch": 0.6805474199904148, "grad_norm": 0.04480816796422005, "learning_rate": 4.7785444658636427e-05, "loss": 0.126, "step": 12780 }, { "epoch": 0.6810799297087172, "grad_norm": 0.045987531542778015, "learning_rate": 4.773180736105607e-05, "loss": 0.1268, "step": 12790 }, { "epoch": 0.6816124394270195, "grad_norm": 0.06958389282226562, "learning_rate": 4.767822361314805e-05, "loss": 0.1259, "step": 12800 }, { "epoch": 0.6821449491453219, "grad_norm": 0.03225488215684891, "learning_rate": 4.762469358107873e-05, "loss": 0.1257, "step": 12810 }, { "epoch": 0.6826774588636243, "grad_norm": 0.05594348534941673, "learning_rate": 4.757121743084784e-05, "loss": 0.1261, "step": 12820 }, { "epoch": 0.6832099685819266, "grad_norm": 0.06453056633472443, "learning_rate": 4.751779532828806e-05, "loss": 0.1263, "step": 12830 }, { "epoch": 0.6837424783002289, "grad_norm": 0.05000981315970421, "learning_rate": 4.746442743906442e-05, "loss": 0.1257, "step": 12840 }, { "epoch": 0.6842749880185314, "grad_norm": 0.06121028959751129, "learning_rate": 4.741111392867386e-05, "loss": 0.1258, "step": 12850 }, { "epoch": 0.6848074977368337, "grad_norm": 0.04783171787858009, "learning_rate": 4.7357854962444686e-05, "loss": 0.1259, "step": 12860 }, { "epoch": 0.685340007455136, "grad_norm": 0.0399705208837986, "learning_rate": 4.7304650705536084e-05, "loss": 0.1261, "step": 12870 }, { "epoch": 0.6858725171734384, "grad_norm": 0.03981216251850128, "learning_rate": 4.7251501322937534e-05, "loss": 0.126, "step": 12880 }, { "epoch": 0.6864050268917408, "grad_norm": 0.034330256283283234, "learning_rate": 4.7198406979468366e-05, "loss": 0.126, "step": 12890 }, { "epoch": 0.6869375366100431, "grad_norm": 0.06301886588335037, "learning_rate": 4.7145367839777237e-05, "loss": 0.1255, "step": 12900 }, { "epoch": 0.6874700463283455, "grad_norm": 0.07015033811330795, "learning_rate": 4.709238406834164e-05, "loss": 0.1257, "step": 12910 }, { "epoch": 0.6880025560466478, "grad_norm": 0.04845889285206795, "learning_rate": 4.703945582946729e-05, "loss": 0.1266, "step": 12920 }, { "epoch": 0.6885350657649502, "grad_norm": 0.043622374534606934, "learning_rate": 4.69865832872878e-05, "loss": 0.1255, "step": 12930 }, { "epoch": 0.6890675754832526, "grad_norm": 0.046708934009075165, "learning_rate": 4.6933766605763955e-05, "loss": 0.1262, "step": 12940 }, { "epoch": 0.6896000852015549, "grad_norm": 0.09181608259677887, "learning_rate": 4.688100594868341e-05, "loss": 0.1262, "step": 12950 }, { "epoch": 0.6901325949198572, "grad_norm": 0.0670885518193245, "learning_rate": 4.682830147965999e-05, "loss": 0.1262, "step": 12960 }, { "epoch": 0.6906651046381597, "grad_norm": 0.05586402490735054, "learning_rate": 4.6775653362133356e-05, "loss": 0.126, "step": 12970 }, { "epoch": 0.691197614356462, "grad_norm": 0.0520888976752758, "learning_rate": 4.6723061759368405e-05, "loss": 0.1256, "step": 12980 }, { "epoch": 0.6917301240747644, "grad_norm": 0.04255915433168411, "learning_rate": 4.667052683445474e-05, "loss": 0.1256, "step": 12990 }, { "epoch": 0.6922626337930667, "grad_norm": 0.04210617393255234, "learning_rate": 4.661804875030623e-05, "loss": 0.1259, "step": 13000 }, { "epoch": 0.6927951435113691, "grad_norm": 0.049725860357284546, "learning_rate": 4.656562766966047e-05, "loss": 0.1259, "step": 13010 }, { "epoch": 0.6933276532296715, "grad_norm": 0.04117880016565323, "learning_rate": 4.6513263755078305e-05, "loss": 0.1252, "step": 13020 }, { "epoch": 0.6938601629479738, "grad_norm": 0.06293977797031403, "learning_rate": 4.6460957168943286e-05, "loss": 0.1257, "step": 13030 }, { "epoch": 0.6943926726662761, "grad_norm": 0.08014130592346191, "learning_rate": 4.640870807346116e-05, "loss": 0.1259, "step": 13040 }, { "epoch": 0.6949251823845786, "grad_norm": 0.05291053652763367, "learning_rate": 4.6356516630659444e-05, "loss": 0.1261, "step": 13050 }, { "epoch": 0.6954576921028809, "grad_norm": 0.061003703624010086, "learning_rate": 4.630438300238684e-05, "loss": 0.1256, "step": 13060 }, { "epoch": 0.6959902018211832, "grad_norm": 0.06733989715576172, "learning_rate": 4.625230735031276e-05, "loss": 0.1259, "step": 13070 }, { "epoch": 0.6965227115394856, "grad_norm": 0.07360579818487167, "learning_rate": 4.620028983592687e-05, "loss": 0.1261, "step": 13080 }, { "epoch": 0.697055221257788, "grad_norm": 0.0536913201212883, "learning_rate": 4.6148330620538474e-05, "loss": 0.1263, "step": 13090 }, { "epoch": 0.6975877309760903, "grad_norm": 0.04993463680148125, "learning_rate": 4.609642986527615e-05, "loss": 0.1262, "step": 13100 }, { "epoch": 0.6981202406943927, "grad_norm": 0.04666028916835785, "learning_rate": 4.6044587731087155e-05, "loss": 0.1258, "step": 13110 }, { "epoch": 0.698652750412695, "grad_norm": 0.07630308717489243, "learning_rate": 4.599280437873699e-05, "loss": 0.1259, "step": 13120 }, { "epoch": 0.6991852601309974, "grad_norm": 0.07336148619651794, "learning_rate": 4.594107996880884e-05, "loss": 0.1261, "step": 13130 }, { "epoch": 0.6997177698492998, "grad_norm": 0.0583728589117527, "learning_rate": 4.588941466170312e-05, "loss": 0.1263, "step": 13140 }, { "epoch": 0.7002502795676021, "grad_norm": 0.04657367989420891, "learning_rate": 4.5837808617636935e-05, "loss": 0.1254, "step": 13150 }, { "epoch": 0.7007827892859044, "grad_norm": 0.03507756069302559, "learning_rate": 4.5786261996643664e-05, "loss": 0.1255, "step": 13160 }, { "epoch": 0.7013152990042069, "grad_norm": 0.10152143239974976, "learning_rate": 4.57347749585724e-05, "loss": 0.1261, "step": 13170 }, { "epoch": 0.7018478087225092, "grad_norm": 0.04690668731927872, "learning_rate": 4.568334766308741e-05, "loss": 0.126, "step": 13180 }, { "epoch": 0.7023803184408115, "grad_norm": 0.050610288977622986, "learning_rate": 4.563198026966776e-05, "loss": 0.1259, "step": 13190 }, { "epoch": 0.7029128281591139, "grad_norm": 0.07565128803253174, "learning_rate": 4.558067293760672e-05, "loss": 0.1253, "step": 13200 }, { "epoch": 0.7034453378774163, "grad_norm": 0.03581630438566208, "learning_rate": 4.552942582601134e-05, "loss": 0.125, "step": 13210 }, { "epoch": 0.7039778475957186, "grad_norm": 0.06287883222103119, "learning_rate": 4.547823909380188e-05, "loss": 0.1264, "step": 13220 }, { "epoch": 0.704510357314021, "grad_norm": 0.03954106569290161, "learning_rate": 4.542711289971139e-05, "loss": 0.1258, "step": 13230 }, { "epoch": 0.7050428670323233, "grad_norm": 0.07557252049446106, "learning_rate": 4.537604740228517e-05, "loss": 0.125, "step": 13240 }, { "epoch": 0.7055753767506256, "grad_norm": 0.07294019311666489, "learning_rate": 4.532504275988033e-05, "loss": 0.1259, "step": 13250 }, { "epoch": 0.7061078864689281, "grad_norm": 0.06293601542711258, "learning_rate": 4.527409913066522e-05, "loss": 0.1259, "step": 13260 }, { "epoch": 0.7066403961872304, "grad_norm": 0.043173741549253464, "learning_rate": 4.5223216672619e-05, "loss": 0.1255, "step": 13270 }, { "epoch": 0.7071729059055327, "grad_norm": 0.05080621689558029, "learning_rate": 4.517239554353116e-05, "loss": 0.1256, "step": 13280 }, { "epoch": 0.7077054156238352, "grad_norm": 0.04947923868894577, "learning_rate": 4.512163590100097e-05, "loss": 0.1257, "step": 13290 }, { "epoch": 0.7082379253421375, "grad_norm": 0.054971180856227875, "learning_rate": 4.507093790243704e-05, "loss": 0.1259, "step": 13300 }, { "epoch": 0.7087704350604399, "grad_norm": 0.0491788424551487, "learning_rate": 4.5020301705056825e-05, "loss": 0.1251, "step": 13310 }, { "epoch": 0.7093029447787422, "grad_norm": 0.06562227010726929, "learning_rate": 4.496972746588614e-05, "loss": 0.1256, "step": 13320 }, { "epoch": 0.7098354544970445, "grad_norm": 0.05146334320306778, "learning_rate": 4.4919215341758614e-05, "loss": 0.1259, "step": 13330 }, { "epoch": 0.710367964215347, "grad_norm": 0.04213017597794533, "learning_rate": 4.486876548931533e-05, "loss": 0.1258, "step": 13340 }, { "epoch": 0.7109004739336493, "grad_norm": 0.07794417440891266, "learning_rate": 4.481837806500419e-05, "loss": 0.1255, "step": 13350 }, { "epoch": 0.7114329836519516, "grad_norm": 0.06125866621732712, "learning_rate": 4.4768053225079565e-05, "loss": 0.1255, "step": 13360 }, { "epoch": 0.711965493370254, "grad_norm": 0.056374140083789825, "learning_rate": 4.471779112560168e-05, "loss": 0.1262, "step": 13370 }, { "epoch": 0.7124980030885564, "grad_norm": 0.05176498368382454, "learning_rate": 4.466759192243627e-05, "loss": 0.1264, "step": 13380 }, { "epoch": 0.7130305128068587, "grad_norm": 0.1045917272567749, "learning_rate": 4.461745577125399e-05, "loss": 0.1253, "step": 13390 }, { "epoch": 0.7135630225251611, "grad_norm": 0.03207787126302719, "learning_rate": 4.456738282752996e-05, "loss": 0.125, "step": 13400 }, { "epoch": 0.7140955322434634, "grad_norm": 0.06704405695199966, "learning_rate": 4.451737324654328e-05, "loss": 0.1253, "step": 13410 }, { "epoch": 0.7146280419617658, "grad_norm": 0.0693150982260704, "learning_rate": 4.4467427183376596e-05, "loss": 0.1257, "step": 13420 }, { "epoch": 0.7151605516800682, "grad_norm": 0.04728610813617706, "learning_rate": 4.441754479291557e-05, "loss": 0.125, "step": 13430 }, { "epoch": 0.7156930613983705, "grad_norm": 0.05448344349861145, "learning_rate": 4.43677262298484e-05, "loss": 0.1261, "step": 13440 }, { "epoch": 0.7162255711166728, "grad_norm": 0.036701589822769165, "learning_rate": 4.431797164866533e-05, "loss": 0.1259, "step": 13450 }, { "epoch": 0.7167580808349753, "grad_norm": 0.043069060891866684, "learning_rate": 4.426828120365824e-05, "loss": 0.1254, "step": 13460 }, { "epoch": 0.7172905905532776, "grad_norm": 0.07068092375993729, "learning_rate": 4.421865504892011e-05, "loss": 0.1251, "step": 13470 }, { "epoch": 0.7178231002715799, "grad_norm": 0.045086752623319626, "learning_rate": 4.416909333834451e-05, "loss": 0.1259, "step": 13480 }, { "epoch": 0.7183556099898823, "grad_norm": 0.03668762743473053, "learning_rate": 4.4119596225625216e-05, "loss": 0.1254, "step": 13490 }, { "epoch": 0.7188881197081847, "grad_norm": 0.05897703021764755, "learning_rate": 4.4070163864255644e-05, "loss": 0.1256, "step": 13500 }, { "epoch": 0.719420629426487, "grad_norm": 0.047495052218437195, "learning_rate": 4.4020796407528455e-05, "loss": 0.126, "step": 13510 }, { "epoch": 0.7199531391447894, "grad_norm": 0.06927572190761566, "learning_rate": 4.397149400853498e-05, "loss": 0.1256, "step": 13520 }, { "epoch": 0.7204856488630917, "grad_norm": 0.03571341931819916, "learning_rate": 4.3922256820164856e-05, "loss": 0.1257, "step": 13530 }, { "epoch": 0.7210181585813941, "grad_norm": 0.04303283616900444, "learning_rate": 4.3873084995105475e-05, "loss": 0.1255, "step": 13540 }, { "epoch": 0.7215506682996965, "grad_norm": 0.05867360904812813, "learning_rate": 4.382397868584151e-05, "loss": 0.1257, "step": 13550 }, { "epoch": 0.7220831780179988, "grad_norm": 0.05930043384432793, "learning_rate": 4.377493804465452e-05, "loss": 0.1254, "step": 13560 }, { "epoch": 0.7226156877363011, "grad_norm": 0.04837455227971077, "learning_rate": 4.372596322362237e-05, "loss": 0.1256, "step": 13570 }, { "epoch": 0.7231481974546036, "grad_norm": 0.04174095019698143, "learning_rate": 4.3677054374618844e-05, "loss": 0.1255, "step": 13580 }, { "epoch": 0.7236807071729059, "grad_norm": 0.0439835861325264, "learning_rate": 4.3628211649313164e-05, "loss": 0.1256, "step": 13590 }, { "epoch": 0.7242132168912082, "grad_norm": 0.06301723420619965, "learning_rate": 4.357943519916942e-05, "loss": 0.1263, "step": 13600 }, { "epoch": 0.7247457266095106, "grad_norm": 0.06303390860557556, "learning_rate": 4.353072517544624e-05, "loss": 0.1254, "step": 13610 }, { "epoch": 0.725278236327813, "grad_norm": 0.06305810809135437, "learning_rate": 4.348208172919626e-05, "loss": 0.1252, "step": 13620 }, { "epoch": 0.7258107460461154, "grad_norm": 0.08300595730543137, "learning_rate": 4.343350501126566e-05, "loss": 0.1258, "step": 13630 }, { "epoch": 0.7263432557644177, "grad_norm": 0.054570749402046204, "learning_rate": 4.338499517229365e-05, "loss": 0.1255, "step": 13640 }, { "epoch": 0.72687576548272, "grad_norm": 0.06060722470283508, "learning_rate": 4.333655236271207e-05, "loss": 0.1259, "step": 13650 }, { "epoch": 0.7274082752010225, "grad_norm": 0.04300956055521965, "learning_rate": 4.328817673274491e-05, "loss": 0.1263, "step": 13660 }, { "epoch": 0.7279407849193248, "grad_norm": 0.07667776197195053, "learning_rate": 4.3239868432407804e-05, "loss": 0.1256, "step": 13670 }, { "epoch": 0.7284732946376271, "grad_norm": 0.04549676924943924, "learning_rate": 4.3191627611507625e-05, "loss": 0.126, "step": 13680 }, { "epoch": 0.7290058043559295, "grad_norm": 0.03738940879702568, "learning_rate": 4.314345441964197e-05, "loss": 0.1258, "step": 13690 }, { "epoch": 0.7295383140742319, "grad_norm": 0.04415878280997276, "learning_rate": 4.3095349006198704e-05, "loss": 0.1253, "step": 13700 }, { "epoch": 0.7300708237925342, "grad_norm": 0.04898412898182869, "learning_rate": 4.304731152035552e-05, "loss": 0.1254, "step": 13710 }, { "epoch": 0.7306033335108366, "grad_norm": 0.06354010105133057, "learning_rate": 4.299934211107947e-05, "loss": 0.1259, "step": 13720 }, { "epoch": 0.7311358432291389, "grad_norm": 0.055742815136909485, "learning_rate": 4.295144092712648e-05, "loss": 0.1253, "step": 13730 }, { "epoch": 0.7316683529474413, "grad_norm": 0.06040511652827263, "learning_rate": 4.290360811704094e-05, "loss": 0.1254, "step": 13740 }, { "epoch": 0.7322008626657437, "grad_norm": 0.04668520390987396, "learning_rate": 4.2855843829155166e-05, "loss": 0.1251, "step": 13750 }, { "epoch": 0.732733372384046, "grad_norm": 0.045908767729997635, "learning_rate": 4.280814821158899e-05, "loss": 0.1257, "step": 13760 }, { "epoch": 0.7332658821023483, "grad_norm": 0.06110945716500282, "learning_rate": 4.276052141224931e-05, "loss": 0.1253, "step": 13770 }, { "epoch": 0.7337983918206508, "grad_norm": 0.05198313668370247, "learning_rate": 4.271296357882962e-05, "loss": 0.1254, "step": 13780 }, { "epoch": 0.7343309015389531, "grad_norm": 0.08895553648471832, "learning_rate": 4.266547485880954e-05, "loss": 0.1245, "step": 13790 }, { "epoch": 0.7348634112572554, "grad_norm": 0.0400872640311718, "learning_rate": 4.261805539945433e-05, "loss": 0.1258, "step": 13800 }, { "epoch": 0.7353959209755578, "grad_norm": 0.05898161605000496, "learning_rate": 4.257070534781452e-05, "loss": 0.1257, "step": 13810 }, { "epoch": 0.7359284306938602, "grad_norm": 0.05569084361195564, "learning_rate": 4.2523424850725366e-05, "loss": 0.1256, "step": 13820 }, { "epoch": 0.7364609404121625, "grad_norm": 0.03815682604908943, "learning_rate": 4.2476214054806464e-05, "loss": 0.1258, "step": 13830 }, { "epoch": 0.7369934501304649, "grad_norm": 0.05617569014430046, "learning_rate": 4.242907310646124e-05, "loss": 0.1256, "step": 13840 }, { "epoch": 0.7375259598487672, "grad_norm": 0.036379266530275345, "learning_rate": 4.238200215187653e-05, "loss": 0.125, "step": 13850 }, { "epoch": 0.7380584695670696, "grad_norm": 0.03899050131440163, "learning_rate": 4.233500133702209e-05, "loss": 0.1252, "step": 13860 }, { "epoch": 0.738590979285372, "grad_norm": 0.08941038697957993, "learning_rate": 4.2288070807650195e-05, "loss": 0.1258, "step": 13870 }, { "epoch": 0.7391234890036743, "grad_norm": 0.04613060876727104, "learning_rate": 4.2241210709295157e-05, "loss": 0.1257, "step": 13880 }, { "epoch": 0.7396559987219766, "grad_norm": 0.05115320160984993, "learning_rate": 4.219442118727289e-05, "loss": 0.1253, "step": 13890 }, { "epoch": 0.7401885084402791, "grad_norm": 0.050107911229133606, "learning_rate": 4.214770238668041e-05, "loss": 0.1249, "step": 13900 }, { "epoch": 0.7407210181585814, "grad_norm": 0.05156391113996506, "learning_rate": 4.210105445239544e-05, "loss": 0.1247, "step": 13910 }, { "epoch": 0.7412535278768837, "grad_norm": 0.03768635913729668, "learning_rate": 4.205447752907594e-05, "loss": 0.1255, "step": 13920 }, { "epoch": 0.7417860375951861, "grad_norm": 0.0679803118109703, "learning_rate": 4.20079717611597e-05, "loss": 0.1251, "step": 13930 }, { "epoch": 0.7423185473134885, "grad_norm": 0.04139627143740654, "learning_rate": 4.196153729286377e-05, "loss": 0.1254, "step": 13940 }, { "epoch": 0.7428510570317909, "grad_norm": 0.07428357750177383, "learning_rate": 4.191517426818419e-05, "loss": 0.1261, "step": 13950 }, { "epoch": 0.7433835667500932, "grad_norm": 0.04479588195681572, "learning_rate": 4.186888283089537e-05, "loss": 0.1251, "step": 13960 }, { "epoch": 0.7439160764683955, "grad_norm": 0.06392911076545715, "learning_rate": 4.182266312454977e-05, "loss": 0.1256, "step": 13970 }, { "epoch": 0.744448586186698, "grad_norm": 0.054266393184661865, "learning_rate": 4.177651529247739e-05, "loss": 0.125, "step": 13980 }, { "epoch": 0.7449810959050003, "grad_norm": 0.06033441051840782, "learning_rate": 4.173043947778536e-05, "loss": 0.1253, "step": 13990 }, { "epoch": 0.7455136056233026, "grad_norm": 0.04095017537474632, "learning_rate": 4.1684435823357454e-05, "loss": 0.1255, "step": 14000 }, { "epoch": 0.746046115341605, "grad_norm": 0.037291690707206726, "learning_rate": 4.163850447185369e-05, "loss": 0.1245, "step": 14010 }, { "epoch": 0.7465786250599074, "grad_norm": 0.06799422949552536, "learning_rate": 4.159264556570986e-05, "loss": 0.1255, "step": 14020 }, { "epoch": 0.7471111347782097, "grad_norm": 0.044954586774110794, "learning_rate": 4.1546859247137124e-05, "loss": 0.1264, "step": 14030 }, { "epoch": 0.7476436444965121, "grad_norm": 0.041422173380851746, "learning_rate": 4.1501145658121525e-05, "loss": 0.1254, "step": 14040 }, { "epoch": 0.7481761542148144, "grad_norm": 0.09260525554418564, "learning_rate": 4.145550494042356e-05, "loss": 0.1244, "step": 14050 }, { "epoch": 0.7487086639331167, "grad_norm": 0.03994472324848175, "learning_rate": 4.140993723557775e-05, "loss": 0.1262, "step": 14060 }, { "epoch": 0.7492411736514192, "grad_norm": 0.04395360127091408, "learning_rate": 4.136444268489221e-05, "loss": 0.1263, "step": 14070 }, { "epoch": 0.7497736833697215, "grad_norm": 0.0409519337117672, "learning_rate": 4.1319021429448204e-05, "loss": 0.126, "step": 14080 }, { "epoch": 0.7503061930880238, "grad_norm": 0.048877742141485214, "learning_rate": 4.1273673610099675e-05, "loss": 0.1253, "step": 14090 }, { "epoch": 0.7508387028063263, "grad_norm": 0.04159548133611679, "learning_rate": 4.122839936747289e-05, "loss": 0.1255, "step": 14100 }, { "epoch": 0.7513712125246286, "grad_norm": 0.036307524889707565, "learning_rate": 4.118319884196587e-05, "loss": 0.1249, "step": 14110 }, { "epoch": 0.7519037222429309, "grad_norm": 0.039279136806726456, "learning_rate": 4.1138072173748116e-05, "loss": 0.125, "step": 14120 }, { "epoch": 0.7524362319612333, "grad_norm": 0.05518367886543274, "learning_rate": 4.109301950276003e-05, "loss": 0.1256, "step": 14130 }, { "epoch": 0.7529687416795356, "grad_norm": 0.043891094624996185, "learning_rate": 4.104804096871259e-05, "loss": 0.1257, "step": 14140 }, { "epoch": 0.753501251397838, "grad_norm": 0.045587554574012756, "learning_rate": 4.1003136711086875e-05, "loss": 0.1263, "step": 14150 }, { "epoch": 0.7540337611161404, "grad_norm": 0.05378378927707672, "learning_rate": 4.0958306869133555e-05, "loss": 0.1253, "step": 14160 }, { "epoch": 0.7545662708344427, "grad_norm": 0.05503176152706146, "learning_rate": 4.091355158187261e-05, "loss": 0.1258, "step": 14170 }, { "epoch": 0.755098780552745, "grad_norm": 0.10129349678754807, "learning_rate": 4.0868870988092795e-05, "loss": 0.1244, "step": 14180 }, { "epoch": 0.7556312902710475, "grad_norm": 0.05240345746278763, "learning_rate": 4.082426522635125e-05, "loss": 0.1257, "step": 14190 }, { "epoch": 0.7561637999893498, "grad_norm": 0.05487096309661865, "learning_rate": 4.077973443497303e-05, "loss": 0.1258, "step": 14200 }, { "epoch": 0.7566963097076521, "grad_norm": 0.04014230892062187, "learning_rate": 4.073527875205071e-05, "loss": 0.1246, "step": 14210 }, { "epoch": 0.7572288194259545, "grad_norm": 0.04955144226551056, "learning_rate": 4.0690898315443955e-05, "loss": 0.1252, "step": 14220 }, { "epoch": 0.7577613291442569, "grad_norm": 0.05915694311261177, "learning_rate": 4.064659326277911e-05, "loss": 0.1255, "step": 14230 }, { "epoch": 0.7582938388625592, "grad_norm": 0.07433107495307922, "learning_rate": 4.0602363731448696e-05, "loss": 0.1247, "step": 14240 }, { "epoch": 0.7588263485808616, "grad_norm": 0.041825130581855774, "learning_rate": 4.0558209858611093e-05, "loss": 0.1253, "step": 14250 }, { "epoch": 0.7593588582991639, "grad_norm": 0.07135327905416489, "learning_rate": 4.051413178119002e-05, "loss": 0.1253, "step": 14260 }, { "epoch": 0.7598913680174664, "grad_norm": 0.0779680609703064, "learning_rate": 4.0470129635874176e-05, "loss": 0.1242, "step": 14270 }, { "epoch": 0.7604238777357687, "grad_norm": 0.03307312726974487, "learning_rate": 4.042620355911677e-05, "loss": 0.1255, "step": 14280 }, { "epoch": 0.760956387454071, "grad_norm": 0.037016890943050385, "learning_rate": 4.0382353687135136e-05, "loss": 0.1249, "step": 14290 }, { "epoch": 0.7614888971723734, "grad_norm": 0.08262995630502701, "learning_rate": 4.0338580155910284e-05, "loss": 0.1249, "step": 14300 }, { "epoch": 0.7620214068906758, "grad_norm": 0.05580204352736473, "learning_rate": 4.029488310118648e-05, "loss": 0.1251, "step": 14310 }, { "epoch": 0.7625539166089781, "grad_norm": 0.0543997660279274, "learning_rate": 4.025126265847084e-05, "loss": 0.1261, "step": 14320 }, { "epoch": 0.7630864263272805, "grad_norm": 0.06115228682756424, "learning_rate": 4.02077189630329e-05, "loss": 0.1251, "step": 14330 }, { "epoch": 0.7636189360455828, "grad_norm": 0.046210747212171555, "learning_rate": 4.016425214990421e-05, "loss": 0.1255, "step": 14340 }, { "epoch": 0.7641514457638852, "grad_norm": 0.0543675497174263, "learning_rate": 4.0120862353877884e-05, "loss": 0.1258, "step": 14350 }, { "epoch": 0.7646839554821876, "grad_norm": 0.06712432205677032, "learning_rate": 4.007754970950821e-05, "loss": 0.1256, "step": 14360 }, { "epoch": 0.7652164652004899, "grad_norm": 0.050090424716472626, "learning_rate": 4.0034314351110216e-05, "loss": 0.1257, "step": 14370 }, { "epoch": 0.7657489749187922, "grad_norm": 0.039436932653188705, "learning_rate": 3.999115641275929e-05, "loss": 0.1259, "step": 14380 }, { "epoch": 0.7662814846370947, "grad_norm": 0.03885102644562721, "learning_rate": 3.994807602829068e-05, "loss": 0.125, "step": 14390 }, { "epoch": 0.766813994355397, "grad_norm": 0.03700343519449234, "learning_rate": 3.990507333129922e-05, "loss": 0.1254, "step": 14400 }, { "epoch": 0.7673465040736993, "grad_norm": 0.06743155419826508, "learning_rate": 3.986214845513874e-05, "loss": 0.1252, "step": 14410 }, { "epoch": 0.7678790137920017, "grad_norm": 0.05197859928011894, "learning_rate": 3.9819301532921807e-05, "loss": 0.1244, "step": 14420 }, { "epoch": 0.7684115235103041, "grad_norm": 0.09455039352178574, "learning_rate": 3.9776532697519206e-05, "loss": 0.1254, "step": 14430 }, { "epoch": 0.7689440332286064, "grad_norm": 0.04639993980526924, "learning_rate": 3.97338420815596e-05, "loss": 0.1252, "step": 14440 }, { "epoch": 0.7694765429469088, "grad_norm": 0.03305187448859215, "learning_rate": 3.969122981742909e-05, "loss": 0.1244, "step": 14450 }, { "epoch": 0.7700090526652111, "grad_norm": 0.06983647495508194, "learning_rate": 3.9648696037270786e-05, "loss": 0.1259, "step": 14460 }, { "epoch": 0.7705415623835135, "grad_norm": 0.07447967678308487, "learning_rate": 3.960624087298439e-05, "loss": 0.1251, "step": 14470 }, { "epoch": 0.7710740721018159, "grad_norm": 0.03923282399773598, "learning_rate": 3.956386445622589e-05, "loss": 0.1254, "step": 14480 }, { "epoch": 0.7716065818201182, "grad_norm": 0.05779058113694191, "learning_rate": 3.9521566918406984e-05, "loss": 0.1248, "step": 14490 }, { "epoch": 0.7721390915384205, "grad_norm": 0.043516259640455246, "learning_rate": 3.947934839069485e-05, "loss": 0.1248, "step": 14500 }, { "epoch": 0.772671601256723, "grad_norm": 0.05518548563122749, "learning_rate": 3.943720900401157e-05, "loss": 0.1261, "step": 14510 }, { "epoch": 0.7732041109750253, "grad_norm": 0.045164406299591064, "learning_rate": 3.939514888903383e-05, "loss": 0.1251, "step": 14520 }, { "epoch": 0.7737366206933276, "grad_norm": 0.042605891823768616, "learning_rate": 3.935316817619252e-05, "loss": 0.1251, "step": 14530 }, { "epoch": 0.77426913041163, "grad_norm": 0.05655062943696976, "learning_rate": 3.931126699567228e-05, "loss": 0.1258, "step": 14540 }, { "epoch": 0.7748016401299324, "grad_norm": 0.06695695966482162, "learning_rate": 3.926944547741112e-05, "loss": 0.1257, "step": 14550 }, { "epoch": 0.7753341498482347, "grad_norm": 0.0719684287905693, "learning_rate": 3.922770375109997e-05, "loss": 0.1256, "step": 14560 }, { "epoch": 0.7758666595665371, "grad_norm": 0.043789032846689224, "learning_rate": 3.918604194618241e-05, "loss": 0.1254, "step": 14570 }, { "epoch": 0.7763991692848394, "grad_norm": 0.03638778626918793, "learning_rate": 3.9144460191854075e-05, "loss": 0.1247, "step": 14580 }, { "epoch": 0.7769316790031419, "grad_norm": 0.0492616705596447, "learning_rate": 3.910295861706244e-05, "loss": 0.1248, "step": 14590 }, { "epoch": 0.7774641887214442, "grad_norm": 0.051167041063308716, "learning_rate": 3.906153735050632e-05, "loss": 0.1255, "step": 14600 }, { "epoch": 0.7779966984397465, "grad_norm": 0.09880778193473816, "learning_rate": 3.9020196520635454e-05, "loss": 0.1256, "step": 14610 }, { "epoch": 0.7785292081580489, "grad_norm": 0.06614736467599869, "learning_rate": 3.897893625565016e-05, "loss": 0.1248, "step": 14620 }, { "epoch": 0.7790617178763513, "grad_norm": 0.05390491709113121, "learning_rate": 3.893775668350095e-05, "loss": 0.125, "step": 14630 }, { "epoch": 0.7795942275946536, "grad_norm": 0.05455655977129936, "learning_rate": 3.8896657931888056e-05, "loss": 0.1248, "step": 14640 }, { "epoch": 0.780126737312956, "grad_norm": 0.06813376396894455, "learning_rate": 3.8855640128261135e-05, "loss": 0.1254, "step": 14650 }, { "epoch": 0.7806592470312583, "grad_norm": 0.03827499598264694, "learning_rate": 3.8814703399818756e-05, "loss": 0.1255, "step": 14660 }, { "epoch": 0.7811917567495607, "grad_norm": 0.07484028488397598, "learning_rate": 3.877384787350812e-05, "loss": 0.1258, "step": 14670 }, { "epoch": 0.7817242664678631, "grad_norm": 0.04092638939619064, "learning_rate": 3.873307367602458e-05, "loss": 0.1256, "step": 14680 }, { "epoch": 0.7822567761861654, "grad_norm": 0.044872015714645386, "learning_rate": 3.869238093381131e-05, "loss": 0.125, "step": 14690 }, { "epoch": 0.7827892859044677, "grad_norm": 0.053619783371686935, "learning_rate": 3.8651769773058894e-05, "loss": 0.1244, "step": 14700 }, { "epoch": 0.7833217956227702, "grad_norm": 0.053385183215141296, "learning_rate": 3.861124031970487e-05, "loss": 0.1252, "step": 14710 }, { "epoch": 0.7838543053410725, "grad_norm": 0.039729390293359756, "learning_rate": 3.857079269943348e-05, "loss": 0.1252, "step": 14720 }, { "epoch": 0.7843868150593748, "grad_norm": 0.055133990943431854, "learning_rate": 3.853042703767511e-05, "loss": 0.1255, "step": 14730 }, { "epoch": 0.7849193247776772, "grad_norm": 0.0439545176923275, "learning_rate": 3.849014345960605e-05, "loss": 0.1254, "step": 14740 }, { "epoch": 0.7854518344959796, "grad_norm": 0.04726070538163185, "learning_rate": 3.844994209014805e-05, "loss": 0.1243, "step": 14750 }, { "epoch": 0.7859843442142819, "grad_norm": 0.11086293309926987, "learning_rate": 3.840982305396787e-05, "loss": 0.1254, "step": 14760 }, { "epoch": 0.7865168539325843, "grad_norm": 0.05439605191349983, "learning_rate": 3.8369786475476986e-05, "loss": 0.1247, "step": 14770 }, { "epoch": 0.7870493636508866, "grad_norm": 0.0793689638376236, "learning_rate": 3.832983247883116e-05, "loss": 0.125, "step": 14780 }, { "epoch": 0.787581873369189, "grad_norm": 0.04240609332919121, "learning_rate": 3.8289961187930076e-05, "loss": 0.1255, "step": 14790 }, { "epoch": 0.7881143830874914, "grad_norm": 0.04460853338241577, "learning_rate": 3.825017272641693e-05, "loss": 0.1258, "step": 14800 }, { "epoch": 0.7886468928057937, "grad_norm": 0.04268253594636917, "learning_rate": 3.821046721767806e-05, "loss": 0.1256, "step": 14810 }, { "epoch": 0.789179402524096, "grad_norm": 0.07220305502414703, "learning_rate": 3.817084478484256e-05, "loss": 0.1258, "step": 14820 }, { "epoch": 0.7897119122423985, "grad_norm": 0.05979606509208679, "learning_rate": 3.8131305550781906e-05, "loss": 0.1262, "step": 14830 }, { "epoch": 0.7902444219607008, "grad_norm": 0.06392871588468552, "learning_rate": 3.8091849638109575e-05, "loss": 0.1244, "step": 14840 }, { "epoch": 0.7907769316790031, "grad_norm": 0.042751483619213104, "learning_rate": 3.8052477169180634e-05, "loss": 0.125, "step": 14850 }, { "epoch": 0.7913094413973055, "grad_norm": 0.03440069034695625, "learning_rate": 3.801318826609144e-05, "loss": 0.1255, "step": 14860 }, { "epoch": 0.7918419511156078, "grad_norm": 0.061454493552446365, "learning_rate": 3.797398305067914e-05, "loss": 0.1251, "step": 14870 }, { "epoch": 0.7923744608339102, "grad_norm": 0.058559708297252655, "learning_rate": 3.7934861644521405e-05, "loss": 0.125, "step": 14880 }, { "epoch": 0.7929069705522126, "grad_norm": 0.0407247468829155, "learning_rate": 3.789582416893599e-05, "loss": 0.1254, "step": 14890 }, { "epoch": 0.7934394802705149, "grad_norm": 0.05672033876180649, "learning_rate": 3.78568707449804e-05, "loss": 0.1253, "step": 14900 }, { "epoch": 0.7939719899888174, "grad_norm": 0.056891556829214096, "learning_rate": 3.781800149345146e-05, "loss": 0.1244, "step": 14910 }, { "epoch": 0.7945044997071197, "grad_norm": 0.05665665119886398, "learning_rate": 3.7779216534885e-05, "loss": 0.1249, "step": 14920 }, { "epoch": 0.795037009425422, "grad_norm": 0.03597261756658554, "learning_rate": 3.774051598955541e-05, "loss": 0.1239, "step": 14930 }, { "epoch": 0.7955695191437244, "grad_norm": 0.06507623195648193, "learning_rate": 3.770189997747536e-05, "loss": 0.1246, "step": 14940 }, { "epoch": 0.7961020288620267, "grad_norm": 0.05575447157025337, "learning_rate": 3.7663368618395365e-05, "loss": 0.1251, "step": 14950 }, { "epoch": 0.7966345385803291, "grad_norm": 0.07422123104333878, "learning_rate": 3.7624922031803403e-05, "loss": 0.1248, "step": 14960 }, { "epoch": 0.7971670482986315, "grad_norm": 0.03994056582450867, "learning_rate": 3.758656033692457e-05, "loss": 0.1254, "step": 14970 }, { "epoch": 0.7976995580169338, "grad_norm": 0.06453961879014969, "learning_rate": 3.754828365272072e-05, "loss": 0.1248, "step": 14980 }, { "epoch": 0.7982320677352361, "grad_norm": 0.03173014149069786, "learning_rate": 3.751009209789011e-05, "loss": 0.1246, "step": 14990 }, { "epoch": 0.7987645774535386, "grad_norm": 0.05219841003417969, "learning_rate": 3.747198579086695e-05, "loss": 0.125, "step": 15000 }, { "epoch": 0.7992970871718409, "grad_norm": 0.04533257335424423, "learning_rate": 3.7433964849821145e-05, "loss": 0.1247, "step": 15010 }, { "epoch": 0.7998295968901432, "grad_norm": 0.05037694424390793, "learning_rate": 3.7396029392657835e-05, "loss": 0.1254, "step": 15020 }, { "epoch": 0.8003621066084456, "grad_norm": 0.047749314457178116, "learning_rate": 3.7358179537017066e-05, "loss": 0.1251, "step": 15030 }, { "epoch": 0.800894616326748, "grad_norm": 0.06332427263259888, "learning_rate": 3.732041540027348e-05, "loss": 0.1246, "step": 15040 }, { "epoch": 0.8014271260450503, "grad_norm": 0.04189267009496689, "learning_rate": 3.728273709953586e-05, "loss": 0.1248, "step": 15050 }, { "epoch": 0.8019596357633527, "grad_norm": 0.04982787370681763, "learning_rate": 3.724514475164681e-05, "loss": 0.1251, "step": 15060 }, { "epoch": 0.802492145481655, "grad_norm": 0.032108910381793976, "learning_rate": 3.720763847318239e-05, "loss": 0.125, "step": 15070 }, { "epoch": 0.8030246551999574, "grad_norm": 0.03504796326160431, "learning_rate": 3.717021838045175e-05, "loss": 0.1247, "step": 15080 }, { "epoch": 0.8035571649182598, "grad_norm": 0.05086008459329605, "learning_rate": 3.713288458949679e-05, "loss": 0.1253, "step": 15090 }, { "epoch": 0.8040896746365621, "grad_norm": 0.04598323255777359, "learning_rate": 3.709563721609178e-05, "loss": 0.1246, "step": 15100 }, { "epoch": 0.8046221843548644, "grad_norm": 0.034302182495594025, "learning_rate": 3.705847637574299e-05, "loss": 0.1256, "step": 15110 }, { "epoch": 0.8051546940731669, "grad_norm": 0.04984142258763313, "learning_rate": 3.7021402183688334e-05, "loss": 0.1249, "step": 15120 }, { "epoch": 0.8056872037914692, "grad_norm": 0.040011048316955566, "learning_rate": 3.698441475489707e-05, "loss": 0.1245, "step": 15130 }, { "epoch": 0.8062197135097715, "grad_norm": 0.0699247419834137, "learning_rate": 3.694751420406937e-05, "loss": 0.1244, "step": 15140 }, { "epoch": 0.8067522232280739, "grad_norm": 0.035953816026449203, "learning_rate": 3.6910700645635975e-05, "loss": 0.1251, "step": 15150 }, { "epoch": 0.8072847329463763, "grad_norm": 0.09570103138685226, "learning_rate": 3.68739741937579e-05, "loss": 0.1245, "step": 15160 }, { "epoch": 0.8078172426646786, "grad_norm": 0.051727280020713806, "learning_rate": 3.683733496232599e-05, "loss": 0.1257, "step": 15170 }, { "epoch": 0.808349752382981, "grad_norm": 0.05984990671277046, "learning_rate": 3.680078306496066e-05, "loss": 0.1251, "step": 15180 }, { "epoch": 0.8088822621012833, "grad_norm": 0.050374679267406464, "learning_rate": 3.676431861501146e-05, "loss": 0.1245, "step": 15190 }, { "epoch": 0.8094147718195857, "grad_norm": 0.08578687161207199, "learning_rate": 3.672794172555677e-05, "loss": 0.1253, "step": 15200 }, { "epoch": 0.8099472815378881, "grad_norm": 0.0373394675552845, "learning_rate": 3.6691652509403475e-05, "loss": 0.1244, "step": 15210 }, { "epoch": 0.8104797912561904, "grad_norm": 0.03546525537967682, "learning_rate": 3.6655451079086525e-05, "loss": 0.1244, "step": 15220 }, { "epoch": 0.8110123009744928, "grad_norm": 0.04470152407884598, "learning_rate": 3.661933754686867e-05, "loss": 0.1251, "step": 15230 }, { "epoch": 0.8115448106927952, "grad_norm": 0.05563315749168396, "learning_rate": 3.6583312024740076e-05, "loss": 0.1254, "step": 15240 }, { "epoch": 0.8120773204110975, "grad_norm": 0.04946048930287361, "learning_rate": 3.654737462441801e-05, "loss": 0.1246, "step": 15250 }, { "epoch": 0.8126098301293999, "grad_norm": 0.05127432197332382, "learning_rate": 3.651152545734643e-05, "loss": 0.1247, "step": 15260 }, { "epoch": 0.8131423398477022, "grad_norm": 0.05603098124265671, "learning_rate": 3.6475764634695674e-05, "loss": 0.1246, "step": 15270 }, { "epoch": 0.8136748495660046, "grad_norm": 0.047840967774391174, "learning_rate": 3.644009226736217e-05, "loss": 0.1248, "step": 15280 }, { "epoch": 0.814207359284307, "grad_norm": 0.04910242184996605, "learning_rate": 3.6404508465968e-05, "loss": 0.1252, "step": 15290 }, { "epoch": 0.8147398690026093, "grad_norm": 0.07945267856121063, "learning_rate": 3.6369013340860606e-05, "loss": 0.1249, "step": 15300 }, { "epoch": 0.8152723787209116, "grad_norm": 0.054363641887903214, "learning_rate": 3.633360700211243e-05, "loss": 0.125, "step": 15310 }, { "epoch": 0.8158048884392141, "grad_norm": 0.051253627985715866, "learning_rate": 3.629828955952062e-05, "loss": 0.1252, "step": 15320 }, { "epoch": 0.8163373981575164, "grad_norm": 0.049010276794433594, "learning_rate": 3.62630611226066e-05, "loss": 0.124, "step": 15330 }, { "epoch": 0.8168699078758187, "grad_norm": 0.05660669878125191, "learning_rate": 3.62279218006158e-05, "loss": 0.1245, "step": 15340 }, { "epoch": 0.8174024175941211, "grad_norm": 0.038386616855859756, "learning_rate": 3.619287170251734e-05, "loss": 0.125, "step": 15350 }, { "epoch": 0.8179349273124235, "grad_norm": 0.06046159192919731, "learning_rate": 3.6157910937003597e-05, "loss": 0.1245, "step": 15360 }, { "epoch": 0.8184674370307258, "grad_norm": 0.06180752068758011, "learning_rate": 3.612303961248995e-05, "loss": 0.1246, "step": 15370 }, { "epoch": 0.8189999467490282, "grad_norm": 0.041465550661087036, "learning_rate": 3.60882578371144e-05, "loss": 0.1247, "step": 15380 }, { "epoch": 0.8195324564673305, "grad_norm": 0.0629926398396492, "learning_rate": 3.6053565718737265e-05, "loss": 0.1248, "step": 15390 }, { "epoch": 0.8200649661856328, "grad_norm": 0.05619725584983826, "learning_rate": 3.601896336494083e-05, "loss": 0.1251, "step": 15400 }, { "epoch": 0.8205974759039353, "grad_norm": 0.06445404887199402, "learning_rate": 3.598445088302901e-05, "loss": 0.1244, "step": 15410 }, { "epoch": 0.8211299856222376, "grad_norm": 0.04841604083776474, "learning_rate": 3.595002838002704e-05, "loss": 0.1249, "step": 15420 }, { "epoch": 0.8216624953405399, "grad_norm": 0.04060814529657364, "learning_rate": 3.591569596268108e-05, "loss": 0.1251, "step": 15430 }, { "epoch": 0.8221950050588424, "grad_norm": 0.03796577826142311, "learning_rate": 3.5881453737457984e-05, "loss": 0.1246, "step": 15440 }, { "epoch": 0.8227275147771447, "grad_norm": 0.033980198204517365, "learning_rate": 3.5847301810544856e-05, "loss": 0.1252, "step": 15450 }, { "epoch": 0.823260024495447, "grad_norm": 0.05901845172047615, "learning_rate": 3.581324028784886e-05, "loss": 0.1244, "step": 15460 }, { "epoch": 0.8237925342137494, "grad_norm": 0.05464969575405121, "learning_rate": 3.577926927499673e-05, "loss": 0.1249, "step": 15470 }, { "epoch": 0.8243250439320517, "grad_norm": 0.05274730920791626, "learning_rate": 3.574538887733456e-05, "loss": 0.1253, "step": 15480 }, { "epoch": 0.8248575536503541, "grad_norm": 0.0523492768406868, "learning_rate": 3.5711599199927446e-05, "loss": 0.1246, "step": 15490 }, { "epoch": 0.8253900633686565, "grad_norm": 0.03017192892730236, "learning_rate": 3.5677900347559146e-05, "loss": 0.1246, "step": 15500 }, { "epoch": 0.8259225730869588, "grad_norm": 0.058320943266153336, "learning_rate": 3.564429242473178e-05, "loss": 0.1253, "step": 15510 }, { "epoch": 0.8264550828052611, "grad_norm": 0.05324307456612587, "learning_rate": 3.5610775535665465e-05, "loss": 0.1247, "step": 15520 }, { "epoch": 0.8269875925235636, "grad_norm": 0.05818801745772362, "learning_rate": 3.557734978429801e-05, "loss": 0.125, "step": 15530 }, { "epoch": 0.8275201022418659, "grad_norm": 0.06262166053056717, "learning_rate": 3.554401527428465e-05, "loss": 0.1246, "step": 15540 }, { "epoch": 0.8280526119601683, "grad_norm": 0.04980841279029846, "learning_rate": 3.551077210899763e-05, "loss": 0.1244, "step": 15550 }, { "epoch": 0.8285851216784706, "grad_norm": 0.03848971053957939, "learning_rate": 3.547762039152594e-05, "loss": 0.1247, "step": 15560 }, { "epoch": 0.829117631396773, "grad_norm": 0.04948917403817177, "learning_rate": 3.5444560224675e-05, "loss": 0.126, "step": 15570 }, { "epoch": 0.8296501411150754, "grad_norm": 0.04893777146935463, "learning_rate": 3.541159171096631e-05, "loss": 0.1252, "step": 15580 }, { "epoch": 0.8301826508333777, "grad_norm": 0.0531187430024147, "learning_rate": 3.537871495263716e-05, "loss": 0.1242, "step": 15590 }, { "epoch": 0.83071516055168, "grad_norm": 0.04317576438188553, "learning_rate": 3.534593005164027e-05, "loss": 0.1241, "step": 15600 }, { "epoch": 0.8312476702699825, "grad_norm": 0.03508533909916878, "learning_rate": 3.531323710964356e-05, "loss": 0.1251, "step": 15610 }, { "epoch": 0.8317801799882848, "grad_norm": 0.04336007684469223, "learning_rate": 3.528063622802974e-05, "loss": 0.1255, "step": 15620 }, { "epoch": 0.8323126897065871, "grad_norm": 0.05976368486881256, "learning_rate": 3.5248127507896045e-05, "loss": 0.1243, "step": 15630 }, { "epoch": 0.8328451994248895, "grad_norm": 0.03515305742621422, "learning_rate": 3.52157110500539e-05, "loss": 0.1256, "step": 15640 }, { "epoch": 0.8333777091431919, "grad_norm": 0.08611953258514404, "learning_rate": 3.518338695502864e-05, "loss": 0.1243, "step": 15650 }, { "epoch": 0.8339102188614942, "grad_norm": 0.041929975152015686, "learning_rate": 3.515115532305918e-05, "loss": 0.1246, "step": 15660 }, { "epoch": 0.8344427285797966, "grad_norm": 0.03388476371765137, "learning_rate": 3.511901625409768e-05, "loss": 0.1246, "step": 15670 }, { "epoch": 0.8349752382980989, "grad_norm": 0.04702109470963478, "learning_rate": 3.5086969847809256e-05, "loss": 0.1241, "step": 15680 }, { "epoch": 0.8355077480164013, "grad_norm": 0.04313468933105469, "learning_rate": 3.50550162035717e-05, "loss": 0.125, "step": 15690 }, { "epoch": 0.8360402577347037, "grad_norm": 0.04187025874853134, "learning_rate": 3.502315542047512e-05, "loss": 0.1244, "step": 15700 }, { "epoch": 0.836572767453006, "grad_norm": 0.0742115005850792, "learning_rate": 3.4991387597321654e-05, "loss": 0.1247, "step": 15710 }, { "epoch": 0.8371052771713083, "grad_norm": 0.047620195895433426, "learning_rate": 3.495971283262519e-05, "loss": 0.1247, "step": 15720 }, { "epoch": 0.8376377868896108, "grad_norm": 0.06339036673307419, "learning_rate": 3.492813122461101e-05, "loss": 0.1249, "step": 15730 }, { "epoch": 0.8381702966079131, "grad_norm": 0.03321847692131996, "learning_rate": 3.489664287121553e-05, "loss": 0.1249, "step": 15740 }, { "epoch": 0.8387028063262154, "grad_norm": 0.06577350944280624, "learning_rate": 3.486524787008595e-05, "loss": 0.1241, "step": 15750 }, { "epoch": 0.8392353160445178, "grad_norm": 0.04866393655538559, "learning_rate": 3.4833946318580026e-05, "loss": 0.1244, "step": 15760 }, { "epoch": 0.8397678257628202, "grad_norm": 0.045106563717126846, "learning_rate": 3.4802738313765685e-05, "loss": 0.1246, "step": 15770 }, { "epoch": 0.8403003354811225, "grad_norm": 0.06193890795111656, "learning_rate": 3.477162395242076e-05, "loss": 0.1251, "step": 15780 }, { "epoch": 0.8408328451994249, "grad_norm": 0.06153490021824837, "learning_rate": 3.4740603331032706e-05, "loss": 0.125, "step": 15790 }, { "epoch": 0.8413653549177272, "grad_norm": 0.05703847110271454, "learning_rate": 3.470967654579828e-05, "loss": 0.1251, "step": 15800 }, { "epoch": 0.8418978646360296, "grad_norm": 0.03664189949631691, "learning_rate": 3.467884369262325e-05, "loss": 0.1249, "step": 15810 }, { "epoch": 0.842430374354332, "grad_norm": 0.037624064832925797, "learning_rate": 3.46481048671221e-05, "loss": 0.1245, "step": 15820 }, { "epoch": 0.8429628840726343, "grad_norm": 0.04117140918970108, "learning_rate": 3.4617460164617684e-05, "loss": 0.1252, "step": 15830 }, { "epoch": 0.8434953937909366, "grad_norm": 0.03222690895199776, "learning_rate": 3.4586909680141047e-05, "loss": 0.1245, "step": 15840 }, { "epoch": 0.8440279035092391, "grad_norm": 0.057400964200496674, "learning_rate": 3.455645350843102e-05, "loss": 0.1248, "step": 15850 }, { "epoch": 0.8445604132275414, "grad_norm": 0.04511050879955292, "learning_rate": 3.452609174393395e-05, "loss": 0.1248, "step": 15860 }, { "epoch": 0.8450929229458438, "grad_norm": 0.03972748667001724, "learning_rate": 3.4495824480803455e-05, "loss": 0.1247, "step": 15870 }, { "epoch": 0.8456254326641461, "grad_norm": 0.08818963170051575, "learning_rate": 3.446565181290007e-05, "loss": 0.125, "step": 15880 }, { "epoch": 0.8461579423824485, "grad_norm": 0.07608040422201157, "learning_rate": 3.4435573833791016e-05, "loss": 0.1246, "step": 15890 }, { "epoch": 0.8466904521007509, "grad_norm": 0.055682647973299026, "learning_rate": 3.4405590636749836e-05, "loss": 0.1255, "step": 15900 }, { "epoch": 0.8472229618190532, "grad_norm": 0.0459553599357605, "learning_rate": 3.437570231475618e-05, "loss": 0.1247, "step": 15910 }, { "epoch": 0.8477554715373555, "grad_norm": 0.07165340334177017, "learning_rate": 3.43459089604955e-05, "loss": 0.1245, "step": 15920 }, { "epoch": 0.848287981255658, "grad_norm": 0.04513763263821602, "learning_rate": 3.43162106663587e-05, "loss": 0.1249, "step": 15930 }, { "epoch": 0.8488204909739603, "grad_norm": 0.07213608175516129, "learning_rate": 3.428660752444193e-05, "loss": 0.1242, "step": 15940 }, { "epoch": 0.8493530006922626, "grad_norm": 0.030396446585655212, "learning_rate": 3.425709962654625e-05, "loss": 0.1252, "step": 15950 }, { "epoch": 0.849885510410565, "grad_norm": 0.030090300366282463, "learning_rate": 3.4227687064177385e-05, "loss": 0.1247, "step": 15960 }, { "epoch": 0.8504180201288674, "grad_norm": 0.05374327301979065, "learning_rate": 3.419836992854541e-05, "loss": 0.1243, "step": 15970 }, { "epoch": 0.8509505298471697, "grad_norm": 0.04545629397034645, "learning_rate": 3.416914831056446e-05, "loss": 0.1245, "step": 15980 }, { "epoch": 0.8514830395654721, "grad_norm": 0.055011678487062454, "learning_rate": 3.414002230085248e-05, "loss": 0.1249, "step": 15990 }, { "epoch": 0.8520155492837744, "grad_norm": 0.05431196093559265, "learning_rate": 3.411099198973092e-05, "loss": 0.1241, "step": 16000 }, { "epoch": 0.8525480590020768, "grad_norm": 0.05473232641816139, "learning_rate": 3.4082057467224484e-05, "loss": 0.1253, "step": 16010 }, { "epoch": 0.8530805687203792, "grad_norm": 0.07439985126256943, "learning_rate": 3.40532188230608e-05, "loss": 0.1245, "step": 16020 }, { "epoch": 0.8536130784386815, "grad_norm": 0.07038458436727524, "learning_rate": 3.402447614667018e-05, "loss": 0.125, "step": 16030 }, { "epoch": 0.8541455881569838, "grad_norm": 0.061058055609464645, "learning_rate": 3.3995829527185354e-05, "loss": 0.1249, "step": 16040 }, { "epoch": 0.8546780978752863, "grad_norm": 0.057675547897815704, "learning_rate": 3.396727905344115e-05, "loss": 0.124, "step": 16050 }, { "epoch": 0.8552106075935886, "grad_norm": 0.038779694586992264, "learning_rate": 3.3938824813974254e-05, "loss": 0.1242, "step": 16060 }, { "epoch": 0.8557431173118909, "grad_norm": 0.044711895287036896, "learning_rate": 3.391046689702292e-05, "loss": 0.1241, "step": 16070 }, { "epoch": 0.8562756270301933, "grad_norm": 0.0402277447283268, "learning_rate": 3.388220539052671e-05, "loss": 0.1241, "step": 16080 }, { "epoch": 0.8568081367484957, "grad_norm": 0.07318955659866333, "learning_rate": 3.3854040382126196e-05, "loss": 0.125, "step": 16090 }, { "epoch": 0.857340646466798, "grad_norm": 0.05128632113337517, "learning_rate": 3.382597195916271e-05, "loss": 0.1252, "step": 16100 }, { "epoch": 0.8578731561851004, "grad_norm": 0.04426991939544678, "learning_rate": 3.379800020867808e-05, "loss": 0.124, "step": 16110 }, { "epoch": 0.8584056659034027, "grad_norm": 0.09766895323991776, "learning_rate": 3.377012521741433e-05, "loss": 0.1246, "step": 16120 }, { "epoch": 0.858938175621705, "grad_norm": 0.04723978415131569, "learning_rate": 3.3742347071813424e-05, "loss": 0.1255, "step": 16130 }, { "epoch": 0.8594706853400075, "grad_norm": 0.0812908411026001, "learning_rate": 3.3714665858017015e-05, "loss": 0.1239, "step": 16140 }, { "epoch": 0.8600031950583098, "grad_norm": 0.0860326737165451, "learning_rate": 3.3687081661866164e-05, "loss": 0.1241, "step": 16150 }, { "epoch": 0.8605357047766121, "grad_norm": 0.04998904466629028, "learning_rate": 3.365959456890109e-05, "loss": 0.1247, "step": 16160 }, { "epoch": 0.8610682144949146, "grad_norm": 0.035985738039016724, "learning_rate": 3.3632204664360836e-05, "loss": 0.1238, "step": 16170 }, { "epoch": 0.8616007242132169, "grad_norm": 0.09907463192939758, "learning_rate": 3.3604912033183126e-05, "loss": 0.1244, "step": 16180 }, { "epoch": 0.8621332339315193, "grad_norm": 0.07233595103025436, "learning_rate": 3.357771676000397e-05, "loss": 0.1253, "step": 16190 }, { "epoch": 0.8626657436498216, "grad_norm": 0.038175683468580246, "learning_rate": 3.355061892915752e-05, "loss": 0.1245, "step": 16200 }, { "epoch": 0.863198253368124, "grad_norm": 0.03324522450566292, "learning_rate": 3.352361862467572e-05, "loss": 0.1242, "step": 16210 }, { "epoch": 0.8637307630864264, "grad_norm": 0.03613545373082161, "learning_rate": 3.349671593028809e-05, "loss": 0.1244, "step": 16220 }, { "epoch": 0.8642632728047287, "grad_norm": 0.10772500932216644, "learning_rate": 3.346991092942146e-05, "loss": 0.1247, "step": 16230 }, { "epoch": 0.864795782523031, "grad_norm": 0.05393153801560402, "learning_rate": 3.3443203705199686e-05, "loss": 0.1247, "step": 16240 }, { "epoch": 0.8653282922413335, "grad_norm": 0.04021570831537247, "learning_rate": 3.3416594340443444e-05, "loss": 0.1248, "step": 16250 }, { "epoch": 0.8658608019596358, "grad_norm": 0.04227181524038315, "learning_rate": 3.339008291766991e-05, "loss": 0.1245, "step": 16260 }, { "epoch": 0.8663933116779381, "grad_norm": 0.044721730053424835, "learning_rate": 3.3363669519092563e-05, "loss": 0.1247, "step": 16270 }, { "epoch": 0.8669258213962405, "grad_norm": 0.1196049377322197, "learning_rate": 3.33373542266209e-05, "loss": 0.1251, "step": 16280 }, { "epoch": 0.8674583311145428, "grad_norm": 0.03756421059370041, "learning_rate": 3.331113712186016e-05, "loss": 0.1242, "step": 16290 }, { "epoch": 0.8679908408328452, "grad_norm": 0.03522124141454697, "learning_rate": 3.328501828611112e-05, "loss": 0.1245, "step": 16300 }, { "epoch": 0.8685233505511476, "grad_norm": 0.04906485602259636, "learning_rate": 3.325899780036982e-05, "loss": 0.125, "step": 16310 }, { "epoch": 0.8690558602694499, "grad_norm": 0.046862684190273285, "learning_rate": 3.3233075745327286e-05, "loss": 0.124, "step": 16320 }, { "epoch": 0.8695883699877522, "grad_norm": 0.1074092835187912, "learning_rate": 3.320725220136934e-05, "loss": 0.1245, "step": 16330 }, { "epoch": 0.8701208797060547, "grad_norm": 0.05268271267414093, "learning_rate": 3.3181527248576294e-05, "loss": 0.1243, "step": 16340 }, { "epoch": 0.870653389424357, "grad_norm": 0.049087993800640106, "learning_rate": 3.3155900966722727e-05, "loss": 0.1242, "step": 16350 }, { "epoch": 0.8711858991426593, "grad_norm": 0.060601964592933655, "learning_rate": 3.313037343527722e-05, "loss": 0.1247, "step": 16360 }, { "epoch": 0.8717184088609617, "grad_norm": 0.05477839335799217, "learning_rate": 3.310494473340215e-05, "loss": 0.1254, "step": 16370 }, { "epoch": 0.8722509185792641, "grad_norm": 0.055110715329647064, "learning_rate": 3.3079614939953416e-05, "loss": 0.1246, "step": 16380 }, { "epoch": 0.8727834282975664, "grad_norm": 0.0602547712624073, "learning_rate": 3.305438413348016e-05, "loss": 0.125, "step": 16390 }, { "epoch": 0.8733159380158688, "grad_norm": 0.05673711746931076, "learning_rate": 3.3029252392224584e-05, "loss": 0.1245, "step": 16400 }, { "epoch": 0.8738484477341711, "grad_norm": 0.05631018802523613, "learning_rate": 3.30042197941217e-05, "loss": 0.125, "step": 16410 }, { "epoch": 0.8743809574524735, "grad_norm": 0.047678008675575256, "learning_rate": 3.297928641679906e-05, "loss": 0.1242, "step": 16420 }, { "epoch": 0.8749134671707759, "grad_norm": 0.05217251926660538, "learning_rate": 3.2954452337576504e-05, "loss": 0.1245, "step": 16430 }, { "epoch": 0.8754459768890782, "grad_norm": 0.05652473866939545, "learning_rate": 3.2929717633465954e-05, "loss": 0.1243, "step": 16440 }, { "epoch": 0.8759784866073805, "grad_norm": 0.03848657384514809, "learning_rate": 3.2905082381171184e-05, "loss": 0.1243, "step": 16450 }, { "epoch": 0.876510996325683, "grad_norm": 0.047618966549634933, "learning_rate": 3.2880546657087554e-05, "loss": 0.1246, "step": 16460 }, { "epoch": 0.8770435060439853, "grad_norm": 0.06333454698324203, "learning_rate": 3.2856110537301756e-05, "loss": 0.1244, "step": 16470 }, { "epoch": 0.8775760157622876, "grad_norm": 0.04720817133784294, "learning_rate": 3.283177409759164e-05, "loss": 0.1239, "step": 16480 }, { "epoch": 0.87810852548059, "grad_norm": 0.03655124083161354, "learning_rate": 3.280753741342592e-05, "loss": 0.1248, "step": 16490 }, { "epoch": 0.8786410351988924, "grad_norm": 0.05196612700819969, "learning_rate": 3.278340055996396e-05, "loss": 0.1245, "step": 16500 }, { "epoch": 0.8791735449171948, "grad_norm": 0.039216578006744385, "learning_rate": 3.275936361205555e-05, "loss": 0.1248, "step": 16510 }, { "epoch": 0.8797060546354971, "grad_norm": 0.055273279547691345, "learning_rate": 3.2735426644240665e-05, "loss": 0.1248, "step": 16520 }, { "epoch": 0.8802385643537994, "grad_norm": 0.05333053693175316, "learning_rate": 3.2711589730749266e-05, "loss": 0.1242, "step": 16530 }, { "epoch": 0.8807710740721019, "grad_norm": 0.062082525342702866, "learning_rate": 3.268785294550098e-05, "loss": 0.1251, "step": 16540 }, { "epoch": 0.8813035837904042, "grad_norm": 0.03454854339361191, "learning_rate": 3.266421636210497e-05, "loss": 0.1241, "step": 16550 }, { "epoch": 0.8818360935087065, "grad_norm": 0.036358997225761414, "learning_rate": 3.264068005385965e-05, "loss": 0.1246, "step": 16560 }, { "epoch": 0.8823686032270089, "grad_norm": 0.03956957161426544, "learning_rate": 3.261724409375252e-05, "loss": 0.1241, "step": 16570 }, { "epoch": 0.8829011129453113, "grad_norm": 0.07209271937608719, "learning_rate": 3.259390855445982e-05, "loss": 0.125, "step": 16580 }, { "epoch": 0.8834336226636136, "grad_norm": 0.06704261153936386, "learning_rate": 3.257067350834644e-05, "loss": 0.1247, "step": 16590 }, { "epoch": 0.883966132381916, "grad_norm": 0.06499594449996948, "learning_rate": 3.25475390274656e-05, "loss": 0.1253, "step": 16600 }, { "epoch": 0.8844986421002183, "grad_norm": 0.03783570975065231, "learning_rate": 3.2524505183558684e-05, "loss": 0.1246, "step": 16610 }, { "epoch": 0.8850311518185207, "grad_norm": 0.04036329314112663, "learning_rate": 3.250157204805498e-05, "loss": 0.1249, "step": 16620 }, { "epoch": 0.8855636615368231, "grad_norm": 0.04968998581171036, "learning_rate": 3.247873969207148e-05, "loss": 0.125, "step": 16630 }, { "epoch": 0.8860961712551254, "grad_norm": 0.045320551842451096, "learning_rate": 3.245600818641265e-05, "loss": 0.1244, "step": 16640 }, { "epoch": 0.8866286809734277, "grad_norm": 0.06106564775109291, "learning_rate": 3.243337760157022e-05, "loss": 0.1247, "step": 16650 }, { "epoch": 0.8871611906917302, "grad_norm": 0.04613622650504112, "learning_rate": 3.241084800772296e-05, "loss": 0.1245, "step": 16660 }, { "epoch": 0.8876937004100325, "grad_norm": 0.05316569283604622, "learning_rate": 3.238841947473642e-05, "loss": 0.1236, "step": 16670 }, { "epoch": 0.8882262101283348, "grad_norm": 0.0546153299510479, "learning_rate": 3.236609207216283e-05, "loss": 0.1245, "step": 16680 }, { "epoch": 0.8887587198466372, "grad_norm": 0.06547331809997559, "learning_rate": 3.2343865869240746e-05, "loss": 0.1243, "step": 16690 }, { "epoch": 0.8892912295649396, "grad_norm": 0.055185478180646896, "learning_rate": 3.2321740934894925e-05, "loss": 0.1245, "step": 16700 }, { "epoch": 0.8898237392832419, "grad_norm": 0.046210877597332, "learning_rate": 3.2299717337736076e-05, "loss": 0.1242, "step": 16710 }, { "epoch": 0.8903562490015443, "grad_norm": 0.04753991216421127, "learning_rate": 3.2277795146060645e-05, "loss": 0.1246, "step": 16720 }, { "epoch": 0.8908887587198466, "grad_norm": 0.05761198326945305, "learning_rate": 3.2255974427850666e-05, "loss": 0.124, "step": 16730 }, { "epoch": 0.891421268438149, "grad_norm": 0.061156004667282104, "learning_rate": 3.223425525077342e-05, "loss": 0.1244, "step": 16740 }, { "epoch": 0.8919537781564514, "grad_norm": 0.07980604469776154, "learning_rate": 3.2212637682181354e-05, "loss": 0.1244, "step": 16750 }, { "epoch": 0.8924862878747537, "grad_norm": 0.04657996818423271, "learning_rate": 3.219112178911181e-05, "loss": 0.1248, "step": 16760 }, { "epoch": 0.893018797593056, "grad_norm": 0.040127284824848175, "learning_rate": 3.216970763828683e-05, "loss": 0.1245, "step": 16770 }, { "epoch": 0.8935513073113585, "grad_norm": 0.04287361726164818, "learning_rate": 3.2148395296112945e-05, "loss": 0.1248, "step": 16780 }, { "epoch": 0.8940838170296608, "grad_norm": 0.0566687285900116, "learning_rate": 3.212718482868096e-05, "loss": 0.1241, "step": 16790 }, { "epoch": 0.8946163267479631, "grad_norm": 0.036797747015953064, "learning_rate": 3.210607630176578e-05, "loss": 0.1252, "step": 16800 }, { "epoch": 0.8951488364662655, "grad_norm": 0.049759261310100555, "learning_rate": 3.208506978082617e-05, "loss": 0.1245, "step": 16810 }, { "epoch": 0.8956813461845679, "grad_norm": 0.061853665858507156, "learning_rate": 3.2064165331004594e-05, "loss": 0.1252, "step": 16820 }, { "epoch": 0.8962138559028703, "grad_norm": 0.05073931813240051, "learning_rate": 3.2043363017126956e-05, "loss": 0.1251, "step": 16830 }, { "epoch": 0.8967463656211726, "grad_norm": 0.06072097271680832, "learning_rate": 3.202266290370245e-05, "loss": 0.1239, "step": 16840 }, { "epoch": 0.8972788753394749, "grad_norm": 0.05999981239438057, "learning_rate": 3.2002065054923325e-05, "loss": 0.1246, "step": 16850 }, { "epoch": 0.8978113850577774, "grad_norm": 0.048532549291849136, "learning_rate": 3.198156953466472e-05, "loss": 0.1242, "step": 16860 }, { "epoch": 0.8983438947760797, "grad_norm": 0.05031272768974304, "learning_rate": 3.196117640648444e-05, "loss": 0.1247, "step": 16870 }, { "epoch": 0.898876404494382, "grad_norm": 0.05845622345805168, "learning_rate": 3.1940885733622754e-05, "loss": 0.1239, "step": 16880 }, { "epoch": 0.8994089142126844, "grad_norm": 0.05167698487639427, "learning_rate": 3.192069757900224e-05, "loss": 0.1245, "step": 16890 }, { "epoch": 0.8999414239309868, "grad_norm": 0.04188617318868637, "learning_rate": 3.190061200522753e-05, "loss": 0.1246, "step": 16900 }, { "epoch": 0.9004739336492891, "grad_norm": 0.0383358858525753, "learning_rate": 3.188062907458516e-05, "loss": 0.124, "step": 16910 }, { "epoch": 0.9010064433675915, "grad_norm": 0.0524710975587368, "learning_rate": 3.186074884904336e-05, "loss": 0.1244, "step": 16920 }, { "epoch": 0.9015389530858938, "grad_norm": 0.034921254962682724, "learning_rate": 3.184097139025189e-05, "loss": 0.1246, "step": 16930 }, { "epoch": 0.9020714628041961, "grad_norm": 0.052057795226573944, "learning_rate": 3.1821296759541764e-05, "loss": 0.124, "step": 16940 }, { "epoch": 0.9026039725224986, "grad_norm": 0.06420300155878067, "learning_rate": 3.1801725017925195e-05, "loss": 0.124, "step": 16950 }, { "epoch": 0.9031364822408009, "grad_norm": 0.03623140975832939, "learning_rate": 3.178225622609528e-05, "loss": 0.1235, "step": 16960 }, { "epoch": 0.9036689919591032, "grad_norm": 0.04458535462617874, "learning_rate": 3.1762890444425875e-05, "loss": 0.1249, "step": 16970 }, { "epoch": 0.9042015016774057, "grad_norm": 0.03834957256913185, "learning_rate": 3.174362773297141e-05, "loss": 0.1246, "step": 16980 }, { "epoch": 0.904734011395708, "grad_norm": 0.04682791605591774, "learning_rate": 3.1724468151466665e-05, "loss": 0.1245, "step": 16990 }, { "epoch": 0.9052665211140103, "grad_norm": 0.0558556504547596, "learning_rate": 3.170541175932662e-05, "loss": 0.1244, "step": 17000 }, { "epoch": 0.9057990308323127, "grad_norm": 0.046385906636714935, "learning_rate": 3.168645861564627e-05, "loss": 0.1241, "step": 17010 }, { "epoch": 0.906331540550615, "grad_norm": 0.04825804755091667, "learning_rate": 3.166760877920041e-05, "loss": 0.1246, "step": 17020 }, { "epoch": 0.9068640502689174, "grad_norm": 0.0882338434457779, "learning_rate": 3.164886230844348e-05, "loss": 0.125, "step": 17030 }, { "epoch": 0.9073965599872198, "grad_norm": 0.08609329909086227, "learning_rate": 3.163021926150939e-05, "loss": 0.1242, "step": 17040 }, { "epoch": 0.9079290697055221, "grad_norm": 0.05161284655332565, "learning_rate": 3.1611679696211294e-05, "loss": 0.125, "step": 17050 }, { "epoch": 0.9084615794238244, "grad_norm": 0.04110497981309891, "learning_rate": 3.159324367004148e-05, "loss": 0.1253, "step": 17060 }, { "epoch": 0.9089940891421269, "grad_norm": 0.07606612145900726, "learning_rate": 3.157491124017115e-05, "loss": 0.1236, "step": 17070 }, { "epoch": 0.9095265988604292, "grad_norm": 0.04594139754772186, "learning_rate": 3.1556682463450214e-05, "loss": 0.1234, "step": 17080 }, { "epoch": 0.9100591085787315, "grad_norm": 0.039515670388936996, "learning_rate": 3.15385573964072e-05, "loss": 0.1243, "step": 17090 }, { "epoch": 0.910591618297034, "grad_norm": 0.04435297101736069, "learning_rate": 3.152053609524897e-05, "loss": 0.1245, "step": 17100 }, { "epoch": 0.9111241280153363, "grad_norm": 0.03617672622203827, "learning_rate": 3.150261861586065e-05, "loss": 0.1243, "step": 17110 }, { "epoch": 0.9116566377336386, "grad_norm": 0.053447507321834564, "learning_rate": 3.148480501380538e-05, "loss": 0.1251, "step": 17120 }, { "epoch": 0.912189147451941, "grad_norm": 0.041530635207891464, "learning_rate": 3.1467095344324174e-05, "loss": 0.1242, "step": 17130 }, { "epoch": 0.9127216571702433, "grad_norm": 0.1386973112821579, "learning_rate": 3.144948966233577e-05, "loss": 0.1244, "step": 17140 }, { "epoch": 0.9132541668885458, "grad_norm": 0.06289440393447876, "learning_rate": 3.143198802243638e-05, "loss": 0.1246, "step": 17150 }, { "epoch": 0.9137866766068481, "grad_norm": 0.03927746042609215, "learning_rate": 3.141459047889964e-05, "loss": 0.1242, "step": 17160 }, { "epoch": 0.9143191863251504, "grad_norm": 0.03453196585178375, "learning_rate": 3.1397297085676336e-05, "loss": 0.1243, "step": 17170 }, { "epoch": 0.9148516960434528, "grad_norm": 0.04992485046386719, "learning_rate": 3.138010789639429e-05, "loss": 0.1242, "step": 17180 }, { "epoch": 0.9153842057617552, "grad_norm": 0.04788126423954964, "learning_rate": 3.136302296435818e-05, "loss": 0.1246, "step": 17190 }, { "epoch": 0.9159167154800575, "grad_norm": 0.05929577723145485, "learning_rate": 3.1346042342549376e-05, "loss": 0.1245, "step": 17200 }, { "epoch": 0.9164492251983599, "grad_norm": 0.08015090227127075, "learning_rate": 3.132916608362578e-05, "loss": 0.1244, "step": 17210 }, { "epoch": 0.9169817349166622, "grad_norm": 0.06038287281990051, "learning_rate": 3.131239423992165e-05, "loss": 0.1245, "step": 17220 }, { "epoch": 0.9175142446349646, "grad_norm": 0.04849204048514366, "learning_rate": 3.129572686344745e-05, "loss": 0.1244, "step": 17230 }, { "epoch": 0.918046754353267, "grad_norm": 0.0919271856546402, "learning_rate": 3.1279164005889696e-05, "loss": 0.1249, "step": 17240 }, { "epoch": 0.9185792640715693, "grad_norm": 0.039166927337646484, "learning_rate": 3.126270571861076e-05, "loss": 0.1246, "step": 17250 }, { "epoch": 0.9191117737898716, "grad_norm": 0.05480289086699486, "learning_rate": 3.1246352052648764e-05, "loss": 0.124, "step": 17260 }, { "epoch": 0.9196442835081741, "grad_norm": 0.0434199757874012, "learning_rate": 3.1230103058717373e-05, "loss": 0.1245, "step": 17270 }, { "epoch": 0.9201767932264764, "grad_norm": 0.03532974794507027, "learning_rate": 3.121395878720567e-05, "loss": 0.1242, "step": 17280 }, { "epoch": 0.9207093029447787, "grad_norm": 0.09806732088327408, "learning_rate": 3.119791928817798e-05, "loss": 0.1251, "step": 17290 }, { "epoch": 0.9212418126630811, "grad_norm": 0.0625268891453743, "learning_rate": 3.1181984611373735e-05, "loss": 0.1239, "step": 17300 }, { "epoch": 0.9217743223813835, "grad_norm": 0.04700905457139015, "learning_rate": 3.116615480620727e-05, "loss": 0.1247, "step": 17310 }, { "epoch": 0.9223068320996858, "grad_norm": 0.0478329174220562, "learning_rate": 3.1150429921767754e-05, "loss": 0.1239, "step": 17320 }, { "epoch": 0.9228393418179882, "grad_norm": 0.041897084563970566, "learning_rate": 3.113481000681897e-05, "loss": 0.1249, "step": 17330 }, { "epoch": 0.9233718515362905, "grad_norm": 0.04760069027543068, "learning_rate": 3.111929510979918e-05, "loss": 0.124, "step": 17340 }, { "epoch": 0.9239043612545929, "grad_norm": 0.06711754202842712, "learning_rate": 3.110388527882099e-05, "loss": 0.1239, "step": 17350 }, { "epoch": 0.9244368709728953, "grad_norm": 0.034946054220199585, "learning_rate": 3.108858056167117e-05, "loss": 0.1246, "step": 17360 }, { "epoch": 0.9249693806911976, "grad_norm": 0.03772689029574394, "learning_rate": 3.107338100581056e-05, "loss": 0.1245, "step": 17370 }, { "epoch": 0.9255018904094999, "grad_norm": 0.04687857627868652, "learning_rate": 3.105828665837386e-05, "loss": 0.1249, "step": 17380 }, { "epoch": 0.9260344001278024, "grad_norm": 0.04124782606959343, "learning_rate": 3.104329756616952e-05, "loss": 0.1243, "step": 17390 }, { "epoch": 0.9265669098461047, "grad_norm": 0.052532244473695755, "learning_rate": 3.1028413775679595e-05, "loss": 0.1245, "step": 17400 }, { "epoch": 0.927099419564407, "grad_norm": 0.0953177809715271, "learning_rate": 3.101363533305958e-05, "loss": 0.1241, "step": 17410 }, { "epoch": 0.9276319292827094, "grad_norm": 0.05009876564145088, "learning_rate": 3.099896228413829e-05, "loss": 0.1238, "step": 17420 }, { "epoch": 0.9281644390010118, "grad_norm": 0.08097761869430542, "learning_rate": 3.098439467441771e-05, "loss": 0.1249, "step": 17430 }, { "epoch": 0.9286969487193141, "grad_norm": 0.047098558396101, "learning_rate": 3.0969932549072835e-05, "loss": 0.1233, "step": 17440 }, { "epoch": 0.9292294584376165, "grad_norm": 0.047214169055223465, "learning_rate": 3.0955575952951575e-05, "loss": 0.1245, "step": 17450 }, { "epoch": 0.9297619681559188, "grad_norm": 0.048827920109033585, "learning_rate": 3.0941324930574554e-05, "loss": 0.1241, "step": 17460 }, { "epoch": 0.9302944778742213, "grad_norm": 0.07658734172582626, "learning_rate": 3.0927179526135044e-05, "loss": 0.1237, "step": 17470 }, { "epoch": 0.9308269875925236, "grad_norm": 0.034373264759778976, "learning_rate": 3.091313978349875e-05, "loss": 0.1252, "step": 17480 }, { "epoch": 0.9313594973108259, "grad_norm": 0.05511806905269623, "learning_rate": 3.089920574620375e-05, "loss": 0.1248, "step": 17490 }, { "epoch": 0.9318920070291283, "grad_norm": 0.04354240372776985, "learning_rate": 3.0885377457460294e-05, "loss": 0.125, "step": 17500 }, { "epoch": 0.9324245167474307, "grad_norm": 0.0334496833384037, "learning_rate": 3.0871654960150706e-05, "loss": 0.1239, "step": 17510 }, { "epoch": 0.932957026465733, "grad_norm": 0.043775349855422974, "learning_rate": 3.085803829682928e-05, "loss": 0.124, "step": 17520 }, { "epoch": 0.9334895361840354, "grad_norm": 0.03564087674021721, "learning_rate": 3.0844527509722045e-05, "loss": 0.1238, "step": 17530 }, { "epoch": 0.9340220459023377, "grad_norm": 0.03102003037929535, "learning_rate": 3.083112264072676e-05, "loss": 0.1248, "step": 17540 }, { "epoch": 0.93455455562064, "grad_norm": 0.03893466666340828, "learning_rate": 3.0817823731412704e-05, "loss": 0.1242, "step": 17550 }, { "epoch": 0.9350870653389425, "grad_norm": 0.05906695872545242, "learning_rate": 3.0804630823020575e-05, "loss": 0.1235, "step": 17560 }, { "epoch": 0.9356195750572448, "grad_norm": 0.07867705076932907, "learning_rate": 3.079154395646233e-05, "loss": 0.1246, "step": 17570 }, { "epoch": 0.9361520847755471, "grad_norm": 0.047349728643894196, "learning_rate": 3.077856317232114e-05, "loss": 0.1238, "step": 17580 }, { "epoch": 0.9366845944938496, "grad_norm": 0.04795532301068306, "learning_rate": 3.0765688510851144e-05, "loss": 0.1243, "step": 17590 }, { "epoch": 0.9372171042121519, "grad_norm": 0.06184261292219162, "learning_rate": 3.075292001197743e-05, "loss": 0.1252, "step": 17600 }, { "epoch": 0.9377496139304542, "grad_norm": 0.05014181509613991, "learning_rate": 3.074025771529585e-05, "loss": 0.1248, "step": 17610 }, { "epoch": 0.9382821236487566, "grad_norm": 0.039317913353443146, "learning_rate": 3.0727701660072925e-05, "loss": 0.1239, "step": 17620 }, { "epoch": 0.938814633367059, "grad_norm": 0.10161686688661575, "learning_rate": 3.0715251885245734e-05, "loss": 0.1232, "step": 17630 }, { "epoch": 0.9393471430853613, "grad_norm": 0.0897764042019844, "learning_rate": 3.070290842942173e-05, "loss": 0.1244, "step": 17640 }, { "epoch": 0.9398796528036637, "grad_norm": 0.056616149842739105, "learning_rate": 3.0690671330878704e-05, "loss": 0.1242, "step": 17650 }, { "epoch": 0.940412162521966, "grad_norm": 0.04649018496274948, "learning_rate": 3.0678540627564614e-05, "loss": 0.1241, "step": 17660 }, { "epoch": 0.9409446722402683, "grad_norm": 0.06587915867567062, "learning_rate": 3.066651635709746e-05, "loss": 0.1239, "step": 17670 }, { "epoch": 0.9414771819585708, "grad_norm": 0.04945458844304085, "learning_rate": 3.065459855676523e-05, "loss": 0.124, "step": 17680 }, { "epoch": 0.9420096916768731, "grad_norm": 0.0448901429772377, "learning_rate": 3.06427872635257e-05, "loss": 0.1234, "step": 17690 }, { "epoch": 0.9425422013951754, "grad_norm": 0.04893770441412926, "learning_rate": 3.063108251400638e-05, "loss": 0.1249, "step": 17700 }, { "epoch": 0.9430747111134778, "grad_norm": 0.050453029572963715, "learning_rate": 3.06194843445044e-05, "loss": 0.125, "step": 17710 }, { "epoch": 0.9436072208317802, "grad_norm": 0.07537666708230972, "learning_rate": 3.060799279098633e-05, "loss": 0.1249, "step": 17720 }, { "epoch": 0.9441397305500825, "grad_norm": 0.04989492520689964, "learning_rate": 3.059660788908817e-05, "loss": 0.1246, "step": 17730 }, { "epoch": 0.9446722402683849, "grad_norm": 0.06518174707889557, "learning_rate": 3.058532967411516e-05, "loss": 0.1251, "step": 17740 }, { "epoch": 0.9452047499866872, "grad_norm": 0.046202413737773895, "learning_rate": 3.057415818104169e-05, "loss": 0.1238, "step": 17750 }, { "epoch": 0.9457372597049896, "grad_norm": 0.05071398615837097, "learning_rate": 3.056309344451123e-05, "loss": 0.1243, "step": 17760 }, { "epoch": 0.946269769423292, "grad_norm": 0.08481655269861221, "learning_rate": 3.0552135498836165e-05, "loss": 0.1247, "step": 17770 }, { "epoch": 0.9468022791415943, "grad_norm": 0.05741250142455101, "learning_rate": 3.0541284377997724e-05, "loss": 0.1251, "step": 17780 }, { "epoch": 0.9473347888598967, "grad_norm": 0.060524262487888336, "learning_rate": 3.053054011564587e-05, "loss": 0.1239, "step": 17790 }, { "epoch": 0.9478672985781991, "grad_norm": 0.056961771100759506, "learning_rate": 3.051990274509917e-05, "loss": 0.1249, "step": 17800 }, { "epoch": 0.9483998082965014, "grad_norm": 0.053151581436395645, "learning_rate": 3.050937229934475e-05, "loss": 0.1247, "step": 17810 }, { "epoch": 0.9489323180148038, "grad_norm": 0.04388433322310448, "learning_rate": 3.049894881103813e-05, "loss": 0.1241, "step": 17820 }, { "epoch": 0.9494648277331061, "grad_norm": 0.05028518661856651, "learning_rate": 3.0488632312503152e-05, "loss": 0.1233, "step": 17830 }, { "epoch": 0.9499973374514085, "grad_norm": 0.06016720086336136, "learning_rate": 3.0478422835731874e-05, "loss": 0.1246, "step": 17840 }, { "epoch": 0.9505298471697109, "grad_norm": 0.05909838154911995, "learning_rate": 3.0468320412384498e-05, "loss": 0.1246, "step": 17850 }, { "epoch": 0.9510623568880132, "grad_norm": 0.06923595815896988, "learning_rate": 3.0458325073789212e-05, "loss": 0.1242, "step": 17860 }, { "epoch": 0.9515948666063155, "grad_norm": 0.041968394070863724, "learning_rate": 3.0448436850942146e-05, "loss": 0.1243, "step": 17870 }, { "epoch": 0.952127376324618, "grad_norm": 0.038111716508865356, "learning_rate": 3.0438655774507256e-05, "loss": 0.1241, "step": 17880 }, { "epoch": 0.9526598860429203, "grad_norm": 0.09092561900615692, "learning_rate": 3.0428981874816235e-05, "loss": 0.1237, "step": 17890 }, { "epoch": 0.9531923957612226, "grad_norm": 0.031990304589271545, "learning_rate": 3.0419415181868416e-05, "loss": 0.1241, "step": 17900 }, { "epoch": 0.953724905479525, "grad_norm": 0.05379229038953781, "learning_rate": 3.0409955725330652e-05, "loss": 0.1236, "step": 17910 }, { "epoch": 0.9542574151978274, "grad_norm": 0.05789874866604805, "learning_rate": 3.0400603534537282e-05, "loss": 0.1236, "step": 17920 }, { "epoch": 0.9547899249161297, "grad_norm": 0.045958537608385086, "learning_rate": 3.0391358638489997e-05, "loss": 0.1241, "step": 17930 }, { "epoch": 0.9553224346344321, "grad_norm": 0.06445147842168808, "learning_rate": 3.0382221065857753e-05, "loss": 0.1239, "step": 17940 }, { "epoch": 0.9558549443527344, "grad_norm": 0.04746498540043831, "learning_rate": 3.0373190844976695e-05, "loss": 0.1239, "step": 17950 }, { "epoch": 0.9563874540710368, "grad_norm": 0.037877731025218964, "learning_rate": 3.0364268003850065e-05, "loss": 0.1242, "step": 17960 }, { "epoch": 0.9569199637893392, "grad_norm": 0.07475633174180984, "learning_rate": 3.0355452570148126e-05, "loss": 0.1241, "step": 17970 }, { "epoch": 0.9574524735076415, "grad_norm": 0.061180293560028076, "learning_rate": 3.0346744571208034e-05, "loss": 0.1241, "step": 17980 }, { "epoch": 0.9579849832259438, "grad_norm": 0.05577493831515312, "learning_rate": 3.033814403403381e-05, "loss": 0.1236, "step": 17990 }, { "epoch": 0.9585174929442463, "grad_norm": 0.04320796579122543, "learning_rate": 3.0329650985296228e-05, "loss": 0.1236, "step": 18000 }, { "epoch": 0.9590500026625486, "grad_norm": 0.07489881664514542, "learning_rate": 3.032126545133271e-05, "loss": 0.1249, "step": 18010 }, { "epoch": 0.9595825123808509, "grad_norm": 0.050032421946525574, "learning_rate": 3.0312987458147298e-05, "loss": 0.1245, "step": 18020 }, { "epoch": 0.9601150220991533, "grad_norm": 0.035527851432561874, "learning_rate": 3.030481703141053e-05, "loss": 0.1239, "step": 18030 }, { "epoch": 0.9606475318174557, "grad_norm": 0.0719723030924797, "learning_rate": 3.0296754196459377e-05, "loss": 0.1234, "step": 18040 }, { "epoch": 0.961180041535758, "grad_norm": 0.07368568331003189, "learning_rate": 3.028879897829716e-05, "loss": 0.1244, "step": 18050 }, { "epoch": 0.9617125512540604, "grad_norm": 0.08034256100654602, "learning_rate": 3.028095140159347e-05, "loss": 0.1249, "step": 18060 }, { "epoch": 0.9622450609723627, "grad_norm": 0.06598822772502899, "learning_rate": 3.0273211490684106e-05, "loss": 0.1243, "step": 18070 }, { "epoch": 0.962777570690665, "grad_norm": 0.048927973955869675, "learning_rate": 3.0265579269570976e-05, "loss": 0.125, "step": 18080 }, { "epoch": 0.9633100804089675, "grad_norm": 0.035660270601511, "learning_rate": 3.025805476192205e-05, "loss": 0.1239, "step": 18090 }, { "epoch": 0.9638425901272698, "grad_norm": 0.055542632937431335, "learning_rate": 3.025063799107126e-05, "loss": 0.1237, "step": 18100 }, { "epoch": 0.9643750998455722, "grad_norm": 0.06694008409976959, "learning_rate": 3.0243328980018447e-05, "loss": 0.1232, "step": 18110 }, { "epoch": 0.9649076095638746, "grad_norm": 0.04520373046398163, "learning_rate": 3.0236127751429284e-05, "loss": 0.1245, "step": 18120 }, { "epoch": 0.9654401192821769, "grad_norm": 0.050599873065948486, "learning_rate": 3.022903432763519e-05, "loss": 0.1236, "step": 18130 }, { "epoch": 0.9659726290004793, "grad_norm": 0.10253104567527771, "learning_rate": 3.02220487306333e-05, "loss": 0.1239, "step": 18140 }, { "epoch": 0.9665051387187816, "grad_norm": 0.0527169369161129, "learning_rate": 3.021517098208635e-05, "loss": 0.1243, "step": 18150 }, { "epoch": 0.967037648437084, "grad_norm": 0.06449782848358154, "learning_rate": 3.0208401103322637e-05, "loss": 0.1237, "step": 18160 }, { "epoch": 0.9675701581553864, "grad_norm": 0.061875950545072556, "learning_rate": 3.0201739115335952e-05, "loss": 0.1238, "step": 18170 }, { "epoch": 0.9681026678736887, "grad_norm": 0.037850238382816315, "learning_rate": 3.0195185038785507e-05, "loss": 0.1249, "step": 18180 }, { "epoch": 0.968635177591991, "grad_norm": 0.04714973270893097, "learning_rate": 3.0188738893995878e-05, "loss": 0.1244, "step": 18190 }, { "epoch": 0.9691676873102935, "grad_norm": 0.09865976870059967, "learning_rate": 3.0182400700956943e-05, "loss": 0.1248, "step": 18200 }, { "epoch": 0.9697001970285958, "grad_norm": 0.03763122111558914, "learning_rate": 3.0176170479323794e-05, "loss": 0.1242, "step": 18210 }, { "epoch": 0.9702327067468981, "grad_norm": 0.038522519171237946, "learning_rate": 3.017004824841672e-05, "loss": 0.1245, "step": 18220 }, { "epoch": 0.9707652164652005, "grad_norm": 0.06458954513072968, "learning_rate": 3.0164034027221112e-05, "loss": 0.1235, "step": 18230 }, { "epoch": 0.9712977261835029, "grad_norm": 0.04066864028573036, "learning_rate": 3.015812783438743e-05, "loss": 0.1247, "step": 18240 }, { "epoch": 0.9718302359018052, "grad_norm": 0.0487934835255146, "learning_rate": 3.0152329688231107e-05, "loss": 0.125, "step": 18250 }, { "epoch": 0.9723627456201076, "grad_norm": 0.0468660444021225, "learning_rate": 3.014663960673254e-05, "loss": 0.1238, "step": 18260 }, { "epoch": 0.9728952553384099, "grad_norm": 0.05677594989538193, "learning_rate": 3.014105760753701e-05, "loss": 0.1236, "step": 18270 }, { "epoch": 0.9734277650567122, "grad_norm": 0.042526934295892715, "learning_rate": 3.0135583707954613e-05, "loss": 0.1234, "step": 18280 }, { "epoch": 0.9739602747750147, "grad_norm": 0.06067803502082825, "learning_rate": 3.0130217924960234e-05, "loss": 0.1248, "step": 18290 }, { "epoch": 0.974492784493317, "grad_norm": 0.04792502894997597, "learning_rate": 3.012496027519348e-05, "loss": 0.1242, "step": 18300 }, { "epoch": 0.9750252942116193, "grad_norm": 0.05716263875365257, "learning_rate": 3.011981077495863e-05, "loss": 0.1241, "step": 18310 }, { "epoch": 0.9755578039299218, "grad_norm": 0.04997260496020317, "learning_rate": 3.011476944022458e-05, "loss": 0.124, "step": 18320 }, { "epoch": 0.9760903136482241, "grad_norm": 0.09484563767910004, "learning_rate": 3.010983628662481e-05, "loss": 0.1242, "step": 18330 }, { "epoch": 0.9766228233665264, "grad_norm": 0.0866529643535614, "learning_rate": 3.010501132945731e-05, "loss": 0.1242, "step": 18340 }, { "epoch": 0.9771553330848288, "grad_norm": 0.0486617274582386, "learning_rate": 3.0100294583684557e-05, "loss": 0.1238, "step": 18350 }, { "epoch": 0.9776878428031311, "grad_norm": 0.032813165336847305, "learning_rate": 3.0095686063933453e-05, "loss": 0.1243, "step": 18360 }, { "epoch": 0.9782203525214335, "grad_norm": 0.07168902456760406, "learning_rate": 3.009118578449529e-05, "loss": 0.1249, "step": 18370 }, { "epoch": 0.9787528622397359, "grad_norm": 0.06777796894311905, "learning_rate": 3.0086793759325693e-05, "loss": 0.1246, "step": 18380 }, { "epoch": 0.9792853719580382, "grad_norm": 0.0522090420126915, "learning_rate": 3.0082510002044588e-05, "loss": 0.1238, "step": 18390 }, { "epoch": 0.9798178816763405, "grad_norm": 0.07006611675024033, "learning_rate": 3.0078334525936163e-05, "loss": 0.1243, "step": 18400 }, { "epoch": 0.980350391394643, "grad_norm": 0.06946975737810135, "learning_rate": 3.0074267343948805e-05, "loss": 0.1244, "step": 18410 }, { "epoch": 0.9808829011129453, "grad_norm": 0.057238370180130005, "learning_rate": 3.0070308468695084e-05, "loss": 0.1234, "step": 18420 }, { "epoch": 0.9814154108312477, "grad_norm": 0.06210003048181534, "learning_rate": 3.0066457912451707e-05, "loss": 0.1237, "step": 18430 }, { "epoch": 0.98194792054955, "grad_norm": 0.0649491548538208, "learning_rate": 3.006271568715947e-05, "loss": 0.1233, "step": 18440 }, { "epoch": 0.9824804302678524, "grad_norm": 0.038359202444553375, "learning_rate": 3.0059081804423232e-05, "loss": 0.1236, "step": 18450 }, { "epoch": 0.9830129399861548, "grad_norm": 0.05497262626886368, "learning_rate": 3.0055556275511883e-05, "loss": 0.1239, "step": 18460 }, { "epoch": 0.9835454497044571, "grad_norm": 0.038170114159584045, "learning_rate": 3.005213911135828e-05, "loss": 0.1242, "step": 18470 }, { "epoch": 0.9840779594227594, "grad_norm": 0.04138299450278282, "learning_rate": 3.004883032255925e-05, "loss": 0.1246, "step": 18480 }, { "epoch": 0.9846104691410619, "grad_norm": 0.0714152529835701, "learning_rate": 3.004562991937555e-05, "loss": 0.1239, "step": 18490 }, { "epoch": 0.9851429788593642, "grad_norm": 0.08420536667108536, "learning_rate": 3.0042537911731818e-05, "loss": 0.1242, "step": 18500 }, { "epoch": 0.9856754885776665, "grad_norm": 0.036863774061203, "learning_rate": 3.0039554309216533e-05, "loss": 0.1245, "step": 18510 }, { "epoch": 0.986207998295969, "grad_norm": 0.059715207666158676, "learning_rate": 3.003667912108204e-05, "loss": 0.1249, "step": 18520 }, { "epoch": 0.9867405080142713, "grad_norm": 0.039495982229709625, "learning_rate": 3.0033912356244453e-05, "loss": 0.1241, "step": 18530 }, { "epoch": 0.9872730177325736, "grad_norm": 0.0337030254304409, "learning_rate": 3.0031254023283678e-05, "loss": 0.1244, "step": 18540 }, { "epoch": 0.987805527450876, "grad_norm": 0.04630092531442642, "learning_rate": 3.0028704130443352e-05, "loss": 0.1237, "step": 18550 }, { "epoch": 0.9883380371691783, "grad_norm": 0.03899266943335533, "learning_rate": 3.0026262685630846e-05, "loss": 0.124, "step": 18560 }, { "epoch": 0.9888705468874807, "grad_norm": 0.04212348535656929, "learning_rate": 3.002392969641723e-05, "loss": 0.1234, "step": 18570 }, { "epoch": 0.9894030566057831, "grad_norm": 0.03455604612827301, "learning_rate": 3.0021705170037227e-05, "loss": 0.1241, "step": 18580 }, { "epoch": 0.9899355663240854, "grad_norm": 0.08309216052293777, "learning_rate": 3.0019589113389234e-05, "loss": 0.1238, "step": 18590 }, { "epoch": 0.9904680760423877, "grad_norm": 0.09158316254615784, "learning_rate": 3.0017581533035255e-05, "loss": 0.1238, "step": 18600 }, { "epoch": 0.9910005857606902, "grad_norm": 0.043327104300260544, "learning_rate": 3.0015682435200926e-05, "loss": 0.1249, "step": 18610 }, { "epoch": 0.9915330954789925, "grad_norm": 0.05239805579185486, "learning_rate": 3.001389182577545e-05, "loss": 0.1243, "step": 18620 }, { "epoch": 0.9920656051972948, "grad_norm": 0.07973194122314453, "learning_rate": 3.0012209710311613e-05, "loss": 0.1239, "step": 18630 }, { "epoch": 0.9925981149155972, "grad_norm": 0.04951346665620804, "learning_rate": 3.001063609402576e-05, "loss": 0.1229, "step": 18640 }, { "epoch": 0.9931306246338996, "grad_norm": 0.038230083882808685, "learning_rate": 3.0009170981797758e-05, "loss": 0.1236, "step": 18650 }, { "epoch": 0.9936631343522019, "grad_norm": 0.05738453194499016, "learning_rate": 3.0007814378171008e-05, "loss": 0.1242, "step": 18660 }, { "epoch": 0.9941956440705043, "grad_norm": 0.060486044734716415, "learning_rate": 3.0006566287352423e-05, "loss": 0.1242, "step": 18670 }, { "epoch": 0.9947281537888066, "grad_norm": 0.05927567929029465, "learning_rate": 3.0005426713212397e-05, "loss": 0.1243, "step": 18680 }, { "epoch": 0.995260663507109, "grad_norm": 0.10084035247564316, "learning_rate": 3.000439565928482e-05, "loss": 0.1243, "step": 18690 }, { "epoch": 0.9957931732254114, "grad_norm": 0.04450371488928795, "learning_rate": 3.0003473128767058e-05, "loss": 0.1244, "step": 18700 }, { "epoch": 0.9963256829437137, "grad_norm": 0.059074439108371735, "learning_rate": 3.000265912451991e-05, "loss": 0.1232, "step": 18710 }, { "epoch": 0.996858192662016, "grad_norm": 0.04628562554717064, "learning_rate": 3.0001953649067676e-05, "loss": 0.1247, "step": 18720 }, { "epoch": 0.9973907023803185, "grad_norm": 0.0613800473511219, "learning_rate": 3.000135670459806e-05, "loss": 0.1241, "step": 18730 }, { "epoch": 0.9979232120986208, "grad_norm": 0.060382645577192307, "learning_rate": 3.000086829296223e-05, "loss": 0.1247, "step": 18740 }, { "epoch": 0.9984557218169232, "grad_norm": 0.03309040144085884, "learning_rate": 3.0000488415674777e-05, "loss": 0.1242, "step": 18750 }, { "epoch": 0.9989882315352255, "grad_norm": 0.047177914530038834, "learning_rate": 3.0000217073913716e-05, "loss": 0.124, "step": 18760 }, { "epoch": 0.9995207412535279, "grad_norm": 0.03689567372202873, "learning_rate": 3.00000542685205e-05, "loss": 0.1245, "step": 18770 }, { "epoch": 1.0, "step": 18779, "total_flos": 5.994863411375112e+18, "train_loss": 0.031714059101823636, "train_runtime": 3465.0177, "train_samples_per_second": 1387.365, "train_steps_per_second": 5.42 } ], "logging_steps": 10, "max_steps": 18779, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.994863411375112e+18, "train_batch_size": 64, "trial_name": null, "trial_params": null }