{ "best_metric": 0.7098406553268433, "best_model_checkpoint": "model/checkpoints/run1-java-codegen/checkpoint-20000", "epoch": 4.999818820886328, "eval_steps": 1000, "global_step": 34495, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014494329093742073, "grad_norm": 4.100641250610352, "learning_rate": 4.998550514567329e-05, "loss": 1.327, "step": 10 }, { "epoch": 0.0028988658187484146, "grad_norm": 1.0939269065856934, "learning_rate": 4.997101029134658e-05, "loss": 1.1957, "step": 20 }, { "epoch": 0.004348298728122622, "grad_norm": 1.4986281394958496, "learning_rate": 4.995651543701986e-05, "loss": 1.178, "step": 30 }, { "epoch": 0.005797731637496829, "grad_norm": 1.2242189645767212, "learning_rate": 4.9942020582693147e-05, "loss": 1.2268, "step": 40 }, { "epoch": 0.007247164546871037, "grad_norm": 1.2362180948257446, "learning_rate": 4.992752572836643e-05, "loss": 1.0823, "step": 50 }, { "epoch": 0.008696597456245243, "grad_norm": 1.1525092124938965, "learning_rate": 4.9913030874039715e-05, "loss": 1.1256, "step": 60 }, { "epoch": 0.010146030365619452, "grad_norm": 0.8720236420631409, "learning_rate": 4.9898536019713006e-05, "loss": 1.1147, "step": 70 }, { "epoch": 0.011595463274993658, "grad_norm": 1.3506882190704346, "learning_rate": 4.988404116538629e-05, "loss": 1.2389, "step": 80 }, { "epoch": 0.013044896184367867, "grad_norm": 1.2381508350372314, "learning_rate": 4.9869546311059575e-05, "loss": 1.0555, "step": 90 }, { "epoch": 0.014494329093742073, "grad_norm": 3.484632730484009, "learning_rate": 4.985505145673286e-05, "loss": 1.1662, "step": 100 }, { "epoch": 0.015943762003116282, "grad_norm": 3.0248196125030518, "learning_rate": 4.9840556602406144e-05, "loss": 1.1566, "step": 110 }, { "epoch": 0.017393194912490487, "grad_norm": 1.1476988792419434, "learning_rate": 4.9826061748079435e-05, "loss": 1.0685, "step": 120 }, { "epoch": 0.018842627821864695, "grad_norm": 3.148444652557373, "learning_rate": 4.981156689375272e-05, "loss": 1.2396, "step": 130 }, { "epoch": 0.020292060731238903, "grad_norm": 1.3316197395324707, "learning_rate": 4.9797072039426004e-05, "loss": 1.1839, "step": 140 }, { "epoch": 0.021741493640613112, "grad_norm": 1.0662773847579956, "learning_rate": 4.9782577185099295e-05, "loss": 1.1634, "step": 150 }, { "epoch": 0.023190926549987317, "grad_norm": 1.1283774375915527, "learning_rate": 4.976808233077258e-05, "loss": 1.114, "step": 160 }, { "epoch": 0.024640359459361525, "grad_norm": 3.6623597145080566, "learning_rate": 4.9753587476445864e-05, "loss": 1.0846, "step": 170 }, { "epoch": 0.026089792368735733, "grad_norm": 1.061867356300354, "learning_rate": 4.973909262211915e-05, "loss": 1.0521, "step": 180 }, { "epoch": 0.02753922527810994, "grad_norm": 3.880845308303833, "learning_rate": 4.972459776779244e-05, "loss": 1.0697, "step": 190 }, { "epoch": 0.028988658187484147, "grad_norm": 1.021384835243225, "learning_rate": 4.9710102913465723e-05, "loss": 1.0681, "step": 200 }, { "epoch": 0.030438091096858355, "grad_norm": 4.2998433113098145, "learning_rate": 4.969560805913901e-05, "loss": 1.0457, "step": 210 }, { "epoch": 0.031887524006232564, "grad_norm": 0.8809024691581726, "learning_rate": 4.96811132048123e-05, "loss": 1.0517, "step": 220 }, { "epoch": 0.03333695691560677, "grad_norm": 4.0385847091674805, "learning_rate": 4.9666618350485577e-05, "loss": 1.1193, "step": 230 }, { "epoch": 0.03478638982498097, "grad_norm": 2.3646724224090576, "learning_rate": 4.965212349615887e-05, "loss": 1.0611, "step": 240 }, { "epoch": 0.03623582273435518, "grad_norm": 1.0366450548171997, "learning_rate": 4.963762864183215e-05, "loss": 1.019, "step": 250 }, { "epoch": 0.03768525564372939, "grad_norm": 1.8329641819000244, "learning_rate": 4.9623133787505436e-05, "loss": 1.0415, "step": 260 }, { "epoch": 0.0391346885531036, "grad_norm": 0.9972900152206421, "learning_rate": 4.960863893317873e-05, "loss": 0.9843, "step": 270 }, { "epoch": 0.04058412146247781, "grad_norm": 0.8438522815704346, "learning_rate": 4.959414407885201e-05, "loss": 1.0857, "step": 280 }, { "epoch": 0.042033554371852015, "grad_norm": 2.5039472579956055, "learning_rate": 4.9579649224525296e-05, "loss": 1.0238, "step": 290 }, { "epoch": 0.043482987281226224, "grad_norm": 1.199118733406067, "learning_rate": 4.956515437019858e-05, "loss": 1.0512, "step": 300 }, { "epoch": 0.044932420190600425, "grad_norm": 1.1583256721496582, "learning_rate": 4.9550659515871865e-05, "loss": 1.1624, "step": 310 }, { "epoch": 0.04638185309997463, "grad_norm": 1.0587997436523438, "learning_rate": 4.9536164661545156e-05, "loss": 1.0383, "step": 320 }, { "epoch": 0.04783128600934884, "grad_norm": 3.125576972961426, "learning_rate": 4.952166980721844e-05, "loss": 0.9672, "step": 330 }, { "epoch": 0.04928071891872305, "grad_norm": 1.0660935640335083, "learning_rate": 4.9507174952891725e-05, "loss": 1.0317, "step": 340 }, { "epoch": 0.05073015182809726, "grad_norm": 1.2960877418518066, "learning_rate": 4.9492680098565016e-05, "loss": 1.0432, "step": 350 }, { "epoch": 0.05217958473747147, "grad_norm": 4.807547092437744, "learning_rate": 4.9478185244238294e-05, "loss": 1.1153, "step": 360 }, { "epoch": 0.05362901764684567, "grad_norm": 0.9906821250915527, "learning_rate": 4.9463690389911585e-05, "loss": 0.9973, "step": 370 }, { "epoch": 0.05507845055621988, "grad_norm": 2.1681807041168213, "learning_rate": 4.944919553558487e-05, "loss": 1.0127, "step": 380 }, { "epoch": 0.056527883465594085, "grad_norm": 1.2156059741973877, "learning_rate": 4.9434700681258153e-05, "loss": 1.0341, "step": 390 }, { "epoch": 0.057977316374968293, "grad_norm": 2.7744154930114746, "learning_rate": 4.9420205826931445e-05, "loss": 0.9894, "step": 400 }, { "epoch": 0.0594267492843425, "grad_norm": 0.9824851751327515, "learning_rate": 4.940571097260473e-05, "loss": 0.9473, "step": 410 }, { "epoch": 0.06087618219371671, "grad_norm": 2.0799906253814697, "learning_rate": 4.939121611827801e-05, "loss": 1.0758, "step": 420 }, { "epoch": 0.06232561510309092, "grad_norm": 1.282225251197815, "learning_rate": 4.93767212639513e-05, "loss": 1.0126, "step": 430 }, { "epoch": 0.06377504801246513, "grad_norm": 3.6338846683502197, "learning_rate": 4.936222640962458e-05, "loss": 1.0053, "step": 440 }, { "epoch": 0.06522448092183933, "grad_norm": 2.238694667816162, "learning_rate": 4.934773155529787e-05, "loss": 0.9902, "step": 450 }, { "epoch": 0.06667391383121354, "grad_norm": 0.8595529794692993, "learning_rate": 4.933323670097116e-05, "loss": 0.9713, "step": 460 }, { "epoch": 0.06812334674058775, "grad_norm": 1.2215890884399414, "learning_rate": 4.931874184664444e-05, "loss": 1.0144, "step": 470 }, { "epoch": 0.06957277964996195, "grad_norm": 1.1760045289993286, "learning_rate": 4.930424699231773e-05, "loss": 1.1197, "step": 480 }, { "epoch": 0.07102221255933616, "grad_norm": 2.8960888385772705, "learning_rate": 4.928975213799101e-05, "loss": 1.0625, "step": 490 }, { "epoch": 0.07247164546871036, "grad_norm": 1.0947250127792358, "learning_rate": 4.92752572836643e-05, "loss": 1.0667, "step": 500 }, { "epoch": 0.07392107837808458, "grad_norm": 0.8826558589935303, "learning_rate": 4.9260762429337586e-05, "loss": 1.081, "step": 510 }, { "epoch": 0.07537051128745878, "grad_norm": 1.2271273136138916, "learning_rate": 4.924626757501087e-05, "loss": 0.9745, "step": 520 }, { "epoch": 0.076819944196833, "grad_norm": 0.9710911512374878, "learning_rate": 4.923177272068416e-05, "loss": 0.9851, "step": 530 }, { "epoch": 0.0782693771062072, "grad_norm": 0.9668734669685364, "learning_rate": 4.9217277866357446e-05, "loss": 1.0127, "step": 540 }, { "epoch": 0.0797188100155814, "grad_norm": 0.9408063888549805, "learning_rate": 4.920278301203073e-05, "loss": 0.8172, "step": 550 }, { "epoch": 0.08116824292495561, "grad_norm": 0.99382084608078, "learning_rate": 4.9188288157704015e-05, "loss": 0.9216, "step": 560 }, { "epoch": 0.08261767583432982, "grad_norm": 1.1238948106765747, "learning_rate": 4.9173793303377306e-05, "loss": 1.0077, "step": 570 }, { "epoch": 0.08406710874370403, "grad_norm": 1.1847305297851562, "learning_rate": 4.915929844905059e-05, "loss": 1.0102, "step": 580 }, { "epoch": 0.08551654165307823, "grad_norm": 1.0952173471450806, "learning_rate": 4.9144803594723874e-05, "loss": 1.0433, "step": 590 }, { "epoch": 0.08696597456245245, "grad_norm": 2.061677932739258, "learning_rate": 4.9130308740397166e-05, "loss": 0.9256, "step": 600 }, { "epoch": 0.08841540747182665, "grad_norm": 2.83744478225708, "learning_rate": 4.911581388607045e-05, "loss": 1.0611, "step": 610 }, { "epoch": 0.08986484038120085, "grad_norm": 2.6311850547790527, "learning_rate": 4.9101319031743734e-05, "loss": 1.0714, "step": 620 }, { "epoch": 0.09131427329057507, "grad_norm": 3.2638437747955322, "learning_rate": 4.908682417741702e-05, "loss": 1.0245, "step": 630 }, { "epoch": 0.09276370619994927, "grad_norm": 2.02360200881958, "learning_rate": 4.90723293230903e-05, "loss": 0.9304, "step": 640 }, { "epoch": 0.09421313910932348, "grad_norm": 2.3670814037323, "learning_rate": 4.9057834468763594e-05, "loss": 1.0079, "step": 650 }, { "epoch": 0.09566257201869768, "grad_norm": 0.9333920478820801, "learning_rate": 4.904333961443688e-05, "loss": 0.9725, "step": 660 }, { "epoch": 0.09711200492807189, "grad_norm": 1.0716252326965332, "learning_rate": 4.902884476011016e-05, "loss": 1.028, "step": 670 }, { "epoch": 0.0985614378374461, "grad_norm": 3.501131534576416, "learning_rate": 4.901434990578345e-05, "loss": 0.9943, "step": 680 }, { "epoch": 0.1000108707468203, "grad_norm": 3.1148428916931152, "learning_rate": 4.899985505145673e-05, "loss": 0.9784, "step": 690 }, { "epoch": 0.10146030365619452, "grad_norm": 2.3271002769470215, "learning_rate": 4.898536019713002e-05, "loss": 0.9682, "step": 700 }, { "epoch": 0.10290973656556872, "grad_norm": 4.113440036773682, "learning_rate": 4.897086534280331e-05, "loss": 0.9687, "step": 710 }, { "epoch": 0.10435916947494293, "grad_norm": 1.9874829053878784, "learning_rate": 4.895637048847659e-05, "loss": 1.0132, "step": 720 }, { "epoch": 0.10580860238431714, "grad_norm": 1.2082017660140991, "learning_rate": 4.894187563414988e-05, "loss": 0.9529, "step": 730 }, { "epoch": 0.10725803529369134, "grad_norm": 1.016812801361084, "learning_rate": 4.892738077982317e-05, "loss": 0.9425, "step": 740 }, { "epoch": 0.10870746820306555, "grad_norm": 1.1642169952392578, "learning_rate": 4.891288592549645e-05, "loss": 1.0083, "step": 750 }, { "epoch": 0.11015690111243975, "grad_norm": 1.112574815750122, "learning_rate": 4.8898391071169736e-05, "loss": 1.0205, "step": 760 }, { "epoch": 0.11160633402181397, "grad_norm": 1.2382400035858154, "learning_rate": 4.888389621684302e-05, "loss": 1.0995, "step": 770 }, { "epoch": 0.11305576693118817, "grad_norm": 1.1367257833480835, "learning_rate": 4.886940136251631e-05, "loss": 1.0161, "step": 780 }, { "epoch": 0.11450519984056239, "grad_norm": 0.9074719548225403, "learning_rate": 4.8854906508189596e-05, "loss": 0.8662, "step": 790 }, { "epoch": 0.11595463274993659, "grad_norm": 2.2612428665161133, "learning_rate": 4.884041165386288e-05, "loss": 1.0303, "step": 800 }, { "epoch": 0.11740406565931079, "grad_norm": 0.9624481797218323, "learning_rate": 4.8825916799536164e-05, "loss": 1.0381, "step": 810 }, { "epoch": 0.118853498568685, "grad_norm": 1.8881802558898926, "learning_rate": 4.881142194520945e-05, "loss": 0.9832, "step": 820 }, { "epoch": 0.1203029314780592, "grad_norm": 1.0378659963607788, "learning_rate": 4.879692709088274e-05, "loss": 0.898, "step": 830 }, { "epoch": 0.12175236438743342, "grad_norm": 2.750006675720215, "learning_rate": 4.8782432236556024e-05, "loss": 0.9369, "step": 840 }, { "epoch": 0.12320179729680762, "grad_norm": 1.0392038822174072, "learning_rate": 4.876793738222931e-05, "loss": 0.949, "step": 850 }, { "epoch": 0.12465123020618184, "grad_norm": 1.1465096473693848, "learning_rate": 4.87534425279026e-05, "loss": 0.9241, "step": 860 }, { "epoch": 0.12610066311555604, "grad_norm": 1.13377845287323, "learning_rate": 4.8738947673575884e-05, "loss": 0.9059, "step": 870 }, { "epoch": 0.12755009602493025, "grad_norm": 0.9846686720848083, "learning_rate": 4.872445281924917e-05, "loss": 0.9339, "step": 880 }, { "epoch": 0.12899952893430444, "grad_norm": 1.0960410833358765, "learning_rate": 4.870995796492245e-05, "loss": 0.9495, "step": 890 }, { "epoch": 0.13044896184367866, "grad_norm": 0.8573481440544128, "learning_rate": 4.869546311059574e-05, "loss": 1.0344, "step": 900 }, { "epoch": 0.13189839475305287, "grad_norm": 1.264652967453003, "learning_rate": 4.868096825626903e-05, "loss": 0.9459, "step": 910 }, { "epoch": 0.1333478276624271, "grad_norm": 2.9564871788024902, "learning_rate": 4.866647340194231e-05, "loss": 1.0021, "step": 920 }, { "epoch": 0.13479726057180128, "grad_norm": 2.6998257637023926, "learning_rate": 4.8651978547615604e-05, "loss": 0.9497, "step": 930 }, { "epoch": 0.1362466934811755, "grad_norm": 1.1182892322540283, "learning_rate": 4.863748369328888e-05, "loss": 0.9073, "step": 940 }, { "epoch": 0.1376961263905497, "grad_norm": 3.7214620113372803, "learning_rate": 4.862298883896217e-05, "loss": 1.0357, "step": 950 }, { "epoch": 0.1391455592999239, "grad_norm": 2.3611812591552734, "learning_rate": 4.860849398463546e-05, "loss": 0.9799, "step": 960 }, { "epoch": 0.1405949922092981, "grad_norm": 2.17946195602417, "learning_rate": 4.859399913030874e-05, "loss": 0.9733, "step": 970 }, { "epoch": 0.14204442511867232, "grad_norm": 2.5610668659210205, "learning_rate": 4.857950427598203e-05, "loss": 0.9406, "step": 980 }, { "epoch": 0.14349385802804654, "grad_norm": 2.09188175201416, "learning_rate": 4.8565009421655317e-05, "loss": 0.9563, "step": 990 }, { "epoch": 0.14494329093742073, "grad_norm": 2.210683584213257, "learning_rate": 4.85505145673286e-05, "loss": 0.9265, "step": 1000 }, { "epoch": 0.14494329093742073, "eval_loss": 0.9329102039337158, "eval_runtime": 670.2535, "eval_samples_per_second": 51.466, "eval_steps_per_second": 2.574, "eval_token_accuracy": 0.00042011859825789107, "step": 1000 }, { "epoch": 0.14639272384679494, "grad_norm": 1.2653387784957886, "learning_rate": 4.8536019713001885e-05, "loss": 0.9509, "step": 1010 }, { "epoch": 0.14784215675616916, "grad_norm": 0.9553664922714233, "learning_rate": 4.852152485867517e-05, "loss": 0.982, "step": 1020 }, { "epoch": 0.14929158966554334, "grad_norm": 0.8428664803504944, "learning_rate": 4.850703000434846e-05, "loss": 0.8896, "step": 1030 }, { "epoch": 0.15074102257491756, "grad_norm": 2.9214038848876953, "learning_rate": 4.8492535150021745e-05, "loss": 0.8753, "step": 1040 }, { "epoch": 0.15219045548429178, "grad_norm": 0.9408260583877563, "learning_rate": 4.847804029569503e-05, "loss": 1.047, "step": 1050 }, { "epoch": 0.153639888393666, "grad_norm": 1.9852826595306396, "learning_rate": 4.846354544136832e-05, "loss": 0.9524, "step": 1060 }, { "epoch": 0.15508932130304018, "grad_norm": 3.4830455780029297, "learning_rate": 4.84490505870416e-05, "loss": 0.9501, "step": 1070 }, { "epoch": 0.1565387542124144, "grad_norm": 2.317713975906372, "learning_rate": 4.843455573271489e-05, "loss": 0.8766, "step": 1080 }, { "epoch": 0.1579881871217886, "grad_norm": 3.330193281173706, "learning_rate": 4.8420060878388174e-05, "loss": 0.9837, "step": 1090 }, { "epoch": 0.1594376200311628, "grad_norm": 2.919408082962036, "learning_rate": 4.840556602406146e-05, "loss": 0.9576, "step": 1100 }, { "epoch": 0.160887052940537, "grad_norm": 2.2638754844665527, "learning_rate": 4.839107116973475e-05, "loss": 0.9298, "step": 1110 }, { "epoch": 0.16233648584991123, "grad_norm": 1.0544981956481934, "learning_rate": 4.8376576315408034e-05, "loss": 0.9947, "step": 1120 }, { "epoch": 0.16378591875928544, "grad_norm": 1.3924440145492554, "learning_rate": 4.836208146108132e-05, "loss": 0.8487, "step": 1130 }, { "epoch": 0.16523535166865963, "grad_norm": 3.719411611557007, "learning_rate": 4.83475866067546e-05, "loss": 0.834, "step": 1140 }, { "epoch": 0.16668478457803385, "grad_norm": 3.1443965435028076, "learning_rate": 4.833309175242789e-05, "loss": 1.0022, "step": 1150 }, { "epoch": 0.16813421748740806, "grad_norm": 0.8373526334762573, "learning_rate": 4.831859689810118e-05, "loss": 0.9773, "step": 1160 }, { "epoch": 0.16958365039678225, "grad_norm": 2.951531171798706, "learning_rate": 4.830410204377446e-05, "loss": 0.977, "step": 1170 }, { "epoch": 0.17103308330615646, "grad_norm": 2.2842185497283936, "learning_rate": 4.8289607189447747e-05, "loss": 0.9501, "step": 1180 }, { "epoch": 0.17248251621553068, "grad_norm": 2.8369762897491455, "learning_rate": 4.827511233512104e-05, "loss": 0.9134, "step": 1190 }, { "epoch": 0.1739319491249049, "grad_norm": 2.4819037914276123, "learning_rate": 4.8260617480794315e-05, "loss": 0.9153, "step": 1200 }, { "epoch": 0.17538138203427908, "grad_norm": 1.0048160552978516, "learning_rate": 4.8246122626467606e-05, "loss": 0.9296, "step": 1210 }, { "epoch": 0.1768308149436533, "grad_norm": 3.0000860691070557, "learning_rate": 4.823162777214089e-05, "loss": 1.0214, "step": 1220 }, { "epoch": 0.1782802478530275, "grad_norm": 2.2292511463165283, "learning_rate": 4.8217132917814175e-05, "loss": 0.9293, "step": 1230 }, { "epoch": 0.1797296807624017, "grad_norm": 1.6593018770217896, "learning_rate": 4.8202638063487466e-05, "loss": 0.936, "step": 1240 }, { "epoch": 0.18117911367177592, "grad_norm": 2.7024037837982178, "learning_rate": 4.818814320916075e-05, "loss": 0.905, "step": 1250 }, { "epoch": 0.18262854658115013, "grad_norm": 0.9400494694709778, "learning_rate": 4.817364835483404e-05, "loss": 0.9595, "step": 1260 }, { "epoch": 0.18407797949052435, "grad_norm": 1.0066282749176025, "learning_rate": 4.815915350050732e-05, "loss": 0.8789, "step": 1270 }, { "epoch": 0.18552741239989853, "grad_norm": 1.9175103902816772, "learning_rate": 4.814465864618061e-05, "loss": 0.8903, "step": 1280 }, { "epoch": 0.18697684530927275, "grad_norm": 1.1838349103927612, "learning_rate": 4.8130163791853895e-05, "loss": 0.9542, "step": 1290 }, { "epoch": 0.18842627821864696, "grad_norm": 4.920762062072754, "learning_rate": 4.811566893752718e-05, "loss": 0.971, "step": 1300 }, { "epoch": 0.18987571112802115, "grad_norm": 1.5060116052627563, "learning_rate": 4.810117408320047e-05, "loss": 0.8748, "step": 1310 }, { "epoch": 0.19132514403739537, "grad_norm": 2.7574806213378906, "learning_rate": 4.8086679228873755e-05, "loss": 0.8522, "step": 1320 }, { "epoch": 0.19277457694676958, "grad_norm": 3.3912112712860107, "learning_rate": 4.807218437454704e-05, "loss": 0.9123, "step": 1330 }, { "epoch": 0.19422400985614377, "grad_norm": 0.9661535620689392, "learning_rate": 4.805768952022032e-05, "loss": 0.9062, "step": 1340 }, { "epoch": 0.19567344276551799, "grad_norm": 2.8521437644958496, "learning_rate": 4.804319466589361e-05, "loss": 0.9922, "step": 1350 }, { "epoch": 0.1971228756748922, "grad_norm": 2.469609498977661, "learning_rate": 4.80286998115669e-05, "loss": 1.0353, "step": 1360 }, { "epoch": 0.19857230858426642, "grad_norm": 2.4669108390808105, "learning_rate": 4.801420495724018e-05, "loss": 0.9082, "step": 1370 }, { "epoch": 0.2000217414936406, "grad_norm": 2.641343593597412, "learning_rate": 4.799971010291347e-05, "loss": 0.9272, "step": 1380 }, { "epoch": 0.20147117440301482, "grad_norm": 1.0589393377304077, "learning_rate": 4.798521524858676e-05, "loss": 0.9126, "step": 1390 }, { "epoch": 0.20292060731238903, "grad_norm": 4.2109880447387695, "learning_rate": 4.7970720394260036e-05, "loss": 1.0255, "step": 1400 }, { "epoch": 0.20437004022176322, "grad_norm": 2.466830015182495, "learning_rate": 4.795622553993333e-05, "loss": 0.9984, "step": 1410 }, { "epoch": 0.20581947313113744, "grad_norm": 0.8373948931694031, "learning_rate": 4.794173068560661e-05, "loss": 0.8425, "step": 1420 }, { "epoch": 0.20726890604051165, "grad_norm": 1.0101238489151, "learning_rate": 4.7927235831279896e-05, "loss": 0.897, "step": 1430 }, { "epoch": 0.20871833894988587, "grad_norm": 2.0855133533477783, "learning_rate": 4.791274097695319e-05, "loss": 0.9181, "step": 1440 }, { "epoch": 0.21016777185926006, "grad_norm": 0.9278097152709961, "learning_rate": 4.789824612262647e-05, "loss": 0.9341, "step": 1450 }, { "epoch": 0.21161720476863427, "grad_norm": 1.599377989768982, "learning_rate": 4.7883751268299756e-05, "loss": 0.8105, "step": 1460 }, { "epoch": 0.21306663767800849, "grad_norm": 1.1618376970291138, "learning_rate": 4.786925641397304e-05, "loss": 0.913, "step": 1470 }, { "epoch": 0.21451607058738267, "grad_norm": 1.2751816511154175, "learning_rate": 4.7854761559646325e-05, "loss": 0.8783, "step": 1480 }, { "epoch": 0.2159655034967569, "grad_norm": 0.7834726572036743, "learning_rate": 4.7840266705319616e-05, "loss": 0.8906, "step": 1490 }, { "epoch": 0.2174149364061311, "grad_norm": 0.9681240320205688, "learning_rate": 4.78257718509929e-05, "loss": 1.0063, "step": 1500 }, { "epoch": 0.21886436931550532, "grad_norm": 2.6840054988861084, "learning_rate": 4.7811276996666185e-05, "loss": 0.9857, "step": 1510 }, { "epoch": 0.2203138022248795, "grad_norm": 0.952912449836731, "learning_rate": 4.779678214233947e-05, "loss": 1.0314, "step": 1520 }, { "epoch": 0.22176323513425372, "grad_norm": 2.4693140983581543, "learning_rate": 4.778228728801275e-05, "loss": 0.9979, "step": 1530 }, { "epoch": 0.22321266804362794, "grad_norm": 2.79858136177063, "learning_rate": 4.7767792433686044e-05, "loss": 0.9388, "step": 1540 }, { "epoch": 0.22466210095300213, "grad_norm": 2.951786994934082, "learning_rate": 4.775329757935933e-05, "loss": 0.9021, "step": 1550 }, { "epoch": 0.22611153386237634, "grad_norm": 1.213524341583252, "learning_rate": 4.773880272503261e-05, "loss": 1.0483, "step": 1560 }, { "epoch": 0.22756096677175056, "grad_norm": 3.919523000717163, "learning_rate": 4.7724307870705904e-05, "loss": 1.0111, "step": 1570 }, { "epoch": 0.22901039968112477, "grad_norm": 1.069823145866394, "learning_rate": 4.770981301637919e-05, "loss": 0.8561, "step": 1580 }, { "epoch": 0.23045983259049896, "grad_norm": 1.9359710216522217, "learning_rate": 4.769531816205247e-05, "loss": 0.9442, "step": 1590 }, { "epoch": 0.23190926549987317, "grad_norm": 0.9056136608123779, "learning_rate": 4.768082330772576e-05, "loss": 0.8722, "step": 1600 }, { "epoch": 0.2333586984092474, "grad_norm": 1.7711377143859863, "learning_rate": 4.766632845339904e-05, "loss": 0.8642, "step": 1610 }, { "epoch": 0.23480813131862158, "grad_norm": 0.7478885650634766, "learning_rate": 4.765183359907233e-05, "loss": 0.9124, "step": 1620 }, { "epoch": 0.2362575642279958, "grad_norm": 0.8888253569602966, "learning_rate": 4.763733874474562e-05, "loss": 0.9537, "step": 1630 }, { "epoch": 0.23770699713737, "grad_norm": 0.7752460837364197, "learning_rate": 4.762284389041891e-05, "loss": 0.8851, "step": 1640 }, { "epoch": 0.23915643004674422, "grad_norm": 1.0801783800125122, "learning_rate": 4.7608349036092186e-05, "loss": 0.8669, "step": 1650 }, { "epoch": 0.2406058629561184, "grad_norm": 1.8434480428695679, "learning_rate": 4.759385418176548e-05, "loss": 1.0102, "step": 1660 }, { "epoch": 0.24205529586549263, "grad_norm": 2.921001434326172, "learning_rate": 4.757935932743876e-05, "loss": 0.8252, "step": 1670 }, { "epoch": 0.24350472877486684, "grad_norm": 1.2697440385818481, "learning_rate": 4.7564864473112046e-05, "loss": 0.851, "step": 1680 }, { "epoch": 0.24495416168424103, "grad_norm": 2.131409168243408, "learning_rate": 4.755036961878534e-05, "loss": 0.9035, "step": 1690 }, { "epoch": 0.24640359459361524, "grad_norm": 2.086275339126587, "learning_rate": 4.753587476445862e-05, "loss": 0.8721, "step": 1700 }, { "epoch": 0.24785302750298946, "grad_norm": 1.0655896663665771, "learning_rate": 4.7521379910131906e-05, "loss": 0.9176, "step": 1710 }, { "epoch": 0.24930246041236367, "grad_norm": 0.9161379933357239, "learning_rate": 4.750688505580519e-05, "loss": 0.8358, "step": 1720 }, { "epoch": 0.25075189332173786, "grad_norm": 1.0103297233581543, "learning_rate": 4.7492390201478474e-05, "loss": 0.9097, "step": 1730 }, { "epoch": 0.2522013262311121, "grad_norm": 0.9535181522369385, "learning_rate": 4.7477895347151765e-05, "loss": 0.9068, "step": 1740 }, { "epoch": 0.2536507591404863, "grad_norm": 3.1974833011627197, "learning_rate": 4.746340049282505e-05, "loss": 0.9513, "step": 1750 }, { "epoch": 0.2551001920498605, "grad_norm": 3.5244507789611816, "learning_rate": 4.7448905638498334e-05, "loss": 0.9102, "step": 1760 }, { "epoch": 0.2565496249592347, "grad_norm": 0.7904402017593384, "learning_rate": 4.7434410784171625e-05, "loss": 0.9212, "step": 1770 }, { "epoch": 0.2579990578686089, "grad_norm": 2.1184966564178467, "learning_rate": 4.74199159298449e-05, "loss": 0.8912, "step": 1780 }, { "epoch": 0.2594484907779831, "grad_norm": 4.049259185791016, "learning_rate": 4.7405421075518194e-05, "loss": 1.0043, "step": 1790 }, { "epoch": 0.2608979236873573, "grad_norm": 0.9909255504608154, "learning_rate": 4.739092622119148e-05, "loss": 0.967, "step": 1800 }, { "epoch": 0.26234735659673153, "grad_norm": 0.9145278930664062, "learning_rate": 4.737643136686476e-05, "loss": 0.8499, "step": 1810 }, { "epoch": 0.26379678950610574, "grad_norm": 4.189142227172852, "learning_rate": 4.7361936512538054e-05, "loss": 0.9368, "step": 1820 }, { "epoch": 0.26524622241547996, "grad_norm": 3.4056777954101562, "learning_rate": 4.734744165821134e-05, "loss": 0.8985, "step": 1830 }, { "epoch": 0.2666956553248542, "grad_norm": 0.9040047526359558, "learning_rate": 4.733294680388462e-05, "loss": 0.8604, "step": 1840 }, { "epoch": 0.26814508823422833, "grad_norm": 2.2302513122558594, "learning_rate": 4.731845194955791e-05, "loss": 0.951, "step": 1850 }, { "epoch": 0.26959452114360255, "grad_norm": 1.6290664672851562, "learning_rate": 4.730395709523119e-05, "loss": 0.8684, "step": 1860 }, { "epoch": 0.27104395405297677, "grad_norm": 3.178881883621216, "learning_rate": 4.728946224090448e-05, "loss": 0.939, "step": 1870 }, { "epoch": 0.272493386962351, "grad_norm": 0.8622704744338989, "learning_rate": 4.727496738657777e-05, "loss": 0.907, "step": 1880 }, { "epoch": 0.2739428198717252, "grad_norm": 0.8100652694702148, "learning_rate": 4.726047253225105e-05, "loss": 0.9619, "step": 1890 }, { "epoch": 0.2753922527810994, "grad_norm": 3.0906217098236084, "learning_rate": 4.724597767792434e-05, "loss": 0.9168, "step": 1900 }, { "epoch": 0.2768416856904736, "grad_norm": 0.9899945259094238, "learning_rate": 4.723148282359762e-05, "loss": 0.9018, "step": 1910 }, { "epoch": 0.2782911185998478, "grad_norm": 0.9530486464500427, "learning_rate": 4.721698796927091e-05, "loss": 0.9945, "step": 1920 }, { "epoch": 0.279740551509222, "grad_norm": 0.9075252413749695, "learning_rate": 4.7202493114944195e-05, "loss": 0.8555, "step": 1930 }, { "epoch": 0.2811899844185962, "grad_norm": 2.1643764972686768, "learning_rate": 4.718799826061748e-05, "loss": 0.9005, "step": 1940 }, { "epoch": 0.28263941732797043, "grad_norm": 1.0276823043823242, "learning_rate": 4.717350340629077e-05, "loss": 1.0093, "step": 1950 }, { "epoch": 0.28408885023734465, "grad_norm": 3.145967483520508, "learning_rate": 4.7159008551964055e-05, "loss": 0.8591, "step": 1960 }, { "epoch": 0.28553828314671886, "grad_norm": 0.9500845074653625, "learning_rate": 4.7144513697637346e-05, "loss": 0.9213, "step": 1970 }, { "epoch": 0.2869877160560931, "grad_norm": 0.709665834903717, "learning_rate": 4.7130018843310624e-05, "loss": 0.8349, "step": 1980 }, { "epoch": 0.28843714896546724, "grad_norm": 1.674027442932129, "learning_rate": 4.711552398898391e-05, "loss": 0.9234, "step": 1990 }, { "epoch": 0.28988658187484145, "grad_norm": 1.1925066709518433, "learning_rate": 4.71010291346572e-05, "loss": 0.9271, "step": 2000 }, { "epoch": 0.28988658187484145, "eval_loss": 0.8844457864761353, "eval_runtime": 669.6132, "eval_samples_per_second": 51.515, "eval_steps_per_second": 2.576, "eval_token_accuracy": 0.0004413497065925165, "step": 2000 }, { "epoch": 0.29133601478421567, "grad_norm": 1.0759925842285156, "learning_rate": 4.7086534280330484e-05, "loss": 0.9389, "step": 2010 }, { "epoch": 0.2927854476935899, "grad_norm": 3.2743873596191406, "learning_rate": 4.7072039426003775e-05, "loss": 0.909, "step": 2020 }, { "epoch": 0.2942348806029641, "grad_norm": 2.5346035957336426, "learning_rate": 4.705754457167706e-05, "loss": 0.9818, "step": 2030 }, { "epoch": 0.2956843135123383, "grad_norm": 0.8546256422996521, "learning_rate": 4.7043049717350344e-05, "loss": 0.9858, "step": 2040 }, { "epoch": 0.29713374642171253, "grad_norm": 1.074825644493103, "learning_rate": 4.702855486302363e-05, "loss": 0.8267, "step": 2050 }, { "epoch": 0.2985831793310867, "grad_norm": 0.9701589941978455, "learning_rate": 4.701406000869691e-05, "loss": 0.8692, "step": 2060 }, { "epoch": 0.3000326122404609, "grad_norm": 0.9195166826248169, "learning_rate": 4.6999565154370204e-05, "loss": 0.9654, "step": 2070 }, { "epoch": 0.3014820451498351, "grad_norm": 0.9517616629600525, "learning_rate": 4.698507030004349e-05, "loss": 0.8702, "step": 2080 }, { "epoch": 0.30293147805920934, "grad_norm": 0.7250176072120667, "learning_rate": 4.697057544571677e-05, "loss": 0.9022, "step": 2090 }, { "epoch": 0.30438091096858355, "grad_norm": 0.8878209590911865, "learning_rate": 4.6956080591390063e-05, "loss": 0.961, "step": 2100 }, { "epoch": 0.30583034387795777, "grad_norm": 0.8246135115623474, "learning_rate": 4.694158573706334e-05, "loss": 0.8893, "step": 2110 }, { "epoch": 0.307279776787332, "grad_norm": 2.2007503509521484, "learning_rate": 4.692709088273663e-05, "loss": 0.9578, "step": 2120 }, { "epoch": 0.30872920969670614, "grad_norm": 0.8744785189628601, "learning_rate": 4.6912596028409916e-05, "loss": 0.8567, "step": 2130 }, { "epoch": 0.31017864260608036, "grad_norm": 0.8183798789978027, "learning_rate": 4.68981011740832e-05, "loss": 0.9111, "step": 2140 }, { "epoch": 0.3116280755154546, "grad_norm": 3.1583478450775146, "learning_rate": 4.688360631975649e-05, "loss": 0.9575, "step": 2150 }, { "epoch": 0.3130775084248288, "grad_norm": 2.7086269855499268, "learning_rate": 4.6869111465429776e-05, "loss": 0.9788, "step": 2160 }, { "epoch": 0.314526941334203, "grad_norm": 0.8873903751373291, "learning_rate": 4.685461661110306e-05, "loss": 0.8857, "step": 2170 }, { "epoch": 0.3159763742435772, "grad_norm": 1.2673507928848267, "learning_rate": 4.6840121756776345e-05, "loss": 0.8733, "step": 2180 }, { "epoch": 0.31742580715295143, "grad_norm": 2.983605146408081, "learning_rate": 4.682562690244963e-05, "loss": 1.0298, "step": 2190 }, { "epoch": 0.3188752400623256, "grad_norm": 0.7594058513641357, "learning_rate": 4.681113204812292e-05, "loss": 0.912, "step": 2200 }, { "epoch": 0.3203246729716998, "grad_norm": 2.4812521934509277, "learning_rate": 4.6796637193796205e-05, "loss": 0.9988, "step": 2210 }, { "epoch": 0.321774105881074, "grad_norm": 2.502842903137207, "learning_rate": 4.678214233946949e-05, "loss": 0.8589, "step": 2220 }, { "epoch": 0.32322353879044824, "grad_norm": 1.1463567018508911, "learning_rate": 4.676764748514278e-05, "loss": 0.9046, "step": 2230 }, { "epoch": 0.32467297169982245, "grad_norm": 1.1552772521972656, "learning_rate": 4.675315263081606e-05, "loss": 0.8777, "step": 2240 }, { "epoch": 0.32612240460919667, "grad_norm": 0.7134237289428711, "learning_rate": 4.673865777648935e-05, "loss": 0.9582, "step": 2250 }, { "epoch": 0.3275718375185709, "grad_norm": 1.2815346717834473, "learning_rate": 4.6724162922162633e-05, "loss": 0.9574, "step": 2260 }, { "epoch": 0.32902127042794505, "grad_norm": 2.6346404552459717, "learning_rate": 4.670966806783592e-05, "loss": 0.8783, "step": 2270 }, { "epoch": 0.33047070333731926, "grad_norm": 2.8299264907836914, "learning_rate": 4.669517321350921e-05, "loss": 0.8363, "step": 2280 }, { "epoch": 0.3319201362466935, "grad_norm": 1.1581934690475464, "learning_rate": 4.668067835918249e-05, "loss": 0.8875, "step": 2290 }, { "epoch": 0.3333695691560677, "grad_norm": 0.801040768623352, "learning_rate": 4.666618350485578e-05, "loss": 0.9725, "step": 2300 }, { "epoch": 0.3348190020654419, "grad_norm": 4.302339553833008, "learning_rate": 4.665168865052906e-05, "loss": 0.8756, "step": 2310 }, { "epoch": 0.3362684349748161, "grad_norm": 1.6098670959472656, "learning_rate": 4.6637193796202346e-05, "loss": 0.9409, "step": 2320 }, { "epoch": 0.33771786788419034, "grad_norm": 0.9708408117294312, "learning_rate": 4.662269894187564e-05, "loss": 0.8269, "step": 2330 }, { "epoch": 0.3391673007935645, "grad_norm": 2.4792208671569824, "learning_rate": 4.660820408754892e-05, "loss": 0.7968, "step": 2340 }, { "epoch": 0.3406167337029387, "grad_norm": 1.9270927906036377, "learning_rate": 4.659370923322221e-05, "loss": 0.8857, "step": 2350 }, { "epoch": 0.3420661666123129, "grad_norm": 0.823274552822113, "learning_rate": 4.65792143788955e-05, "loss": 0.9024, "step": 2360 }, { "epoch": 0.34351559952168714, "grad_norm": 1.060691475868225, "learning_rate": 4.656471952456878e-05, "loss": 0.8604, "step": 2370 }, { "epoch": 0.34496503243106136, "grad_norm": 0.875577986240387, "learning_rate": 4.6550224670242066e-05, "loss": 0.8587, "step": 2380 }, { "epoch": 0.3464144653404356, "grad_norm": 1.1254234313964844, "learning_rate": 4.653572981591535e-05, "loss": 0.841, "step": 2390 }, { "epoch": 0.3478638982498098, "grad_norm": 0.9796624779701233, "learning_rate": 4.652123496158864e-05, "loss": 0.8953, "step": 2400 }, { "epoch": 0.34931333115918395, "grad_norm": 1.004135012626648, "learning_rate": 4.6506740107261926e-05, "loss": 0.9329, "step": 2410 }, { "epoch": 0.35076276406855816, "grad_norm": 1.0762351751327515, "learning_rate": 4.649224525293521e-05, "loss": 0.8285, "step": 2420 }, { "epoch": 0.3522121969779324, "grad_norm": 2.8664536476135254, "learning_rate": 4.6477750398608495e-05, "loss": 0.894, "step": 2430 }, { "epoch": 0.3536616298873066, "grad_norm": 2.1905500888824463, "learning_rate": 4.646325554428178e-05, "loss": 1.0367, "step": 2440 }, { "epoch": 0.3551110627966808, "grad_norm": 2.4113802909851074, "learning_rate": 4.644876068995507e-05, "loss": 0.915, "step": 2450 }, { "epoch": 0.356560495706055, "grad_norm": 2.86832594871521, "learning_rate": 4.6434265835628355e-05, "loss": 0.9524, "step": 2460 }, { "epoch": 0.35800992861542924, "grad_norm": 2.5679378509521484, "learning_rate": 4.641977098130164e-05, "loss": 0.8456, "step": 2470 }, { "epoch": 0.3594593615248034, "grad_norm": 2.207540988922119, "learning_rate": 4.640527612697493e-05, "loss": 0.9101, "step": 2480 }, { "epoch": 0.3609087944341776, "grad_norm": 0.9119837880134583, "learning_rate": 4.639078127264821e-05, "loss": 0.8385, "step": 2490 }, { "epoch": 0.36235822734355183, "grad_norm": 2.577519655227661, "learning_rate": 4.63762864183215e-05, "loss": 0.9289, "step": 2500 }, { "epoch": 0.36380766025292605, "grad_norm": 0.8448528051376343, "learning_rate": 4.636179156399478e-05, "loss": 0.8761, "step": 2510 }, { "epoch": 0.36525709316230026, "grad_norm": 1.085668683052063, "learning_rate": 4.634729670966807e-05, "loss": 0.9201, "step": 2520 }, { "epoch": 0.3667065260716745, "grad_norm": 3.674574851989746, "learning_rate": 4.633280185534136e-05, "loss": 0.8622, "step": 2530 }, { "epoch": 0.3681559589810487, "grad_norm": 0.8500022292137146, "learning_rate": 4.631830700101464e-05, "loss": 0.8821, "step": 2540 }, { "epoch": 0.36960539189042285, "grad_norm": 0.8118327856063843, "learning_rate": 4.630381214668793e-05, "loss": 0.8382, "step": 2550 }, { "epoch": 0.37105482479979707, "grad_norm": 0.8166295289993286, "learning_rate": 4.628931729236121e-05, "loss": 0.8343, "step": 2560 }, { "epoch": 0.3725042577091713, "grad_norm": 0.8547616004943848, "learning_rate": 4.6274822438034496e-05, "loss": 0.8319, "step": 2570 }, { "epoch": 0.3739536906185455, "grad_norm": 2.3333303928375244, "learning_rate": 4.626032758370779e-05, "loss": 0.9359, "step": 2580 }, { "epoch": 0.3754031235279197, "grad_norm": 2.314495086669922, "learning_rate": 4.624583272938107e-05, "loss": 0.9749, "step": 2590 }, { "epoch": 0.37685255643729393, "grad_norm": 3.219776153564453, "learning_rate": 4.6231337875054356e-05, "loss": 0.8707, "step": 2600 }, { "epoch": 0.37830198934666814, "grad_norm": 1.0046021938323975, "learning_rate": 4.621684302072765e-05, "loss": 0.963, "step": 2610 }, { "epoch": 0.3797514222560423, "grad_norm": 3.0080223083496094, "learning_rate": 4.6202348166400925e-05, "loss": 0.7975, "step": 2620 }, { "epoch": 0.3812008551654165, "grad_norm": 0.7294064164161682, "learning_rate": 4.6187853312074216e-05, "loss": 0.9354, "step": 2630 }, { "epoch": 0.38265028807479073, "grad_norm": 2.269174575805664, "learning_rate": 4.61733584577475e-05, "loss": 0.952, "step": 2640 }, { "epoch": 0.38409972098416495, "grad_norm": 2.4099864959716797, "learning_rate": 4.6158863603420784e-05, "loss": 0.9258, "step": 2650 }, { "epoch": 0.38554915389353916, "grad_norm": 0.9502733945846558, "learning_rate": 4.6144368749094076e-05, "loss": 0.9672, "step": 2660 }, { "epoch": 0.3869985868029134, "grad_norm": 2.0401251316070557, "learning_rate": 4.612987389476736e-05, "loss": 0.8961, "step": 2670 }, { "epoch": 0.38844801971228754, "grad_norm": 0.6880651116371155, "learning_rate": 4.611537904044065e-05, "loss": 0.8395, "step": 2680 }, { "epoch": 0.38989745262166176, "grad_norm": 2.322988986968994, "learning_rate": 4.610088418611393e-05, "loss": 0.9236, "step": 2690 }, { "epoch": 0.39134688553103597, "grad_norm": 2.6459531784057617, "learning_rate": 4.608638933178721e-05, "loss": 0.9201, "step": 2700 }, { "epoch": 0.3927963184404102, "grad_norm": 3.0082294940948486, "learning_rate": 4.6071894477460504e-05, "loss": 0.9289, "step": 2710 }, { "epoch": 0.3942457513497844, "grad_norm": 2.3234775066375732, "learning_rate": 4.605739962313379e-05, "loss": 0.8907, "step": 2720 }, { "epoch": 0.3956951842591586, "grad_norm": 2.548450469970703, "learning_rate": 4.604290476880708e-05, "loss": 0.8939, "step": 2730 }, { "epoch": 0.39714461716853283, "grad_norm": 0.8990277051925659, "learning_rate": 4.6028409914480364e-05, "loss": 0.8258, "step": 2740 }, { "epoch": 0.398594050077907, "grad_norm": 2.6693661212921143, "learning_rate": 4.601391506015365e-05, "loss": 0.8105, "step": 2750 }, { "epoch": 0.4000434829872812, "grad_norm": 1.8534784317016602, "learning_rate": 4.599942020582693e-05, "loss": 0.8123, "step": 2760 }, { "epoch": 0.4014929158966554, "grad_norm": 0.9559533596038818, "learning_rate": 4.598492535150022e-05, "loss": 0.8422, "step": 2770 }, { "epoch": 0.40294234880602964, "grad_norm": 3.0623579025268555, "learning_rate": 4.597043049717351e-05, "loss": 0.9334, "step": 2780 }, { "epoch": 0.40439178171540385, "grad_norm": 2.8755178451538086, "learning_rate": 4.595593564284679e-05, "loss": 0.8892, "step": 2790 }, { "epoch": 0.40584121462477807, "grad_norm": 1.9824632406234741, "learning_rate": 4.594144078852008e-05, "loss": 0.8169, "step": 2800 }, { "epoch": 0.4072906475341523, "grad_norm": 2.5154964923858643, "learning_rate": 4.592694593419337e-05, "loss": 0.8304, "step": 2810 }, { "epoch": 0.40874008044352644, "grad_norm": 1.6365853548049927, "learning_rate": 4.5912451079866646e-05, "loss": 0.8779, "step": 2820 }, { "epoch": 0.41018951335290066, "grad_norm": 0.7808021903038025, "learning_rate": 4.589795622553994e-05, "loss": 0.8335, "step": 2830 }, { "epoch": 0.4116389462622749, "grad_norm": 0.9925370812416077, "learning_rate": 4.588346137121322e-05, "loss": 0.7861, "step": 2840 }, { "epoch": 0.4130883791716491, "grad_norm": 2.8942930698394775, "learning_rate": 4.5868966516886506e-05, "loss": 0.9344, "step": 2850 }, { "epoch": 0.4145378120810233, "grad_norm": 1.0374455451965332, "learning_rate": 4.58544716625598e-05, "loss": 0.8743, "step": 2860 }, { "epoch": 0.4159872449903975, "grad_norm": 1.9524415731430054, "learning_rate": 4.583997680823308e-05, "loss": 0.8777, "step": 2870 }, { "epoch": 0.41743667789977174, "grad_norm": 0.8949989080429077, "learning_rate": 4.5825481953906365e-05, "loss": 0.8909, "step": 2880 }, { "epoch": 0.4188861108091459, "grad_norm": 2.0267937183380127, "learning_rate": 4.581098709957965e-05, "loss": 0.9716, "step": 2890 }, { "epoch": 0.4203355437185201, "grad_norm": 0.943326473236084, "learning_rate": 4.5796492245252934e-05, "loss": 0.9009, "step": 2900 }, { "epoch": 0.4217849766278943, "grad_norm": 0.7954492568969727, "learning_rate": 4.5781997390926225e-05, "loss": 0.9105, "step": 2910 }, { "epoch": 0.42323440953726854, "grad_norm": 4.5275349617004395, "learning_rate": 4.576750253659951e-05, "loss": 0.8551, "step": 2920 }, { "epoch": 0.42468384244664276, "grad_norm": 0.8888694047927856, "learning_rate": 4.5753007682272794e-05, "loss": 0.7458, "step": 2930 }, { "epoch": 0.42613327535601697, "grad_norm": 2.544055938720703, "learning_rate": 4.5738512827946085e-05, "loss": 0.9631, "step": 2940 }, { "epoch": 0.4275827082653912, "grad_norm": 0.7478300929069519, "learning_rate": 4.572401797361936e-05, "loss": 0.8596, "step": 2950 }, { "epoch": 0.42903214117476535, "grad_norm": 3.5203044414520264, "learning_rate": 4.5709523119292654e-05, "loss": 0.8521, "step": 2960 }, { "epoch": 0.43048157408413956, "grad_norm": 0.8133640289306641, "learning_rate": 4.569502826496594e-05, "loss": 0.8216, "step": 2970 }, { "epoch": 0.4319310069935138, "grad_norm": 1.3965954780578613, "learning_rate": 4.568053341063922e-05, "loss": 0.8722, "step": 2980 }, { "epoch": 0.433380439902888, "grad_norm": 0.6984473466873169, "learning_rate": 4.5666038556312514e-05, "loss": 0.9287, "step": 2990 }, { "epoch": 0.4348298728122622, "grad_norm": 0.7165631055831909, "learning_rate": 4.56515437019858e-05, "loss": 0.881, "step": 3000 }, { "epoch": 0.4348298728122622, "eval_loss": 0.8530751466751099, "eval_runtime": 671.8627, "eval_samples_per_second": 51.342, "eval_steps_per_second": 2.567, "eval_token_accuracy": 0.00042285290766462313, "step": 3000 }, { "epoch": 0.4362793057216364, "grad_norm": 0.9275263547897339, "learning_rate": 4.563704884765908e-05, "loss": 0.9308, "step": 3010 }, { "epoch": 0.43772873863101064, "grad_norm": 0.8296828269958496, "learning_rate": 4.562255399333237e-05, "loss": 0.9581, "step": 3020 }, { "epoch": 0.4391781715403848, "grad_norm": 0.8571820855140686, "learning_rate": 4.560805913900565e-05, "loss": 0.8346, "step": 3030 }, { "epoch": 0.440627604449759, "grad_norm": 0.910743772983551, "learning_rate": 4.559356428467894e-05, "loss": 0.9596, "step": 3040 }, { "epoch": 0.44207703735913323, "grad_norm": 1.1356425285339355, "learning_rate": 4.557906943035223e-05, "loss": 0.8551, "step": 3050 }, { "epoch": 0.44352647026850744, "grad_norm": 0.9998161792755127, "learning_rate": 4.556457457602552e-05, "loss": 0.8586, "step": 3060 }, { "epoch": 0.44497590317788166, "grad_norm": 1.193593978881836, "learning_rate": 4.55500797216988e-05, "loss": 0.877, "step": 3070 }, { "epoch": 0.4464253360872559, "grad_norm": 0.7462995052337646, "learning_rate": 4.553558486737208e-05, "loss": 0.8667, "step": 3080 }, { "epoch": 0.4478747689966301, "grad_norm": 0.9373787045478821, "learning_rate": 4.552109001304537e-05, "loss": 0.8609, "step": 3090 }, { "epoch": 0.44932420190600425, "grad_norm": 3.4775943756103516, "learning_rate": 4.5506595158718655e-05, "loss": 0.9172, "step": 3100 }, { "epoch": 0.45077363481537847, "grad_norm": 1.0909534692764282, "learning_rate": 4.5492100304391946e-05, "loss": 0.9085, "step": 3110 }, { "epoch": 0.4522230677247527, "grad_norm": 2.1074111461639404, "learning_rate": 4.547760545006523e-05, "loss": 0.8658, "step": 3120 }, { "epoch": 0.4536725006341269, "grad_norm": 0.9462234377861023, "learning_rate": 4.5463110595738515e-05, "loss": 0.7572, "step": 3130 }, { "epoch": 0.4551219335435011, "grad_norm": 1.8323560953140259, "learning_rate": 4.54486157414118e-05, "loss": 0.8097, "step": 3140 }, { "epoch": 0.4565713664528753, "grad_norm": 0.9515965580940247, "learning_rate": 4.5434120887085084e-05, "loss": 0.8501, "step": 3150 }, { "epoch": 0.45802079936224954, "grad_norm": 4.722071170806885, "learning_rate": 4.5419626032758375e-05, "loss": 0.9871, "step": 3160 }, { "epoch": 0.4594702322716237, "grad_norm": 1.0577269792556763, "learning_rate": 4.540513117843166e-05, "loss": 0.7459, "step": 3170 }, { "epoch": 0.4609196651809979, "grad_norm": 1.0412189960479736, "learning_rate": 4.5390636324104944e-05, "loss": 0.8492, "step": 3180 }, { "epoch": 0.46236909809037213, "grad_norm": 2.957418441772461, "learning_rate": 4.5376141469778235e-05, "loss": 0.8368, "step": 3190 }, { "epoch": 0.46381853099974635, "grad_norm": 1.0263798236846924, "learning_rate": 4.536164661545152e-05, "loss": 0.8358, "step": 3200 }, { "epoch": 0.46526796390912056, "grad_norm": 0.8341432213783264, "learning_rate": 4.5347151761124803e-05, "loss": 0.8912, "step": 3210 }, { "epoch": 0.4667173968184948, "grad_norm": 2.897122621536255, "learning_rate": 4.533265690679809e-05, "loss": 0.8843, "step": 3220 }, { "epoch": 0.468166829727869, "grad_norm": 2.811288356781006, "learning_rate": 4.531816205247137e-05, "loss": 0.8503, "step": 3230 }, { "epoch": 0.46961626263724315, "grad_norm": 2.7167201042175293, "learning_rate": 4.530366719814466e-05, "loss": 0.8325, "step": 3240 }, { "epoch": 0.47106569554661737, "grad_norm": 2.2229905128479004, "learning_rate": 4.528917234381795e-05, "loss": 0.8333, "step": 3250 }, { "epoch": 0.4725151284559916, "grad_norm": 0.8708949685096741, "learning_rate": 4.527467748949123e-05, "loss": 0.8631, "step": 3260 }, { "epoch": 0.4739645613653658, "grad_norm": 2.3800301551818848, "learning_rate": 4.5260182635164516e-05, "loss": 0.8038, "step": 3270 }, { "epoch": 0.47541399427474, "grad_norm": 3.5404298305511475, "learning_rate": 4.52456877808378e-05, "loss": 0.8799, "step": 3280 }, { "epoch": 0.47686342718411423, "grad_norm": 3.8659508228302, "learning_rate": 4.523119292651109e-05, "loss": 0.8644, "step": 3290 }, { "epoch": 0.47831286009348845, "grad_norm": 0.8231411576271057, "learning_rate": 4.5216698072184376e-05, "loss": 0.7729, "step": 3300 }, { "epoch": 0.4797622930028626, "grad_norm": 0.8375280499458313, "learning_rate": 4.520220321785766e-05, "loss": 0.8589, "step": 3310 }, { "epoch": 0.4812117259122368, "grad_norm": 1.1756319999694824, "learning_rate": 4.518770836353095e-05, "loss": 0.7879, "step": 3320 }, { "epoch": 0.48266115882161104, "grad_norm": 1.7768633365631104, "learning_rate": 4.517321350920423e-05, "loss": 0.8617, "step": 3330 }, { "epoch": 0.48411059173098525, "grad_norm": 2.3392810821533203, "learning_rate": 4.515871865487752e-05, "loss": 0.8479, "step": 3340 }, { "epoch": 0.48556002464035947, "grad_norm": 0.9335256218910217, "learning_rate": 4.5144223800550805e-05, "loss": 0.7817, "step": 3350 }, { "epoch": 0.4870094575497337, "grad_norm": 2.646064281463623, "learning_rate": 4.512972894622409e-05, "loss": 0.8844, "step": 3360 }, { "epoch": 0.4884588904591079, "grad_norm": 1.0953303575515747, "learning_rate": 4.511523409189738e-05, "loss": 0.8304, "step": 3370 }, { "epoch": 0.48990832336848206, "grad_norm": 2.2010998725891113, "learning_rate": 4.5100739237570665e-05, "loss": 0.7681, "step": 3380 }, { "epoch": 0.4913577562778563, "grad_norm": 2.8716251850128174, "learning_rate": 4.5086244383243956e-05, "loss": 0.9036, "step": 3390 }, { "epoch": 0.4928071891872305, "grad_norm": 0.7964257597923279, "learning_rate": 4.5071749528917233e-05, "loss": 0.9171, "step": 3400 }, { "epoch": 0.4942566220966047, "grad_norm": 1.0154571533203125, "learning_rate": 4.505725467459052e-05, "loss": 0.8455, "step": 3410 }, { "epoch": 0.4957060550059789, "grad_norm": 0.8615840673446655, "learning_rate": 4.504275982026381e-05, "loss": 0.8727, "step": 3420 }, { "epoch": 0.49715548791535313, "grad_norm": 1.9749999046325684, "learning_rate": 4.502826496593709e-05, "loss": 0.8145, "step": 3430 }, { "epoch": 0.49860492082472735, "grad_norm": 0.8778107762336731, "learning_rate": 4.5013770111610384e-05, "loss": 0.8998, "step": 3440 }, { "epoch": 0.5000543537341016, "grad_norm": 2.565814733505249, "learning_rate": 4.499927525728367e-05, "loss": 0.8783, "step": 3450 }, { "epoch": 0.5015037866434757, "grad_norm": 1.1715037822723389, "learning_rate": 4.498478040295695e-05, "loss": 0.8143, "step": 3460 }, { "epoch": 0.50295321955285, "grad_norm": 2.346769094467163, "learning_rate": 4.497028554863024e-05, "loss": 0.8483, "step": 3470 }, { "epoch": 0.5044026524622242, "grad_norm": 1.9950371980667114, "learning_rate": 4.495579069430352e-05, "loss": 0.762, "step": 3480 }, { "epoch": 0.5058520853715983, "grad_norm": 2.0283892154693604, "learning_rate": 4.494129583997681e-05, "loss": 0.8049, "step": 3490 }, { "epoch": 0.5073015182809726, "grad_norm": 0.9947206377983093, "learning_rate": 4.49268009856501e-05, "loss": 0.9395, "step": 3500 }, { "epoch": 0.5087509511903467, "grad_norm": 0.7881497144699097, "learning_rate": 4.491230613132338e-05, "loss": 0.8563, "step": 3510 }, { "epoch": 0.510200384099721, "grad_norm": 2.30930495262146, "learning_rate": 4.489781127699667e-05, "loss": 0.9173, "step": 3520 }, { "epoch": 0.5116498170090952, "grad_norm": 3.175978899002075, "learning_rate": 4.488331642266995e-05, "loss": 0.8632, "step": 3530 }, { "epoch": 0.5130992499184694, "grad_norm": 0.7730461955070496, "learning_rate": 4.486882156834324e-05, "loss": 0.7952, "step": 3540 }, { "epoch": 0.5145486828278436, "grad_norm": 1.061704158782959, "learning_rate": 4.4854326714016526e-05, "loss": 0.7262, "step": 3550 }, { "epoch": 0.5159981157372178, "grad_norm": 3.158407211303711, "learning_rate": 4.483983185968981e-05, "loss": 0.7429, "step": 3560 }, { "epoch": 0.517447548646592, "grad_norm": 2.246033191680908, "learning_rate": 4.48253370053631e-05, "loss": 0.8934, "step": 3570 }, { "epoch": 0.5188969815559662, "grad_norm": 2.165231466293335, "learning_rate": 4.4810842151036386e-05, "loss": 0.8407, "step": 3580 }, { "epoch": 0.5203464144653405, "grad_norm": 3.290911912918091, "learning_rate": 4.479634729670967e-05, "loss": 0.8164, "step": 3590 }, { "epoch": 0.5217958473747146, "grad_norm": 1.2274982929229736, "learning_rate": 4.4781852442382954e-05, "loss": 0.8467, "step": 3600 }, { "epoch": 0.5232452802840889, "grad_norm": 0.940759539604187, "learning_rate": 4.476735758805624e-05, "loss": 0.7047, "step": 3610 }, { "epoch": 0.5246947131934631, "grad_norm": 2.014238119125366, "learning_rate": 4.475286273372953e-05, "loss": 0.8128, "step": 3620 }, { "epoch": 0.5261441461028372, "grad_norm": 0.9501071572303772, "learning_rate": 4.4738367879402814e-05, "loss": 0.9025, "step": 3630 }, { "epoch": 0.5275935790122115, "grad_norm": 0.9888691902160645, "learning_rate": 4.47238730250761e-05, "loss": 0.7364, "step": 3640 }, { "epoch": 0.5290430119215856, "grad_norm": 3.5915961265563965, "learning_rate": 4.470937817074939e-05, "loss": 0.8577, "step": 3650 }, { "epoch": 0.5304924448309599, "grad_norm": 0.958191990852356, "learning_rate": 4.469488331642267e-05, "loss": 0.9072, "step": 3660 }, { "epoch": 0.5319418777403341, "grad_norm": 4.15883207321167, "learning_rate": 4.468038846209596e-05, "loss": 0.9426, "step": 3670 }, { "epoch": 0.5333913106497084, "grad_norm": 1.091619849205017, "learning_rate": 4.466589360776924e-05, "loss": 0.7791, "step": 3680 }, { "epoch": 0.5348407435590825, "grad_norm": 3.2313733100891113, "learning_rate": 4.465139875344253e-05, "loss": 0.7986, "step": 3690 }, { "epoch": 0.5362901764684567, "grad_norm": 0.9143542051315308, "learning_rate": 4.463690389911582e-05, "loss": 0.7882, "step": 3700 }, { "epoch": 0.5377396093778309, "grad_norm": 1.09394109249115, "learning_rate": 4.46224090447891e-05, "loss": 0.7717, "step": 3710 }, { "epoch": 0.5391890422872051, "grad_norm": 1.5948755741119385, "learning_rate": 4.460791419046239e-05, "loss": 0.8436, "step": 3720 }, { "epoch": 0.5406384751965794, "grad_norm": 2.321056604385376, "learning_rate": 4.459341933613567e-05, "loss": 0.8353, "step": 3730 }, { "epoch": 0.5420879081059535, "grad_norm": 2.6569488048553467, "learning_rate": 4.4578924481808956e-05, "loss": 0.834, "step": 3740 }, { "epoch": 0.5435373410153278, "grad_norm": 3.0700886249542236, "learning_rate": 4.456442962748225e-05, "loss": 0.8203, "step": 3750 }, { "epoch": 0.544986773924702, "grad_norm": 2.120776414871216, "learning_rate": 4.454993477315553e-05, "loss": 0.8139, "step": 3760 }, { "epoch": 0.5464362068340761, "grad_norm": 2.4611003398895264, "learning_rate": 4.453543991882882e-05, "loss": 0.8524, "step": 3770 }, { "epoch": 0.5478856397434504, "grad_norm": 1.0382717847824097, "learning_rate": 4.452094506450211e-05, "loss": 0.8474, "step": 3780 }, { "epoch": 0.5493350726528246, "grad_norm": 0.7796250581741333, "learning_rate": 4.4506450210175384e-05, "loss": 0.8561, "step": 3790 }, { "epoch": 0.5507845055621988, "grad_norm": 2.735966920852661, "learning_rate": 4.4491955355848676e-05, "loss": 0.7416, "step": 3800 }, { "epoch": 0.552233938471573, "grad_norm": 0.8775273561477661, "learning_rate": 4.447746050152196e-05, "loss": 0.868, "step": 3810 }, { "epoch": 0.5536833713809473, "grad_norm": 0.9166693687438965, "learning_rate": 4.446296564719525e-05, "loss": 0.895, "step": 3820 }, { "epoch": 0.5551328042903214, "grad_norm": 0.9439873099327087, "learning_rate": 4.4448470792868535e-05, "loss": 0.8011, "step": 3830 }, { "epoch": 0.5565822371996956, "grad_norm": 0.7854294776916504, "learning_rate": 4.443397593854182e-05, "loss": 0.8141, "step": 3840 }, { "epoch": 0.5580316701090698, "grad_norm": 0.9581212401390076, "learning_rate": 4.4419481084215104e-05, "loss": 0.7875, "step": 3850 }, { "epoch": 0.559481103018444, "grad_norm": 1.0400795936584473, "learning_rate": 4.440498622988839e-05, "loss": 0.8467, "step": 3860 }, { "epoch": 0.5609305359278183, "grad_norm": 0.912345826625824, "learning_rate": 4.439049137556168e-05, "loss": 0.8752, "step": 3870 }, { "epoch": 0.5623799688371924, "grad_norm": 0.8866713047027588, "learning_rate": 4.4375996521234964e-05, "loss": 0.8622, "step": 3880 }, { "epoch": 0.5638294017465667, "grad_norm": 1.6618248224258423, "learning_rate": 4.436150166690825e-05, "loss": 0.8189, "step": 3890 }, { "epoch": 0.5652788346559409, "grad_norm": 0.9341676235198975, "learning_rate": 4.434700681258154e-05, "loss": 0.7565, "step": 3900 }, { "epoch": 0.566728267565315, "grad_norm": 2.9719181060791016, "learning_rate": 4.4332511958254824e-05, "loss": 0.7841, "step": 3910 }, { "epoch": 0.5681777004746893, "grad_norm": 1.0437769889831543, "learning_rate": 4.431801710392811e-05, "loss": 0.8613, "step": 3920 }, { "epoch": 0.5696271333840635, "grad_norm": 1.6077215671539307, "learning_rate": 4.430352224960139e-05, "loss": 0.8608, "step": 3930 }, { "epoch": 0.5710765662934377, "grad_norm": 2.5882461071014404, "learning_rate": 4.428902739527468e-05, "loss": 0.8888, "step": 3940 }, { "epoch": 0.5725259992028119, "grad_norm": 2.8269803524017334, "learning_rate": 4.427453254094797e-05, "loss": 0.9575, "step": 3950 }, { "epoch": 0.5739754321121862, "grad_norm": 1.8316258192062378, "learning_rate": 4.426003768662125e-05, "loss": 0.9177, "step": 3960 }, { "epoch": 0.5754248650215603, "grad_norm": 2.2243731021881104, "learning_rate": 4.424554283229454e-05, "loss": 0.9245, "step": 3970 }, { "epoch": 0.5768742979309345, "grad_norm": 2.549422264099121, "learning_rate": 4.423104797796782e-05, "loss": 0.8799, "step": 3980 }, { "epoch": 0.5783237308403087, "grad_norm": 0.6303164958953857, "learning_rate": 4.4216553123641105e-05, "loss": 0.8572, "step": 3990 }, { "epoch": 0.5797731637496829, "grad_norm": 1.8965942859649658, "learning_rate": 4.4202058269314397e-05, "loss": 0.7797, "step": 4000 }, { "epoch": 0.5797731637496829, "eval_loss": 0.8338169455528259, "eval_runtime": 672.2843, "eval_samples_per_second": 51.31, "eval_steps_per_second": 2.566, "eval_token_accuracy": 0.00039567065532711027, "step": 4000 }, { "epoch": 0.5812225966590572, "grad_norm": 2.4498796463012695, "learning_rate": 4.418756341498768e-05, "loss": 0.8155, "step": 4010 }, { "epoch": 0.5826720295684313, "grad_norm": 0.7214322686195374, "learning_rate": 4.4173068560660965e-05, "loss": 0.902, "step": 4020 }, { "epoch": 0.5841214624778056, "grad_norm": 1.9820255041122437, "learning_rate": 4.4158573706334256e-05, "loss": 0.8595, "step": 4030 }, { "epoch": 0.5855708953871798, "grad_norm": 2.444005250930786, "learning_rate": 4.414407885200754e-05, "loss": 0.7991, "step": 4040 }, { "epoch": 0.5870203282965539, "grad_norm": 1.8259989023208618, "learning_rate": 4.4129583997680825e-05, "loss": 0.8107, "step": 4050 }, { "epoch": 0.5884697612059282, "grad_norm": 1.1818252801895142, "learning_rate": 4.411508914335411e-05, "loss": 0.916, "step": 4060 }, { "epoch": 0.5899191941153024, "grad_norm": 1.647369146347046, "learning_rate": 4.4100594289027394e-05, "loss": 0.7957, "step": 4070 }, { "epoch": 0.5913686270246766, "grad_norm": 3.159400463104248, "learning_rate": 4.4086099434700685e-05, "loss": 0.9447, "step": 4080 }, { "epoch": 0.5928180599340508, "grad_norm": 1.5535900592803955, "learning_rate": 4.407160458037397e-05, "loss": 0.8162, "step": 4090 }, { "epoch": 0.5942674928434251, "grad_norm": 3.949704647064209, "learning_rate": 4.405710972604726e-05, "loss": 0.832, "step": 4100 }, { "epoch": 0.5957169257527992, "grad_norm": 0.761550784111023, "learning_rate": 4.404261487172054e-05, "loss": 0.8827, "step": 4110 }, { "epoch": 0.5971663586621734, "grad_norm": 1.7703503370285034, "learning_rate": 4.402812001739382e-05, "loss": 0.8713, "step": 4120 }, { "epoch": 0.5986157915715477, "grad_norm": 0.7429376840591431, "learning_rate": 4.4013625163067114e-05, "loss": 0.7604, "step": 4130 }, { "epoch": 0.6000652244809218, "grad_norm": 0.8260897397994995, "learning_rate": 4.39991303087404e-05, "loss": 0.8617, "step": 4140 }, { "epoch": 0.6015146573902961, "grad_norm": 2.519921064376831, "learning_rate": 4.398463545441369e-05, "loss": 0.88, "step": 4150 }, { "epoch": 0.6029640902996702, "grad_norm": 0.6744734048843384, "learning_rate": 4.3970140600086973e-05, "loss": 0.7893, "step": 4160 }, { "epoch": 0.6044135232090445, "grad_norm": 0.9345059394836426, "learning_rate": 4.395564574576026e-05, "loss": 0.8327, "step": 4170 }, { "epoch": 0.6058629561184187, "grad_norm": 3.128801107406616, "learning_rate": 4.394115089143354e-05, "loss": 0.8599, "step": 4180 }, { "epoch": 0.6073123890277928, "grad_norm": 0.9045581817626953, "learning_rate": 4.3926656037106827e-05, "loss": 0.8531, "step": 4190 }, { "epoch": 0.6087618219371671, "grad_norm": 1.3482164144515991, "learning_rate": 4.391216118278012e-05, "loss": 0.8507, "step": 4200 }, { "epoch": 0.6102112548465413, "grad_norm": 2.7452120780944824, "learning_rate": 4.38976663284534e-05, "loss": 0.7958, "step": 4210 }, { "epoch": 0.6116606877559155, "grad_norm": 0.7711092829704285, "learning_rate": 4.3883171474126686e-05, "loss": 0.9046, "step": 4220 }, { "epoch": 0.6131101206652897, "grad_norm": 1.0835328102111816, "learning_rate": 4.386867661979998e-05, "loss": 0.7562, "step": 4230 }, { "epoch": 0.614559553574664, "grad_norm": 1.0015699863433838, "learning_rate": 4.3854181765473255e-05, "loss": 0.7945, "step": 4240 }, { "epoch": 0.6160089864840381, "grad_norm": 0.9216394424438477, "learning_rate": 4.3839686911146546e-05, "loss": 0.896, "step": 4250 }, { "epoch": 0.6174584193934123, "grad_norm": 2.684931755065918, "learning_rate": 4.382519205681983e-05, "loss": 0.8881, "step": 4260 }, { "epoch": 0.6189078523027866, "grad_norm": 2.537541627883911, "learning_rate": 4.3810697202493115e-05, "loss": 0.9242, "step": 4270 }, { "epoch": 0.6203572852121607, "grad_norm": 1.0086060762405396, "learning_rate": 4.3796202348166406e-05, "loss": 0.8116, "step": 4280 }, { "epoch": 0.621806718121535, "grad_norm": 1.0704823732376099, "learning_rate": 4.378170749383969e-05, "loss": 0.8074, "step": 4290 }, { "epoch": 0.6232561510309091, "grad_norm": 0.9033405184745789, "learning_rate": 4.3767212639512975e-05, "loss": 0.7903, "step": 4300 }, { "epoch": 0.6247055839402834, "grad_norm": 0.925373375415802, "learning_rate": 4.375271778518626e-05, "loss": 0.7961, "step": 4310 }, { "epoch": 0.6261550168496576, "grad_norm": 2.409839391708374, "learning_rate": 4.3738222930859544e-05, "loss": 0.9425, "step": 4320 }, { "epoch": 0.6276044497590317, "grad_norm": 0.8671131134033203, "learning_rate": 4.3723728076532835e-05, "loss": 0.9119, "step": 4330 }, { "epoch": 0.629053882668406, "grad_norm": 1.9147675037384033, "learning_rate": 4.370923322220612e-05, "loss": 0.7476, "step": 4340 }, { "epoch": 0.6305033155777802, "grad_norm": 2.5024831295013428, "learning_rate": 4.36947383678794e-05, "loss": 0.8095, "step": 4350 }, { "epoch": 0.6319527484871544, "grad_norm": 1.1699477434158325, "learning_rate": 4.3680243513552695e-05, "loss": 0.7649, "step": 4360 }, { "epoch": 0.6334021813965286, "grad_norm": 1.053463101387024, "learning_rate": 4.366574865922597e-05, "loss": 0.7738, "step": 4370 }, { "epoch": 0.6348516143059029, "grad_norm": 0.9840993285179138, "learning_rate": 4.365125380489926e-05, "loss": 0.8444, "step": 4380 }, { "epoch": 0.636301047215277, "grad_norm": 1.8738470077514648, "learning_rate": 4.363675895057255e-05, "loss": 0.8309, "step": 4390 }, { "epoch": 0.6377504801246512, "grad_norm": 0.9873073697090149, "learning_rate": 4.362226409624583e-05, "loss": 0.9108, "step": 4400 }, { "epoch": 0.6391999130340255, "grad_norm": 1.9426926374435425, "learning_rate": 4.360776924191912e-05, "loss": 0.8223, "step": 4410 }, { "epoch": 0.6406493459433996, "grad_norm": 0.967570424079895, "learning_rate": 4.359327438759241e-05, "loss": 0.8562, "step": 4420 }, { "epoch": 0.6420987788527739, "grad_norm": 1.1149927377700806, "learning_rate": 4.357877953326569e-05, "loss": 0.962, "step": 4430 }, { "epoch": 0.643548211762148, "grad_norm": 0.8680489659309387, "learning_rate": 4.3564284678938976e-05, "loss": 0.8571, "step": 4440 }, { "epoch": 0.6449976446715223, "grad_norm": 1.7880040407180786, "learning_rate": 4.354978982461226e-05, "loss": 0.82, "step": 4450 }, { "epoch": 0.6464470775808965, "grad_norm": 1.0027462244033813, "learning_rate": 4.353529497028555e-05, "loss": 0.7613, "step": 4460 }, { "epoch": 0.6478965104902706, "grad_norm": 1.6337950229644775, "learning_rate": 4.3520800115958836e-05, "loss": 0.7093, "step": 4470 }, { "epoch": 0.6493459433996449, "grad_norm": 2.350299596786499, "learning_rate": 4.350630526163213e-05, "loss": 0.849, "step": 4480 }, { "epoch": 0.6507953763090191, "grad_norm": 1.187408685684204, "learning_rate": 4.349181040730541e-05, "loss": 0.8921, "step": 4490 }, { "epoch": 0.6522448092183933, "grad_norm": 1.5397257804870605, "learning_rate": 4.347731555297869e-05, "loss": 0.8206, "step": 4500 }, { "epoch": 0.6536942421277675, "grad_norm": 1.072008728981018, "learning_rate": 4.346282069865198e-05, "loss": 0.8675, "step": 4510 }, { "epoch": 0.6551436750371418, "grad_norm": 0.9316320419311523, "learning_rate": 4.3448325844325265e-05, "loss": 0.752, "step": 4520 }, { "epoch": 0.6565931079465159, "grad_norm": 0.7438766360282898, "learning_rate": 4.3433830989998556e-05, "loss": 0.7891, "step": 4530 }, { "epoch": 0.6580425408558901, "grad_norm": 0.84394371509552, "learning_rate": 4.341933613567184e-05, "loss": 0.8332, "step": 4540 }, { "epoch": 0.6594919737652644, "grad_norm": 0.9269441366195679, "learning_rate": 4.3404841281345124e-05, "loss": 0.9001, "step": 4550 }, { "epoch": 0.6609414066746385, "grad_norm": 0.8739952445030212, "learning_rate": 4.339034642701841e-05, "loss": 0.8035, "step": 4560 }, { "epoch": 0.6623908395840128, "grad_norm": 1.9341847896575928, "learning_rate": 4.337585157269169e-05, "loss": 0.8768, "step": 4570 }, { "epoch": 0.663840272493387, "grad_norm": 0.8902730345726013, "learning_rate": 4.3361356718364984e-05, "loss": 0.8206, "step": 4580 }, { "epoch": 0.6652897054027612, "grad_norm": 0.8367321491241455, "learning_rate": 4.334686186403827e-05, "loss": 0.8538, "step": 4590 }, { "epoch": 0.6667391383121354, "grad_norm": 2.862443685531616, "learning_rate": 4.333236700971155e-05, "loss": 0.8864, "step": 4600 }, { "epoch": 0.6681885712215095, "grad_norm": 0.8810304999351501, "learning_rate": 4.3317872155384844e-05, "loss": 0.961, "step": 4610 }, { "epoch": 0.6696380041308838, "grad_norm": 0.9625788927078247, "learning_rate": 4.330337730105813e-05, "loss": 0.8163, "step": 4620 }, { "epoch": 0.671087437040258, "grad_norm": 0.772083580493927, "learning_rate": 4.328888244673141e-05, "loss": 0.8537, "step": 4630 }, { "epoch": 0.6725368699496322, "grad_norm": 1.8077799081802368, "learning_rate": 4.32743875924047e-05, "loss": 0.9016, "step": 4640 }, { "epoch": 0.6739863028590064, "grad_norm": 0.7941009998321533, "learning_rate": 4.325989273807798e-05, "loss": 0.8011, "step": 4650 }, { "epoch": 0.6754357357683807, "grad_norm": 2.0097315311431885, "learning_rate": 4.324539788375127e-05, "loss": 0.862, "step": 4660 }, { "epoch": 0.6768851686777548, "grad_norm": 2.0535378456115723, "learning_rate": 4.323090302942456e-05, "loss": 0.7873, "step": 4670 }, { "epoch": 0.678334601587129, "grad_norm": 1.0298711061477661, "learning_rate": 4.321640817509784e-05, "loss": 0.8405, "step": 4680 }, { "epoch": 0.6797840344965033, "grad_norm": 0.7229492664337158, "learning_rate": 4.3201913320771126e-05, "loss": 0.8394, "step": 4690 }, { "epoch": 0.6812334674058774, "grad_norm": 0.8889229893684387, "learning_rate": 4.318741846644441e-05, "loss": 0.8787, "step": 4700 }, { "epoch": 0.6826829003152517, "grad_norm": 2.534590482711792, "learning_rate": 4.31729236121177e-05, "loss": 0.7816, "step": 4710 }, { "epoch": 0.6841323332246259, "grad_norm": 0.8015549182891846, "learning_rate": 4.3158428757790986e-05, "loss": 0.7606, "step": 4720 }, { "epoch": 0.6855817661340001, "grad_norm": 1.200669288635254, "learning_rate": 4.314393390346427e-05, "loss": 0.9176, "step": 4730 }, { "epoch": 0.6870311990433743, "grad_norm": 1.7741879224777222, "learning_rate": 4.312943904913756e-05, "loss": 0.8293, "step": 4740 }, { "epoch": 0.6884806319527484, "grad_norm": 2.136448621749878, "learning_rate": 4.3114944194810846e-05, "loss": 0.877, "step": 4750 }, { "epoch": 0.6899300648621227, "grad_norm": 1.725224256515503, "learning_rate": 4.310044934048413e-05, "loss": 0.7985, "step": 4760 }, { "epoch": 0.6913794977714969, "grad_norm": 0.8180555701255798, "learning_rate": 4.3085954486157414e-05, "loss": 0.8777, "step": 4770 }, { "epoch": 0.6928289306808711, "grad_norm": 0.5975582003593445, "learning_rate": 4.30714596318307e-05, "loss": 0.7879, "step": 4780 }, { "epoch": 0.6942783635902453, "grad_norm": 0.9511466026306152, "learning_rate": 4.305696477750399e-05, "loss": 0.8936, "step": 4790 }, { "epoch": 0.6957277964996196, "grad_norm": 1.0259195566177368, "learning_rate": 4.3042469923177274e-05, "loss": 0.761, "step": 4800 }, { "epoch": 0.6971772294089937, "grad_norm": 2.490692615509033, "learning_rate": 4.3027975068850565e-05, "loss": 0.8069, "step": 4810 }, { "epoch": 0.6986266623183679, "grad_norm": 1.8551379442214966, "learning_rate": 4.301348021452384e-05, "loss": 0.7879, "step": 4820 }, { "epoch": 0.7000760952277422, "grad_norm": 2.0735645294189453, "learning_rate": 4.299898536019713e-05, "loss": 0.8176, "step": 4830 }, { "epoch": 0.7015255281371163, "grad_norm": 0.8412244915962219, "learning_rate": 4.298449050587042e-05, "loss": 0.8356, "step": 4840 }, { "epoch": 0.7029749610464906, "grad_norm": 3.692345142364502, "learning_rate": 4.29699956515437e-05, "loss": 0.7893, "step": 4850 }, { "epoch": 0.7044243939558648, "grad_norm": 3.127885103225708, "learning_rate": 4.2955500797216994e-05, "loss": 0.8115, "step": 4860 }, { "epoch": 0.705873826865239, "grad_norm": 3.361967086791992, "learning_rate": 4.294100594289028e-05, "loss": 0.8344, "step": 4870 }, { "epoch": 0.7073232597746132, "grad_norm": 0.7663640975952148, "learning_rate": 4.292651108856356e-05, "loss": 0.7669, "step": 4880 }, { "epoch": 0.7087726926839873, "grad_norm": 3.2423362731933594, "learning_rate": 4.291201623423685e-05, "loss": 0.9231, "step": 4890 }, { "epoch": 0.7102221255933616, "grad_norm": 2.1033742427825928, "learning_rate": 4.289752137991013e-05, "loss": 0.8546, "step": 4900 }, { "epoch": 0.7116715585027358, "grad_norm": 1.6948819160461426, "learning_rate": 4.288302652558342e-05, "loss": 0.8038, "step": 4910 }, { "epoch": 0.71312099141211, "grad_norm": 1.5998060703277588, "learning_rate": 4.286853167125671e-05, "loss": 0.7877, "step": 4920 }, { "epoch": 0.7145704243214842, "grad_norm": 1.865884780883789, "learning_rate": 4.285403681692999e-05, "loss": 0.8306, "step": 4930 }, { "epoch": 0.7160198572308585, "grad_norm": 0.7841973304748535, "learning_rate": 4.283954196260328e-05, "loss": 0.737, "step": 4940 }, { "epoch": 0.7174692901402326, "grad_norm": 0.9612240195274353, "learning_rate": 4.282504710827656e-05, "loss": 0.8115, "step": 4950 }, { "epoch": 0.7189187230496068, "grad_norm": 0.8772614002227783, "learning_rate": 4.281055225394985e-05, "loss": 0.8226, "step": 4960 }, { "epoch": 0.7203681559589811, "grad_norm": 0.9083195328712463, "learning_rate": 4.2796057399623135e-05, "loss": 0.8494, "step": 4970 }, { "epoch": 0.7218175888683552, "grad_norm": 2.2124178409576416, "learning_rate": 4.278156254529642e-05, "loss": 0.8175, "step": 4980 }, { "epoch": 0.7232670217777295, "grad_norm": 3.139594793319702, "learning_rate": 4.276706769096971e-05, "loss": 0.8368, "step": 4990 }, { "epoch": 0.7247164546871037, "grad_norm": 0.9544805884361267, "learning_rate": 4.2752572836642995e-05, "loss": 0.7776, "step": 5000 }, { "epoch": 0.7247164546871037, "eval_loss": 0.8146118521690369, "eval_runtime": 671.7402, "eval_samples_per_second": 51.352, "eval_steps_per_second": 2.568, "eval_token_accuracy": 0.00041561502982327355, "step": 5000 }, { "epoch": 0.7261658875964779, "grad_norm": 0.878653883934021, "learning_rate": 4.273807798231628e-05, "loss": 0.8326, "step": 5010 }, { "epoch": 0.7276153205058521, "grad_norm": 0.8273966312408447, "learning_rate": 4.2723583127989564e-05, "loss": 0.7153, "step": 5020 }, { "epoch": 0.7290647534152263, "grad_norm": 1.2781431674957275, "learning_rate": 4.270908827366285e-05, "loss": 0.7926, "step": 5030 }, { "epoch": 0.7305141863246005, "grad_norm": 1.093881607055664, "learning_rate": 4.269459341933614e-05, "loss": 0.8349, "step": 5040 }, { "epoch": 0.7319636192339747, "grad_norm": 2.6112184524536133, "learning_rate": 4.2680098565009424e-05, "loss": 0.8498, "step": 5050 }, { "epoch": 0.733413052143349, "grad_norm": 2.0127315521240234, "learning_rate": 4.266560371068271e-05, "loss": 0.8129, "step": 5060 }, { "epoch": 0.7348624850527231, "grad_norm": 0.8400161266326904, "learning_rate": 4.2651108856356e-05, "loss": 0.8914, "step": 5070 }, { "epoch": 0.7363119179620974, "grad_norm": 2.1788601875305176, "learning_rate": 4.263661400202928e-05, "loss": 0.9231, "step": 5080 }, { "epoch": 0.7377613508714715, "grad_norm": 0.8961378335952759, "learning_rate": 4.262211914770257e-05, "loss": 0.833, "step": 5090 }, { "epoch": 0.7392107837808457, "grad_norm": 1.742361068725586, "learning_rate": 4.260762429337585e-05, "loss": 0.7927, "step": 5100 }, { "epoch": 0.74066021669022, "grad_norm": 3.132824420928955, "learning_rate": 4.259312943904914e-05, "loss": 0.7623, "step": 5110 }, { "epoch": 0.7421096495995941, "grad_norm": 1.33770751953125, "learning_rate": 4.257863458472243e-05, "loss": 0.8726, "step": 5120 }, { "epoch": 0.7435590825089684, "grad_norm": 2.396254301071167, "learning_rate": 4.256413973039571e-05, "loss": 0.8686, "step": 5130 }, { "epoch": 0.7450085154183426, "grad_norm": 0.8666991591453552, "learning_rate": 4.2549644876069e-05, "loss": 0.728, "step": 5140 }, { "epoch": 0.7464579483277168, "grad_norm": 1.0336198806762695, "learning_rate": 4.253515002174228e-05, "loss": 0.7384, "step": 5150 }, { "epoch": 0.747907381237091, "grad_norm": 3.091517448425293, "learning_rate": 4.2520655167415565e-05, "loss": 0.8342, "step": 5160 }, { "epoch": 0.7493568141464652, "grad_norm": 1.0534110069274902, "learning_rate": 4.2506160313088856e-05, "loss": 0.8453, "step": 5170 }, { "epoch": 0.7508062470558394, "grad_norm": 0.7332333326339722, "learning_rate": 4.249166545876214e-05, "loss": 0.9213, "step": 5180 }, { "epoch": 0.7522556799652136, "grad_norm": 0.9510803818702698, "learning_rate": 4.247717060443543e-05, "loss": 0.7008, "step": 5190 }, { "epoch": 0.7537051128745879, "grad_norm": 0.9351806044578552, "learning_rate": 4.2462675750108716e-05, "loss": 0.8127, "step": 5200 }, { "epoch": 0.755154545783962, "grad_norm": 0.739949107170105, "learning_rate": 4.2448180895781994e-05, "loss": 0.7393, "step": 5210 }, { "epoch": 0.7566039786933363, "grad_norm": 0.7901501655578613, "learning_rate": 4.2433686041455285e-05, "loss": 0.7996, "step": 5220 }, { "epoch": 0.7580534116027104, "grad_norm": 1.0017391443252563, "learning_rate": 4.241919118712857e-05, "loss": 0.7722, "step": 5230 }, { "epoch": 0.7595028445120846, "grad_norm": 2.1011905670166016, "learning_rate": 4.240469633280186e-05, "loss": 0.7743, "step": 5240 }, { "epoch": 0.7609522774214589, "grad_norm": 0.9775656461715698, "learning_rate": 4.2390201478475145e-05, "loss": 0.7604, "step": 5250 }, { "epoch": 0.762401710330833, "grad_norm": 2.5780832767486572, "learning_rate": 4.237570662414843e-05, "loss": 0.8864, "step": 5260 }, { "epoch": 0.7638511432402073, "grad_norm": 2.137580394744873, "learning_rate": 4.2361211769821713e-05, "loss": 0.7924, "step": 5270 }, { "epoch": 0.7653005761495815, "grad_norm": 1.0586779117584229, "learning_rate": 4.2346716915495e-05, "loss": 0.7467, "step": 5280 }, { "epoch": 0.7667500090589556, "grad_norm": 1.0569285154342651, "learning_rate": 4.233222206116829e-05, "loss": 0.8568, "step": 5290 }, { "epoch": 0.7681994419683299, "grad_norm": 2.8792178630828857, "learning_rate": 4.231772720684157e-05, "loss": 0.8079, "step": 5300 }, { "epoch": 0.7696488748777041, "grad_norm": 2.2021849155426025, "learning_rate": 4.230323235251486e-05, "loss": 0.8115, "step": 5310 }, { "epoch": 0.7710983077870783, "grad_norm": 0.730385422706604, "learning_rate": 4.228873749818815e-05, "loss": 0.8685, "step": 5320 }, { "epoch": 0.7725477406964525, "grad_norm": 1.3898003101348877, "learning_rate": 4.227424264386143e-05, "loss": 0.7707, "step": 5330 }, { "epoch": 0.7739971736058268, "grad_norm": 0.990164041519165, "learning_rate": 4.225974778953472e-05, "loss": 0.8342, "step": 5340 }, { "epoch": 0.7754466065152009, "grad_norm": 0.7976711392402649, "learning_rate": 4.2245252935208e-05, "loss": 0.7386, "step": 5350 }, { "epoch": 0.7768960394245751, "grad_norm": 1.8743420839309692, "learning_rate": 4.2230758080881286e-05, "loss": 0.7369, "step": 5360 }, { "epoch": 0.7783454723339494, "grad_norm": 0.827246367931366, "learning_rate": 4.221626322655458e-05, "loss": 0.7587, "step": 5370 }, { "epoch": 0.7797949052433235, "grad_norm": 2.4631876945495605, "learning_rate": 4.220176837222786e-05, "loss": 0.7761, "step": 5380 }, { "epoch": 0.7812443381526978, "grad_norm": 0.9340019226074219, "learning_rate": 4.2187273517901146e-05, "loss": 0.8844, "step": 5390 }, { "epoch": 0.7826937710620719, "grad_norm": 0.8450477123260498, "learning_rate": 4.217277866357443e-05, "loss": 0.7458, "step": 5400 }, { "epoch": 0.7841432039714462, "grad_norm": 0.7873104214668274, "learning_rate": 4.2158283809247715e-05, "loss": 0.7478, "step": 5410 }, { "epoch": 0.7855926368808204, "grad_norm": 0.9672576785087585, "learning_rate": 4.2143788954921006e-05, "loss": 0.8117, "step": 5420 }, { "epoch": 0.7870420697901945, "grad_norm": 0.8910959959030151, "learning_rate": 4.212929410059429e-05, "loss": 0.8051, "step": 5430 }, { "epoch": 0.7884915026995688, "grad_norm": 2.4677529335021973, "learning_rate": 4.2114799246267575e-05, "loss": 0.8244, "step": 5440 }, { "epoch": 0.789940935608943, "grad_norm": 0.9692900776863098, "learning_rate": 4.2100304391940866e-05, "loss": 0.7718, "step": 5450 }, { "epoch": 0.7913903685183172, "grad_norm": 1.1083464622497559, "learning_rate": 4.208580953761415e-05, "loss": 0.8499, "step": 5460 }, { "epoch": 0.7928398014276914, "grad_norm": 0.7506137490272522, "learning_rate": 4.2071314683287435e-05, "loss": 0.7642, "step": 5470 }, { "epoch": 0.7942892343370657, "grad_norm": 0.9144595265388489, "learning_rate": 4.205681982896072e-05, "loss": 0.8737, "step": 5480 }, { "epoch": 0.7957386672464398, "grad_norm": 2.1372525691986084, "learning_rate": 4.2042324974634e-05, "loss": 0.8602, "step": 5490 }, { "epoch": 0.797188100155814, "grad_norm": 0.9848815202713013, "learning_rate": 4.2027830120307294e-05, "loss": 0.7839, "step": 5500 }, { "epoch": 0.7986375330651883, "grad_norm": 0.9968371391296387, "learning_rate": 4.201333526598058e-05, "loss": 0.8499, "step": 5510 }, { "epoch": 0.8000869659745624, "grad_norm": 0.8149365186691284, "learning_rate": 4.199884041165387e-05, "loss": 0.7291, "step": 5520 }, { "epoch": 0.8015363988839367, "grad_norm": 3.4558465480804443, "learning_rate": 4.198434555732715e-05, "loss": 0.8413, "step": 5530 }, { "epoch": 0.8029858317933108, "grad_norm": 1.0479544401168823, "learning_rate": 4.196985070300043e-05, "loss": 0.8553, "step": 5540 }, { "epoch": 0.8044352647026851, "grad_norm": 2.724478006362915, "learning_rate": 4.195535584867372e-05, "loss": 0.8351, "step": 5550 }, { "epoch": 0.8058846976120593, "grad_norm": 1.0499247312545776, "learning_rate": 4.194086099434701e-05, "loss": 0.83, "step": 5560 }, { "epoch": 0.8073341305214334, "grad_norm": 1.0782171487808228, "learning_rate": 4.19263661400203e-05, "loss": 0.7593, "step": 5570 }, { "epoch": 0.8087835634308077, "grad_norm": 0.8276395201683044, "learning_rate": 4.191187128569358e-05, "loss": 0.793, "step": 5580 }, { "epoch": 0.8102329963401819, "grad_norm": 0.7851141691207886, "learning_rate": 4.189737643136687e-05, "loss": 0.8528, "step": 5590 }, { "epoch": 0.8116824292495561, "grad_norm": 0.865790605545044, "learning_rate": 4.188288157704015e-05, "loss": 0.855, "step": 5600 }, { "epoch": 0.8131318621589303, "grad_norm": 0.9776528477668762, "learning_rate": 4.1868386722713436e-05, "loss": 0.7644, "step": 5610 }, { "epoch": 0.8145812950683046, "grad_norm": 0.8355238437652588, "learning_rate": 4.185389186838673e-05, "loss": 0.8027, "step": 5620 }, { "epoch": 0.8160307279776787, "grad_norm": 2.3449933528900146, "learning_rate": 4.183939701406001e-05, "loss": 0.8623, "step": 5630 }, { "epoch": 0.8174801608870529, "grad_norm": 0.879385769367218, "learning_rate": 4.1824902159733296e-05, "loss": 0.8522, "step": 5640 }, { "epoch": 0.8189295937964272, "grad_norm": 0.9040646553039551, "learning_rate": 4.181040730540659e-05, "loss": 0.7904, "step": 5650 }, { "epoch": 0.8203790267058013, "grad_norm": 1.8557476997375488, "learning_rate": 4.1795912451079864e-05, "loss": 0.9005, "step": 5660 }, { "epoch": 0.8218284596151756, "grad_norm": 2.7867159843444824, "learning_rate": 4.1781417596753156e-05, "loss": 0.8912, "step": 5670 }, { "epoch": 0.8232778925245497, "grad_norm": 1.0696690082550049, "learning_rate": 4.176692274242644e-05, "loss": 0.8365, "step": 5680 }, { "epoch": 0.824727325433924, "grad_norm": 2.0290820598602295, "learning_rate": 4.1752427888099724e-05, "loss": 0.7567, "step": 5690 }, { "epoch": 0.8261767583432982, "grad_norm": 1.7348473072052002, "learning_rate": 4.1737933033773015e-05, "loss": 0.8233, "step": 5700 }, { "epoch": 0.8276261912526723, "grad_norm": 1.0425660610198975, "learning_rate": 4.17234381794463e-05, "loss": 0.9019, "step": 5710 }, { "epoch": 0.8290756241620466, "grad_norm": 1.9040955305099487, "learning_rate": 4.1708943325119584e-05, "loss": 0.8035, "step": 5720 }, { "epoch": 0.8305250570714208, "grad_norm": 0.7795917391777039, "learning_rate": 4.169444847079287e-05, "loss": 0.8519, "step": 5730 }, { "epoch": 0.831974489980795, "grad_norm": 1.099768042564392, "learning_rate": 4.167995361646615e-05, "loss": 0.7934, "step": 5740 }, { "epoch": 0.8334239228901692, "grad_norm": 1.0211305618286133, "learning_rate": 4.1665458762139444e-05, "loss": 0.8082, "step": 5750 }, { "epoch": 0.8348733557995435, "grad_norm": 0.8864409327507019, "learning_rate": 4.165096390781273e-05, "loss": 0.7796, "step": 5760 }, { "epoch": 0.8363227887089176, "grad_norm": 0.8594615459442139, "learning_rate": 4.163646905348601e-05, "loss": 0.7961, "step": 5770 }, { "epoch": 0.8377722216182918, "grad_norm": 0.9918568134307861, "learning_rate": 4.1621974199159304e-05, "loss": 0.7387, "step": 5780 }, { "epoch": 0.8392216545276661, "grad_norm": 0.6663451790809631, "learning_rate": 4.160747934483258e-05, "loss": 0.8327, "step": 5790 }, { "epoch": 0.8406710874370402, "grad_norm": 0.8948430418968201, "learning_rate": 4.159298449050587e-05, "loss": 0.8362, "step": 5800 }, { "epoch": 0.8421205203464145, "grad_norm": 1.9746590852737427, "learning_rate": 4.157848963617916e-05, "loss": 0.8044, "step": 5810 }, { "epoch": 0.8435699532557887, "grad_norm": 0.7772048115730286, "learning_rate": 4.156399478185244e-05, "loss": 0.7423, "step": 5820 }, { "epoch": 0.8450193861651629, "grad_norm": 1.7775204181671143, "learning_rate": 4.154949992752573e-05, "loss": 0.8133, "step": 5830 }, { "epoch": 0.8464688190745371, "grad_norm": 1.7171192169189453, "learning_rate": 4.153500507319902e-05, "loss": 0.7657, "step": 5840 }, { "epoch": 0.8479182519839112, "grad_norm": 0.6374487280845642, "learning_rate": 4.152051021887231e-05, "loss": 0.786, "step": 5850 }, { "epoch": 0.8493676848932855, "grad_norm": 0.764743447303772, "learning_rate": 4.1506015364545586e-05, "loss": 0.8204, "step": 5860 }, { "epoch": 0.8508171178026597, "grad_norm": 3.3664588928222656, "learning_rate": 4.149152051021887e-05, "loss": 0.8363, "step": 5870 }, { "epoch": 0.8522665507120339, "grad_norm": 1.1142141819000244, "learning_rate": 4.147702565589216e-05, "loss": 0.8146, "step": 5880 }, { "epoch": 0.8537159836214081, "grad_norm": 0.7387230396270752, "learning_rate": 4.1462530801565445e-05, "loss": 0.7297, "step": 5890 }, { "epoch": 0.8551654165307824, "grad_norm": 0.9522792100906372, "learning_rate": 4.1448035947238737e-05, "loss": 0.794, "step": 5900 }, { "epoch": 0.8566148494401565, "grad_norm": 1.995259165763855, "learning_rate": 4.143354109291202e-05, "loss": 0.8431, "step": 5910 }, { "epoch": 0.8580642823495307, "grad_norm": 1.9007961750030518, "learning_rate": 4.14190462385853e-05, "loss": 0.8472, "step": 5920 }, { "epoch": 0.859513715258905, "grad_norm": 2.864264488220215, "learning_rate": 4.140455138425859e-05, "loss": 0.7948, "step": 5930 }, { "epoch": 0.8609631481682791, "grad_norm": 0.8150015473365784, "learning_rate": 4.1390056529931874e-05, "loss": 0.8264, "step": 5940 }, { "epoch": 0.8624125810776534, "grad_norm": 1.1385763883590698, "learning_rate": 4.1375561675605165e-05, "loss": 0.8622, "step": 5950 }, { "epoch": 0.8638620139870276, "grad_norm": 0.8771058320999146, "learning_rate": 4.136106682127845e-05, "loss": 0.7388, "step": 5960 }, { "epoch": 0.8653114468964018, "grad_norm": 0.7156450748443604, "learning_rate": 4.1346571966951734e-05, "loss": 0.7222, "step": 5970 }, { "epoch": 0.866760879805776, "grad_norm": 2.9387946128845215, "learning_rate": 4.1332077112625025e-05, "loss": 0.8499, "step": 5980 }, { "epoch": 0.8682103127151501, "grad_norm": 0.827257513999939, "learning_rate": 4.13175822582983e-05, "loss": 0.7695, "step": 5990 }, { "epoch": 0.8696597456245244, "grad_norm": 1.7753419876098633, "learning_rate": 4.1303087403971594e-05, "loss": 0.8177, "step": 6000 }, { "epoch": 0.8696597456245244, "eval_loss": 0.8002915382385254, "eval_runtime": 670.9604, "eval_samples_per_second": 51.411, "eval_steps_per_second": 2.571, "eval_token_accuracy": 0.00040709041814346184, "step": 6000 }, { "epoch": 0.8711091785338986, "grad_norm": 0.7696251273155212, "learning_rate": 4.128859254964488e-05, "loss": 0.741, "step": 6010 }, { "epoch": 0.8725586114432728, "grad_norm": Infinity, "learning_rate": 4.127554718075084e-05, "loss": 0.8342, "step": 6020 }, { "epoch": 0.874008044352647, "grad_norm": 0.7678489089012146, "learning_rate": 4.126105232642412e-05, "loss": 0.7815, "step": 6030 }, { "epoch": 0.8754574772620213, "grad_norm": 0.6283605694770813, "learning_rate": 4.1246557472097406e-05, "loss": 0.8367, "step": 6040 }, { "epoch": 0.8769069101713954, "grad_norm": 0.9361746907234192, "learning_rate": 4.12320626177707e-05, "loss": 0.8052, "step": 6050 }, { "epoch": 0.8783563430807696, "grad_norm": 2.359816074371338, "learning_rate": 4.121756776344398e-05, "loss": 0.8143, "step": 6060 }, { "epoch": 0.8798057759901439, "grad_norm": 2.2239186763763428, "learning_rate": 4.1203072909117266e-05, "loss": 0.9177, "step": 6070 }, { "epoch": 0.881255208899518, "grad_norm": 2.528871536254883, "learning_rate": 4.118857805479055e-05, "loss": 0.7642, "step": 6080 }, { "epoch": 0.8827046418088923, "grad_norm": 2.7372164726257324, "learning_rate": 4.1174083200463835e-05, "loss": 0.8501, "step": 6090 }, { "epoch": 0.8841540747182665, "grad_norm": 0.9686703681945801, "learning_rate": 4.1159588346137126e-05, "loss": 0.9077, "step": 6100 }, { "epoch": 0.8856035076276407, "grad_norm": 0.7706042528152466, "learning_rate": 4.114509349181041e-05, "loss": 0.7017, "step": 6110 }, { "epoch": 0.8870529405370149, "grad_norm": 0.7924395203590393, "learning_rate": 4.1130598637483695e-05, "loss": 0.7459, "step": 6120 }, { "epoch": 0.888502373446389, "grad_norm": 1.8335968255996704, "learning_rate": 4.1116103783156986e-05, "loss": 0.8256, "step": 6130 }, { "epoch": 0.8899518063557633, "grad_norm": 1.7597600221633911, "learning_rate": 4.110160892883026e-05, "loss": 0.782, "step": 6140 }, { "epoch": 0.8914012392651375, "grad_norm": 1.0810132026672363, "learning_rate": 4.1087114074503554e-05, "loss": 0.8474, "step": 6150 }, { "epoch": 0.8928506721745118, "grad_norm": 1.7811000347137451, "learning_rate": 4.107261922017684e-05, "loss": 0.8355, "step": 6160 }, { "epoch": 0.8943001050838859, "grad_norm": 2.644036054611206, "learning_rate": 4.105812436585012e-05, "loss": 0.7535, "step": 6170 }, { "epoch": 0.8957495379932602, "grad_norm": 1.6075462102890015, "learning_rate": 4.1043629511523414e-05, "loss": 0.7093, "step": 6180 }, { "epoch": 0.8971989709026343, "grad_norm": 1.9953532218933105, "learning_rate": 4.10291346571967e-05, "loss": 0.8419, "step": 6190 }, { "epoch": 0.8986484038120085, "grad_norm": 2.267934799194336, "learning_rate": 4.101463980286998e-05, "loss": 0.8523, "step": 6200 }, { "epoch": 0.9000978367213828, "grad_norm": 1.9035049676895142, "learning_rate": 4.100014494854327e-05, "loss": 0.7483, "step": 6210 }, { "epoch": 0.9015472696307569, "grad_norm": 0.8251744508743286, "learning_rate": 4.098565009421655e-05, "loss": 0.8831, "step": 6220 }, { "epoch": 0.9029967025401312, "grad_norm": 2.1203315258026123, "learning_rate": 4.097115523988984e-05, "loss": 0.7755, "step": 6230 }, { "epoch": 0.9044461354495054, "grad_norm": 0.8929731249809265, "learning_rate": 4.095666038556313e-05, "loss": 0.8435, "step": 6240 }, { "epoch": 0.9058955683588796, "grad_norm": 2.2965400218963623, "learning_rate": 4.094216553123642e-05, "loss": 0.8037, "step": 6250 }, { "epoch": 0.9073450012682538, "grad_norm": 0.6761788725852966, "learning_rate": 4.0927670676909696e-05, "loss": 0.8395, "step": 6260 }, { "epoch": 0.908794434177628, "grad_norm": 0.7339790463447571, "learning_rate": 4.091317582258298e-05, "loss": 0.8268, "step": 6270 }, { "epoch": 0.9102438670870022, "grad_norm": 0.9395334720611572, "learning_rate": 4.089868096825627e-05, "loss": 0.7619, "step": 6280 }, { "epoch": 0.9116932999963764, "grad_norm": 1.431176781654358, "learning_rate": 4.0884186113929556e-05, "loss": 0.7839, "step": 6290 }, { "epoch": 0.9131427329057507, "grad_norm": 0.9616233706474304, "learning_rate": 4.086969125960285e-05, "loss": 0.8028, "step": 6300 }, { "epoch": 0.9145921658151248, "grad_norm": 1.2412314414978027, "learning_rate": 4.085519640527613e-05, "loss": 0.9369, "step": 6310 }, { "epoch": 0.9160415987244991, "grad_norm": 0.7780980467796326, "learning_rate": 4.0840701550949416e-05, "loss": 0.8283, "step": 6320 }, { "epoch": 0.9174910316338732, "grad_norm": 2.292849540710449, "learning_rate": 4.08262066966227e-05, "loss": 0.8124, "step": 6330 }, { "epoch": 0.9189404645432474, "grad_norm": 2.3807780742645264, "learning_rate": 4.0811711842295984e-05, "loss": 0.7707, "step": 6340 }, { "epoch": 0.9203898974526217, "grad_norm": 0.8536470532417297, "learning_rate": 4.0797216987969275e-05, "loss": 0.868, "step": 6350 }, { "epoch": 0.9218393303619958, "grad_norm": 3.343093156814575, "learning_rate": 4.078272213364256e-05, "loss": 0.8104, "step": 6360 }, { "epoch": 0.9232887632713701, "grad_norm": 0.824166476726532, "learning_rate": 4.0768227279315844e-05, "loss": 0.7457, "step": 6370 }, { "epoch": 0.9247381961807443, "grad_norm": 0.8960769772529602, "learning_rate": 4.0753732424989135e-05, "loss": 0.7728, "step": 6380 }, { "epoch": 0.9261876290901185, "grad_norm": 0.7696554064750671, "learning_rate": 4.073923757066241e-05, "loss": 0.8353, "step": 6390 }, { "epoch": 0.9276370619994927, "grad_norm": 3.4166648387908936, "learning_rate": 4.0724742716335704e-05, "loss": 0.8143, "step": 6400 }, { "epoch": 0.9290864949088669, "grad_norm": 2.698413133621216, "learning_rate": 4.071024786200899e-05, "loss": 0.7716, "step": 6410 }, { "epoch": 0.9305359278182411, "grad_norm": 1.7810639142990112, "learning_rate": 4.069575300768227e-05, "loss": 0.8451, "step": 6420 }, { "epoch": 0.9319853607276153, "grad_norm": 2.0858755111694336, "learning_rate": 4.0681258153355564e-05, "loss": 0.8048, "step": 6430 }, { "epoch": 0.9334347936369896, "grad_norm": 2.5020108222961426, "learning_rate": 4.066676329902885e-05, "loss": 0.8316, "step": 6440 }, { "epoch": 0.9348842265463637, "grad_norm": 0.8029810190200806, "learning_rate": 4.065226844470213e-05, "loss": 0.8335, "step": 6450 }, { "epoch": 0.936333659455738, "grad_norm": 0.7290728688240051, "learning_rate": 4.063777359037542e-05, "loss": 0.8842, "step": 6460 }, { "epoch": 0.9377830923651121, "grad_norm": 0.8825303316116333, "learning_rate": 4.06232787360487e-05, "loss": 0.7534, "step": 6470 }, { "epoch": 0.9392325252744863, "grad_norm": 2.034435510635376, "learning_rate": 4.060878388172199e-05, "loss": 0.7776, "step": 6480 }, { "epoch": 0.9406819581838606, "grad_norm": 1.9346160888671875, "learning_rate": 4.059428902739528e-05, "loss": 0.8008, "step": 6490 }, { "epoch": 0.9421313910932347, "grad_norm": 0.8853064179420471, "learning_rate": 4.057979417306856e-05, "loss": 0.8289, "step": 6500 }, { "epoch": 0.943580824002609, "grad_norm": 2.0305445194244385, "learning_rate": 4.056529931874185e-05, "loss": 0.8135, "step": 6510 }, { "epoch": 0.9450302569119832, "grad_norm": 2.7980148792266846, "learning_rate": 4.055080446441513e-05, "loss": 0.8089, "step": 6520 }, { "epoch": 0.9464796898213574, "grad_norm": 1.0745283365249634, "learning_rate": 4.053630961008842e-05, "loss": 0.9252, "step": 6530 }, { "epoch": 0.9479291227307316, "grad_norm": 0.9550811648368835, "learning_rate": 4.0521814755761705e-05, "loss": 0.7947, "step": 6540 }, { "epoch": 0.9493785556401058, "grad_norm": 2.408703088760376, "learning_rate": 4.050731990143499e-05, "loss": 0.7917, "step": 6550 }, { "epoch": 0.95082798854948, "grad_norm": 1.9390877485275269, "learning_rate": 4.049282504710828e-05, "loss": 0.8671, "step": 6560 }, { "epoch": 0.9522774214588542, "grad_norm": 1.9944474697113037, "learning_rate": 4.0478330192781565e-05, "loss": 0.7685, "step": 6570 }, { "epoch": 0.9537268543682285, "grad_norm": 2.1434199810028076, "learning_rate": 4.0463835338454856e-05, "loss": 0.7979, "step": 6580 }, { "epoch": 0.9551762872776026, "grad_norm": 2.005647897720337, "learning_rate": 4.0449340484128134e-05, "loss": 0.7932, "step": 6590 }, { "epoch": 0.9566257201869769, "grad_norm": 1.3373275995254517, "learning_rate": 4.043484562980142e-05, "loss": 0.7332, "step": 6600 }, { "epoch": 0.958075153096351, "grad_norm": 3.3556673526763916, "learning_rate": 4.042035077547471e-05, "loss": 0.7623, "step": 6610 }, { "epoch": 0.9595245860057252, "grad_norm": 0.8822040557861328, "learning_rate": 4.0405855921147994e-05, "loss": 0.8023, "step": 6620 }, { "epoch": 0.9609740189150995, "grad_norm": 1.7681927680969238, "learning_rate": 4.0391361066821285e-05, "loss": 0.8174, "step": 6630 }, { "epoch": 0.9624234518244736, "grad_norm": 0.8971022963523865, "learning_rate": 4.037686621249457e-05, "loss": 0.7967, "step": 6640 }, { "epoch": 0.9638728847338479, "grad_norm": 2.181123733520508, "learning_rate": 4.036237135816785e-05, "loss": 0.7561, "step": 6650 }, { "epoch": 0.9653223176432221, "grad_norm": 2.5860283374786377, "learning_rate": 4.034787650384114e-05, "loss": 0.741, "step": 6660 }, { "epoch": 0.9667717505525963, "grad_norm": 2.3298215866088867, "learning_rate": 4.033338164951442e-05, "loss": 0.8468, "step": 6670 }, { "epoch": 0.9682211834619705, "grad_norm": 2.238698959350586, "learning_rate": 4.0318886795187713e-05, "loss": 0.7435, "step": 6680 }, { "epoch": 0.9696706163713447, "grad_norm": 2.973845958709717, "learning_rate": 4.0304391940861e-05, "loss": 0.8298, "step": 6690 }, { "epoch": 0.9711200492807189, "grad_norm": 0.7994486689567566, "learning_rate": 4.028989708653428e-05, "loss": 0.7821, "step": 6700 }, { "epoch": 0.9725694821900931, "grad_norm": 2.1712687015533447, "learning_rate": 4.027540223220757e-05, "loss": 0.7828, "step": 6710 }, { "epoch": 0.9740189150994674, "grad_norm": 1.6569911241531372, "learning_rate": 4.026090737788085e-05, "loss": 0.7252, "step": 6720 }, { "epoch": 0.9754683480088415, "grad_norm": 2.9206511974334717, "learning_rate": 4.024641252355414e-05, "loss": 0.904, "step": 6730 }, { "epoch": 0.9769177809182158, "grad_norm": 0.7334924340248108, "learning_rate": 4.0231917669227426e-05, "loss": 0.8184, "step": 6740 }, { "epoch": 0.97836721382759, "grad_norm": 0.9845920205116272, "learning_rate": 4.021742281490071e-05, "loss": 0.7562, "step": 6750 }, { "epoch": 0.9798166467369641, "grad_norm": 0.7724540829658508, "learning_rate": 4.0202927960574e-05, "loss": 0.8182, "step": 6760 }, { "epoch": 0.9812660796463384, "grad_norm": 2.2663064002990723, "learning_rate": 4.0188433106247286e-05, "loss": 0.7824, "step": 6770 }, { "epoch": 0.9827155125557125, "grad_norm": 1.2169333696365356, "learning_rate": 4.017393825192057e-05, "loss": 0.759, "step": 6780 }, { "epoch": 0.9841649454650868, "grad_norm": 0.6806200742721558, "learning_rate": 4.0159443397593855e-05, "loss": 0.8752, "step": 6790 }, { "epoch": 0.985614378374461, "grad_norm": 1.1035833358764648, "learning_rate": 4.014494854326714e-05, "loss": 0.8627, "step": 6800 }, { "epoch": 0.9870638112838352, "grad_norm": 0.8531244397163391, "learning_rate": 4.013045368894043e-05, "loss": 0.7522, "step": 6810 }, { "epoch": 0.9885132441932094, "grad_norm": 2.015822410583496, "learning_rate": 4.0115958834613715e-05, "loss": 0.8365, "step": 6820 }, { "epoch": 0.9899626771025836, "grad_norm": 2.455010414123535, "learning_rate": 4.0101463980287e-05, "loss": 0.8219, "step": 6830 }, { "epoch": 0.9914121100119578, "grad_norm": 3.0342283248901367, "learning_rate": 4.008696912596029e-05, "loss": 0.8452, "step": 6840 }, { "epoch": 0.992861542921332, "grad_norm": 2.531339406967163, "learning_rate": 4.007247427163357e-05, "loss": 0.8152, "step": 6850 }, { "epoch": 0.9943109758307063, "grad_norm": 0.8472199440002441, "learning_rate": 4.005797941730686e-05, "loss": 0.7319, "step": 6860 }, { "epoch": 0.9957604087400804, "grad_norm": 2.572490930557251, "learning_rate": 4.0043484562980143e-05, "loss": 0.8502, "step": 6870 }, { "epoch": 0.9972098416494547, "grad_norm": 3.1185014247894287, "learning_rate": 4.002898970865343e-05, "loss": 0.776, "step": 6880 }, { "epoch": 0.9986592745588289, "grad_norm": 2.1352272033691406, "learning_rate": 4.001449485432672e-05, "loss": 0.7082, "step": 6890 }, { "epoch": 1.0001087074682031, "grad_norm": Infinity, "learning_rate": 4.000144948543267e-05, "loss": 0.8426, "step": 6900 }, { "epoch": 1.0015581403775773, "grad_norm": 1.6490182876586914, "learning_rate": 3.998695463110596e-05, "loss": 0.6672, "step": 6910 }, { "epoch": 1.0030075732869514, "grad_norm": 1.2665408849716187, "learning_rate": 3.997245977677925e-05, "loss": 0.6411, "step": 6920 }, { "epoch": 1.0044570061963256, "grad_norm": 1.9385483264923096, "learning_rate": 3.995796492245253e-05, "loss": 0.6512, "step": 6930 }, { "epoch": 1.0059064391057, "grad_norm": 1.9456660747528076, "learning_rate": 3.9943470068125816e-05, "loss": 0.7186, "step": 6940 }, { "epoch": 1.0073558720150741, "grad_norm": 0.8936344981193542, "learning_rate": 3.99289752137991e-05, "loss": 0.7411, "step": 6950 }, { "epoch": 1.0088053049244483, "grad_norm": 0.8895001411437988, "learning_rate": 3.991448035947239e-05, "loss": 0.732, "step": 6960 }, { "epoch": 1.0102547378338225, "grad_norm": 2.4484055042266846, "learning_rate": 3.9899985505145676e-05, "loss": 0.6834, "step": 6970 }, { "epoch": 1.0117041707431966, "grad_norm": 1.6009628772735596, "learning_rate": 3.988549065081896e-05, "loss": 0.7056, "step": 6980 }, { "epoch": 1.013153603652571, "grad_norm": 0.817646324634552, "learning_rate": 3.987099579649225e-05, "loss": 0.7023, "step": 6990 }, { "epoch": 1.0146030365619452, "grad_norm": 2.423677444458008, "learning_rate": 3.985650094216553e-05, "loss": 0.6912, "step": 7000 }, { "epoch": 1.0146030365619452, "eval_loss": 0.792147159576416, "eval_runtime": 670.8402, "eval_samples_per_second": 51.421, "eval_steps_per_second": 2.571, "eval_token_accuracy": 0.0003976007560848035, "step": 7000 }, { "epoch": 1.0160524694713193, "grad_norm": 0.8038765788078308, "learning_rate": 3.984200608783882e-05, "loss": 0.7457, "step": 7010 }, { "epoch": 1.0175019023806935, "grad_norm": 2.2214059829711914, "learning_rate": 3.9827511233512104e-05, "loss": 0.6803, "step": 7020 }, { "epoch": 1.0189513352900677, "grad_norm": 0.8514269590377808, "learning_rate": 3.981301637918539e-05, "loss": 0.7552, "step": 7030 }, { "epoch": 1.020400768199442, "grad_norm": 0.9722759127616882, "learning_rate": 3.979852152485868e-05, "loss": 0.6756, "step": 7040 }, { "epoch": 1.0218502011088162, "grad_norm": 2.627790689468384, "learning_rate": 3.9784026670531964e-05, "loss": 0.7058, "step": 7050 }, { "epoch": 1.0232996340181904, "grad_norm": 0.8177839517593384, "learning_rate": 3.976953181620525e-05, "loss": 0.6768, "step": 7060 }, { "epoch": 1.0247490669275645, "grad_norm": 0.7263636589050293, "learning_rate": 3.975503696187853e-05, "loss": 0.6945, "step": 7070 }, { "epoch": 1.026198499836939, "grad_norm": 1.9415383338928223, "learning_rate": 3.974054210755182e-05, "loss": 0.6608, "step": 7080 }, { "epoch": 1.027647932746313, "grad_norm": 0.9528020024299622, "learning_rate": 3.972604725322511e-05, "loss": 0.6775, "step": 7090 }, { "epoch": 1.0290973656556872, "grad_norm": 1.0678958892822266, "learning_rate": 3.971155239889839e-05, "loss": 0.7564, "step": 7100 }, { "epoch": 1.0305467985650614, "grad_norm": 1.6998884677886963, "learning_rate": 3.9697057544571684e-05, "loss": 0.8101, "step": 7110 }, { "epoch": 1.0319962314744355, "grad_norm": 0.9464067816734314, "learning_rate": 3.968256269024497e-05, "loss": 0.6839, "step": 7120 }, { "epoch": 1.03344566438381, "grad_norm": 0.9953280091285706, "learning_rate": 3.9668067835918246e-05, "loss": 0.6776, "step": 7130 }, { "epoch": 1.034895097293184, "grad_norm": 2.3919310569763184, "learning_rate": 3.965357298159154e-05, "loss": 0.7536, "step": 7140 }, { "epoch": 1.0363445302025582, "grad_norm": 2.497450113296509, "learning_rate": 3.963907812726482e-05, "loss": 0.7147, "step": 7150 }, { "epoch": 1.0377939631119324, "grad_norm": 0.7969416379928589, "learning_rate": 3.962458327293811e-05, "loss": 0.7205, "step": 7160 }, { "epoch": 1.0392433960213068, "grad_norm": 2.086747884750366, "learning_rate": 3.9610088418611397e-05, "loss": 0.6671, "step": 7170 }, { "epoch": 1.040692828930681, "grad_norm": 0.8285253643989563, "learning_rate": 3.959559356428468e-05, "loss": 0.6685, "step": 7180 }, { "epoch": 1.042142261840055, "grad_norm": 0.897041380405426, "learning_rate": 3.9581098709957965e-05, "loss": 0.6681, "step": 7190 }, { "epoch": 1.0435916947494293, "grad_norm": 0.9613838195800781, "learning_rate": 3.956660385563125e-05, "loss": 0.5794, "step": 7200 }, { "epoch": 1.0450411276588034, "grad_norm": 0.9189807176589966, "learning_rate": 3.955210900130454e-05, "loss": 0.6748, "step": 7210 }, { "epoch": 1.0464905605681778, "grad_norm": 2.2880289554595947, "learning_rate": 3.9537614146977825e-05, "loss": 0.6471, "step": 7220 }, { "epoch": 1.047939993477552, "grad_norm": 2.2064247131347656, "learning_rate": 3.952311929265111e-05, "loss": 0.7417, "step": 7230 }, { "epoch": 1.0493894263869261, "grad_norm": 1.0736846923828125, "learning_rate": 3.95086244383244e-05, "loss": 0.6161, "step": 7240 }, { "epoch": 1.0508388592963003, "grad_norm": 0.9019289612770081, "learning_rate": 3.9494129583997685e-05, "loss": 0.7279, "step": 7250 }, { "epoch": 1.0522882922056744, "grad_norm": 0.6698160767555237, "learning_rate": 3.947963472967097e-05, "loss": 0.6323, "step": 7260 }, { "epoch": 1.0537377251150488, "grad_norm": 2.861865282058716, "learning_rate": 3.9465139875344254e-05, "loss": 0.7293, "step": 7270 }, { "epoch": 1.055187158024423, "grad_norm": 0.9664188027381897, "learning_rate": 3.945064502101754e-05, "loss": 0.7297, "step": 7280 }, { "epoch": 1.0566365909337971, "grad_norm": 3.0016567707061768, "learning_rate": 3.943615016669083e-05, "loss": 0.7518, "step": 7290 }, { "epoch": 1.0580860238431713, "grad_norm": 0.9263351559638977, "learning_rate": 3.9421655312364114e-05, "loss": 0.7422, "step": 7300 }, { "epoch": 1.0595354567525455, "grad_norm": 1.9621021747589111, "learning_rate": 3.94071604580374e-05, "loss": 0.6925, "step": 7310 }, { "epoch": 1.0609848896619198, "grad_norm": 0.8665259480476379, "learning_rate": 3.939266560371068e-05, "loss": 0.7076, "step": 7320 }, { "epoch": 1.062434322571294, "grad_norm": 0.8874248266220093, "learning_rate": 3.937817074938397e-05, "loss": 0.7928, "step": 7330 }, { "epoch": 1.0638837554806682, "grad_norm": 0.9791090488433838, "learning_rate": 3.936367589505726e-05, "loss": 0.5994, "step": 7340 }, { "epoch": 1.0653331883900423, "grad_norm": 2.642836809158325, "learning_rate": 3.934918104073054e-05, "loss": 0.7622, "step": 7350 }, { "epoch": 1.0667826212994167, "grad_norm": 2.5333988666534424, "learning_rate": 3.9334686186403827e-05, "loss": 0.6799, "step": 7360 }, { "epoch": 1.0682320542087909, "grad_norm": 1.0640588998794556, "learning_rate": 3.932019133207712e-05, "loss": 0.7797, "step": 7370 }, { "epoch": 1.069681487118165, "grad_norm": 0.8988319039344788, "learning_rate": 3.9305696477750395e-05, "loss": 0.7214, "step": 7380 }, { "epoch": 1.0711309200275392, "grad_norm": 0.9383190870285034, "learning_rate": 3.9291201623423686e-05, "loss": 0.758, "step": 7390 }, { "epoch": 1.0725803529369133, "grad_norm": 0.8509389758110046, "learning_rate": 3.927670676909697e-05, "loss": 0.6936, "step": 7400 }, { "epoch": 1.0740297858462877, "grad_norm": 0.8831974864006042, "learning_rate": 3.9262211914770255e-05, "loss": 0.7335, "step": 7410 }, { "epoch": 1.0754792187556619, "grad_norm": 2.2875640392303467, "learning_rate": 3.9247717060443546e-05, "loss": 0.6867, "step": 7420 }, { "epoch": 1.076928651665036, "grad_norm": 2.576653242111206, "learning_rate": 3.923322220611683e-05, "loss": 0.7765, "step": 7430 }, { "epoch": 1.0783780845744102, "grad_norm": 3.6591567993164062, "learning_rate": 3.921872735179012e-05, "loss": 0.6902, "step": 7440 }, { "epoch": 1.0798275174837846, "grad_norm": 1.969195008277893, "learning_rate": 3.92042324974634e-05, "loss": 0.6604, "step": 7450 }, { "epoch": 1.0812769503931587, "grad_norm": 1.8259871006011963, "learning_rate": 3.9189737643136684e-05, "loss": 0.7414, "step": 7460 }, { "epoch": 1.082726383302533, "grad_norm": 1.0266244411468506, "learning_rate": 3.9175242788809975e-05, "loss": 0.6822, "step": 7470 }, { "epoch": 1.084175816211907, "grad_norm": 0.9187620878219604, "learning_rate": 3.916074793448326e-05, "loss": 0.7962, "step": 7480 }, { "epoch": 1.0856252491212812, "grad_norm": 2.446805000305176, "learning_rate": 3.914625308015655e-05, "loss": 0.6669, "step": 7490 }, { "epoch": 1.0870746820306556, "grad_norm": 1.7605246305465698, "learning_rate": 3.9131758225829835e-05, "loss": 0.6893, "step": 7500 }, { "epoch": 1.0885241149400298, "grad_norm": 1.388551950454712, "learning_rate": 3.911726337150311e-05, "loss": 0.7815, "step": 7510 }, { "epoch": 1.089973547849404, "grad_norm": 0.932859480381012, "learning_rate": 3.91027685171764e-05, "loss": 0.7375, "step": 7520 }, { "epoch": 1.091422980758778, "grad_norm": 0.7429149150848389, "learning_rate": 3.908827366284969e-05, "loss": 0.6903, "step": 7530 }, { "epoch": 1.0928724136681522, "grad_norm": 1.107605218887329, "learning_rate": 3.907377880852298e-05, "loss": 0.8292, "step": 7540 }, { "epoch": 1.0943218465775266, "grad_norm": 1.9951454401016235, "learning_rate": 3.905928395419626e-05, "loss": 0.639, "step": 7550 }, { "epoch": 1.0957712794869008, "grad_norm": 3.669023275375366, "learning_rate": 3.904478909986955e-05, "loss": 0.7677, "step": 7560 }, { "epoch": 1.097220712396275, "grad_norm": 0.930507242679596, "learning_rate": 3.903029424554284e-05, "loss": 0.6948, "step": 7570 }, { "epoch": 1.098670145305649, "grad_norm": 2.206545829772949, "learning_rate": 3.9015799391216116e-05, "loss": 0.746, "step": 7580 }, { "epoch": 1.1001195782150233, "grad_norm": 2.361729860305786, "learning_rate": 3.900130453688941e-05, "loss": 0.6641, "step": 7590 }, { "epoch": 1.1015690111243976, "grad_norm": 0.863552451133728, "learning_rate": 3.898680968256269e-05, "loss": 0.7508, "step": 7600 }, { "epoch": 1.1030184440337718, "grad_norm": 0.9006494283676147, "learning_rate": 3.8972314828235976e-05, "loss": 0.6622, "step": 7610 }, { "epoch": 1.104467876943146, "grad_norm": 1.0481303930282593, "learning_rate": 3.895781997390927e-05, "loss": 0.6852, "step": 7620 }, { "epoch": 1.1059173098525201, "grad_norm": 2.5158543586730957, "learning_rate": 3.894332511958255e-05, "loss": 0.7763, "step": 7630 }, { "epoch": 1.1073667427618945, "grad_norm": 2.5790767669677734, "learning_rate": 3.8928830265255836e-05, "loss": 0.6905, "step": 7640 }, { "epoch": 1.1088161756712687, "grad_norm": 0.8664813041687012, "learning_rate": 3.891433541092912e-05, "loss": 0.6791, "step": 7650 }, { "epoch": 1.1102656085806428, "grad_norm": 0.7020731568336487, "learning_rate": 3.8899840556602405e-05, "loss": 0.6716, "step": 7660 }, { "epoch": 1.111715041490017, "grad_norm": 0.7595298886299133, "learning_rate": 3.8885345702275696e-05, "loss": 0.7458, "step": 7670 }, { "epoch": 1.1131644743993911, "grad_norm": 1.0612444877624512, "learning_rate": 3.887085084794898e-05, "loss": 0.6897, "step": 7680 }, { "epoch": 1.1146139073087655, "grad_norm": 1.821798324584961, "learning_rate": 3.8856355993622265e-05, "loss": 0.6691, "step": 7690 }, { "epoch": 1.1160633402181397, "grad_norm": 2.0372657775878906, "learning_rate": 3.8841861139295556e-05, "loss": 0.7072, "step": 7700 }, { "epoch": 1.1175127731275138, "grad_norm": 0.9012567400932312, "learning_rate": 3.882736628496883e-05, "loss": 0.6702, "step": 7710 }, { "epoch": 1.118962206036888, "grad_norm": 0.9007511734962463, "learning_rate": 3.8812871430642124e-05, "loss": 0.6839, "step": 7720 }, { "epoch": 1.1204116389462624, "grad_norm": 1.0055979490280151, "learning_rate": 3.879837657631541e-05, "loss": 0.7102, "step": 7730 }, { "epoch": 1.1218610718556365, "grad_norm": 3.4122612476348877, "learning_rate": 3.878388172198869e-05, "loss": 0.7228, "step": 7740 }, { "epoch": 1.1233105047650107, "grad_norm": 0.8962409496307373, "learning_rate": 3.8769386867661984e-05, "loss": 0.6315, "step": 7750 }, { "epoch": 1.1247599376743849, "grad_norm": 1.3250775337219238, "learning_rate": 3.875489201333527e-05, "loss": 0.7067, "step": 7760 }, { "epoch": 1.126209370583759, "grad_norm": 0.8613935112953186, "learning_rate": 3.874039715900855e-05, "loss": 0.6161, "step": 7770 }, { "epoch": 1.1276588034931332, "grad_norm": 2.337137222290039, "learning_rate": 3.872590230468184e-05, "loss": 0.7833, "step": 7780 }, { "epoch": 1.1291082364025076, "grad_norm": 1.5880218744277954, "learning_rate": 3.871140745035512e-05, "loss": 0.7372, "step": 7790 }, { "epoch": 1.1305576693118817, "grad_norm": 0.8534262776374817, "learning_rate": 3.869691259602841e-05, "loss": 0.6467, "step": 7800 }, { "epoch": 1.132007102221256, "grad_norm": 0.7763128280639648, "learning_rate": 3.86824177417017e-05, "loss": 0.7475, "step": 7810 }, { "epoch": 1.13345653513063, "grad_norm": 1.5850907564163208, "learning_rate": 3.866792288737499e-05, "loss": 0.6969, "step": 7820 }, { "epoch": 1.1349059680400044, "grad_norm": 1.0109902620315552, "learning_rate": 3.865342803304827e-05, "loss": 0.6547, "step": 7830 }, { "epoch": 1.1363554009493786, "grad_norm": 2.639951705932617, "learning_rate": 3.863893317872155e-05, "loss": 0.6913, "step": 7840 }, { "epoch": 1.1378048338587528, "grad_norm": 2.2629740238189697, "learning_rate": 3.862443832439484e-05, "loss": 0.7458, "step": 7850 }, { "epoch": 1.139254266768127, "grad_norm": 3.484891176223755, "learning_rate": 3.8609943470068126e-05, "loss": 0.7227, "step": 7860 }, { "epoch": 1.140703699677501, "grad_norm": 3.3114774227142334, "learning_rate": 3.859544861574142e-05, "loss": 0.6682, "step": 7870 }, { "epoch": 1.1421531325868755, "grad_norm": 0.9345071911811829, "learning_rate": 3.85809537614147e-05, "loss": 0.6499, "step": 7880 }, { "epoch": 1.1436025654962496, "grad_norm": 2.3660013675689697, "learning_rate": 3.8566458907087986e-05, "loss": 0.65, "step": 7890 }, { "epoch": 1.1450519984056238, "grad_norm": 0.6031567454338074, "learning_rate": 3.855196405276127e-05, "loss": 0.6494, "step": 7900 }, { "epoch": 1.146501431314998, "grad_norm": 1.2019445896148682, "learning_rate": 3.8537469198434554e-05, "loss": 0.7211, "step": 7910 }, { "epoch": 1.1479508642243723, "grad_norm": 1.701231598854065, "learning_rate": 3.8522974344107845e-05, "loss": 0.7197, "step": 7920 }, { "epoch": 1.1494002971337465, "grad_norm": 2.265749931335449, "learning_rate": 3.850847948978113e-05, "loss": 0.7113, "step": 7930 }, { "epoch": 1.1508497300431206, "grad_norm": 0.8541210293769836, "learning_rate": 3.8493984635454414e-05, "loss": 0.7063, "step": 7940 }, { "epoch": 1.1522991629524948, "grad_norm": 1.4700528383255005, "learning_rate": 3.8479489781127705e-05, "loss": 0.6937, "step": 7950 }, { "epoch": 1.153748595861869, "grad_norm": 1.0620396137237549, "learning_rate": 3.846499492680099e-05, "loss": 0.7409, "step": 7960 }, { "epoch": 1.1551980287712433, "grad_norm": 1.703613519668579, "learning_rate": 3.8450500072474274e-05, "loss": 0.6171, "step": 7970 }, { "epoch": 1.1566474616806175, "grad_norm": 3.0533642768859863, "learning_rate": 3.843600521814756e-05, "loss": 0.78, "step": 7980 }, { "epoch": 1.1580968945899917, "grad_norm": 2.968996286392212, "learning_rate": 3.842151036382084e-05, "loss": 0.665, "step": 7990 }, { "epoch": 1.1595463274993658, "grad_norm": 1.5719397068023682, "learning_rate": 3.8407015509494134e-05, "loss": 0.6968, "step": 8000 }, { "epoch": 1.1595463274993658, "eval_loss": 0.7815335988998413, "eval_runtime": 669.2359, "eval_samples_per_second": 51.544, "eval_steps_per_second": 2.578, "eval_token_accuracy": 0.000397922439544419, "step": 8000 }, { "epoch": 1.1609957604087402, "grad_norm": 1.8867238759994507, "learning_rate": 3.839252065516742e-05, "loss": 0.7018, "step": 8010 }, { "epoch": 1.1624451933181144, "grad_norm": 0.8305041193962097, "learning_rate": 3.83780258008407e-05, "loss": 0.7591, "step": 8020 }, { "epoch": 1.1638946262274885, "grad_norm": 0.9616385102272034, "learning_rate": 3.836353094651399e-05, "loss": 0.7355, "step": 8030 }, { "epoch": 1.1653440591368627, "grad_norm": 0.797763466835022, "learning_rate": 3.834903609218727e-05, "loss": 0.6915, "step": 8040 }, { "epoch": 1.1667934920462368, "grad_norm": 0.9838529825210571, "learning_rate": 3.833454123786056e-05, "loss": 0.7352, "step": 8050 }, { "epoch": 1.168242924955611, "grad_norm": 1.1470749378204346, "learning_rate": 3.832004638353385e-05, "loss": 0.7103, "step": 8060 }, { "epoch": 1.1696923578649854, "grad_norm": 1.6150496006011963, "learning_rate": 3.830555152920713e-05, "loss": 0.713, "step": 8070 }, { "epoch": 1.1711417907743595, "grad_norm": 2.4331045150756836, "learning_rate": 3.829105667488042e-05, "loss": 0.6846, "step": 8080 }, { "epoch": 1.1725912236837337, "grad_norm": 0.7783316969871521, "learning_rate": 3.827656182055371e-05, "loss": 0.6121, "step": 8090 }, { "epoch": 1.1740406565931079, "grad_norm": 0.7985559701919556, "learning_rate": 3.826206696622699e-05, "loss": 0.7786, "step": 8100 }, { "epoch": 1.1754900895024822, "grad_norm": 2.05668568611145, "learning_rate": 3.8247572111900275e-05, "loss": 0.6657, "step": 8110 }, { "epoch": 1.1769395224118564, "grad_norm": 1.422728419303894, "learning_rate": 3.823307725757356e-05, "loss": 0.6807, "step": 8120 }, { "epoch": 1.1783889553212306, "grad_norm": 0.692974865436554, "learning_rate": 3.821858240324685e-05, "loss": 0.8165, "step": 8130 }, { "epoch": 1.1798383882306047, "grad_norm": 0.6786725521087646, "learning_rate": 3.8204087548920135e-05, "loss": 0.6539, "step": 8140 }, { "epoch": 1.1812878211399789, "grad_norm": 1.600859522819519, "learning_rate": 3.8189592694593426e-05, "loss": 0.6922, "step": 8150 }, { "epoch": 1.1827372540493533, "grad_norm": 1.9323725700378418, "learning_rate": 3.8175097840266704e-05, "loss": 0.6823, "step": 8160 }, { "epoch": 1.1841866869587274, "grad_norm": 0.9999890327453613, "learning_rate": 3.816060298593999e-05, "loss": 0.7203, "step": 8170 }, { "epoch": 1.1856361198681016, "grad_norm": 0.8995366096496582, "learning_rate": 3.814610813161328e-05, "loss": 0.6565, "step": 8180 }, { "epoch": 1.1870855527774757, "grad_norm": 2.447030544281006, "learning_rate": 3.8131613277286564e-05, "loss": 0.7161, "step": 8190 }, { "epoch": 1.1885349856868501, "grad_norm": 3.476491689682007, "learning_rate": 3.8117118422959855e-05, "loss": 0.7355, "step": 8200 }, { "epoch": 1.1899844185962243, "grad_norm": 0.8819745779037476, "learning_rate": 3.810262356863314e-05, "loss": 0.6723, "step": 8210 }, { "epoch": 1.1914338515055984, "grad_norm": 0.8547608256340027, "learning_rate": 3.8088128714306424e-05, "loss": 0.739, "step": 8220 }, { "epoch": 1.1928832844149726, "grad_norm": 2.171769142150879, "learning_rate": 3.807363385997971e-05, "loss": 0.7394, "step": 8230 }, { "epoch": 1.1943327173243468, "grad_norm": 0.7322458624839783, "learning_rate": 3.805913900565299e-05, "loss": 0.7139, "step": 8240 }, { "epoch": 1.1957821502337211, "grad_norm": 2.5103094577789307, "learning_rate": 3.8044644151326284e-05, "loss": 0.6333, "step": 8250 }, { "epoch": 1.1972315831430953, "grad_norm": 2.6011738777160645, "learning_rate": 3.803014929699957e-05, "loss": 0.6952, "step": 8260 }, { "epoch": 1.1986810160524695, "grad_norm": 0.9356118440628052, "learning_rate": 3.801565444267285e-05, "loss": 0.7426, "step": 8270 }, { "epoch": 1.2001304489618436, "grad_norm": 0.8487895131111145, "learning_rate": 3.8001159588346143e-05, "loss": 0.6723, "step": 8280 }, { "epoch": 1.201579881871218, "grad_norm": 0.6286924481391907, "learning_rate": 3.798666473401942e-05, "loss": 0.6607, "step": 8290 }, { "epoch": 1.2030293147805922, "grad_norm": 0.8928942680358887, "learning_rate": 3.797216987969271e-05, "loss": 0.6233, "step": 8300 }, { "epoch": 1.2044787476899663, "grad_norm": 2.156315326690674, "learning_rate": 3.7957675025365996e-05, "loss": 0.6939, "step": 8310 }, { "epoch": 1.2059281805993405, "grad_norm": 1.7508461475372314, "learning_rate": 3.794318017103928e-05, "loss": 0.6814, "step": 8320 }, { "epoch": 1.2073776135087146, "grad_norm": 2.0084280967712402, "learning_rate": 3.792868531671257e-05, "loss": 0.7559, "step": 8330 }, { "epoch": 1.2088270464180888, "grad_norm": 0.8208935260772705, "learning_rate": 3.7914190462385856e-05, "loss": 0.6455, "step": 8340 }, { "epoch": 1.2102764793274632, "grad_norm": 0.764270007610321, "learning_rate": 3.789969560805914e-05, "loss": 0.7392, "step": 8350 }, { "epoch": 1.2117259122368373, "grad_norm": 1.3177683353424072, "learning_rate": 3.7885200753732425e-05, "loss": 0.7061, "step": 8360 }, { "epoch": 1.2131753451462115, "grad_norm": 1.047242283821106, "learning_rate": 3.787070589940571e-05, "loss": 0.6914, "step": 8370 }, { "epoch": 1.2146247780555857, "grad_norm": 0.938040018081665, "learning_rate": 3.7856211045079e-05, "loss": 0.7243, "step": 8380 }, { "epoch": 1.21607421096496, "grad_norm": 3.838555335998535, "learning_rate": 3.7841716190752285e-05, "loss": 0.7684, "step": 8390 }, { "epoch": 1.2175236438743342, "grad_norm": 0.834057629108429, "learning_rate": 3.782722133642557e-05, "loss": 0.6682, "step": 8400 }, { "epoch": 1.2189730767837084, "grad_norm": 1.0617311000823975, "learning_rate": 3.781272648209886e-05, "loss": 0.7431, "step": 8410 }, { "epoch": 1.2204225096930825, "grad_norm": 1.9687778949737549, "learning_rate": 3.779823162777214e-05, "loss": 0.6855, "step": 8420 }, { "epoch": 1.2218719426024567, "grad_norm": 1.4709055423736572, "learning_rate": 3.778373677344543e-05, "loss": 0.7468, "step": 8430 }, { "epoch": 1.223321375511831, "grad_norm": 1.0255465507507324, "learning_rate": 3.7769241919118713e-05, "loss": 0.6915, "step": 8440 }, { "epoch": 1.2247708084212052, "grad_norm": 0.910531759262085, "learning_rate": 3.7754747064792e-05, "loss": 0.7406, "step": 8450 }, { "epoch": 1.2262202413305794, "grad_norm": 1.3547700643539429, "learning_rate": 3.774025221046529e-05, "loss": 0.7018, "step": 8460 }, { "epoch": 1.2276696742399535, "grad_norm": 1.63336181640625, "learning_rate": 3.772575735613857e-05, "loss": 0.6442, "step": 8470 }, { "epoch": 1.229119107149328, "grad_norm": 2.7954370975494385, "learning_rate": 3.771126250181186e-05, "loss": 0.6536, "step": 8480 }, { "epoch": 1.230568540058702, "grad_norm": 1.6382683515548706, "learning_rate": 3.769676764748514e-05, "loss": 0.6524, "step": 8490 }, { "epoch": 1.2320179729680762, "grad_norm": 1.616657018661499, "learning_rate": 3.7682272793158426e-05, "loss": 0.7309, "step": 8500 }, { "epoch": 1.2334674058774504, "grad_norm": 0.8286413550376892, "learning_rate": 3.766777793883172e-05, "loss": 0.7236, "step": 8510 }, { "epoch": 1.2349168387868246, "grad_norm": 2.312886953353882, "learning_rate": 3.7653283084505e-05, "loss": 0.6688, "step": 8520 }, { "epoch": 1.236366271696199, "grad_norm": 2.2434260845184326, "learning_rate": 3.763878823017829e-05, "loss": 0.6509, "step": 8530 }, { "epoch": 1.237815704605573, "grad_norm": 0.8880380988121033, "learning_rate": 3.762429337585158e-05, "loss": 0.8008, "step": 8540 }, { "epoch": 1.2392651375149473, "grad_norm": 3.9391345977783203, "learning_rate": 3.7609798521524855e-05, "loss": 0.6917, "step": 8550 }, { "epoch": 1.2407145704243214, "grad_norm": 1.7457298040390015, "learning_rate": 3.7595303667198146e-05, "loss": 0.7069, "step": 8560 }, { "epoch": 1.2421640033336958, "grad_norm": 0.8259201645851135, "learning_rate": 3.758080881287143e-05, "loss": 0.7088, "step": 8570 }, { "epoch": 1.24361343624307, "grad_norm": 0.7493927478790283, "learning_rate": 3.756631395854472e-05, "loss": 0.6286, "step": 8580 }, { "epoch": 1.2450628691524441, "grad_norm": 3.048274040222168, "learning_rate": 3.7551819104218006e-05, "loss": 0.6848, "step": 8590 }, { "epoch": 1.2465123020618183, "grad_norm": 1.0291824340820312, "learning_rate": 3.753732424989129e-05, "loss": 0.6626, "step": 8600 }, { "epoch": 1.2479617349711924, "grad_norm": 1.0793825387954712, "learning_rate": 3.7522829395564575e-05, "loss": 0.6966, "step": 8610 }, { "epoch": 1.2494111678805666, "grad_norm": 1.4464988708496094, "learning_rate": 3.750833454123786e-05, "loss": 0.6607, "step": 8620 }, { "epoch": 1.250860600789941, "grad_norm": 0.8551133871078491, "learning_rate": 3.749383968691115e-05, "loss": 0.638, "step": 8630 }, { "epoch": 1.2523100336993152, "grad_norm": 2.6740128993988037, "learning_rate": 3.7479344832584435e-05, "loss": 0.742, "step": 8640 }, { "epoch": 1.2537594666086893, "grad_norm": 1.0272529125213623, "learning_rate": 3.746484997825772e-05, "loss": 0.6917, "step": 8650 }, { "epoch": 1.2552088995180637, "grad_norm": 0.8823620080947876, "learning_rate": 3.745035512393101e-05, "loss": 0.7034, "step": 8660 }, { "epoch": 1.2566583324274379, "grad_norm": 0.8183333873748779, "learning_rate": 3.7435860269604294e-05, "loss": 0.6761, "step": 8670 }, { "epoch": 1.258107765336812, "grad_norm": 2.188427209854126, "learning_rate": 3.742136541527758e-05, "loss": 0.6813, "step": 8680 }, { "epoch": 1.2595571982461862, "grad_norm": 0.7740078568458557, "learning_rate": 3.740687056095086e-05, "loss": 0.7348, "step": 8690 }, { "epoch": 1.2610066311555603, "grad_norm": 3.2154736518859863, "learning_rate": 3.739237570662415e-05, "loss": 0.7179, "step": 8700 }, { "epoch": 1.2624560640649345, "grad_norm": 2.6355783939361572, "learning_rate": 3.737788085229744e-05, "loss": 0.6656, "step": 8710 }, { "epoch": 1.2639054969743089, "grad_norm": 0.9518022537231445, "learning_rate": 3.736338599797072e-05, "loss": 0.66, "step": 8720 }, { "epoch": 1.265354929883683, "grad_norm": 0.9941968321800232, "learning_rate": 3.734889114364401e-05, "loss": 0.701, "step": 8730 }, { "epoch": 1.2668043627930572, "grad_norm": 2.278553009033203, "learning_rate": 3.733439628931729e-05, "loss": 0.6655, "step": 8740 }, { "epoch": 1.2682537957024314, "grad_norm": 2.2199084758758545, "learning_rate": 3.7319901434990576e-05, "loss": 0.642, "step": 8750 }, { "epoch": 1.2697032286118057, "grad_norm": 0.855959415435791, "learning_rate": 3.730540658066387e-05, "loss": 0.6768, "step": 8760 }, { "epoch": 1.27115266152118, "grad_norm": 0.7713690996170044, "learning_rate": 3.729091172633715e-05, "loss": 0.7202, "step": 8770 }, { "epoch": 1.272602094430554, "grad_norm": 3.6193957328796387, "learning_rate": 3.7276416872010436e-05, "loss": 0.8522, "step": 8780 }, { "epoch": 1.2740515273399282, "grad_norm": 0.8761569261550903, "learning_rate": 3.726192201768373e-05, "loss": 0.6998, "step": 8790 }, { "epoch": 1.2755009602493024, "grad_norm": 0.8228587508201599, "learning_rate": 3.724742716335701e-05, "loss": 0.6271, "step": 8800 }, { "epoch": 1.2769503931586765, "grad_norm": 0.967322826385498, "learning_rate": 3.7232932309030296e-05, "loss": 0.677, "step": 8810 }, { "epoch": 1.278399826068051, "grad_norm": 0.7491191029548645, "learning_rate": 3.721843745470358e-05, "loss": 0.6275, "step": 8820 }, { "epoch": 1.279849258977425, "grad_norm": 0.7908267974853516, "learning_rate": 3.7203942600376864e-05, "loss": 0.758, "step": 8830 }, { "epoch": 1.2812986918867992, "grad_norm": 2.2761118412017822, "learning_rate": 3.7189447746050156e-05, "loss": 0.6876, "step": 8840 }, { "epoch": 1.2827481247961736, "grad_norm": 1.9194432497024536, "learning_rate": 3.717495289172344e-05, "loss": 0.6291, "step": 8850 }, { "epoch": 1.2841975577055478, "grad_norm": 0.9832128882408142, "learning_rate": 3.716045803739673e-05, "loss": 0.738, "step": 8860 }, { "epoch": 1.285646990614922, "grad_norm": 1.0362465381622314, "learning_rate": 3.714596318307001e-05, "loss": 0.653, "step": 8870 }, { "epoch": 1.287096423524296, "grad_norm": 0.9563276171684265, "learning_rate": 3.713146832874329e-05, "loss": 0.712, "step": 8880 }, { "epoch": 1.2885458564336703, "grad_norm": 1.062441349029541, "learning_rate": 3.7116973474416584e-05, "loss": 0.6401, "step": 8890 }, { "epoch": 1.2899952893430444, "grad_norm": 1.0190941095352173, "learning_rate": 3.710247862008987e-05, "loss": 0.6558, "step": 8900 }, { "epoch": 1.2914447222524188, "grad_norm": 2.2705626487731934, "learning_rate": 3.708798376576316e-05, "loss": 0.6885, "step": 8910 }, { "epoch": 1.292894155161793, "grad_norm": 2.16426157951355, "learning_rate": 3.7073488911436444e-05, "loss": 0.7323, "step": 8920 }, { "epoch": 1.2943435880711671, "grad_norm": 0.7907456159591675, "learning_rate": 3.705899405710973e-05, "loss": 0.6317, "step": 8930 }, { "epoch": 1.2957930209805415, "grad_norm": 0.942328691482544, "learning_rate": 3.704449920278301e-05, "loss": 0.7002, "step": 8940 }, { "epoch": 1.2972424538899157, "grad_norm": 1.5620898008346558, "learning_rate": 3.70300043484563e-05, "loss": 0.6701, "step": 8950 }, { "epoch": 1.2986918867992898, "grad_norm": 2.1964192390441895, "learning_rate": 3.701550949412959e-05, "loss": 0.6785, "step": 8960 }, { "epoch": 1.300141319708664, "grad_norm": 1.622687816619873, "learning_rate": 3.700101463980287e-05, "loss": 0.7324, "step": 8970 }, { "epoch": 1.3015907526180381, "grad_norm": 0.9879828095436096, "learning_rate": 3.698651978547616e-05, "loss": 0.6718, "step": 8980 }, { "epoch": 1.3030401855274123, "grad_norm": 0.7968335747718811, "learning_rate": 3.697202493114945e-05, "loss": 0.7395, "step": 8990 }, { "epoch": 1.3044896184367867, "grad_norm": 1.0809406042099, "learning_rate": 3.6957530076822726e-05, "loss": 0.6902, "step": 9000 }, { "epoch": 1.3044896184367867, "eval_loss": 0.7758866548538208, "eval_runtime": 672.1803, "eval_samples_per_second": 51.318, "eval_steps_per_second": 2.566, "eval_token_accuracy": 0.000404195267006922, "step": 9000 }, { "epoch": 1.3059390513461608, "grad_norm": 1.9905250072479248, "learning_rate": 3.694303522249602e-05, "loss": 0.7069, "step": 9010 }, { "epoch": 1.307388484255535, "grad_norm": 3.8968214988708496, "learning_rate": 3.69285403681693e-05, "loss": 0.679, "step": 9020 }, { "epoch": 1.3088379171649092, "grad_norm": 0.9726070761680603, "learning_rate": 3.6914045513842586e-05, "loss": 0.7227, "step": 9030 }, { "epoch": 1.3102873500742835, "grad_norm": 2.276339054107666, "learning_rate": 3.689955065951588e-05, "loss": 0.7343, "step": 9040 }, { "epoch": 1.3117367829836577, "grad_norm": 2.7617475986480713, "learning_rate": 3.688505580518916e-05, "loss": 0.7368, "step": 9050 }, { "epoch": 1.3131862158930319, "grad_norm": 2.35437273979187, "learning_rate": 3.6870560950862445e-05, "loss": 0.7382, "step": 9060 }, { "epoch": 1.314635648802406, "grad_norm": 1.8291373252868652, "learning_rate": 3.685606609653573e-05, "loss": 0.6743, "step": 9070 }, { "epoch": 1.3160850817117802, "grad_norm": 0.8860570788383484, "learning_rate": 3.6841571242209014e-05, "loss": 0.7293, "step": 9080 }, { "epoch": 1.3175345146211543, "grad_norm": 2.3467090129852295, "learning_rate": 3.6827076387882305e-05, "loss": 0.7022, "step": 9090 }, { "epoch": 1.3189839475305287, "grad_norm": 0.8134902715682983, "learning_rate": 3.681258153355559e-05, "loss": 0.6055, "step": 9100 }, { "epoch": 1.3204333804399029, "grad_norm": 1.741827130317688, "learning_rate": 3.6798086679228874e-05, "loss": 0.6536, "step": 9110 }, { "epoch": 1.321882813349277, "grad_norm": 0.9208307266235352, "learning_rate": 3.6783591824902165e-05, "loss": 0.6733, "step": 9120 }, { "epoch": 1.3233322462586514, "grad_norm": 0.9000769257545471, "learning_rate": 3.676909697057544e-05, "loss": 0.7081, "step": 9130 }, { "epoch": 1.3247816791680256, "grad_norm": 0.9797672033309937, "learning_rate": 3.6754602116248734e-05, "loss": 0.6804, "step": 9140 }, { "epoch": 1.3262311120773997, "grad_norm": 3.0780463218688965, "learning_rate": 3.674010726192202e-05, "loss": 0.641, "step": 9150 }, { "epoch": 1.327680544986774, "grad_norm": 0.9936016201972961, "learning_rate": 3.67256124075953e-05, "loss": 0.6929, "step": 9160 }, { "epoch": 1.329129977896148, "grad_norm": 2.4128029346466064, "learning_rate": 3.6711117553268594e-05, "loss": 0.6985, "step": 9170 }, { "epoch": 1.3305794108055222, "grad_norm": 0.8239456415176392, "learning_rate": 3.669662269894188e-05, "loss": 0.6691, "step": 9180 }, { "epoch": 1.3320288437148966, "grad_norm": 2.196967124938965, "learning_rate": 3.668212784461516e-05, "loss": 0.7352, "step": 9190 }, { "epoch": 1.3334782766242708, "grad_norm": 1.011257529258728, "learning_rate": 3.666763299028845e-05, "loss": 0.6596, "step": 9200 }, { "epoch": 1.334927709533645, "grad_norm": 1.0266071557998657, "learning_rate": 3.665313813596173e-05, "loss": 0.7006, "step": 9210 }, { "epoch": 1.3363771424430193, "grad_norm": 0.8049505949020386, "learning_rate": 3.663864328163502e-05, "loss": 0.654, "step": 9220 }, { "epoch": 1.3378265753523935, "grad_norm": 1.4919633865356445, "learning_rate": 3.6624148427308307e-05, "loss": 0.6717, "step": 9230 }, { "epoch": 1.3392760082617676, "grad_norm": 1.3777735233306885, "learning_rate": 3.66096535729816e-05, "loss": 0.7646, "step": 9240 }, { "epoch": 1.3407254411711418, "grad_norm": 1.0090413093566895, "learning_rate": 3.659515871865488e-05, "loss": 0.661, "step": 9250 }, { "epoch": 1.342174874080516, "grad_norm": 0.9088075757026672, "learning_rate": 3.658066386432816e-05, "loss": 0.7843, "step": 9260 }, { "epoch": 1.34362430698989, "grad_norm": 2.6569576263427734, "learning_rate": 3.656616901000145e-05, "loss": 0.6978, "step": 9270 }, { "epoch": 1.3450737398992645, "grad_norm": 2.193643808364868, "learning_rate": 3.6551674155674735e-05, "loss": 0.6263, "step": 9280 }, { "epoch": 1.3465231728086386, "grad_norm": 1.4514281749725342, "learning_rate": 3.6537179301348026e-05, "loss": 0.6815, "step": 9290 }, { "epoch": 1.3479726057180128, "grad_norm": 1.3338249921798706, "learning_rate": 3.652268444702131e-05, "loss": 0.7358, "step": 9300 }, { "epoch": 1.349422038627387, "grad_norm": 0.815692663192749, "learning_rate": 3.6508189592694595e-05, "loss": 0.6351, "step": 9310 }, { "epoch": 1.3508714715367613, "grad_norm": 1.9944225549697876, "learning_rate": 3.649369473836788e-05, "loss": 0.7492, "step": 9320 }, { "epoch": 1.3523209044461355, "grad_norm": 0.8797019720077515, "learning_rate": 3.6479199884041164e-05, "loss": 0.7493, "step": 9330 }, { "epoch": 1.3537703373555097, "grad_norm": 0.9795637726783752, "learning_rate": 3.6464705029714455e-05, "loss": 0.6286, "step": 9340 }, { "epoch": 1.3552197702648838, "grad_norm": 1.6963341236114502, "learning_rate": 3.645021017538774e-05, "loss": 0.6751, "step": 9350 }, { "epoch": 1.356669203174258, "grad_norm": 2.1599597930908203, "learning_rate": 3.6435715321061024e-05, "loss": 0.7438, "step": 9360 }, { "epoch": 1.3581186360836321, "grad_norm": 2.2890264987945557, "learning_rate": 3.6421220466734315e-05, "loss": 0.666, "step": 9370 }, { "epoch": 1.3595680689930065, "grad_norm": 0.921067476272583, "learning_rate": 3.64067256124076e-05, "loss": 0.6272, "step": 9380 }, { "epoch": 1.3610175019023807, "grad_norm": 2.4551358222961426, "learning_rate": 3.6392230758080883e-05, "loss": 0.7137, "step": 9390 }, { "epoch": 1.3624669348117548, "grad_norm": 2.1481635570526123, "learning_rate": 3.637773590375417e-05, "loss": 0.6762, "step": 9400 }, { "epoch": 1.3639163677211292, "grad_norm": 2.6500132083892822, "learning_rate": 3.636324104942745e-05, "loss": 0.7908, "step": 9410 }, { "epoch": 1.3653658006305034, "grad_norm": 1.9957408905029297, "learning_rate": 3.634874619510074e-05, "loss": 0.6225, "step": 9420 }, { "epoch": 1.3668152335398775, "grad_norm": 1.0306966304779053, "learning_rate": 3.633425134077403e-05, "loss": 0.7203, "step": 9430 }, { "epoch": 1.3682646664492517, "grad_norm": 2.174342393875122, "learning_rate": 3.631975648644731e-05, "loss": 0.7314, "step": 9440 }, { "epoch": 1.3697140993586259, "grad_norm": 0.7824757099151611, "learning_rate": 3.6305261632120596e-05, "loss": 0.7611, "step": 9450 }, { "epoch": 1.371163532268, "grad_norm": 2.024747371673584, "learning_rate": 3.629076677779388e-05, "loss": 0.7054, "step": 9460 }, { "epoch": 1.3726129651773744, "grad_norm": 0.7712695598602295, "learning_rate": 3.627627192346717e-05, "loss": 0.7164, "step": 9470 }, { "epoch": 1.3740623980867486, "grad_norm": 2.13800311088562, "learning_rate": 3.6261777069140456e-05, "loss": 0.6677, "step": 9480 }, { "epoch": 1.3755118309961227, "grad_norm": 3.425255060195923, "learning_rate": 3.624728221481374e-05, "loss": 0.7723, "step": 9490 }, { "epoch": 1.3769612639054971, "grad_norm": 0.7990706562995911, "learning_rate": 3.623278736048703e-05, "loss": 0.7634, "step": 9500 }, { "epoch": 1.3784106968148713, "grad_norm": 1.1414493322372437, "learning_rate": 3.6218292506160316e-05, "loss": 0.7022, "step": 9510 }, { "epoch": 1.3798601297242454, "grad_norm": 0.868614137172699, "learning_rate": 3.62037976518336e-05, "loss": 0.6991, "step": 9520 }, { "epoch": 1.3813095626336196, "grad_norm": 1.5563892126083374, "learning_rate": 3.6189302797506885e-05, "loss": 0.7179, "step": 9530 }, { "epoch": 1.3827589955429938, "grad_norm": 2.05755615234375, "learning_rate": 3.617480794318017e-05, "loss": 0.6415, "step": 9540 }, { "epoch": 1.384208428452368, "grad_norm": 0.7626024484634399, "learning_rate": 3.616031308885346e-05, "loss": 0.6597, "step": 9550 }, { "epoch": 1.3856578613617423, "grad_norm": 1.085626244544983, "learning_rate": 3.6145818234526745e-05, "loss": 0.6923, "step": 9560 }, { "epoch": 1.3871072942711165, "grad_norm": 1.0802031755447388, "learning_rate": 3.6131323380200036e-05, "loss": 0.7292, "step": 9570 }, { "epoch": 1.3885567271804906, "grad_norm": 1.0880290269851685, "learning_rate": 3.6116828525873313e-05, "loss": 0.6502, "step": 9580 }, { "epoch": 1.3900061600898648, "grad_norm": 0.8710733652114868, "learning_rate": 3.61023336715466e-05, "loss": 0.7041, "step": 9590 }, { "epoch": 1.3914555929992392, "grad_norm": 2.254011631011963, "learning_rate": 3.608783881721989e-05, "loss": 0.7389, "step": 9600 }, { "epoch": 1.3929050259086133, "grad_norm": 0.849483847618103, "learning_rate": 3.607334396289317e-05, "loss": 0.7202, "step": 9610 }, { "epoch": 1.3943544588179875, "grad_norm": 1.5035589933395386, "learning_rate": 3.6058849108566464e-05, "loss": 0.6131, "step": 9620 }, { "epoch": 1.3958038917273616, "grad_norm": 2.1819663047790527, "learning_rate": 3.604435425423975e-05, "loss": 0.6658, "step": 9630 }, { "epoch": 1.3972533246367358, "grad_norm": 0.8759439587593079, "learning_rate": 3.602985939991303e-05, "loss": 0.5972, "step": 9640 }, { "epoch": 1.39870275754611, "grad_norm": 1.233659267425537, "learning_rate": 3.601536454558632e-05, "loss": 0.7624, "step": 9650 }, { "epoch": 1.4001521904554843, "grad_norm": 1.6374999284744263, "learning_rate": 3.60008696912596e-05, "loss": 0.6939, "step": 9660 }, { "epoch": 1.4016016233648585, "grad_norm": 0.7916937470436096, "learning_rate": 3.598637483693289e-05, "loss": 0.7305, "step": 9670 }, { "epoch": 1.4030510562742327, "grad_norm": 0.9866163730621338, "learning_rate": 3.597187998260618e-05, "loss": 0.6483, "step": 9680 }, { "epoch": 1.404500489183607, "grad_norm": 0.7897905707359314, "learning_rate": 3.595738512827946e-05, "loss": 0.7158, "step": 9690 }, { "epoch": 1.4059499220929812, "grad_norm": 0.9635679125785828, "learning_rate": 3.594289027395275e-05, "loss": 0.7239, "step": 9700 }, { "epoch": 1.4073993550023554, "grad_norm": 0.8831052780151367, "learning_rate": 3.592839541962603e-05, "loss": 0.5833, "step": 9710 }, { "epoch": 1.4088487879117295, "grad_norm": 0.8457877039909363, "learning_rate": 3.591390056529932e-05, "loss": 0.6611, "step": 9720 }, { "epoch": 1.4102982208211037, "grad_norm": 1.9585788249969482, "learning_rate": 3.5899405710972606e-05, "loss": 0.742, "step": 9730 }, { "epoch": 1.4117476537304778, "grad_norm": 0.816891610622406, "learning_rate": 3.588491085664589e-05, "loss": 0.6449, "step": 9740 }, { "epoch": 1.4131970866398522, "grad_norm": 1.15848970413208, "learning_rate": 3.587041600231918e-05, "loss": 0.6755, "step": 9750 }, { "epoch": 1.4146465195492264, "grad_norm": 1.4050579071044922, "learning_rate": 3.5855921147992466e-05, "loss": 0.7529, "step": 9760 }, { "epoch": 1.4160959524586005, "grad_norm": 0.9132912755012512, "learning_rate": 3.584142629366575e-05, "loss": 0.7623, "step": 9770 }, { "epoch": 1.4175453853679747, "grad_norm": 2.101269006729126, "learning_rate": 3.5826931439339034e-05, "loss": 0.6348, "step": 9780 }, { "epoch": 1.418994818277349, "grad_norm": 1.0391186475753784, "learning_rate": 3.581243658501232e-05, "loss": 0.6816, "step": 9790 }, { "epoch": 1.4204442511867232, "grad_norm": 0.8907914757728577, "learning_rate": 3.579794173068561e-05, "loss": 0.6829, "step": 9800 }, { "epoch": 1.4218936840960974, "grad_norm": 1.922855257987976, "learning_rate": 3.5783446876358894e-05, "loss": 0.6947, "step": 9810 }, { "epoch": 1.4233431170054716, "grad_norm": 2.7909116744995117, "learning_rate": 3.576895202203218e-05, "loss": 0.7583, "step": 9820 }, { "epoch": 1.4247925499148457, "grad_norm": 0.9503197073936462, "learning_rate": 3.575445716770547e-05, "loss": 0.6714, "step": 9830 }, { "epoch": 1.42624198282422, "grad_norm": 1.3675655126571655, "learning_rate": 3.573996231337875e-05, "loss": 0.6434, "step": 9840 }, { "epoch": 1.4276914157335943, "grad_norm": 0.931876003742218, "learning_rate": 3.572546745905204e-05, "loss": 0.6797, "step": 9850 }, { "epoch": 1.4291408486429684, "grad_norm": 3.2150590419769287, "learning_rate": 3.571097260472532e-05, "loss": 0.6415, "step": 9860 }, { "epoch": 1.4305902815523426, "grad_norm": 0.887059211730957, "learning_rate": 3.569647775039861e-05, "loss": 0.7293, "step": 9870 }, { "epoch": 1.432039714461717, "grad_norm": 0.790198028087616, "learning_rate": 3.56819828960719e-05, "loss": 0.7376, "step": 9880 }, { "epoch": 1.4334891473710911, "grad_norm": 2.035407781600952, "learning_rate": 3.566748804174518e-05, "loss": 0.7065, "step": 9890 }, { "epoch": 1.4349385802804653, "grad_norm": 4.058956623077393, "learning_rate": 3.5652993187418474e-05, "loss": 0.6915, "step": 9900 }, { "epoch": 1.4363880131898394, "grad_norm": 1.8480651378631592, "learning_rate": 3.563849833309175e-05, "loss": 0.6387, "step": 9910 }, { "epoch": 1.4378374460992136, "grad_norm": 2.976912021636963, "learning_rate": 3.5624003478765036e-05, "loss": 0.734, "step": 9920 }, { "epoch": 1.4392868790085878, "grad_norm": 0.7814016342163086, "learning_rate": 3.560950862443833e-05, "loss": 0.6622, "step": 9930 }, { "epoch": 1.4407363119179621, "grad_norm": 2.2150237560272217, "learning_rate": 3.559501377011161e-05, "loss": 0.7197, "step": 9940 }, { "epoch": 1.4421857448273363, "grad_norm": 0.8639504909515381, "learning_rate": 3.55805189157849e-05, "loss": 0.6108, "step": 9950 }, { "epoch": 1.4436351777367105, "grad_norm": 1.8026032447814941, "learning_rate": 3.556602406145819e-05, "loss": 0.7659, "step": 9960 }, { "epoch": 1.4450846106460848, "grad_norm": 2.1232337951660156, "learning_rate": 3.5551529207131464e-05, "loss": 0.7001, "step": 9970 }, { "epoch": 1.446534043555459, "grad_norm": 0.9476084113121033, "learning_rate": 3.5537034352804756e-05, "loss": 0.672, "step": 9980 }, { "epoch": 1.4479834764648332, "grad_norm": 1.823028802871704, "learning_rate": 3.552253949847804e-05, "loss": 0.7238, "step": 9990 }, { "epoch": 1.4494329093742073, "grad_norm": 1.8687421083450317, "learning_rate": 3.550804464415133e-05, "loss": 0.6683, "step": 10000 }, { "epoch": 1.4494329093742073, "eval_loss": 0.7658461332321167, "eval_runtime": 671.2876, "eval_samples_per_second": 51.386, "eval_steps_per_second": 2.57, "eval_token_accuracy": 0.00042253122420500756, "step": 10000 }, { "epoch": 1.4508823422835815, "grad_norm": 0.9342031478881836, "learning_rate": 3.5493549789824615e-05, "loss": 0.7305, "step": 10010 }, { "epoch": 1.4523317751929556, "grad_norm": 0.755462110042572, "learning_rate": 3.54790549354979e-05, "loss": 0.6838, "step": 10020 }, { "epoch": 1.45378120810233, "grad_norm": 1.6950440406799316, "learning_rate": 3.546456008117119e-05, "loss": 0.6614, "step": 10030 }, { "epoch": 1.4552306410117042, "grad_norm": 0.9943178296089172, "learning_rate": 3.545006522684447e-05, "loss": 0.7104, "step": 10040 }, { "epoch": 1.4566800739210783, "grad_norm": 1.9179010391235352, "learning_rate": 3.543557037251776e-05, "loss": 0.6876, "step": 10050 }, { "epoch": 1.4581295068304525, "grad_norm": 2.0479156970977783, "learning_rate": 3.5421075518191044e-05, "loss": 0.6735, "step": 10060 }, { "epoch": 1.4595789397398269, "grad_norm": 2.0140061378479004, "learning_rate": 3.540658066386433e-05, "loss": 0.7058, "step": 10070 }, { "epoch": 1.461028372649201, "grad_norm": 0.9706962704658508, "learning_rate": 3.539208580953762e-05, "loss": 0.6184, "step": 10080 }, { "epoch": 1.4624778055585752, "grad_norm": 0.7795096039772034, "learning_rate": 3.5377590955210904e-05, "loss": 0.7585, "step": 10090 }, { "epoch": 1.4639272384679494, "grad_norm": 0.8623164296150208, "learning_rate": 3.536309610088419e-05, "loss": 0.6902, "step": 10100 }, { "epoch": 1.4653766713773235, "grad_norm": 1.4100171327590942, "learning_rate": 3.534860124655747e-05, "loss": 0.7216, "step": 10110 }, { "epoch": 1.466826104286698, "grad_norm": 0.8636854887008667, "learning_rate": 3.533410639223076e-05, "loss": 0.6922, "step": 10120 }, { "epoch": 1.468275537196072, "grad_norm": 0.8046514987945557, "learning_rate": 3.531961153790405e-05, "loss": 0.7511, "step": 10130 }, { "epoch": 1.4697249701054462, "grad_norm": 1.5246775150299072, "learning_rate": 3.530511668357733e-05, "loss": 0.6348, "step": 10140 }, { "epoch": 1.4711744030148204, "grad_norm": 0.7951986789703369, "learning_rate": 3.529062182925062e-05, "loss": 0.6492, "step": 10150 }, { "epoch": 1.4726238359241948, "grad_norm": 1.9236490726470947, "learning_rate": 3.52761269749239e-05, "loss": 0.7468, "step": 10160 }, { "epoch": 1.474073268833569, "grad_norm": 3.2273659706115723, "learning_rate": 3.5261632120597185e-05, "loss": 0.6982, "step": 10170 }, { "epoch": 1.475522701742943, "grad_norm": 2.427692413330078, "learning_rate": 3.5247137266270477e-05, "loss": 0.8, "step": 10180 }, { "epoch": 1.4769721346523172, "grad_norm": 1.7644684314727783, "learning_rate": 3.523264241194376e-05, "loss": 0.7468, "step": 10190 }, { "epoch": 1.4784215675616914, "grad_norm": 0.9527018666267395, "learning_rate": 3.5218147557617045e-05, "loss": 0.6499, "step": 10200 }, { "epoch": 1.4798710004710656, "grad_norm": 0.6727984547615051, "learning_rate": 3.5203652703290336e-05, "loss": 0.6432, "step": 10210 }, { "epoch": 1.48132043338044, "grad_norm": 1.005982518196106, "learning_rate": 3.518915784896362e-05, "loss": 0.7578, "step": 10220 }, { "epoch": 1.482769866289814, "grad_norm": 0.9719845056533813, "learning_rate": 3.5174662994636905e-05, "loss": 0.6569, "step": 10230 }, { "epoch": 1.4842192991991883, "grad_norm": 0.7539176344871521, "learning_rate": 3.516016814031019e-05, "loss": 0.6088, "step": 10240 }, { "epoch": 1.4856687321085627, "grad_norm": 2.297215700149536, "learning_rate": 3.5145673285983474e-05, "loss": 0.572, "step": 10250 }, { "epoch": 1.4871181650179368, "grad_norm": 0.7237049341201782, "learning_rate": 3.5131178431656765e-05, "loss": 0.679, "step": 10260 }, { "epoch": 1.488567597927311, "grad_norm": 0.7301941514015198, "learning_rate": 3.511668357733005e-05, "loss": 0.7794, "step": 10270 }, { "epoch": 1.4900170308366851, "grad_norm": 2.2130892276763916, "learning_rate": 3.510218872300334e-05, "loss": 0.6488, "step": 10280 }, { "epoch": 1.4914664637460593, "grad_norm": 1.0272775888442993, "learning_rate": 3.508769386867662e-05, "loss": 0.7033, "step": 10290 }, { "epoch": 1.4929158966554334, "grad_norm": 1.5348803997039795, "learning_rate": 3.50731990143499e-05, "loss": 0.7062, "step": 10300 }, { "epoch": 1.4943653295648078, "grad_norm": 2.1071600914001465, "learning_rate": 3.5058704160023194e-05, "loss": 0.6284, "step": 10310 }, { "epoch": 1.495814762474182, "grad_norm": 1.7620905637741089, "learning_rate": 3.504420930569648e-05, "loss": 0.6207, "step": 10320 }, { "epoch": 1.4972641953835562, "grad_norm": 1.4796020984649658, "learning_rate": 3.502971445136977e-05, "loss": 0.7086, "step": 10330 }, { "epoch": 1.4987136282929303, "grad_norm": 0.8734525442123413, "learning_rate": 3.5015219597043053e-05, "loss": 0.6757, "step": 10340 }, { "epoch": 1.5001630612023047, "grad_norm": 0.849749743938446, "learning_rate": 3.500072474271634e-05, "loss": 0.7431, "step": 10350 }, { "epoch": 1.5016124941116789, "grad_norm": 1.0296956300735474, "learning_rate": 3.498622988838962e-05, "loss": 0.6848, "step": 10360 }, { "epoch": 1.503061927021053, "grad_norm": 1.796150803565979, "learning_rate": 3.4971735034062907e-05, "loss": 0.635, "step": 10370 }, { "epoch": 1.5045113599304272, "grad_norm": 3.076692581176758, "learning_rate": 3.49572401797362e-05, "loss": 0.7448, "step": 10380 }, { "epoch": 1.5059607928398013, "grad_norm": 0.9386917352676392, "learning_rate": 3.494274532540948e-05, "loss": 0.7112, "step": 10390 }, { "epoch": 1.5074102257491755, "grad_norm": 2.2855920791625977, "learning_rate": 3.4928250471082766e-05, "loss": 0.7775, "step": 10400 }, { "epoch": 1.5088596586585499, "grad_norm": 0.7951090931892395, "learning_rate": 3.491375561675606e-05, "loss": 0.7277, "step": 10410 }, { "epoch": 1.510309091567924, "grad_norm": 2.5202653408050537, "learning_rate": 3.4899260762429335e-05, "loss": 0.6719, "step": 10420 }, { "epoch": 1.5117585244772984, "grad_norm": 1.999173879623413, "learning_rate": 3.4884765908102626e-05, "loss": 0.7104, "step": 10430 }, { "epoch": 1.5132079573866726, "grad_norm": 0.9255297183990479, "learning_rate": 3.487027105377591e-05, "loss": 0.6208, "step": 10440 }, { "epoch": 1.5146573902960467, "grad_norm": 1.7983359098434448, "learning_rate": 3.4855776199449195e-05, "loss": 0.7865, "step": 10450 }, { "epoch": 1.516106823205421, "grad_norm": 0.9478282332420349, "learning_rate": 3.4841281345122486e-05, "loss": 0.7451, "step": 10460 }, { "epoch": 1.517556256114795, "grad_norm": 1.0491830110549927, "learning_rate": 3.482678649079577e-05, "loss": 0.7612, "step": 10470 }, { "epoch": 1.5190056890241692, "grad_norm": 0.9677424430847168, "learning_rate": 3.4812291636469055e-05, "loss": 0.6674, "step": 10480 }, { "epoch": 1.5204551219335434, "grad_norm": 1.719191312789917, "learning_rate": 3.479779678214234e-05, "loss": 0.6897, "step": 10490 }, { "epoch": 1.5219045548429178, "grad_norm": 0.9445870518684387, "learning_rate": 3.4783301927815624e-05, "loss": 0.6801, "step": 10500 }, { "epoch": 1.523353987752292, "grad_norm": 2.001390218734741, "learning_rate": 3.4768807073488915e-05, "loss": 0.6875, "step": 10510 }, { "epoch": 1.524803420661666, "grad_norm": 2.875307321548462, "learning_rate": 3.47543122191622e-05, "loss": 0.6728, "step": 10520 }, { "epoch": 1.5262528535710405, "grad_norm": 2.2775304317474365, "learning_rate": 3.473981736483548e-05, "loss": 0.6632, "step": 10530 }, { "epoch": 1.5277022864804146, "grad_norm": 0.9073929190635681, "learning_rate": 3.4725322510508775e-05, "loss": 0.7309, "step": 10540 }, { "epoch": 1.5291517193897888, "grad_norm": 1.4111050367355347, "learning_rate": 3.471082765618205e-05, "loss": 0.6356, "step": 10550 }, { "epoch": 1.530601152299163, "grad_norm": 1.236938238143921, "learning_rate": 3.469633280185534e-05, "loss": 0.697, "step": 10560 }, { "epoch": 1.532050585208537, "grad_norm": 0.9304881691932678, "learning_rate": 3.468183794752863e-05, "loss": 0.66, "step": 10570 }, { "epoch": 1.5335000181179113, "grad_norm": 0.6954860091209412, "learning_rate": 3.466734309320191e-05, "loss": 0.6499, "step": 10580 }, { "epoch": 1.5349494510272854, "grad_norm": 1.1868486404418945, "learning_rate": 3.46528482388752e-05, "loss": 0.709, "step": 10590 }, { "epoch": 1.5363988839366598, "grad_norm": 2.335561513900757, "learning_rate": 3.463835338454849e-05, "loss": 0.6909, "step": 10600 }, { "epoch": 1.537848316846034, "grad_norm": 0.7953568696975708, "learning_rate": 3.462385853022178e-05, "loss": 0.6548, "step": 10610 }, { "epoch": 1.5392977497554083, "grad_norm": 0.893295168876648, "learning_rate": 3.4609363675895056e-05, "loss": 0.7508, "step": 10620 }, { "epoch": 1.5407471826647825, "grad_norm": 3.305266857147217, "learning_rate": 3.459486882156834e-05, "loss": 0.6312, "step": 10630 }, { "epoch": 1.5421966155741567, "grad_norm": 1.2510921955108643, "learning_rate": 3.458037396724163e-05, "loss": 0.6669, "step": 10640 }, { "epoch": 1.5436460484835308, "grad_norm": 0.8094971776008606, "learning_rate": 3.4565879112914916e-05, "loss": 0.7096, "step": 10650 }, { "epoch": 1.545095481392905, "grad_norm": 0.8499931693077087, "learning_rate": 3.455138425858821e-05, "loss": 0.6208, "step": 10660 }, { "epoch": 1.5465449143022791, "grad_norm": 2.0002918243408203, "learning_rate": 3.453688940426149e-05, "loss": 0.7067, "step": 10670 }, { "epoch": 1.5479943472116533, "grad_norm": 3.303682565689087, "learning_rate": 3.452239454993477e-05, "loss": 0.6802, "step": 10680 }, { "epoch": 1.5494437801210277, "grad_norm": 0.8864135146141052, "learning_rate": 3.450789969560806e-05, "loss": 0.7016, "step": 10690 }, { "epoch": 1.5508932130304018, "grad_norm": 1.1588127613067627, "learning_rate": 3.4493404841281345e-05, "loss": 0.6362, "step": 10700 }, { "epoch": 1.5523426459397762, "grad_norm": 1.947551965713501, "learning_rate": 3.4478909986954636e-05, "loss": 0.7554, "step": 10710 }, { "epoch": 1.5537920788491504, "grad_norm": 2.90364933013916, "learning_rate": 3.446441513262792e-05, "loss": 0.724, "step": 10720 }, { "epoch": 1.5552415117585245, "grad_norm": 0.898250937461853, "learning_rate": 3.4449920278301204e-05, "loss": 0.6948, "step": 10730 }, { "epoch": 1.5566909446678987, "grad_norm": 2.2128922939300537, "learning_rate": 3.4435425423974496e-05, "loss": 0.6681, "step": 10740 }, { "epoch": 1.5581403775772729, "grad_norm": 0.7464655041694641, "learning_rate": 3.442093056964777e-05, "loss": 0.6382, "step": 10750 }, { "epoch": 1.559589810486647, "grad_norm": 0.7855749726295471, "learning_rate": 3.4406435715321064e-05, "loss": 0.7867, "step": 10760 }, { "epoch": 1.5610392433960212, "grad_norm": 0.9440972805023193, "learning_rate": 3.439194086099435e-05, "loss": 0.7226, "step": 10770 }, { "epoch": 1.5624886763053956, "grad_norm": 1.0518885850906372, "learning_rate": 3.437744600666763e-05, "loss": 0.7334, "step": 10780 }, { "epoch": 1.5639381092147697, "grad_norm": 0.8372756242752075, "learning_rate": 3.4362951152340924e-05, "loss": 0.7222, "step": 10790 }, { "epoch": 1.5653875421241439, "grad_norm": 0.901339054107666, "learning_rate": 3.434845629801421e-05, "loss": 0.7014, "step": 10800 }, { "epoch": 1.5668369750335183, "grad_norm": 3.179217576980591, "learning_rate": 3.433396144368749e-05, "loss": 0.6731, "step": 10810 }, { "epoch": 1.5682864079428924, "grad_norm": 0.967142641544342, "learning_rate": 3.431946658936078e-05, "loss": 0.7114, "step": 10820 }, { "epoch": 1.5697358408522666, "grad_norm": 3.2705516815185547, "learning_rate": 3.430497173503406e-05, "loss": 0.6808, "step": 10830 }, { "epoch": 1.5711852737616407, "grad_norm": 1.0143731832504272, "learning_rate": 3.429047688070735e-05, "loss": 0.6969, "step": 10840 }, { "epoch": 1.572634706671015, "grad_norm": 0.7936012744903564, "learning_rate": 3.427598202638064e-05, "loss": 0.7324, "step": 10850 }, { "epoch": 1.574084139580389, "grad_norm": 2.3230628967285156, "learning_rate": 3.426148717205392e-05, "loss": 0.666, "step": 10860 }, { "epoch": 1.5755335724897632, "grad_norm": 2.224515199661255, "learning_rate": 3.424699231772721e-05, "loss": 0.6755, "step": 10870 }, { "epoch": 1.5769830053991376, "grad_norm": 1.1359072923660278, "learning_rate": 3.423249746340049e-05, "loss": 0.7035, "step": 10880 }, { "epoch": 1.5784324383085118, "grad_norm": 0.8372917175292969, "learning_rate": 3.421800260907378e-05, "loss": 0.5668, "step": 10890 }, { "epoch": 1.5798818712178861, "grad_norm": 0.7077991962432861, "learning_rate": 3.4203507754747066e-05, "loss": 0.6532, "step": 10900 }, { "epoch": 1.5813313041272603, "grad_norm": 1.7431057691574097, "learning_rate": 3.418901290042035e-05, "loss": 0.6941, "step": 10910 }, { "epoch": 1.5827807370366345, "grad_norm": 0.7109362483024597, "learning_rate": 3.417596753152631e-05, "loss": 0.6996, "step": 10920 }, { "epoch": 1.5842301699460086, "grad_norm": 0.884335994720459, "learning_rate": 3.4161472677199594e-05, "loss": 0.7177, "step": 10930 }, { "epoch": 1.5856796028553828, "grad_norm": 2.881589651107788, "learning_rate": 3.4146977822872885e-05, "loss": 0.6996, "step": 10940 }, { "epoch": 1.587129035764757, "grad_norm": 1.1488120555877686, "learning_rate": 3.413248296854617e-05, "loss": 0.684, "step": 10950 }, { "epoch": 1.588578468674131, "grad_norm": 2.397216320037842, "learning_rate": 3.4117988114219454e-05, "loss": 0.677, "step": 10960 }, { "epoch": 1.5900279015835055, "grad_norm": 1.1147899627685547, "learning_rate": 3.410349325989274e-05, "loss": 0.6599, "step": 10970 }, { "epoch": 1.5914773344928796, "grad_norm": 2.646649122238159, "learning_rate": 3.408899840556602e-05, "loss": 0.6607, "step": 10980 }, { "epoch": 1.592926767402254, "grad_norm": 1.1824229955673218, "learning_rate": 3.407450355123931e-05, "loss": 0.6214, "step": 10990 }, { "epoch": 1.5943762003116282, "grad_norm": 1.8018920421600342, "learning_rate": 3.40600086969126e-05, "loss": 0.6321, "step": 11000 }, { "epoch": 1.5943762003116282, "eval_loss": 0.7519996166229248, "eval_runtime": 672.5842, "eval_samples_per_second": 51.287, "eval_steps_per_second": 2.565, "eval_token_accuracy": 0.00041191567003769487, "step": 11000 }, { "epoch": 1.5958256332210023, "grad_norm": 1.9211559295654297, "learning_rate": 3.404551384258589e-05, "loss": 0.681, "step": 11010 }, { "epoch": 1.5972750661303765, "grad_norm": 0.8976821303367615, "learning_rate": 3.403101898825917e-05, "loss": 0.5972, "step": 11020 }, { "epoch": 1.5987244990397507, "grad_norm": 0.7794384956359863, "learning_rate": 3.401652413393245e-05, "loss": 0.7753, "step": 11030 }, { "epoch": 1.6001739319491248, "grad_norm": 1.1415067911148071, "learning_rate": 3.400202927960574e-05, "loss": 0.7083, "step": 11040 }, { "epoch": 1.601623364858499, "grad_norm": 1.0612363815307617, "learning_rate": 3.3987534425279026e-05, "loss": 0.7217, "step": 11050 }, { "epoch": 1.6030727977678734, "grad_norm": 2.422513246536255, "learning_rate": 3.397303957095232e-05, "loss": 0.7079, "step": 11060 }, { "epoch": 1.6045222306772475, "grad_norm": 0.8872206211090088, "learning_rate": 3.39585447166256e-05, "loss": 0.7362, "step": 11070 }, { "epoch": 1.6059716635866217, "grad_norm": 0.8796588778495789, "learning_rate": 3.3944049862298886e-05, "loss": 0.7146, "step": 11080 }, { "epoch": 1.607421096495996, "grad_norm": 1.5327075719833374, "learning_rate": 3.392955500797217e-05, "loss": 0.6852, "step": 11090 }, { "epoch": 1.6088705294053702, "grad_norm": 0.8125131130218506, "learning_rate": 3.3915060153645455e-05, "loss": 0.7014, "step": 11100 }, { "epoch": 1.6103199623147444, "grad_norm": 0.8616804480552673, "learning_rate": 3.3900565299318746e-05, "loss": 0.6572, "step": 11110 }, { "epoch": 1.6117693952241186, "grad_norm": 2.233217477798462, "learning_rate": 3.388607044499203e-05, "loss": 0.6935, "step": 11120 }, { "epoch": 1.6132188281334927, "grad_norm": 2.3844566345214844, "learning_rate": 3.3871575590665315e-05, "loss": 0.7073, "step": 11130 }, { "epoch": 1.6146682610428669, "grad_norm": 2.445873975753784, "learning_rate": 3.3857080736338606e-05, "loss": 0.7148, "step": 11140 }, { "epoch": 1.616117693952241, "grad_norm": 1.7319201231002808, "learning_rate": 3.384258588201189e-05, "loss": 0.6878, "step": 11150 }, { "epoch": 1.6175671268616154, "grad_norm": 2.917090654373169, "learning_rate": 3.3828091027685175e-05, "loss": 0.6819, "step": 11160 }, { "epoch": 1.6190165597709896, "grad_norm": 2.5769288539886475, "learning_rate": 3.381359617335846e-05, "loss": 0.7332, "step": 11170 }, { "epoch": 1.620465992680364, "grad_norm": 0.9746682047843933, "learning_rate": 3.379910131903174e-05, "loss": 0.7014, "step": 11180 }, { "epoch": 1.6219154255897381, "grad_norm": 1.9940383434295654, "learning_rate": 3.3784606464705034e-05, "loss": 0.6732, "step": 11190 }, { "epoch": 1.6233648584991123, "grad_norm": 1.0099761486053467, "learning_rate": 3.377011161037832e-05, "loss": 0.5949, "step": 11200 }, { "epoch": 1.6248142914084864, "grad_norm": 2.6662089824676514, "learning_rate": 3.37556167560516e-05, "loss": 0.6592, "step": 11210 }, { "epoch": 1.6262637243178606, "grad_norm": 0.6609460115432739, "learning_rate": 3.374112190172489e-05, "loss": 0.6284, "step": 11220 }, { "epoch": 1.6277131572272348, "grad_norm": 2.3695130348205566, "learning_rate": 3.372662704739817e-05, "loss": 0.6975, "step": 11230 }, { "epoch": 1.629162590136609, "grad_norm": 0.8688599467277527, "learning_rate": 3.371213219307146e-05, "loss": 0.6692, "step": 11240 }, { "epoch": 1.6306120230459833, "grad_norm": 2.008312702178955, "learning_rate": 3.369763733874475e-05, "loss": 0.6637, "step": 11250 }, { "epoch": 1.6320614559553575, "grad_norm": 0.8913775682449341, "learning_rate": 3.368314248441803e-05, "loss": 0.6861, "step": 11260 }, { "epoch": 1.6335108888647318, "grad_norm": 1.5506291389465332, "learning_rate": 3.366864763009132e-05, "loss": 0.6753, "step": 11270 }, { "epoch": 1.634960321774106, "grad_norm": 2.029127359390259, "learning_rate": 3.36541527757646e-05, "loss": 0.7136, "step": 11280 }, { "epoch": 1.6364097546834802, "grad_norm": 1.9097111225128174, "learning_rate": 3.363965792143789e-05, "loss": 0.5921, "step": 11290 }, { "epoch": 1.6378591875928543, "grad_norm": 2.0805654525756836, "learning_rate": 3.3625163067111176e-05, "loss": 0.6608, "step": 11300 }, { "epoch": 1.6393086205022285, "grad_norm": 2.208421230316162, "learning_rate": 3.361066821278446e-05, "loss": 0.6562, "step": 11310 }, { "epoch": 1.6407580534116026, "grad_norm": 1.505969762802124, "learning_rate": 3.359617335845775e-05, "loss": 0.693, "step": 11320 }, { "epoch": 1.6422074863209768, "grad_norm": 2.6280758380889893, "learning_rate": 3.3581678504131036e-05, "loss": 0.673, "step": 11330 }, { "epoch": 1.6436569192303512, "grad_norm": 0.8961797952651978, "learning_rate": 3.356718364980432e-05, "loss": 0.6021, "step": 11340 }, { "epoch": 1.6451063521397253, "grad_norm": 0.6946638822555542, "learning_rate": 3.3552688795477605e-05, "loss": 0.6796, "step": 11350 }, { "epoch": 1.6465557850490995, "grad_norm": 2.8928956985473633, "learning_rate": 3.353819394115089e-05, "loss": 0.7382, "step": 11360 }, { "epoch": 1.6480052179584739, "grad_norm": 0.8265762329101562, "learning_rate": 3.352369908682418e-05, "loss": 0.7026, "step": 11370 }, { "epoch": 1.649454650867848, "grad_norm": 4.2738938331604, "learning_rate": 3.3509204232497464e-05, "loss": 0.6302, "step": 11380 }, { "epoch": 1.6509040837772222, "grad_norm": 2.031883955001831, "learning_rate": 3.3494709378170756e-05, "loss": 0.7206, "step": 11390 }, { "epoch": 1.6523535166865964, "grad_norm": 0.7447781562805176, "learning_rate": 3.348021452384404e-05, "loss": 0.6582, "step": 11400 }, { "epoch": 1.6538029495959705, "grad_norm": 0.9610055685043335, "learning_rate": 3.346571966951732e-05, "loss": 0.6741, "step": 11410 }, { "epoch": 1.6552523825053447, "grad_norm": 2.5023865699768066, "learning_rate": 3.345122481519061e-05, "loss": 0.6875, "step": 11420 }, { "epoch": 1.6567018154147188, "grad_norm": 1.0197991132736206, "learning_rate": 3.343817944629656e-05, "loss": 0.7759, "step": 11430 }, { "epoch": 1.6581512483240932, "grad_norm": 0.8140033483505249, "learning_rate": 3.342368459196985e-05, "loss": 0.6356, "step": 11440 }, { "epoch": 1.6596006812334674, "grad_norm": 1.772228717803955, "learning_rate": 3.340918973764314e-05, "loss": 0.7157, "step": 11450 }, { "epoch": 1.6610501141428418, "grad_norm": 1.5189317464828491, "learning_rate": 3.339469488331642e-05, "loss": 0.6381, "step": 11460 }, { "epoch": 1.662499547052216, "grad_norm": 2.599026679992676, "learning_rate": 3.338020002898971e-05, "loss": 0.7256, "step": 11470 }, { "epoch": 1.66394897996159, "grad_norm": 0.9813500642776489, "learning_rate": 3.3365705174662996e-05, "loss": 0.7397, "step": 11480 }, { "epoch": 1.6653984128709642, "grad_norm": 1.4737082719802856, "learning_rate": 3.335121032033629e-05, "loss": 0.7078, "step": 11490 }, { "epoch": 1.6668478457803384, "grad_norm": 1.0761265754699707, "learning_rate": 3.3336715466009565e-05, "loss": 0.7165, "step": 11500 }, { "epoch": 1.6682972786897126, "grad_norm": 4.028006553649902, "learning_rate": 3.332222061168285e-05, "loss": 0.7517, "step": 11510 }, { "epoch": 1.6697467115990867, "grad_norm": 2.7268528938293457, "learning_rate": 3.330772575735614e-05, "loss": 0.6512, "step": 11520 }, { "epoch": 1.671196144508461, "grad_norm": 1.2567470073699951, "learning_rate": 3.3293230903029425e-05, "loss": 0.6798, "step": 11530 }, { "epoch": 1.6726455774178353, "grad_norm": 0.9180237054824829, "learning_rate": 3.3278736048702716e-05, "loss": 0.8243, "step": 11540 }, { "epoch": 1.6740950103272096, "grad_norm": 2.680816650390625, "learning_rate": 3.3264241194376e-05, "loss": 0.6361, "step": 11550 }, { "epoch": 1.6755444432365838, "grad_norm": 2.539856433868408, "learning_rate": 3.324974634004928e-05, "loss": 0.6767, "step": 11560 }, { "epoch": 1.676993876145958, "grad_norm": 0.8751583695411682, "learning_rate": 3.323525148572257e-05, "loss": 0.7148, "step": 11570 }, { "epoch": 1.6784433090553321, "grad_norm": 0.7487082481384277, "learning_rate": 3.3220756631395854e-05, "loss": 0.6886, "step": 11580 }, { "epoch": 1.6798927419647063, "grad_norm": 2.2744905948638916, "learning_rate": 3.3206261777069145e-05, "loss": 0.6029, "step": 11590 }, { "epoch": 1.6813421748740804, "grad_norm": 2.1104652881622314, "learning_rate": 3.319176692274243e-05, "loss": 0.6817, "step": 11600 }, { "epoch": 1.6827916077834546, "grad_norm": 1.006417155265808, "learning_rate": 3.3177272068415713e-05, "loss": 0.7137, "step": 11610 }, { "epoch": 1.684241040692829, "grad_norm": 2.490140914916992, "learning_rate": 3.3162777214089005e-05, "loss": 0.6737, "step": 11620 }, { "epoch": 1.6856904736022031, "grad_norm": 1.7343158721923828, "learning_rate": 3.314828235976228e-05, "loss": 0.6307, "step": 11630 }, { "epoch": 1.6871399065115773, "grad_norm": 0.7881498336791992, "learning_rate": 3.313378750543557e-05, "loss": 0.703, "step": 11640 }, { "epoch": 1.6885893394209517, "grad_norm": 1.097777009010315, "learning_rate": 3.311929265110886e-05, "loss": 0.7322, "step": 11650 }, { "epoch": 1.6900387723303258, "grad_norm": 1.9791345596313477, "learning_rate": 3.310479779678214e-05, "loss": 0.6046, "step": 11660 }, { "epoch": 1.6914882052397, "grad_norm": 0.9018293619155884, "learning_rate": 3.309030294245543e-05, "loss": 0.6562, "step": 11670 }, { "epoch": 1.6929376381490742, "grad_norm": 3.6132423877716064, "learning_rate": 3.307580808812872e-05, "loss": 0.7399, "step": 11680 }, { "epoch": 1.6943870710584483, "grad_norm": 1.8991214036941528, "learning_rate": 3.3061313233802e-05, "loss": 0.7522, "step": 11690 }, { "epoch": 1.6958365039678225, "grad_norm": 0.8896854519844055, "learning_rate": 3.3046818379475286e-05, "loss": 0.6909, "step": 11700 }, { "epoch": 1.6972859368771966, "grad_norm": 1.0252200365066528, "learning_rate": 3.303232352514857e-05, "loss": 0.7392, "step": 11710 }, { "epoch": 1.698735369786571, "grad_norm": 1.8175885677337646, "learning_rate": 3.301782867082186e-05, "loss": 0.7272, "step": 11720 }, { "epoch": 1.7001848026959452, "grad_norm": 1.0172370672225952, "learning_rate": 3.3003333816495146e-05, "loss": 0.6532, "step": 11730 }, { "epoch": 1.7016342356053196, "grad_norm": 2.1411426067352295, "learning_rate": 3.298883896216843e-05, "loss": 0.6795, "step": 11740 }, { "epoch": 1.7030836685146937, "grad_norm": 1.9220808744430542, "learning_rate": 3.297434410784172e-05, "loss": 0.6655, "step": 11750 }, { "epoch": 1.7045331014240679, "grad_norm": 3.3900511264801025, "learning_rate": 3.2959849253515e-05, "loss": 0.6648, "step": 11760 }, { "epoch": 1.705982534333442, "grad_norm": 3.2443623542785645, "learning_rate": 3.294535439918829e-05, "loss": 0.6851, "step": 11770 }, { "epoch": 1.7074319672428162, "grad_norm": 1.7702313661575317, "learning_rate": 3.2930859544861575e-05, "loss": 0.6403, "step": 11780 }, { "epoch": 1.7088814001521904, "grad_norm": 2.2568228244781494, "learning_rate": 3.291636469053486e-05, "loss": 0.6541, "step": 11790 }, { "epoch": 1.7103308330615645, "grad_norm": 3.1511707305908203, "learning_rate": 3.290186983620815e-05, "loss": 0.691, "step": 11800 }, { "epoch": 1.711780265970939, "grad_norm": 1.8583332300186157, "learning_rate": 3.2887374981881435e-05, "loss": 0.6347, "step": 11810 }, { "epoch": 1.713229698880313, "grad_norm": 1.8265938758850098, "learning_rate": 3.287288012755472e-05, "loss": 0.6762, "step": 11820 }, { "epoch": 1.7146791317896874, "grad_norm": 2.006502866744995, "learning_rate": 3.2858385273228e-05, "loss": 0.6834, "step": 11830 }, { "epoch": 1.7161285646990616, "grad_norm": 0.8520517945289612, "learning_rate": 3.284389041890129e-05, "loss": 0.6735, "step": 11840 }, { "epoch": 1.7175779976084358, "grad_norm": 0.9941295385360718, "learning_rate": 3.282939556457458e-05, "loss": 0.6592, "step": 11850 }, { "epoch": 1.71902743051781, "grad_norm": 0.8577190637588501, "learning_rate": 3.281490071024786e-05, "loss": 0.6025, "step": 11860 }, { "epoch": 1.720476863427184, "grad_norm": 3.2018096446990967, "learning_rate": 3.2800405855921154e-05, "loss": 0.7097, "step": 11870 }, { "epoch": 1.7219262963365582, "grad_norm": 0.7818105816841125, "learning_rate": 3.278591100159444e-05, "loss": 0.628, "step": 11880 }, { "epoch": 1.7233757292459324, "grad_norm": 3.172701358795166, "learning_rate": 3.2771416147267716e-05, "loss": 0.6854, "step": 11890 }, { "epoch": 1.7248251621553068, "grad_norm": 0.8485254049301147, "learning_rate": 3.275692129294101e-05, "loss": 0.6506, "step": 11900 }, { "epoch": 1.726274595064681, "grad_norm": 1.6423099040985107, "learning_rate": 3.274242643861429e-05, "loss": 0.6422, "step": 11910 }, { "epoch": 1.727724027974055, "grad_norm": 0.7199040055274963, "learning_rate": 3.272793158428758e-05, "loss": 0.6863, "step": 11920 }, { "epoch": 1.7291734608834295, "grad_norm": 0.9496470093727112, "learning_rate": 3.271343672996087e-05, "loss": 0.6509, "step": 11930 }, { "epoch": 1.7306228937928037, "grad_norm": 0.8391323685646057, "learning_rate": 3.269894187563415e-05, "loss": 0.7397, "step": 11940 }, { "epoch": 1.7320723267021778, "grad_norm": 0.752637505531311, "learning_rate": 3.2684447021307436e-05, "loss": 0.6644, "step": 11950 }, { "epoch": 1.733521759611552, "grad_norm": 0.8174326419830322, "learning_rate": 3.266995216698072e-05, "loss": 0.7158, "step": 11960 }, { "epoch": 1.7349711925209261, "grad_norm": 1.8735527992248535, "learning_rate": 3.265545731265401e-05, "loss": 0.6833, "step": 11970 }, { "epoch": 1.7364206254303003, "grad_norm": 0.8878263235092163, "learning_rate": 3.2640962458327296e-05, "loss": 0.6542, "step": 11980 }, { "epoch": 1.7378700583396745, "grad_norm": 1.0730360746383667, "learning_rate": 3.262646760400058e-05, "loss": 0.6864, "step": 11990 }, { "epoch": 1.7393194912490488, "grad_norm": 2.2720248699188232, "learning_rate": 3.261197274967387e-05, "loss": 0.7018, "step": 12000 }, { "epoch": 1.7393194912490488, "eval_loss": 0.7453984022140503, "eval_runtime": 671.5911, "eval_samples_per_second": 51.363, "eval_steps_per_second": 2.569, "eval_token_accuracy": 0.0004127198786867337, "step": 12000 }, { "epoch": 1.740768924158423, "grad_norm": 0.742331862449646, "learning_rate": 3.2597477895347156e-05, "loss": 0.6351, "step": 12010 }, { "epoch": 1.7422183570677974, "grad_norm": 1.6918139457702637, "learning_rate": 3.258298304102044e-05, "loss": 0.6707, "step": 12020 }, { "epoch": 1.7436677899771715, "grad_norm": 2.084019422531128, "learning_rate": 3.2568488186693724e-05, "loss": 0.6947, "step": 12030 }, { "epoch": 1.7451172228865457, "grad_norm": 1.5365352630615234, "learning_rate": 3.255399333236701e-05, "loss": 0.6949, "step": 12040 }, { "epoch": 1.7465666557959199, "grad_norm": 0.9291880130767822, "learning_rate": 3.25394984780403e-05, "loss": 0.7178, "step": 12050 }, { "epoch": 1.748016088705294, "grad_norm": 2.788231134414673, "learning_rate": 3.2525003623713584e-05, "loss": 0.6189, "step": 12060 }, { "epoch": 1.7494655216146682, "grad_norm": 0.8338028788566589, "learning_rate": 3.251050876938687e-05, "loss": 0.6596, "step": 12070 }, { "epoch": 1.7509149545240423, "grad_norm": 1.9956914186477661, "learning_rate": 3.249601391506015e-05, "loss": 0.686, "step": 12080 }, { "epoch": 1.7523643874334167, "grad_norm": 2.4082913398742676, "learning_rate": 3.248151906073344e-05, "loss": 0.6606, "step": 12090 }, { "epoch": 1.7538138203427909, "grad_norm": 0.8920974135398865, "learning_rate": 3.246702420640673e-05, "loss": 0.6806, "step": 12100 }, { "epoch": 1.7552632532521653, "grad_norm": 0.6411871314048767, "learning_rate": 3.245252935208001e-05, "loss": 0.6898, "step": 12110 }, { "epoch": 1.7567126861615394, "grad_norm": 1.895133376121521, "learning_rate": 3.24380344977533e-05, "loss": 0.6771, "step": 12120 }, { "epoch": 1.7581621190709136, "grad_norm": 0.8246234059333801, "learning_rate": 3.242353964342659e-05, "loss": 0.7074, "step": 12130 }, { "epoch": 1.7596115519802877, "grad_norm": 0.9914174675941467, "learning_rate": 3.240904478909987e-05, "loss": 0.7025, "step": 12140 }, { "epoch": 1.761060984889662, "grad_norm": 1.5609525442123413, "learning_rate": 3.239454993477316e-05, "loss": 0.7268, "step": 12150 }, { "epoch": 1.762510417799036, "grad_norm": 0.7251628637313843, "learning_rate": 3.238005508044644e-05, "loss": 0.7015, "step": 12160 }, { "epoch": 1.7639598507084102, "grad_norm": 2.229095697402954, "learning_rate": 3.2365560226119726e-05, "loss": 0.5969, "step": 12170 }, { "epoch": 1.7654092836177846, "grad_norm": 0.9089709520339966, "learning_rate": 3.235106537179302e-05, "loss": 0.6905, "step": 12180 }, { "epoch": 1.7668587165271588, "grad_norm": 0.8589386343955994, "learning_rate": 3.23365705174663e-05, "loss": 0.6675, "step": 12190 }, { "epoch": 1.768308149436533, "grad_norm": 0.9688293933868408, "learning_rate": 3.232207566313959e-05, "loss": 0.5953, "step": 12200 }, { "epoch": 1.7697575823459073, "grad_norm": 0.7265881896018982, "learning_rate": 3.230758080881287e-05, "loss": 0.6947, "step": 12210 }, { "epoch": 1.7712070152552815, "grad_norm": 0.90655517578125, "learning_rate": 3.2293085954486154e-05, "loss": 0.661, "step": 12220 }, { "epoch": 1.7726564481646556, "grad_norm": 0.8843987584114075, "learning_rate": 3.2278591100159445e-05, "loss": 0.6676, "step": 12230 }, { "epoch": 1.7741058810740298, "grad_norm": 0.9245916604995728, "learning_rate": 3.226409624583273e-05, "loss": 0.6631, "step": 12240 }, { "epoch": 1.775555313983404, "grad_norm": 1.0180144309997559, "learning_rate": 3.224960139150602e-05, "loss": 0.6716, "step": 12250 }, { "epoch": 1.777004746892778, "grad_norm": 4.001671314239502, "learning_rate": 3.2235106537179305e-05, "loss": 0.8316, "step": 12260 }, { "epoch": 1.7784541798021523, "grad_norm": 2.808419942855835, "learning_rate": 3.222061168285258e-05, "loss": 0.7557, "step": 12270 }, { "epoch": 1.7799036127115266, "grad_norm": 1.6131603717803955, "learning_rate": 3.2206116828525874e-05, "loss": 0.6285, "step": 12280 }, { "epoch": 1.7813530456209008, "grad_norm": 0.8948814272880554, "learning_rate": 3.219162197419916e-05, "loss": 0.7099, "step": 12290 }, { "epoch": 1.7828024785302752, "grad_norm": 3.0544824600219727, "learning_rate": 3.217712711987245e-05, "loss": 0.6793, "step": 12300 }, { "epoch": 1.7842519114396493, "grad_norm": 1.1489214897155762, "learning_rate": 3.2162632265545734e-05, "loss": 0.6682, "step": 12310 }, { "epoch": 1.7857013443490235, "grad_norm": 1.5618456602096558, "learning_rate": 3.214813741121902e-05, "loss": 0.7276, "step": 12320 }, { "epoch": 1.7871507772583977, "grad_norm": 2.5483455657958984, "learning_rate": 3.213364255689231e-05, "loss": 0.6497, "step": 12330 }, { "epoch": 1.7886002101677718, "grad_norm": 2.1112256050109863, "learning_rate": 3.211914770256559e-05, "loss": 0.7096, "step": 12340 }, { "epoch": 1.790049643077146, "grad_norm": 1.7844610214233398, "learning_rate": 3.210465284823888e-05, "loss": 0.5964, "step": 12350 }, { "epoch": 1.7914990759865201, "grad_norm": 0.7652512192726135, "learning_rate": 3.209015799391216e-05, "loss": 0.7223, "step": 12360 }, { "epoch": 1.7929485088958945, "grad_norm": 1.2720307111740112, "learning_rate": 3.207566313958545e-05, "loss": 0.7185, "step": 12370 }, { "epoch": 1.7943979418052687, "grad_norm": 0.8202764391899109, "learning_rate": 3.206116828525874e-05, "loss": 0.6452, "step": 12380 }, { "epoch": 1.7958473747146428, "grad_norm": 1.2758756875991821, "learning_rate": 3.204667343093202e-05, "loss": 0.65, "step": 12390 }, { "epoch": 1.7972968076240172, "grad_norm": 2.4472177028656006, "learning_rate": 3.2032178576605307e-05, "loss": 0.6755, "step": 12400 }, { "epoch": 1.7987462405333914, "grad_norm": 3.1003291606903076, "learning_rate": 3.201768372227859e-05, "loss": 0.6934, "step": 12410 }, { "epoch": 1.8001956734427655, "grad_norm": 3.7856054306030273, "learning_rate": 3.2003188867951875e-05, "loss": 0.7038, "step": 12420 }, { "epoch": 1.8016451063521397, "grad_norm": 0.8813289403915405, "learning_rate": 3.1988694013625166e-05, "loss": 0.651, "step": 12430 }, { "epoch": 1.8030945392615139, "grad_norm": 2.7432374954223633, "learning_rate": 3.197419915929845e-05, "loss": 0.6651, "step": 12440 }, { "epoch": 1.804543972170888, "grad_norm": 0.8676012754440308, "learning_rate": 3.1959704304971735e-05, "loss": 0.7189, "step": 12450 }, { "epoch": 1.8059934050802622, "grad_norm": 1.9796631336212158, "learning_rate": 3.1945209450645026e-05, "loss": 0.6967, "step": 12460 }, { "epoch": 1.8074428379896366, "grad_norm": 0.8426458239555359, "learning_rate": 3.1930714596318304e-05, "loss": 0.6947, "step": 12470 }, { "epoch": 1.8088922708990107, "grad_norm": 0.9686286449432373, "learning_rate": 3.1916219741991595e-05, "loss": 0.6641, "step": 12480 }, { "epoch": 1.810341703808385, "grad_norm": 0.8845982551574707, "learning_rate": 3.190172488766488e-05, "loss": 0.6704, "step": 12490 }, { "epoch": 1.8117911367177593, "grad_norm": 1.16560959815979, "learning_rate": 3.1887230033338164e-05, "loss": 0.6233, "step": 12500 }, { "epoch": 1.8132405696271334, "grad_norm": 0.8579843044281006, "learning_rate": 3.1872735179011455e-05, "loss": 0.6219, "step": 12510 }, { "epoch": 1.8146900025365076, "grad_norm": 2.8450324535369873, "learning_rate": 3.185824032468474e-05, "loss": 0.6458, "step": 12520 }, { "epoch": 1.8161394354458817, "grad_norm": 2.29085111618042, "learning_rate": 3.1843745470358024e-05, "loss": 0.7369, "step": 12530 }, { "epoch": 1.817588868355256, "grad_norm": 2.1326372623443604, "learning_rate": 3.182925061603131e-05, "loss": 0.7111, "step": 12540 }, { "epoch": 1.81903830126463, "grad_norm": 1.4014294147491455, "learning_rate": 3.181475576170459e-05, "loss": 0.6926, "step": 12550 }, { "epoch": 1.8204877341740044, "grad_norm": 2.965717315673828, "learning_rate": 3.1800260907377883e-05, "loss": 0.6309, "step": 12560 }, { "epoch": 1.8219371670833786, "grad_norm": 1.889169454574585, "learning_rate": 3.178576605305117e-05, "loss": 0.6605, "step": 12570 }, { "epoch": 1.823386599992753, "grad_norm": 1.740950584411621, "learning_rate": 3.177127119872446e-05, "loss": 0.6657, "step": 12580 }, { "epoch": 1.8248360329021271, "grad_norm": 0.9073018431663513, "learning_rate": 3.175677634439774e-05, "loss": 0.5996, "step": 12590 }, { "epoch": 1.8262854658115013, "grad_norm": 2.453105926513672, "learning_rate": 3.174228149007102e-05, "loss": 0.6781, "step": 12600 }, { "epoch": 1.8277348987208755, "grad_norm": 2.1584982872009277, "learning_rate": 3.172778663574431e-05, "loss": 0.6798, "step": 12610 }, { "epoch": 1.8291843316302496, "grad_norm": 1.8437451124191284, "learning_rate": 3.1713291781417596e-05, "loss": 0.6918, "step": 12620 }, { "epoch": 1.8306337645396238, "grad_norm": 0.9406701326370239, "learning_rate": 3.169879692709089e-05, "loss": 0.6399, "step": 12630 }, { "epoch": 1.832083197448998, "grad_norm": 3.034640312194824, "learning_rate": 3.168430207276417e-05, "loss": 0.7079, "step": 12640 }, { "epoch": 1.8335326303583723, "grad_norm": 1.1881543397903442, "learning_rate": 3.1669807218437456e-05, "loss": 0.6354, "step": 12650 }, { "epoch": 1.8349820632677465, "grad_norm": 0.7651578783988953, "learning_rate": 3.165531236411074e-05, "loss": 0.6374, "step": 12660 }, { "epoch": 1.8364314961771206, "grad_norm": 3.246687650680542, "learning_rate": 3.1640817509784025e-05, "loss": 0.6265, "step": 12670 }, { "epoch": 1.837880929086495, "grad_norm": 2.519076108932495, "learning_rate": 3.1626322655457316e-05, "loss": 0.6974, "step": 12680 }, { "epoch": 1.8393303619958692, "grad_norm": 1.0031216144561768, "learning_rate": 3.16118278011306e-05, "loss": 0.6395, "step": 12690 }, { "epoch": 1.8407797949052433, "grad_norm": 1.622070550918579, "learning_rate": 3.1597332946803885e-05, "loss": 0.6563, "step": 12700 }, { "epoch": 1.8422292278146175, "grad_norm": 0.7034159302711487, "learning_rate": 3.1582838092477176e-05, "loss": 0.6605, "step": 12710 }, { "epoch": 1.8436786607239917, "grad_norm": 1.7830655574798584, "learning_rate": 3.156834323815046e-05, "loss": 0.6996, "step": 12720 }, { "epoch": 1.8451280936333658, "grad_norm": 2.7417211532592773, "learning_rate": 3.1553848383823745e-05, "loss": 0.6474, "step": 12730 }, { "epoch": 1.84657752654274, "grad_norm": 2.5987696647644043, "learning_rate": 3.153935352949703e-05, "loss": 0.6479, "step": 12740 }, { "epoch": 1.8480269594521144, "grad_norm": 0.8672921657562256, "learning_rate": 3.152485867517031e-05, "loss": 0.7378, "step": 12750 }, { "epoch": 1.8494763923614885, "grad_norm": 1.1308648586273193, "learning_rate": 3.1510363820843605e-05, "loss": 0.6805, "step": 12760 }, { "epoch": 1.850925825270863, "grad_norm": 1.4739187955856323, "learning_rate": 3.149586896651689e-05, "loss": 0.6196, "step": 12770 }, { "epoch": 1.852375258180237, "grad_norm": 0.7275584936141968, "learning_rate": 3.148137411219017e-05, "loss": 0.7302, "step": 12780 }, { "epoch": 1.8538246910896112, "grad_norm": 1.3452956676483154, "learning_rate": 3.146687925786346e-05, "loss": 0.695, "step": 12790 }, { "epoch": 1.8552741239989854, "grad_norm": 2.195021152496338, "learning_rate": 3.145238440353674e-05, "loss": 0.6259, "step": 12800 }, { "epoch": 1.8567235569083596, "grad_norm": 2.028768539428711, "learning_rate": 3.143788954921003e-05, "loss": 0.6325, "step": 12810 }, { "epoch": 1.8581729898177337, "grad_norm": 1.9663366079330444, "learning_rate": 3.142339469488332e-05, "loss": 0.5774, "step": 12820 }, { "epoch": 1.8596224227271079, "grad_norm": 0.9267606139183044, "learning_rate": 3.14088998405566e-05, "loss": 0.7084, "step": 12830 }, { "epoch": 1.8610718556364823, "grad_norm": 0.6876948475837708, "learning_rate": 3.139440498622989e-05, "loss": 0.7126, "step": 12840 }, { "epoch": 1.8625212885458564, "grad_norm": 1.1228951215744019, "learning_rate": 3.137991013190318e-05, "loss": 0.6687, "step": 12850 }, { "epoch": 1.8639707214552308, "grad_norm": 3.0569639205932617, "learning_rate": 3.136541527757646e-05, "loss": 0.7034, "step": 12860 }, { "epoch": 1.865420154364605, "grad_norm": 2.0416345596313477, "learning_rate": 3.1350920423249746e-05, "loss": 0.7078, "step": 12870 }, { "epoch": 1.8668695872739791, "grad_norm": 2.680041551589966, "learning_rate": 3.133642556892303e-05, "loss": 0.7417, "step": 12880 }, { "epoch": 1.8683190201833533, "grad_norm": 1.1946463584899902, "learning_rate": 3.132193071459632e-05, "loss": 0.7307, "step": 12890 }, { "epoch": 1.8697684530927274, "grad_norm": 0.8287043571472168, "learning_rate": 3.1307435860269606e-05, "loss": 0.6699, "step": 12900 }, { "epoch": 1.8712178860021016, "grad_norm": 1.7087205648422241, "learning_rate": 3.12929410059429e-05, "loss": 0.6111, "step": 12910 }, { "epoch": 1.8726673189114758, "grad_norm": 1.4836465120315552, "learning_rate": 3.1278446151616175e-05, "loss": 0.7251, "step": 12920 }, { "epoch": 1.8741167518208501, "grad_norm": 0.8842166066169739, "learning_rate": 3.126395129728946e-05, "loss": 0.7151, "step": 12930 }, { "epoch": 1.8755661847302243, "grad_norm": 3.3071022033691406, "learning_rate": 3.124945644296275e-05, "loss": 0.6943, "step": 12940 }, { "epoch": 1.8770156176395985, "grad_norm": 1.8857395648956299, "learning_rate": 3.1234961588636034e-05, "loss": 0.6141, "step": 12950 }, { "epoch": 1.8784650505489728, "grad_norm": 1.0635484457015991, "learning_rate": 3.1220466734309326e-05, "loss": 0.6381, "step": 12960 }, { "epoch": 1.879914483458347, "grad_norm": 1.4398080110549927, "learning_rate": 3.120597187998261e-05, "loss": 0.633, "step": 12970 }, { "epoch": 1.8813639163677212, "grad_norm": 1.040791392326355, "learning_rate": 3.1191477025655894e-05, "loss": 0.6147, "step": 12980 }, { "epoch": 1.8828133492770953, "grad_norm": 0.9879858493804932, "learning_rate": 3.117698217132918e-05, "loss": 0.646, "step": 12990 }, { "epoch": 1.8842627821864695, "grad_norm": 0.8812539577484131, "learning_rate": 3.116248731700246e-05, "loss": 0.7175, "step": 13000 }, { "epoch": 1.8842627821864695, "eval_loss": 0.7349093556404114, "eval_runtime": 669.3439, "eval_samples_per_second": 51.536, "eval_steps_per_second": 2.577, "eval_token_accuracy": 0.00039856580646365007, "step": 13000 }, { "epoch": 1.8857122150958436, "grad_norm": 0.8519927263259888, "learning_rate": 3.1147992462675754e-05, "loss": 0.6899, "step": 13010 }, { "epoch": 1.8871616480052178, "grad_norm": 0.9121435880661011, "learning_rate": 3.113349760834904e-05, "loss": 0.6665, "step": 13020 }, { "epoch": 1.8886110809145922, "grad_norm": 0.9590705037117004, "learning_rate": 3.111900275402232e-05, "loss": 0.5601, "step": 13030 }, { "epoch": 1.8900605138239663, "grad_norm": 0.8972257375717163, "learning_rate": 3.1104507899695614e-05, "loss": 0.6744, "step": 13040 }, { "epoch": 1.8915099467333407, "grad_norm": 0.8012029528617859, "learning_rate": 3.109001304536889e-05, "loss": 0.6178, "step": 13050 }, { "epoch": 1.8929593796427149, "grad_norm": 0.9889801740646362, "learning_rate": 3.107551819104218e-05, "loss": 0.729, "step": 13060 }, { "epoch": 1.894408812552089, "grad_norm": 1.8423585891723633, "learning_rate": 3.106102333671547e-05, "loss": 0.6544, "step": 13070 }, { "epoch": 1.8958582454614632, "grad_norm": 2.015009880065918, "learning_rate": 3.104652848238875e-05, "loss": 0.6447, "step": 13080 }, { "epoch": 1.8973076783708374, "grad_norm": 2.1070494651794434, "learning_rate": 3.103203362806204e-05, "loss": 0.575, "step": 13090 }, { "epoch": 1.8987571112802115, "grad_norm": 0.9845415353775024, "learning_rate": 3.101753877373533e-05, "loss": 0.7011, "step": 13100 }, { "epoch": 1.9002065441895857, "grad_norm": 1.782501220703125, "learning_rate": 3.100304391940861e-05, "loss": 0.6614, "step": 13110 }, { "epoch": 1.90165597709896, "grad_norm": 1.965434193611145, "learning_rate": 3.0988549065081896e-05, "loss": 0.7378, "step": 13120 }, { "epoch": 1.9031054100083342, "grad_norm": 0.8427996635437012, "learning_rate": 3.097405421075518e-05, "loss": 0.6494, "step": 13130 }, { "epoch": 1.9045548429177086, "grad_norm": 0.9848141670227051, "learning_rate": 3.095955935642847e-05, "loss": 0.7511, "step": 13140 }, { "epoch": 1.9060042758270828, "grad_norm": 1.678484559059143, "learning_rate": 3.0945064502101756e-05, "loss": 0.6437, "step": 13150 }, { "epoch": 1.907453708736457, "grad_norm": 0.7144997715950012, "learning_rate": 3.093056964777504e-05, "loss": 0.6478, "step": 13160 }, { "epoch": 1.908903141645831, "grad_norm": 0.9617393612861633, "learning_rate": 3.091607479344833e-05, "loss": 0.6981, "step": 13170 }, { "epoch": 1.9103525745552052, "grad_norm": 0.8014298677444458, "learning_rate": 3.090157993912161e-05, "loss": 0.6234, "step": 13180 }, { "epoch": 1.9118020074645794, "grad_norm": 1.6113104820251465, "learning_rate": 3.08870850847949e-05, "loss": 0.6508, "step": 13190 }, { "epoch": 1.9132514403739536, "grad_norm": 0.8225398659706116, "learning_rate": 3.0872590230468184e-05, "loss": 0.637, "step": 13200 }, { "epoch": 1.914700873283328, "grad_norm": 0.7952432036399841, "learning_rate": 3.085809537614147e-05, "loss": 0.6801, "step": 13210 }, { "epoch": 1.916150306192702, "grad_norm": 2.7938709259033203, "learning_rate": 3.084360052181476e-05, "loss": 0.7336, "step": 13220 }, { "epoch": 1.9175997391020763, "grad_norm": 0.8850911259651184, "learning_rate": 3.0829105667488044e-05, "loss": 0.6048, "step": 13230 }, { "epoch": 1.9190491720114506, "grad_norm": 1.023543357849121, "learning_rate": 3.081461081316133e-05, "loss": 0.6227, "step": 13240 }, { "epoch": 1.9204986049208248, "grad_norm": 0.89097660779953, "learning_rate": 3.080011595883461e-05, "loss": 0.6683, "step": 13250 }, { "epoch": 1.921948037830199, "grad_norm": 0.8899220824241638, "learning_rate": 3.07856211045079e-05, "loss": 0.6979, "step": 13260 }, { "epoch": 1.9233974707395731, "grad_norm": 1.9883252382278442, "learning_rate": 3.077112625018119e-05, "loss": 0.5952, "step": 13270 }, { "epoch": 1.9248469036489473, "grad_norm": 2.1873536109924316, "learning_rate": 3.075663139585447e-05, "loss": 0.665, "step": 13280 }, { "epoch": 1.9262963365583214, "grad_norm": 0.7692016363143921, "learning_rate": 3.0742136541527764e-05, "loss": 0.6827, "step": 13290 }, { "epoch": 1.9277457694676956, "grad_norm": 3.5339086055755615, "learning_rate": 3.072764168720105e-05, "loss": 0.6847, "step": 13300 }, { "epoch": 1.92919520237707, "grad_norm": 2.340005397796631, "learning_rate": 3.0713146832874326e-05, "loss": 0.7517, "step": 13310 }, { "epoch": 1.9306446352864441, "grad_norm": 2.5109386444091797, "learning_rate": 3.069865197854762e-05, "loss": 0.604, "step": 13320 }, { "epoch": 1.9320940681958185, "grad_norm": 0.8669785261154175, "learning_rate": 3.06841571242209e-05, "loss": 0.7108, "step": 13330 }, { "epoch": 1.9335435011051927, "grad_norm": 1.7247940301895142, "learning_rate": 3.066966226989419e-05, "loss": 0.6894, "step": 13340 }, { "epoch": 1.9349929340145668, "grad_norm": 2.6877431869506836, "learning_rate": 3.0655167415567477e-05, "loss": 0.6121, "step": 13350 }, { "epoch": 1.936442366923941, "grad_norm": 0.8619291186332703, "learning_rate": 3.064067256124076e-05, "loss": 0.6968, "step": 13360 }, { "epoch": 1.9378917998333152, "grad_norm": 0.9239633679389954, "learning_rate": 3.0626177706914045e-05, "loss": 0.6755, "step": 13370 }, { "epoch": 1.9393412327426893, "grad_norm": 0.8451610207557678, "learning_rate": 3.061168285258733e-05, "loss": 0.7067, "step": 13380 }, { "epoch": 1.9407906656520635, "grad_norm": 2.7170891761779785, "learning_rate": 3.059718799826062e-05, "loss": 0.5995, "step": 13390 }, { "epoch": 1.9422400985614379, "grad_norm": 0.9185802340507507, "learning_rate": 3.0582693143933905e-05, "loss": 0.6857, "step": 13400 }, { "epoch": 1.943689531470812, "grad_norm": 0.7790818214416504, "learning_rate": 3.056819828960719e-05, "loss": 0.6072, "step": 13410 }, { "epoch": 1.9451389643801864, "grad_norm": 0.8623969554901123, "learning_rate": 3.055370343528048e-05, "loss": 0.7032, "step": 13420 }, { "epoch": 1.9465883972895606, "grad_norm": 1.7718230485916138, "learning_rate": 3.0539208580953765e-05, "loss": 0.6544, "step": 13430 }, { "epoch": 1.9480378301989347, "grad_norm": 1.7005128860473633, "learning_rate": 3.052471372662705e-05, "loss": 0.5819, "step": 13440 }, { "epoch": 1.9494872631083089, "grad_norm": 1.0943654775619507, "learning_rate": 3.0510218872300334e-05, "loss": 0.6557, "step": 13450 }, { "epoch": 1.950936696017683, "grad_norm": 1.9594976902008057, "learning_rate": 3.049572401797362e-05, "loss": 0.5739, "step": 13460 }, { "epoch": 1.9523861289270572, "grad_norm": 2.1289260387420654, "learning_rate": 3.0481229163646906e-05, "loss": 0.6473, "step": 13470 }, { "epoch": 1.9538355618364314, "grad_norm": 0.8309232592582703, "learning_rate": 3.0466734309320194e-05, "loss": 0.6649, "step": 13480 }, { "epoch": 1.9552849947458057, "grad_norm": 0.8766238689422607, "learning_rate": 3.045223945499348e-05, "loss": 0.6607, "step": 13490 }, { "epoch": 1.95673442765518, "grad_norm": 1.4982854127883911, "learning_rate": 3.0437744600666762e-05, "loss": 0.7224, "step": 13500 }, { "epoch": 1.958183860564554, "grad_norm": 1.3108932971954346, "learning_rate": 3.042324974634005e-05, "loss": 0.5613, "step": 13510 }, { "epoch": 1.9596332934739285, "grad_norm": 2.0841736793518066, "learning_rate": 3.0408754892013334e-05, "loss": 0.6751, "step": 13520 }, { "epoch": 1.9610827263833026, "grad_norm": 0.7184624671936035, "learning_rate": 3.0394260037686622e-05, "loss": 0.6308, "step": 13530 }, { "epoch": 1.9625321592926768, "grad_norm": 1.8600022792816162, "learning_rate": 3.037976518335991e-05, "loss": 0.5928, "step": 13540 }, { "epoch": 1.963981592202051, "grad_norm": 0.7624354362487793, "learning_rate": 3.0365270329033198e-05, "loss": 0.685, "step": 13550 }, { "epoch": 1.965431025111425, "grad_norm": 0.7610770463943481, "learning_rate": 3.0350775474706482e-05, "loss": 0.6679, "step": 13560 }, { "epoch": 1.9668804580207992, "grad_norm": 2.7660579681396484, "learning_rate": 3.0336280620379763e-05, "loss": 0.6375, "step": 13570 }, { "epoch": 1.9683298909301734, "grad_norm": 2.7836456298828125, "learning_rate": 3.032178576605305e-05, "loss": 0.673, "step": 13580 }, { "epoch": 1.9697793238395478, "grad_norm": 2.4783413410186768, "learning_rate": 3.030729091172634e-05, "loss": 0.6507, "step": 13590 }, { "epoch": 1.971228756748922, "grad_norm": 2.4852845668792725, "learning_rate": 3.0292796057399626e-05, "loss": 0.6564, "step": 13600 }, { "epoch": 1.9726781896582963, "grad_norm": 1.1195975542068481, "learning_rate": 3.027830120307291e-05, "loss": 0.6899, "step": 13610 }, { "epoch": 1.9741276225676705, "grad_norm": 0.7854732275009155, "learning_rate": 3.02638063487462e-05, "loss": 0.6372, "step": 13620 }, { "epoch": 1.9755770554770447, "grad_norm": 0.8562934994697571, "learning_rate": 3.024931149441948e-05, "loss": 0.6611, "step": 13630 }, { "epoch": 1.9770264883864188, "grad_norm": 2.820359468460083, "learning_rate": 3.0234816640092767e-05, "loss": 0.6989, "step": 13640 }, { "epoch": 1.978475921295793, "grad_norm": 0.9774487018585205, "learning_rate": 3.0220321785766055e-05, "loss": 0.6408, "step": 13650 }, { "epoch": 1.9799253542051671, "grad_norm": 1.5797113180160522, "learning_rate": 3.020582693143934e-05, "loss": 0.6087, "step": 13660 }, { "epoch": 1.9813747871145413, "grad_norm": 1.3290632963180542, "learning_rate": 3.0191332077112627e-05, "loss": 0.6909, "step": 13670 }, { "epoch": 1.9828242200239157, "grad_norm": 0.8466750979423523, "learning_rate": 3.0176837222785915e-05, "loss": 0.7907, "step": 13680 }, { "epoch": 1.9842736529332898, "grad_norm": 1.6860663890838623, "learning_rate": 3.0162342368459202e-05, "loss": 0.6878, "step": 13690 }, { "epoch": 1.9857230858426642, "grad_norm": 2.6741063594818115, "learning_rate": 3.0147847514132483e-05, "loss": 0.6498, "step": 13700 }, { "epoch": 1.9871725187520384, "grad_norm": 0.9091131091117859, "learning_rate": 3.0133352659805768e-05, "loss": 0.751, "step": 13710 }, { "epoch": 1.9886219516614125, "grad_norm": 0.8405702710151672, "learning_rate": 3.0118857805479055e-05, "loss": 0.6445, "step": 13720 }, { "epoch": 1.9900713845707867, "grad_norm": 0.7596121430397034, "learning_rate": 3.0104362951152343e-05, "loss": 0.5749, "step": 13730 }, { "epoch": 1.9915208174801609, "grad_norm": 1.1137197017669678, "learning_rate": 3.008986809682563e-05, "loss": 0.6756, "step": 13740 }, { "epoch": 1.992970250389535, "grad_norm": 0.8212170004844666, "learning_rate": 3.0075373242498915e-05, "loss": 0.7122, "step": 13750 }, { "epoch": 1.9944196832989092, "grad_norm": 2.4602837562561035, "learning_rate": 3.0060878388172196e-05, "loss": 0.7309, "step": 13760 }, { "epoch": 1.9958691162082836, "grad_norm": 2.3783586025238037, "learning_rate": 3.0046383533845484e-05, "loss": 0.7033, "step": 13770 }, { "epoch": 1.9973185491176577, "grad_norm": 1.690285086631775, "learning_rate": 3.0031888679518772e-05, "loss": 0.6742, "step": 13780 }, { "epoch": 1.9987679820270319, "grad_norm": 0.956028401851654, "learning_rate": 3.001739382519206e-05, "loss": 0.6831, "step": 13790 }, { "epoch": 2.0002174149364063, "grad_norm": 2.1426761150360107, "learning_rate": 3.0002898970865344e-05, "loss": 0.641, "step": 13800 }, { "epoch": 2.0016668478457804, "grad_norm": 2.594532012939453, "learning_rate": 2.998840411653863e-05, "loss": 0.5254, "step": 13810 }, { "epoch": 2.0031162807551546, "grad_norm": 3.246509075164795, "learning_rate": 2.997390926221192e-05, "loss": 0.6204, "step": 13820 }, { "epoch": 2.0045657136645287, "grad_norm": 2.1272997856140137, "learning_rate": 2.99594144078852e-05, "loss": 0.616, "step": 13830 }, { "epoch": 2.006015146573903, "grad_norm": 0.8702901005744934, "learning_rate": 2.9944919553558488e-05, "loss": 0.5416, "step": 13840 }, { "epoch": 2.007464579483277, "grad_norm": 0.8716585040092468, "learning_rate": 2.9930424699231772e-05, "loss": 0.545, "step": 13850 }, { "epoch": 2.008914012392651, "grad_norm": 1.0876643657684326, "learning_rate": 2.991592984490506e-05, "loss": 0.5743, "step": 13860 }, { "epoch": 2.0103634453020254, "grad_norm": 0.886386513710022, "learning_rate": 2.9901434990578348e-05, "loss": 0.5763, "step": 13870 }, { "epoch": 2.0118128782114, "grad_norm": 1.9063818454742432, "learning_rate": 2.9886940136251636e-05, "loss": 0.5437, "step": 13880 }, { "epoch": 2.013262311120774, "grad_norm": 3.7688114643096924, "learning_rate": 2.9872445281924917e-05, "loss": 0.7232, "step": 13890 }, { "epoch": 2.0147117440301483, "grad_norm": 1.1015434265136719, "learning_rate": 2.98579504275982e-05, "loss": 0.6314, "step": 13900 }, { "epoch": 2.0161611769395225, "grad_norm": 1.0932585000991821, "learning_rate": 2.984345557327149e-05, "loss": 0.6195, "step": 13910 }, { "epoch": 2.0176106098488966, "grad_norm": 1.782259225845337, "learning_rate": 2.9828960718944777e-05, "loss": 0.5268, "step": 13920 }, { "epoch": 2.019060042758271, "grad_norm": 0.9539967775344849, "learning_rate": 2.9814465864618064e-05, "loss": 0.6348, "step": 13930 }, { "epoch": 2.020509475667645, "grad_norm": 0.9660699367523193, "learning_rate": 2.979997101029135e-05, "loss": 0.6033, "step": 13940 }, { "epoch": 2.021958908577019, "grad_norm": 1.874058723449707, "learning_rate": 2.9785476155964636e-05, "loss": 0.5756, "step": 13950 }, { "epoch": 2.0234083414863933, "grad_norm": 2.6562726497650146, "learning_rate": 2.9770981301637917e-05, "loss": 0.543, "step": 13960 }, { "epoch": 2.024857774395768, "grad_norm": 2.959782600402832, "learning_rate": 2.9756486447311205e-05, "loss": 0.494, "step": 13970 }, { "epoch": 2.026307207305142, "grad_norm": 1.5290549993515015, "learning_rate": 2.9741991592984493e-05, "loss": 0.5822, "step": 13980 }, { "epoch": 2.027756640214516, "grad_norm": 1.205371379852295, "learning_rate": 2.9727496738657777e-05, "loss": 0.5649, "step": 13990 }, { "epoch": 2.0292060731238903, "grad_norm": 0.8969298601150513, "learning_rate": 2.9713001884331065e-05, "loss": 0.5747, "step": 14000 }, { "epoch": 2.0292060731238903, "eval_loss": 0.7397032380104065, "eval_runtime": 669.3889, "eval_samples_per_second": 51.532, "eval_steps_per_second": 2.577, "eval_token_accuracy": 0.00038907614440499173, "step": 14000 }, { "epoch": 2.0306555060332645, "grad_norm": 2.183821439743042, "learning_rate": 2.9698507030004353e-05, "loss": 0.6279, "step": 14010 }, { "epoch": 2.0321049389426387, "grad_norm": 1.117087960243225, "learning_rate": 2.9684012175677634e-05, "loss": 0.5683, "step": 14020 }, { "epoch": 2.033554371852013, "grad_norm": 1.8336340188980103, "learning_rate": 2.966951732135092e-05, "loss": 0.6125, "step": 14030 }, { "epoch": 2.035003804761387, "grad_norm": 2.2379143238067627, "learning_rate": 2.9655022467024206e-05, "loss": 0.5941, "step": 14040 }, { "epoch": 2.036453237670761, "grad_norm": 0.7575920224189758, "learning_rate": 2.9640527612697494e-05, "loss": 0.5903, "step": 14050 }, { "epoch": 2.0379026705801353, "grad_norm": 3.254852294921875, "learning_rate": 2.962603275837078e-05, "loss": 0.562, "step": 14060 }, { "epoch": 2.03935210348951, "grad_norm": 1.0880857706069946, "learning_rate": 2.961153790404407e-05, "loss": 0.5554, "step": 14070 }, { "epoch": 2.040801536398884, "grad_norm": 0.8083730340003967, "learning_rate": 2.9597043049717353e-05, "loss": 0.5702, "step": 14080 }, { "epoch": 2.0422509693082582, "grad_norm": 0.9211301207542419, "learning_rate": 2.9582548195390634e-05, "loss": 0.5294, "step": 14090 }, { "epoch": 2.0437004022176324, "grad_norm": 1.0826817750930786, "learning_rate": 2.9568053341063922e-05, "loss": 0.6142, "step": 14100 }, { "epoch": 2.0451498351270065, "grad_norm": 1.09013032913208, "learning_rate": 2.955355848673721e-05, "loss": 0.5591, "step": 14110 }, { "epoch": 2.0465992680363807, "grad_norm": 0.9100773334503174, "learning_rate": 2.9539063632410498e-05, "loss": 0.5169, "step": 14120 }, { "epoch": 2.048048700945755, "grad_norm": 2.1786623001098633, "learning_rate": 2.9524568778083782e-05, "loss": 0.6315, "step": 14130 }, { "epoch": 2.049498133855129, "grad_norm": 3.048058271408081, "learning_rate": 2.951007392375707e-05, "loss": 0.557, "step": 14140 }, { "epoch": 2.050947566764503, "grad_norm": 0.8104945421218872, "learning_rate": 2.949557906943035e-05, "loss": 0.5159, "step": 14150 }, { "epoch": 2.052396999673878, "grad_norm": 1.006663203239441, "learning_rate": 2.948108421510364e-05, "loss": 0.5531, "step": 14160 }, { "epoch": 2.053846432583252, "grad_norm": 0.8596798181533813, "learning_rate": 2.9466589360776926e-05, "loss": 0.6046, "step": 14170 }, { "epoch": 2.055295865492626, "grad_norm": 0.9660754203796387, "learning_rate": 2.945209450645021e-05, "loss": 0.5892, "step": 14180 }, { "epoch": 2.0567452984020003, "grad_norm": 0.7551504969596863, "learning_rate": 2.9437599652123498e-05, "loss": 0.5422, "step": 14190 }, { "epoch": 2.0581947313113744, "grad_norm": 1.2956832647323608, "learning_rate": 2.9423104797796786e-05, "loss": 0.5931, "step": 14200 }, { "epoch": 2.0596441642207486, "grad_norm": 2.2845637798309326, "learning_rate": 2.9408609943470067e-05, "loss": 0.5701, "step": 14210 }, { "epoch": 2.0610935971301227, "grad_norm": 0.7346026301383972, "learning_rate": 2.9394115089143355e-05, "loss": 0.5839, "step": 14220 }, { "epoch": 2.062543030039497, "grad_norm": 2.995973825454712, "learning_rate": 2.938106972024931e-05, "loss": 0.6129, "step": 14230 }, { "epoch": 2.063992462948871, "grad_norm": 0.8397359251976013, "learning_rate": 2.93665748659226e-05, "loss": 0.6201, "step": 14240 }, { "epoch": 2.0654418958582457, "grad_norm": 2.088355302810669, "learning_rate": 2.9352080011595883e-05, "loss": 0.6016, "step": 14250 }, { "epoch": 2.06689132876762, "grad_norm": 1.5870708227157593, "learning_rate": 2.933758515726917e-05, "loss": 0.6194, "step": 14260 }, { "epoch": 2.068340761676994, "grad_norm": 3.212609052658081, "learning_rate": 2.9323090302942458e-05, "loss": 0.6338, "step": 14270 }, { "epoch": 2.069790194586368, "grad_norm": 1.227840542793274, "learning_rate": 2.9308595448615746e-05, "loss": 0.5664, "step": 14280 }, { "epoch": 2.0712396274957423, "grad_norm": 1.0554555654525757, "learning_rate": 2.929410059428903e-05, "loss": 0.5276, "step": 14290 }, { "epoch": 2.0726890604051165, "grad_norm": 1.9810999631881714, "learning_rate": 2.927960573996231e-05, "loss": 0.5682, "step": 14300 }, { "epoch": 2.0741384933144906, "grad_norm": 1.1422438621520996, "learning_rate": 2.92651108856356e-05, "loss": 0.5827, "step": 14310 }, { "epoch": 2.075587926223865, "grad_norm": 0.8738481402397156, "learning_rate": 2.9250616031308887e-05, "loss": 0.6125, "step": 14320 }, { "epoch": 2.077037359133239, "grad_norm": 0.8904774785041809, "learning_rate": 2.9236121176982175e-05, "loss": 0.595, "step": 14330 }, { "epoch": 2.0784867920426136, "grad_norm": 0.7666566967964172, "learning_rate": 2.922162632265546e-05, "loss": 0.5833, "step": 14340 }, { "epoch": 2.0799362249519877, "grad_norm": 2.4014010429382324, "learning_rate": 2.9207131468328747e-05, "loss": 0.5472, "step": 14350 }, { "epoch": 2.081385657861362, "grad_norm": 2.999542474746704, "learning_rate": 2.9192636614002028e-05, "loss": 0.5587, "step": 14360 }, { "epoch": 2.082835090770736, "grad_norm": 1.0062235593795776, "learning_rate": 2.9178141759675315e-05, "loss": 0.4953, "step": 14370 }, { "epoch": 2.08428452368011, "grad_norm": 2.3883652687072754, "learning_rate": 2.9163646905348603e-05, "loss": 0.5312, "step": 14380 }, { "epoch": 2.0857339565894844, "grad_norm": 3.436647653579712, "learning_rate": 2.9149152051021888e-05, "loss": 0.6025, "step": 14390 }, { "epoch": 2.0871833894988585, "grad_norm": 2.230759620666504, "learning_rate": 2.9134657196695175e-05, "loss": 0.5443, "step": 14400 }, { "epoch": 2.0886328224082327, "grad_norm": 1.5165539979934692, "learning_rate": 2.9120162342368463e-05, "loss": 0.5124, "step": 14410 }, { "epoch": 2.090082255317607, "grad_norm": 1.5038182735443115, "learning_rate": 2.9105667488041747e-05, "loss": 0.5254, "step": 14420 }, { "epoch": 2.091531688226981, "grad_norm": 1.835642695426941, "learning_rate": 2.9091172633715032e-05, "loss": 0.5832, "step": 14430 }, { "epoch": 2.0929811211363556, "grad_norm": 2.8495612144470215, "learning_rate": 2.9076677779388316e-05, "loss": 0.5343, "step": 14440 }, { "epoch": 2.0944305540457298, "grad_norm": 2.3448574542999268, "learning_rate": 2.9062182925061604e-05, "loss": 0.5667, "step": 14450 }, { "epoch": 2.095879986955104, "grad_norm": 1.0450005531311035, "learning_rate": 2.904768807073489e-05, "loss": 0.5853, "step": 14460 }, { "epoch": 2.097329419864478, "grad_norm": 2.5745956897735596, "learning_rate": 2.903319321640818e-05, "loss": 0.591, "step": 14470 }, { "epoch": 2.0987788527738522, "grad_norm": 1.0149450302124023, "learning_rate": 2.9018698362081464e-05, "loss": 0.6152, "step": 14480 }, { "epoch": 2.1002282856832264, "grad_norm": 1.0218358039855957, "learning_rate": 2.9004203507754745e-05, "loss": 0.537, "step": 14490 }, { "epoch": 2.1016777185926006, "grad_norm": 1.4043501615524292, "learning_rate": 2.8989708653428032e-05, "loss": 0.5272, "step": 14500 }, { "epoch": 2.1031271515019747, "grad_norm": 1.0450868606567383, "learning_rate": 2.897521379910132e-05, "loss": 0.5758, "step": 14510 }, { "epoch": 2.104576584411349, "grad_norm": 2.720461368560791, "learning_rate": 2.8960718944774608e-05, "loss": 0.5151, "step": 14520 }, { "epoch": 2.1060260173207235, "grad_norm": 2.295283317565918, "learning_rate": 2.8946224090447892e-05, "loss": 0.5786, "step": 14530 }, { "epoch": 2.1074754502300976, "grad_norm": 0.9478172063827515, "learning_rate": 2.893172923612118e-05, "loss": 0.5736, "step": 14540 }, { "epoch": 2.108924883139472, "grad_norm": 2.118621826171875, "learning_rate": 2.8917234381794468e-05, "loss": 0.5304, "step": 14550 }, { "epoch": 2.110374316048846, "grad_norm": 1.0907704830169678, "learning_rate": 2.890273952746775e-05, "loss": 0.5746, "step": 14560 }, { "epoch": 2.11182374895822, "grad_norm": 2.393841505050659, "learning_rate": 2.8888244673141036e-05, "loss": 0.5995, "step": 14570 }, { "epoch": 2.1132731818675943, "grad_norm": 2.9934537410736084, "learning_rate": 2.887374981881432e-05, "loss": 0.5894, "step": 14580 }, { "epoch": 2.1147226147769684, "grad_norm": 3.190861940383911, "learning_rate": 2.885925496448761e-05, "loss": 0.6324, "step": 14590 }, { "epoch": 2.1161720476863426, "grad_norm": 0.9476354122161865, "learning_rate": 2.8844760110160896e-05, "loss": 0.5397, "step": 14600 }, { "epoch": 2.1176214805957168, "grad_norm": 0.9767410755157471, "learning_rate": 2.883026525583418e-05, "loss": 0.5659, "step": 14610 }, { "epoch": 2.119070913505091, "grad_norm": 0.9902337193489075, "learning_rate": 2.8815770401507465e-05, "loss": 0.5553, "step": 14620 }, { "epoch": 2.1205203464144655, "grad_norm": 0.8071011900901794, "learning_rate": 2.880127554718075e-05, "loss": 0.5573, "step": 14630 }, { "epoch": 2.1219697793238397, "grad_norm": 1.1467477083206177, "learning_rate": 2.8786780692854037e-05, "loss": 0.6328, "step": 14640 }, { "epoch": 2.123419212233214, "grad_norm": 1.8961005210876465, "learning_rate": 2.8772285838527325e-05, "loss": 0.6178, "step": 14650 }, { "epoch": 2.124868645142588, "grad_norm": 0.9489744305610657, "learning_rate": 2.8757790984200613e-05, "loss": 0.5989, "step": 14660 }, { "epoch": 2.126318078051962, "grad_norm": 0.7456279397010803, "learning_rate": 2.8743296129873897e-05, "loss": 0.5063, "step": 14670 }, { "epoch": 2.1277675109613363, "grad_norm": 0.9865498542785645, "learning_rate": 2.8728801275547185e-05, "loss": 0.5289, "step": 14680 }, { "epoch": 2.1292169438707105, "grad_norm": 1.7938251495361328, "learning_rate": 2.8714306421220466e-05, "loss": 0.5128, "step": 14690 }, { "epoch": 2.1306663767800846, "grad_norm": 1.2648547887802124, "learning_rate": 2.8699811566893753e-05, "loss": 0.6091, "step": 14700 }, { "epoch": 2.132115809689459, "grad_norm": 1.101527214050293, "learning_rate": 2.868531671256704e-05, "loss": 0.5869, "step": 14710 }, { "epoch": 2.1335652425988334, "grad_norm": 1.024398922920227, "learning_rate": 2.8670821858240326e-05, "loss": 0.535, "step": 14720 }, { "epoch": 2.1350146755082076, "grad_norm": 0.8312330842018127, "learning_rate": 2.8656327003913613e-05, "loss": 0.571, "step": 14730 }, { "epoch": 2.1364641084175817, "grad_norm": 0.7251666784286499, "learning_rate": 2.86418321495869e-05, "loss": 0.5798, "step": 14740 }, { "epoch": 2.137913541326956, "grad_norm": 4.033049583435059, "learning_rate": 2.8627337295260182e-05, "loss": 0.5914, "step": 14750 }, { "epoch": 2.13936297423633, "grad_norm": 0.8272391557693481, "learning_rate": 2.861284244093347e-05, "loss": 0.553, "step": 14760 }, { "epoch": 2.140812407145704, "grad_norm": 2.1350879669189453, "learning_rate": 2.8598347586606754e-05, "loss": 0.5949, "step": 14770 }, { "epoch": 2.1422618400550784, "grad_norm": 1.17877197265625, "learning_rate": 2.8583852732280042e-05, "loss": 0.6239, "step": 14780 }, { "epoch": 2.1437112729644525, "grad_norm": 3.4440340995788574, "learning_rate": 2.856935787795333e-05, "loss": 0.5632, "step": 14790 }, { "epoch": 2.1451607058738267, "grad_norm": 2.5987493991851807, "learning_rate": 2.8554863023626614e-05, "loss": 0.609, "step": 14800 }, { "epoch": 2.146610138783201, "grad_norm": 1.0455831289291382, "learning_rate": 2.8540368169299902e-05, "loss": 0.6558, "step": 14810 }, { "epoch": 2.1480595716925754, "grad_norm": 1.111206293106079, "learning_rate": 2.8525873314973183e-05, "loss": 0.5465, "step": 14820 }, { "epoch": 2.1495090046019496, "grad_norm": 1.54293692111969, "learning_rate": 2.851137846064647e-05, "loss": 0.5884, "step": 14830 }, { "epoch": 2.1509584375113238, "grad_norm": 0.9174100160598755, "learning_rate": 2.8496883606319758e-05, "loss": 0.5876, "step": 14840 }, { "epoch": 2.152407870420698, "grad_norm": 0.9613193869590759, "learning_rate": 2.8482388751993046e-05, "loss": 0.5343, "step": 14850 }, { "epoch": 2.153857303330072, "grad_norm": 0.9370414018630981, "learning_rate": 2.846789389766633e-05, "loss": 0.5604, "step": 14860 }, { "epoch": 2.1553067362394462, "grad_norm": 0.9880764484405518, "learning_rate": 2.8453399043339618e-05, "loss": 0.5512, "step": 14870 }, { "epoch": 2.1567561691488204, "grad_norm": 0.9697717428207397, "learning_rate": 2.84389041890129e-05, "loss": 0.5975, "step": 14880 }, { "epoch": 2.1582056020581946, "grad_norm": 2.6026132106781006, "learning_rate": 2.8424409334686187e-05, "loss": 0.5156, "step": 14890 }, { "epoch": 2.159655034967569, "grad_norm": 1.054138422012329, "learning_rate": 2.8409914480359475e-05, "loss": 0.568, "step": 14900 }, { "epoch": 2.1611044678769433, "grad_norm": 1.9634565114974976, "learning_rate": 2.839541962603276e-05, "loss": 0.4952, "step": 14910 }, { "epoch": 2.1625539007863175, "grad_norm": 2.1717028617858887, "learning_rate": 2.8380924771706047e-05, "loss": 0.5208, "step": 14920 }, { "epoch": 2.1640033336956916, "grad_norm": 1.046442985534668, "learning_rate": 2.8366429917379334e-05, "loss": 0.5488, "step": 14930 }, { "epoch": 2.165452766605066, "grad_norm": 0.7042348980903625, "learning_rate": 2.835193506305262e-05, "loss": 0.5599, "step": 14940 }, { "epoch": 2.16690219951444, "grad_norm": 0.8977163434028625, "learning_rate": 2.8337440208725903e-05, "loss": 0.611, "step": 14950 }, { "epoch": 2.168351632423814, "grad_norm": 3.2048516273498535, "learning_rate": 2.8322945354399187e-05, "loss": 0.5933, "step": 14960 }, { "epoch": 2.1698010653331883, "grad_norm": 2.3574509620666504, "learning_rate": 2.8308450500072475e-05, "loss": 0.5766, "step": 14970 }, { "epoch": 2.1712504982425624, "grad_norm": 1.741523265838623, "learning_rate": 2.8293955645745763e-05, "loss": 0.5783, "step": 14980 }, { "epoch": 2.1726999311519366, "grad_norm": 1.014177680015564, "learning_rate": 2.827946079141905e-05, "loss": 0.5648, "step": 14990 }, { "epoch": 2.174149364061311, "grad_norm": 0.8169074058532715, "learning_rate": 2.8264965937092335e-05, "loss": 0.6333, "step": 15000 }, { "epoch": 2.174149364061311, "eval_loss": 0.7357287406921387, "eval_runtime": 671.8324, "eval_samples_per_second": 51.345, "eval_steps_per_second": 2.568, "eval_token_accuracy": 0.00039937001511268895, "step": 15000 }, { "epoch": 2.1755987969706854, "grad_norm": 1.113046407699585, "learning_rate": 2.8250471082765616e-05, "loss": 0.5788, "step": 15010 }, { "epoch": 2.1770482298800595, "grad_norm": 2.7498059272766113, "learning_rate": 2.8235976228438904e-05, "loss": 0.5749, "step": 15020 }, { "epoch": 2.1784976627894337, "grad_norm": 0.8708633780479431, "learning_rate": 2.822148137411219e-05, "loss": 0.5504, "step": 15030 }, { "epoch": 2.179947095698808, "grad_norm": 1.1141761541366577, "learning_rate": 2.820698651978548e-05, "loss": 0.4918, "step": 15040 }, { "epoch": 2.181396528608182, "grad_norm": 1.0088549852371216, "learning_rate": 2.8192491665458764e-05, "loss": 0.5798, "step": 15050 }, { "epoch": 2.182845961517556, "grad_norm": 0.924612283706665, "learning_rate": 2.817799681113205e-05, "loss": 0.5411, "step": 15060 }, { "epoch": 2.1842953944269303, "grad_norm": 3.6293270587921143, "learning_rate": 2.816350195680534e-05, "loss": 0.5907, "step": 15070 }, { "epoch": 2.1857448273363045, "grad_norm": 1.6444721221923828, "learning_rate": 2.814900710247862e-05, "loss": 0.5934, "step": 15080 }, { "epoch": 2.187194260245679, "grad_norm": 0.8702876567840576, "learning_rate": 2.8134512248151908e-05, "loss": 0.5257, "step": 15090 }, { "epoch": 2.1886436931550532, "grad_norm": 1.1607176065444946, "learning_rate": 2.8120017393825192e-05, "loss": 0.6226, "step": 15100 }, { "epoch": 2.1900931260644274, "grad_norm": 0.8363760113716125, "learning_rate": 2.810552253949848e-05, "loss": 0.555, "step": 15110 }, { "epoch": 2.1915425589738016, "grad_norm": 0.882907509803772, "learning_rate": 2.8091027685171768e-05, "loss": 0.5416, "step": 15120 }, { "epoch": 2.1929919918831757, "grad_norm": 1.025321364402771, "learning_rate": 2.8076532830845052e-05, "loss": 0.5778, "step": 15130 }, { "epoch": 2.19444142479255, "grad_norm": 1.1927074193954468, "learning_rate": 2.8062037976518336e-05, "loss": 0.4824, "step": 15140 }, { "epoch": 2.195890857701924, "grad_norm": 2.4487786293029785, "learning_rate": 2.804754312219162e-05, "loss": 0.5273, "step": 15150 }, { "epoch": 2.197340290611298, "grad_norm": 0.8211236596107483, "learning_rate": 2.803304826786491e-05, "loss": 0.5425, "step": 15160 }, { "epoch": 2.1987897235206724, "grad_norm": 1.0167851448059082, "learning_rate": 2.8018553413538196e-05, "loss": 0.5707, "step": 15170 }, { "epoch": 2.2002391564300465, "grad_norm": 0.7350836396217346, "learning_rate": 2.8004058559211484e-05, "loss": 0.5831, "step": 15180 }, { "epoch": 2.201688589339421, "grad_norm": 1.0720088481903076, "learning_rate": 2.798956370488477e-05, "loss": 0.6097, "step": 15190 }, { "epoch": 2.2031380222487953, "grad_norm": 2.9851224422454834, "learning_rate": 2.797506885055805e-05, "loss": 0.6407, "step": 15200 }, { "epoch": 2.2045874551581695, "grad_norm": 2.150064706802368, "learning_rate": 2.7960573996231337e-05, "loss": 0.565, "step": 15210 }, { "epoch": 2.2060368880675436, "grad_norm": 2.035189151763916, "learning_rate": 2.7946079141904625e-05, "loss": 0.5112, "step": 15220 }, { "epoch": 2.2074863209769178, "grad_norm": 1.1495574712753296, "learning_rate": 2.7931584287577913e-05, "loss": 0.5142, "step": 15230 }, { "epoch": 2.208935753886292, "grad_norm": 0.8203104734420776, "learning_rate": 2.7917089433251197e-05, "loss": 0.569, "step": 15240 }, { "epoch": 2.210385186795666, "grad_norm": 0.7035321593284607, "learning_rate": 2.7902594578924485e-05, "loss": 0.5688, "step": 15250 }, { "epoch": 2.2118346197050403, "grad_norm": 0.8323057889938354, "learning_rate": 2.7888099724597772e-05, "loss": 0.515, "step": 15260 }, { "epoch": 2.2132840526144144, "grad_norm": 2.930710792541504, "learning_rate": 2.7873604870271053e-05, "loss": 0.5306, "step": 15270 }, { "epoch": 2.214733485523789, "grad_norm": 0.955631673336029, "learning_rate": 2.785911001594434e-05, "loss": 0.6126, "step": 15280 }, { "epoch": 2.216182918433163, "grad_norm": 2.184109926223755, "learning_rate": 2.7844615161617626e-05, "loss": 0.5994, "step": 15290 }, { "epoch": 2.2176323513425373, "grad_norm": 1.508414626121521, "learning_rate": 2.7830120307290913e-05, "loss": 0.5375, "step": 15300 }, { "epoch": 2.2190817842519115, "grad_norm": 2.4226200580596924, "learning_rate": 2.78156254529642e-05, "loss": 0.567, "step": 15310 }, { "epoch": 2.2205312171612857, "grad_norm": 0.8827345371246338, "learning_rate": 2.7801130598637485e-05, "loss": 0.5822, "step": 15320 }, { "epoch": 2.22198065007066, "grad_norm": 2.3095834255218506, "learning_rate": 2.778663574431077e-05, "loss": 0.5613, "step": 15330 }, { "epoch": 2.223430082980034, "grad_norm": 3.297713279724121, "learning_rate": 2.7772140889984054e-05, "loss": 0.5968, "step": 15340 }, { "epoch": 2.224879515889408, "grad_norm": 3.122631788253784, "learning_rate": 2.7757646035657342e-05, "loss": 0.5367, "step": 15350 }, { "epoch": 2.2263289487987823, "grad_norm": 0.7298904061317444, "learning_rate": 2.774315118133063e-05, "loss": 0.5646, "step": 15360 }, { "epoch": 2.2277783817081565, "grad_norm": 2.29826283454895, "learning_rate": 2.7728656327003917e-05, "loss": 0.5341, "step": 15370 }, { "epoch": 2.229227814617531, "grad_norm": 3.6760542392730713, "learning_rate": 2.7714161472677202e-05, "loss": 0.5884, "step": 15380 }, { "epoch": 2.230677247526905, "grad_norm": 1.860216736793518, "learning_rate": 2.769966661835049e-05, "loss": 0.6191, "step": 15390 }, { "epoch": 2.2321266804362794, "grad_norm": 1.4394019842147827, "learning_rate": 2.768517176402377e-05, "loss": 0.5397, "step": 15400 }, { "epoch": 2.2335761133456535, "grad_norm": 0.8453474044799805, "learning_rate": 2.7670676909697058e-05, "loss": 0.5987, "step": 15410 }, { "epoch": 2.2350255462550277, "grad_norm": 0.797804594039917, "learning_rate": 2.7656182055370346e-05, "loss": 0.5442, "step": 15420 }, { "epoch": 2.236474979164402, "grad_norm": 0.9788017868995667, "learning_rate": 2.764168720104363e-05, "loss": 0.5397, "step": 15430 }, { "epoch": 2.237924412073776, "grad_norm": 1.2205966711044312, "learning_rate": 2.7627192346716918e-05, "loss": 0.5546, "step": 15440 }, { "epoch": 2.23937384498315, "grad_norm": 0.9380795359611511, "learning_rate": 2.7612697492390206e-05, "loss": 0.5002, "step": 15450 }, { "epoch": 2.240823277892525, "grad_norm": 0.9293228983879089, "learning_rate": 2.7598202638063487e-05, "loss": 0.5216, "step": 15460 }, { "epoch": 2.242272710801899, "grad_norm": 3.3147010803222656, "learning_rate": 2.7583707783736774e-05, "loss": 0.5185, "step": 15470 }, { "epoch": 2.243722143711273, "grad_norm": 2.7040462493896484, "learning_rate": 2.756921292941006e-05, "loss": 0.5597, "step": 15480 }, { "epoch": 2.2451715766206473, "grad_norm": 1.8201930522918701, "learning_rate": 2.7554718075083347e-05, "loss": 0.5687, "step": 15490 }, { "epoch": 2.2466210095300214, "grad_norm": 2.6963207721710205, "learning_rate": 2.7540223220756634e-05, "loss": 0.548, "step": 15500 }, { "epoch": 2.2480704424393956, "grad_norm": 2.536465644836426, "learning_rate": 2.752572836642992e-05, "loss": 0.5446, "step": 15510 }, { "epoch": 2.2495198753487697, "grad_norm": 1.1007113456726074, "learning_rate": 2.7511233512103206e-05, "loss": 0.5694, "step": 15520 }, { "epoch": 2.250969308258144, "grad_norm": 0.933037519454956, "learning_rate": 2.7496738657776487e-05, "loss": 0.5369, "step": 15530 }, { "epoch": 2.252418741167518, "grad_norm": 1.0118540525436401, "learning_rate": 2.7482243803449775e-05, "loss": 0.6256, "step": 15540 }, { "epoch": 2.253868174076892, "grad_norm": 1.152823805809021, "learning_rate": 2.7467748949123063e-05, "loss": 0.6073, "step": 15550 }, { "epoch": 2.2553176069862664, "grad_norm": 1.694582462310791, "learning_rate": 2.745325409479635e-05, "loss": 0.5662, "step": 15560 }, { "epoch": 2.256767039895641, "grad_norm": 1.02156400680542, "learning_rate": 2.7438759240469635e-05, "loss": 0.5219, "step": 15570 }, { "epoch": 2.258216472805015, "grad_norm": 1.1287500858306885, "learning_rate": 2.7424264386142923e-05, "loss": 0.5136, "step": 15580 }, { "epoch": 2.2596659057143893, "grad_norm": 3.2117209434509277, "learning_rate": 2.7409769531816204e-05, "loss": 0.536, "step": 15590 }, { "epoch": 2.2611153386237635, "grad_norm": 2.113884925842285, "learning_rate": 2.739527467748949e-05, "loss": 0.6149, "step": 15600 }, { "epoch": 2.2625647715331376, "grad_norm": 0.9687091708183289, "learning_rate": 2.738077982316278e-05, "loss": 0.5503, "step": 15610 }, { "epoch": 2.264014204442512, "grad_norm": 2.7099952697753906, "learning_rate": 2.7366284968836064e-05, "loss": 0.5276, "step": 15620 }, { "epoch": 2.265463637351886, "grad_norm": 3.803027629852295, "learning_rate": 2.735179011450935e-05, "loss": 0.5589, "step": 15630 }, { "epoch": 2.26691307026126, "grad_norm": 3.0070250034332275, "learning_rate": 2.733729526018264e-05, "loss": 0.6067, "step": 15640 }, { "epoch": 2.2683625031706347, "grad_norm": 1.7724757194519043, "learning_rate": 2.7322800405855923e-05, "loss": 0.6155, "step": 15650 }, { "epoch": 2.269811936080009, "grad_norm": 0.9404800534248352, "learning_rate": 2.7308305551529208e-05, "loss": 0.522, "step": 15660 }, { "epoch": 2.271261368989383, "grad_norm": 1.0227422714233398, "learning_rate": 2.7293810697202492e-05, "loss": 0.582, "step": 15670 }, { "epoch": 2.272710801898757, "grad_norm": 1.8441566228866577, "learning_rate": 2.727931584287578e-05, "loss": 0.6357, "step": 15680 }, { "epoch": 2.2741602348081313, "grad_norm": 1.2445653676986694, "learning_rate": 2.7264820988549068e-05, "loss": 0.5996, "step": 15690 }, { "epoch": 2.2756096677175055, "grad_norm": 1.0026644468307495, "learning_rate": 2.7250326134222352e-05, "loss": 0.5768, "step": 15700 }, { "epoch": 2.2770591006268797, "grad_norm": 1.053460717201233, "learning_rate": 2.723583127989564e-05, "loss": 0.5642, "step": 15710 }, { "epoch": 2.278508533536254, "grad_norm": 0.7311908006668091, "learning_rate": 2.722133642556892e-05, "loss": 0.5459, "step": 15720 }, { "epoch": 2.279957966445628, "grad_norm": 0.796303927898407, "learning_rate": 2.720684157124221e-05, "loss": 0.5471, "step": 15730 }, { "epoch": 2.281407399355002, "grad_norm": 1.0040631294250488, "learning_rate": 2.7192346716915496e-05, "loss": 0.5486, "step": 15740 }, { "epoch": 2.2828568322643767, "grad_norm": 0.8615463376045227, "learning_rate": 2.7177851862588784e-05, "loss": 0.5496, "step": 15750 }, { "epoch": 2.284306265173751, "grad_norm": 2.0899055004119873, "learning_rate": 2.716335700826207e-05, "loss": 0.553, "step": 15760 }, { "epoch": 2.285755698083125, "grad_norm": 1.0100775957107544, "learning_rate": 2.7148862153935356e-05, "loss": 0.5986, "step": 15770 }, { "epoch": 2.2872051309924992, "grad_norm": 2.7385852336883545, "learning_rate": 2.7134367299608644e-05, "loss": 0.5775, "step": 15780 }, { "epoch": 2.2886545639018734, "grad_norm": 1.0600306987762451, "learning_rate": 2.7119872445281925e-05, "loss": 0.5824, "step": 15790 }, { "epoch": 2.2901039968112475, "grad_norm": 2.653311014175415, "learning_rate": 2.7105377590955213e-05, "loss": 0.6391, "step": 15800 }, { "epoch": 2.2915534297206217, "grad_norm": 2.0966145992279053, "learning_rate": 2.7090882736628497e-05, "loss": 0.5532, "step": 15810 }, { "epoch": 2.293002862629996, "grad_norm": 1.9514657258987427, "learning_rate": 2.7076387882301785e-05, "loss": 0.5264, "step": 15820 }, { "epoch": 2.2944522955393705, "grad_norm": 0.8025791049003601, "learning_rate": 2.7061893027975072e-05, "loss": 0.5974, "step": 15830 }, { "epoch": 2.2959017284487446, "grad_norm": 0.8524109125137329, "learning_rate": 2.7047398173648357e-05, "loss": 0.4788, "step": 15840 }, { "epoch": 2.297351161358119, "grad_norm": 0.9747781157493591, "learning_rate": 2.703290331932164e-05, "loss": 0.5226, "step": 15850 }, { "epoch": 2.298800594267493, "grad_norm": 1.0575156211853027, "learning_rate": 2.7018408464994925e-05, "loss": 0.5671, "step": 15860 }, { "epoch": 2.300250027176867, "grad_norm": 1.6193407773971558, "learning_rate": 2.7003913610668213e-05, "loss": 0.5991, "step": 15870 }, { "epoch": 2.3016994600862413, "grad_norm": 2.8316709995269775, "learning_rate": 2.69894187563415e-05, "loss": 0.5654, "step": 15880 }, { "epoch": 2.3031488929956154, "grad_norm": 2.5655696392059326, "learning_rate": 2.6974923902014785e-05, "loss": 0.5776, "step": 15890 }, { "epoch": 2.3045983259049896, "grad_norm": 0.8109912276268005, "learning_rate": 2.6960429047688073e-05, "loss": 0.5703, "step": 15900 }, { "epoch": 2.3060477588143637, "grad_norm": 1.0215001106262207, "learning_rate": 2.694593419336136e-05, "loss": 0.6276, "step": 15910 }, { "epoch": 2.307497191723738, "grad_norm": 0.8743044137954712, "learning_rate": 2.6931439339034642e-05, "loss": 0.5279, "step": 15920 }, { "epoch": 2.308946624633112, "grad_norm": 3.493328809738159, "learning_rate": 2.691694448470793e-05, "loss": 0.5984, "step": 15930 }, { "epoch": 2.3103960575424867, "grad_norm": 0.842359185218811, "learning_rate": 2.6902449630381217e-05, "loss": 0.5373, "step": 15940 }, { "epoch": 2.311845490451861, "grad_norm": 0.8587477207183838, "learning_rate": 2.68879547760545e-05, "loss": 0.6291, "step": 15950 }, { "epoch": 2.313294923361235, "grad_norm": 1.6115646362304688, "learning_rate": 2.687345992172779e-05, "loss": 0.5439, "step": 15960 }, { "epoch": 2.314744356270609, "grad_norm": 1.8800400495529175, "learning_rate": 2.6858965067401077e-05, "loss": 0.5898, "step": 15970 }, { "epoch": 2.3161937891799833, "grad_norm": 1.8140876293182373, "learning_rate": 2.6844470213074358e-05, "loss": 0.6311, "step": 15980 }, { "epoch": 2.3176432220893575, "grad_norm": 0.8107464909553528, "learning_rate": 2.6829975358747646e-05, "loss": 0.5671, "step": 15990 }, { "epoch": 2.3190926549987316, "grad_norm": 2.7339236736297607, "learning_rate": 2.681548050442093e-05, "loss": 0.5767, "step": 16000 }, { "epoch": 2.3190926549987316, "eval_loss": 0.7302293181419373, "eval_runtime": 671.5101, "eval_samples_per_second": 51.369, "eval_steps_per_second": 2.569, "eval_token_accuracy": 0.0004098247275501939, "step": 16000 }, { "epoch": 2.320542087908106, "grad_norm": 0.9360762238502502, "learning_rate": 2.6800985650094218e-05, "loss": 0.5811, "step": 16010 }, { "epoch": 2.3219915208174804, "grad_norm": 1.0557974576950073, "learning_rate": 2.6786490795767506e-05, "loss": 0.5832, "step": 16020 }, { "epoch": 2.3234409537268546, "grad_norm": 0.8322176337242126, "learning_rate": 2.677199594144079e-05, "loss": 0.5846, "step": 16030 }, { "epoch": 2.3248903866362287, "grad_norm": 1.0784193277359009, "learning_rate": 2.6757501087114078e-05, "loss": 0.5706, "step": 16040 }, { "epoch": 2.326339819545603, "grad_norm": 1.1414878368377686, "learning_rate": 2.674300623278736e-05, "loss": 0.615, "step": 16050 }, { "epoch": 2.327789252454977, "grad_norm": 1.0786707401275635, "learning_rate": 2.6728511378460647e-05, "loss": 0.5238, "step": 16060 }, { "epoch": 2.329238685364351, "grad_norm": 2.1784827709198, "learning_rate": 2.6714016524133934e-05, "loss": 0.5548, "step": 16070 }, { "epoch": 2.3306881182737254, "grad_norm": 1.0003679990768433, "learning_rate": 2.6699521669807222e-05, "loss": 0.5994, "step": 16080 }, { "epoch": 2.3321375511830995, "grad_norm": 0.9344300627708435, "learning_rate": 2.6685026815480506e-05, "loss": 0.5763, "step": 16090 }, { "epoch": 2.3335869840924737, "grad_norm": 0.8955085277557373, "learning_rate": 2.6670531961153794e-05, "loss": 0.5758, "step": 16100 }, { "epoch": 2.335036417001848, "grad_norm": 0.9898104667663574, "learning_rate": 2.6656037106827075e-05, "loss": 0.5956, "step": 16110 }, { "epoch": 2.336485849911222, "grad_norm": 0.795774519443512, "learning_rate": 2.6641542252500363e-05, "loss": 0.5701, "step": 16120 }, { "epoch": 2.3379352828205966, "grad_norm": 0.7755706906318665, "learning_rate": 2.662704739817365e-05, "loss": 0.5359, "step": 16130 }, { "epoch": 2.3393847157299708, "grad_norm": 0.9177207946777344, "learning_rate": 2.6612552543846935e-05, "loss": 0.5405, "step": 16140 }, { "epoch": 2.340834148639345, "grad_norm": 0.9761675000190735, "learning_rate": 2.6598057689520223e-05, "loss": 0.6509, "step": 16150 }, { "epoch": 2.342283581548719, "grad_norm": 1.5821987390518188, "learning_rate": 2.658356283519351e-05, "loss": 0.5317, "step": 16160 }, { "epoch": 2.3437330144580932, "grad_norm": 1.0644711256027222, "learning_rate": 2.656906798086679e-05, "loss": 0.5593, "step": 16170 }, { "epoch": 2.3451824473674674, "grad_norm": 0.7431843280792236, "learning_rate": 2.655457312654008e-05, "loss": 0.5443, "step": 16180 }, { "epoch": 2.3466318802768416, "grad_norm": 1.9291008710861206, "learning_rate": 2.6540078272213364e-05, "loss": 0.5501, "step": 16190 }, { "epoch": 2.3480813131862157, "grad_norm": 1.1686913967132568, "learning_rate": 2.652558341788665e-05, "loss": 0.5686, "step": 16200 }, { "epoch": 2.3495307460955903, "grad_norm": 0.8081609606742859, "learning_rate": 2.651108856355994e-05, "loss": 0.5372, "step": 16210 }, { "epoch": 2.3509801790049645, "grad_norm": 2.6243176460266113, "learning_rate": 2.6496593709233223e-05, "loss": 0.5796, "step": 16220 }, { "epoch": 2.3524296119143386, "grad_norm": 2.3926889896392822, "learning_rate": 2.648209885490651e-05, "loss": 0.5658, "step": 16230 }, { "epoch": 2.353879044823713, "grad_norm": 0.7738615870475769, "learning_rate": 2.6467604000579792e-05, "loss": 0.5407, "step": 16240 }, { "epoch": 2.355328477733087, "grad_norm": 1.307656168937683, "learning_rate": 2.645310914625308e-05, "loss": 0.4653, "step": 16250 }, { "epoch": 2.356777910642461, "grad_norm": 3.530689001083374, "learning_rate": 2.6438614291926368e-05, "loss": 0.5606, "step": 16260 }, { "epoch": 2.3582273435518353, "grad_norm": 3.2263526916503906, "learning_rate": 2.6424119437599655e-05, "loss": 0.5717, "step": 16270 }, { "epoch": 2.3596767764612094, "grad_norm": 1.393611192703247, "learning_rate": 2.640962458327294e-05, "loss": 0.6187, "step": 16280 }, { "epoch": 2.3611262093705836, "grad_norm": 1.1633098125457764, "learning_rate": 2.6395129728946227e-05, "loss": 0.6338, "step": 16290 }, { "epoch": 2.3625756422799578, "grad_norm": 1.472760796546936, "learning_rate": 2.638063487461951e-05, "loss": 0.5886, "step": 16300 }, { "epoch": 2.3640250751893324, "grad_norm": 0.9567728042602539, "learning_rate": 2.6366140020292796e-05, "loss": 0.5278, "step": 16310 }, { "epoch": 2.3654745080987065, "grad_norm": 1.845215916633606, "learning_rate": 2.6351645165966084e-05, "loss": 0.6091, "step": 16320 }, { "epoch": 2.3669239410080807, "grad_norm": 2.0079545974731445, "learning_rate": 2.6337150311639368e-05, "loss": 0.6078, "step": 16330 }, { "epoch": 2.368373373917455, "grad_norm": 3.1055665016174316, "learning_rate": 2.6322655457312656e-05, "loss": 0.5967, "step": 16340 }, { "epoch": 2.369822806826829, "grad_norm": 2.2276206016540527, "learning_rate": 2.6308160602985944e-05, "loss": 0.546, "step": 16350 }, { "epoch": 2.371272239736203, "grad_norm": 1.090536117553711, "learning_rate": 2.6293665748659228e-05, "loss": 0.5972, "step": 16360 }, { "epoch": 2.3727216726455773, "grad_norm": 3.283720016479492, "learning_rate": 2.6279170894332513e-05, "loss": 0.5906, "step": 16370 }, { "epoch": 2.3741711055549515, "grad_norm": 1.0914537906646729, "learning_rate": 2.6264676040005797e-05, "loss": 0.6173, "step": 16380 }, { "epoch": 2.375620538464326, "grad_norm": 1.635926604270935, "learning_rate": 2.6250181185679085e-05, "loss": 0.6039, "step": 16390 }, { "epoch": 2.3770699713737002, "grad_norm": 0.960096001625061, "learning_rate": 2.6235686331352372e-05, "loss": 0.6001, "step": 16400 }, { "epoch": 2.3785194042830744, "grad_norm": 0.8694259524345398, "learning_rate": 2.6221191477025657e-05, "loss": 0.5376, "step": 16410 }, { "epoch": 2.3799688371924486, "grad_norm": 0.9353076815605164, "learning_rate": 2.6206696622698944e-05, "loss": 0.6184, "step": 16420 }, { "epoch": 2.3814182701018227, "grad_norm": 2.265793800354004, "learning_rate": 2.6192201768372225e-05, "loss": 0.5653, "step": 16430 }, { "epoch": 2.382867703011197, "grad_norm": 0.8421527147293091, "learning_rate": 2.6177706914045513e-05, "loss": 0.6106, "step": 16440 }, { "epoch": 2.384317135920571, "grad_norm": 1.8163224458694458, "learning_rate": 2.61632120597188e-05, "loss": 0.519, "step": 16450 }, { "epoch": 2.385766568829945, "grad_norm": 2.589510679244995, "learning_rate": 2.614871720539209e-05, "loss": 0.629, "step": 16460 }, { "epoch": 2.3872160017393194, "grad_norm": 0.9429428577423096, "learning_rate": 2.6134222351065373e-05, "loss": 0.616, "step": 16470 }, { "epoch": 2.3886654346486935, "grad_norm": 1.0219968557357788, "learning_rate": 2.611972749673866e-05, "loss": 0.5683, "step": 16480 }, { "epoch": 2.3901148675580677, "grad_norm": 0.883083701133728, "learning_rate": 2.610523264241195e-05, "loss": 0.5917, "step": 16490 }, { "epoch": 2.3915643004674423, "grad_norm": 2.5757100582122803, "learning_rate": 2.609073778808523e-05, "loss": 0.5856, "step": 16500 }, { "epoch": 2.3930137333768164, "grad_norm": 2.343125581741333, "learning_rate": 2.6076242933758517e-05, "loss": 0.5585, "step": 16510 }, { "epoch": 2.3944631662861906, "grad_norm": 2.2022042274475098, "learning_rate": 2.60617480794318e-05, "loss": 0.5742, "step": 16520 }, { "epoch": 2.3959125991955648, "grad_norm": 1.8321778774261475, "learning_rate": 2.604725322510509e-05, "loss": 0.6063, "step": 16530 }, { "epoch": 2.397362032104939, "grad_norm": 3.5156893730163574, "learning_rate": 2.6032758370778377e-05, "loss": 0.5967, "step": 16540 }, { "epoch": 2.398811465014313, "grad_norm": 0.9662124514579773, "learning_rate": 2.601826351645166e-05, "loss": 0.5819, "step": 16550 }, { "epoch": 2.4002608979236872, "grad_norm": 2.2844319343566895, "learning_rate": 2.6003768662124946e-05, "loss": 0.5899, "step": 16560 }, { "epoch": 2.4017103308330614, "grad_norm": 1.8318297863006592, "learning_rate": 2.598927380779823e-05, "loss": 0.5363, "step": 16570 }, { "epoch": 2.403159763742436, "grad_norm": 0.8952181935310364, "learning_rate": 2.5974778953471518e-05, "loss": 0.5589, "step": 16580 }, { "epoch": 2.40460919665181, "grad_norm": 0.9220480918884277, "learning_rate": 2.5960284099144806e-05, "loss": 0.5619, "step": 16590 }, { "epoch": 2.4060586295611843, "grad_norm": 1.3954976797103882, "learning_rate": 2.594578924481809e-05, "loss": 0.5288, "step": 16600 }, { "epoch": 2.4075080624705585, "grad_norm": 0.9671636819839478, "learning_rate": 2.5931294390491378e-05, "loss": 0.5749, "step": 16610 }, { "epoch": 2.4089574953799326, "grad_norm": 2.2830398082733154, "learning_rate": 2.5916799536164666e-05, "loss": 0.5306, "step": 16620 }, { "epoch": 2.410406928289307, "grad_norm": 2.176110029220581, "learning_rate": 2.5902304681837947e-05, "loss": 0.5812, "step": 16630 }, { "epoch": 2.411856361198681, "grad_norm": 0.8190110325813293, "learning_rate": 2.5887809827511234e-05, "loss": 0.5436, "step": 16640 }, { "epoch": 2.413305794108055, "grad_norm": 1.3935105800628662, "learning_rate": 2.5873314973184522e-05, "loss": 0.5721, "step": 16650 }, { "epoch": 2.4147552270174293, "grad_norm": 1.1035131216049194, "learning_rate": 2.5858820118857806e-05, "loss": 0.6168, "step": 16660 }, { "epoch": 2.4162046599268034, "grad_norm": 2.679525136947632, "learning_rate": 2.5844325264531094e-05, "loss": 0.6231, "step": 16670 }, { "epoch": 2.4176540928361776, "grad_norm": 0.9634791016578674, "learning_rate": 2.5829830410204382e-05, "loss": 0.5816, "step": 16680 }, { "epoch": 2.419103525745552, "grad_norm": 1.4482085704803467, "learning_rate": 2.5815335555877663e-05, "loss": 0.5483, "step": 16690 }, { "epoch": 2.4205529586549264, "grad_norm": 0.7800090312957764, "learning_rate": 2.580084070155095e-05, "loss": 0.524, "step": 16700 }, { "epoch": 2.4220023915643005, "grad_norm": 0.8343473076820374, "learning_rate": 2.5786345847224235e-05, "loss": 0.5098, "step": 16710 }, { "epoch": 2.4234518244736747, "grad_norm": 3.29060435295105, "learning_rate": 2.5771850992897523e-05, "loss": 0.588, "step": 16720 }, { "epoch": 2.424901257383049, "grad_norm": 0.890139639377594, "learning_rate": 2.575735613857081e-05, "loss": 0.6131, "step": 16730 }, { "epoch": 2.426350690292423, "grad_norm": 2.1215193271636963, "learning_rate": 2.5742861284244095e-05, "loss": 0.5887, "step": 16740 }, { "epoch": 2.427800123201797, "grad_norm": 1.007075548171997, "learning_rate": 2.5728366429917383e-05, "loss": 0.5369, "step": 16750 }, { "epoch": 2.4292495561111713, "grad_norm": 2.85379695892334, "learning_rate": 2.5713871575590663e-05, "loss": 0.5485, "step": 16760 }, { "epoch": 2.430698989020546, "grad_norm": 1.8373109102249146, "learning_rate": 2.569937672126395e-05, "loss": 0.5603, "step": 16770 }, { "epoch": 2.43214842192992, "grad_norm": 1.6939905881881714, "learning_rate": 2.568488186693724e-05, "loss": 0.5318, "step": 16780 }, { "epoch": 2.4335978548392942, "grad_norm": 1.1219695806503296, "learning_rate": 2.5670387012610523e-05, "loss": 0.5429, "step": 16790 }, { "epoch": 2.4350472877486684, "grad_norm": 3.929163932800293, "learning_rate": 2.565589215828381e-05, "loss": 0.5971, "step": 16800 }, { "epoch": 2.4364967206580426, "grad_norm": 2.685321807861328, "learning_rate": 2.56413973039571e-05, "loss": 0.6142, "step": 16810 }, { "epoch": 2.4379461535674167, "grad_norm": 2.1034326553344727, "learning_rate": 2.562690244963038e-05, "loss": 0.6629, "step": 16820 }, { "epoch": 2.439395586476791, "grad_norm": 3.248325824737549, "learning_rate": 2.5612407595303668e-05, "loss": 0.5933, "step": 16830 }, { "epoch": 2.440845019386165, "grad_norm": 0.8918882012367249, "learning_rate": 2.5597912740976955e-05, "loss": 0.608, "step": 16840 }, { "epoch": 2.442294452295539, "grad_norm": 2.837606906890869, "learning_rate": 2.558341788665024e-05, "loss": 0.51, "step": 16850 }, { "epoch": 2.4437438852049134, "grad_norm": 2.903970241546631, "learning_rate": 2.5568923032323527e-05, "loss": 0.6183, "step": 16860 }, { "epoch": 2.445193318114288, "grad_norm": 0.7252295017242432, "learning_rate": 2.5554428177996815e-05, "loss": 0.5576, "step": 16870 }, { "epoch": 2.446642751023662, "grad_norm": 2.373798370361328, "learning_rate": 2.55399333236701e-05, "loss": 0.5976, "step": 16880 }, { "epoch": 2.4480921839330363, "grad_norm": 0.8515360355377197, "learning_rate": 2.5525438469343384e-05, "loss": 0.576, "step": 16890 }, { "epoch": 2.4495416168424105, "grad_norm": 1.0345165729522705, "learning_rate": 2.5510943615016668e-05, "loss": 0.5842, "step": 16900 }, { "epoch": 2.4509910497517846, "grad_norm": 0.9085811972618103, "learning_rate": 2.5496448760689956e-05, "loss": 0.5532, "step": 16910 }, { "epoch": 2.4524404826611588, "grad_norm": 2.3138504028320312, "learning_rate": 2.5481953906363244e-05, "loss": 0.5709, "step": 16920 }, { "epoch": 2.453889915570533, "grad_norm": 2.1807808876037598, "learning_rate": 2.5467459052036528e-05, "loss": 0.5493, "step": 16930 }, { "epoch": 2.455339348479907, "grad_norm": 0.8590166568756104, "learning_rate": 2.5452964197709816e-05, "loss": 0.5581, "step": 16940 }, { "epoch": 2.4567887813892813, "grad_norm": 1.3868404626846313, "learning_rate": 2.5438469343383097e-05, "loss": 0.5393, "step": 16950 }, { "epoch": 2.458238214298656, "grad_norm": 2.7364912033081055, "learning_rate": 2.5423974489056385e-05, "loss": 0.5478, "step": 16960 }, { "epoch": 2.45968764720803, "grad_norm": 0.6827089786529541, "learning_rate": 2.5409479634729672e-05, "loss": 0.59, "step": 16970 }, { "epoch": 2.461137080117404, "grad_norm": 0.8788642287254333, "learning_rate": 2.5394984780402957e-05, "loss": 0.5632, "step": 16980 }, { "epoch": 2.4625865130267783, "grad_norm": 1.083719253540039, "learning_rate": 2.5380489926076244e-05, "loss": 0.5878, "step": 16990 }, { "epoch": 2.4640359459361525, "grad_norm": 0.7993260025978088, "learning_rate": 2.5365995071749532e-05, "loss": 0.5351, "step": 17000 }, { "epoch": 2.4640359459361525, "eval_loss": 0.7283534407615662, "eval_runtime": 670.6485, "eval_samples_per_second": 51.435, "eval_steps_per_second": 2.572, "eval_token_accuracy": 0.00039727907262518793, "step": 17000 }, { "epoch": 2.4654853788455267, "grad_norm": 2.387397527694702, "learning_rate": 2.535150021742282e-05, "loss": 0.5472, "step": 17010 }, { "epoch": 2.466934811754901, "grad_norm": 2.854649305343628, "learning_rate": 2.53370053630961e-05, "loss": 0.5452, "step": 17020 }, { "epoch": 2.468384244664275, "grad_norm": 1.1013128757476807, "learning_rate": 2.532251050876939e-05, "loss": 0.6453, "step": 17030 }, { "epoch": 2.469833677573649, "grad_norm": 0.925757110118866, "learning_rate": 2.5308015654442673e-05, "loss": 0.5083, "step": 17040 }, { "epoch": 2.4712831104830233, "grad_norm": 0.8227369785308838, "learning_rate": 2.529352080011596e-05, "loss": 0.5315, "step": 17050 }, { "epoch": 2.472732543392398, "grad_norm": 2.7134206295013428, "learning_rate": 2.527902594578925e-05, "loss": 0.548, "step": 17060 }, { "epoch": 2.474181976301772, "grad_norm": 1.0173161029815674, "learning_rate": 2.5264531091462533e-05, "loss": 0.5493, "step": 17070 }, { "epoch": 2.475631409211146, "grad_norm": 1.1902835369110107, "learning_rate": 2.5250036237135817e-05, "loss": 0.5379, "step": 17080 }, { "epoch": 2.4770808421205204, "grad_norm": 2.2522265911102295, "learning_rate": 2.52355413828091e-05, "loss": 0.5814, "step": 17090 }, { "epoch": 2.4785302750298945, "grad_norm": 1.12079918384552, "learning_rate": 2.522104652848239e-05, "loss": 0.578, "step": 17100 }, { "epoch": 2.4799797079392687, "grad_norm": 1.099763035774231, "learning_rate": 2.5206551674155677e-05, "loss": 0.5816, "step": 17110 }, { "epoch": 2.481429140848643, "grad_norm": 2.820364475250244, "learning_rate": 2.519205681982896e-05, "loss": 0.5926, "step": 17120 }, { "epoch": 2.482878573758017, "grad_norm": 1.0384719371795654, "learning_rate": 2.517756196550225e-05, "loss": 0.6269, "step": 17130 }, { "epoch": 2.4843280066673916, "grad_norm": 0.8136078715324402, "learning_rate": 2.516306711117553e-05, "loss": 0.5686, "step": 17140 }, { "epoch": 2.485777439576766, "grad_norm": 1.984144926071167, "learning_rate": 2.5148572256848818e-05, "loss": 0.5829, "step": 17150 }, { "epoch": 2.48722687248614, "grad_norm": 1.0443241596221924, "learning_rate": 2.5134077402522106e-05, "loss": 0.5972, "step": 17160 }, { "epoch": 2.488676305395514, "grad_norm": 2.3265018463134766, "learning_rate": 2.5119582548195393e-05, "loss": 0.5548, "step": 17170 }, { "epoch": 2.4901257383048883, "grad_norm": 2.045551061630249, "learning_rate": 2.5105087693868678e-05, "loss": 0.5481, "step": 17180 }, { "epoch": 2.4915751712142624, "grad_norm": 2.1172616481781006, "learning_rate": 2.5090592839541965e-05, "loss": 0.6316, "step": 17190 }, { "epoch": 2.4930246041236366, "grad_norm": 2.111454963684082, "learning_rate": 2.5076097985215253e-05, "loss": 0.5581, "step": 17200 }, { "epoch": 2.4944740370330107, "grad_norm": 0.8427057266235352, "learning_rate": 2.5061603130888534e-05, "loss": 0.5431, "step": 17210 }, { "epoch": 2.495923469942385, "grad_norm": 1.2653619050979614, "learning_rate": 2.5047108276561822e-05, "loss": 0.5838, "step": 17220 }, { "epoch": 2.497372902851759, "grad_norm": 0.9283726215362549, "learning_rate": 2.5032613422235106e-05, "loss": 0.6017, "step": 17230 }, { "epoch": 2.498822335761133, "grad_norm": 3.0021955966949463, "learning_rate": 2.5018118567908394e-05, "loss": 0.5924, "step": 17240 }, { "epoch": 2.500271768670508, "grad_norm": 0.9704183340072632, "learning_rate": 2.5003623713581682e-05, "loss": 0.6268, "step": 17250 }, { "epoch": 2.501721201579882, "grad_norm": 1.6761257648468018, "learning_rate": 2.4989128859254966e-05, "loss": 0.527, "step": 17260 }, { "epoch": 2.503170634489256, "grad_norm": 0.9880573153495789, "learning_rate": 2.497463400492825e-05, "loss": 0.5573, "step": 17270 }, { "epoch": 2.5046200673986303, "grad_norm": 0.8700310587882996, "learning_rate": 2.4960139150601538e-05, "loss": 0.6175, "step": 17280 }, { "epoch": 2.5060695003080045, "grad_norm": 2.353101968765259, "learning_rate": 2.4945644296274823e-05, "loss": 0.5717, "step": 17290 }, { "epoch": 2.5075189332173786, "grad_norm": 0.9893775582313538, "learning_rate": 2.493114944194811e-05, "loss": 0.5915, "step": 17300 }, { "epoch": 2.508968366126753, "grad_norm": 1.2474043369293213, "learning_rate": 2.4916654587621395e-05, "loss": 0.5724, "step": 17310 }, { "epoch": 2.5104177990361274, "grad_norm": 1.757016658782959, "learning_rate": 2.490215973329468e-05, "loss": 0.6423, "step": 17320 }, { "epoch": 2.5118672319455015, "grad_norm": 2.3559505939483643, "learning_rate": 2.4887664878967967e-05, "loss": 0.6315, "step": 17330 }, { "epoch": 2.5133166648548757, "grad_norm": 2.5416176319122314, "learning_rate": 2.4873170024641255e-05, "loss": 0.5803, "step": 17340 }, { "epoch": 2.51476609776425, "grad_norm": 0.9849221110343933, "learning_rate": 2.485867517031454e-05, "loss": 0.5358, "step": 17350 }, { "epoch": 2.516215530673624, "grad_norm": 1.1437593698501587, "learning_rate": 2.4844180315987827e-05, "loss": 0.6097, "step": 17360 }, { "epoch": 2.517664963582998, "grad_norm": 1.940948486328125, "learning_rate": 2.482968546166111e-05, "loss": 0.5221, "step": 17370 }, { "epoch": 2.5191143964923723, "grad_norm": 1.1979233026504517, "learning_rate": 2.48151906073344e-05, "loss": 0.5467, "step": 17380 }, { "epoch": 2.5205638294017465, "grad_norm": 0.8096486330032349, "learning_rate": 2.4800695753007683e-05, "loss": 0.6043, "step": 17390 }, { "epoch": 2.5220132623111207, "grad_norm": 1.0167595148086548, "learning_rate": 2.478620089868097e-05, "loss": 0.5643, "step": 17400 }, { "epoch": 2.523462695220495, "grad_norm": 0.9136523008346558, "learning_rate": 2.4771706044354255e-05, "loss": 0.5803, "step": 17410 }, { "epoch": 2.524912128129869, "grad_norm": 0.9761309027671814, "learning_rate": 2.475721119002754e-05, "loss": 0.5619, "step": 17420 }, { "epoch": 2.526361561039243, "grad_norm": 0.8645803928375244, "learning_rate": 2.4742716335700827e-05, "loss": 0.5925, "step": 17430 }, { "epoch": 2.5278109939486177, "grad_norm": 1.0616345405578613, "learning_rate": 2.4728221481374115e-05, "loss": 0.5879, "step": 17440 }, { "epoch": 2.529260426857992, "grad_norm": 1.0399123430252075, "learning_rate": 2.47137266270474e-05, "loss": 0.5642, "step": 17450 }, { "epoch": 2.530709859767366, "grad_norm": 1.3098094463348389, "learning_rate": 2.4699231772720684e-05, "loss": 0.5524, "step": 17460 }, { "epoch": 2.5321592926767402, "grad_norm": 2.086995840072632, "learning_rate": 2.468473691839397e-05, "loss": 0.5656, "step": 17470 }, { "epoch": 2.5336087255861144, "grad_norm": 0.8559460043907166, "learning_rate": 2.4670242064067256e-05, "loss": 0.5693, "step": 17480 }, { "epoch": 2.5350581584954885, "grad_norm": 1.3102972507476807, "learning_rate": 2.4655747209740544e-05, "loss": 0.6378, "step": 17490 }, { "epoch": 2.5365075914048627, "grad_norm": 2.715803623199463, "learning_rate": 2.4641252355413828e-05, "loss": 0.589, "step": 17500 }, { "epoch": 2.5379570243142373, "grad_norm": 1.1338856220245361, "learning_rate": 2.4626757501087116e-05, "loss": 0.573, "step": 17510 }, { "epoch": 2.5394064572236115, "grad_norm": 1.6766473054885864, "learning_rate": 2.46122626467604e-05, "loss": 0.5306, "step": 17520 }, { "epoch": 2.5408558901329856, "grad_norm": 1.0871402025222778, "learning_rate": 2.4597767792433688e-05, "loss": 0.5491, "step": 17530 }, { "epoch": 2.54230532304236, "grad_norm": 1.2214182615280151, "learning_rate": 2.4583272938106976e-05, "loss": 0.6273, "step": 17540 }, { "epoch": 2.543754755951734, "grad_norm": 1.6520607471466064, "learning_rate": 2.456877808378026e-05, "loss": 0.6235, "step": 17550 }, { "epoch": 2.545204188861108, "grad_norm": 2.7489123344421387, "learning_rate": 2.4554283229453544e-05, "loss": 0.5907, "step": 17560 }, { "epoch": 2.5466536217704823, "grad_norm": 1.1337274312973022, "learning_rate": 2.4539788375126832e-05, "loss": 0.5336, "step": 17570 }, { "epoch": 2.5481030546798564, "grad_norm": 1.1375880241394043, "learning_rate": 2.4525293520800116e-05, "loss": 0.5142, "step": 17580 }, { "epoch": 2.5495524875892306, "grad_norm": 0.8471434712409973, "learning_rate": 2.4510798666473404e-05, "loss": 0.5173, "step": 17590 }, { "epoch": 2.5510019204986047, "grad_norm": 0.8759787678718567, "learning_rate": 2.449630381214669e-05, "loss": 0.5195, "step": 17600 }, { "epoch": 2.552451353407979, "grad_norm": 0.8988997936248779, "learning_rate": 2.4481808957819973e-05, "loss": 0.5688, "step": 17610 }, { "epoch": 2.553900786317353, "grad_norm": 2.0469727516174316, "learning_rate": 2.446731410349326e-05, "loss": 0.5593, "step": 17620 }, { "epoch": 2.5553502192267277, "grad_norm": 1.4637449979782104, "learning_rate": 2.445281924916655e-05, "loss": 0.5109, "step": 17630 }, { "epoch": 2.556799652136102, "grad_norm": 1.2323262691497803, "learning_rate": 2.4438324394839833e-05, "loss": 0.6227, "step": 17640 }, { "epoch": 2.558249085045476, "grad_norm": 4.2845940589904785, "learning_rate": 2.4423829540513117e-05, "loss": 0.5726, "step": 17650 }, { "epoch": 2.55969851795485, "grad_norm": 0.966750979423523, "learning_rate": 2.4409334686186405e-05, "loss": 0.629, "step": 17660 }, { "epoch": 2.5611479508642243, "grad_norm": 2.271350145339966, "learning_rate": 2.4394839831859693e-05, "loss": 0.6151, "step": 17670 }, { "epoch": 2.5625973837735985, "grad_norm": 0.9974566698074341, "learning_rate": 2.4380344977532977e-05, "loss": 0.4867, "step": 17680 }, { "epoch": 2.5640468166829726, "grad_norm": 1.0083458423614502, "learning_rate": 2.436585012320626e-05, "loss": 0.531, "step": 17690 }, { "epoch": 2.5654962495923472, "grad_norm": 1.5749166011810303, "learning_rate": 2.435135526887955e-05, "loss": 0.5425, "step": 17700 }, { "epoch": 2.5669456825017214, "grad_norm": 1.1030534505844116, "learning_rate": 2.4336860414552833e-05, "loss": 0.5484, "step": 17710 }, { "epoch": 2.5683951154110956, "grad_norm": 1.7335549592971802, "learning_rate": 2.432236556022612e-05, "loss": 0.6168, "step": 17720 }, { "epoch": 2.5698445483204697, "grad_norm": 1.9986166954040527, "learning_rate": 2.430787070589941e-05, "loss": 0.5892, "step": 17730 }, { "epoch": 2.571293981229844, "grad_norm": 0.8699630498886108, "learning_rate": 2.4293375851572693e-05, "loss": 0.472, "step": 17740 }, { "epoch": 2.572743414139218, "grad_norm": 0.9094012379646301, "learning_rate": 2.4278880997245978e-05, "loss": 0.5479, "step": 17750 }, { "epoch": 2.574192847048592, "grad_norm": 1.0403581857681274, "learning_rate": 2.4264386142919265e-05, "loss": 0.5751, "step": 17760 }, { "epoch": 2.5756422799579664, "grad_norm": 2.722165822982788, "learning_rate": 2.424989128859255e-05, "loss": 0.5864, "step": 17770 }, { "epoch": 2.5770917128673405, "grad_norm": 1.9678312540054321, "learning_rate": 2.4235396434265838e-05, "loss": 0.6081, "step": 17780 }, { "epoch": 2.5785411457767147, "grad_norm": 0.9112073183059692, "learning_rate": 2.4220901579939122e-05, "loss": 0.5357, "step": 17790 }, { "epoch": 2.579990578686089, "grad_norm": 2.1713218688964844, "learning_rate": 2.420640672561241e-05, "loss": 0.5997, "step": 17800 }, { "epoch": 2.5814400115954634, "grad_norm": 1.1149449348449707, "learning_rate": 2.4191911871285694e-05, "loss": 0.5664, "step": 17810 }, { "epoch": 2.5828894445048376, "grad_norm": 2.689030885696411, "learning_rate": 2.4177417016958982e-05, "loss": 0.5555, "step": 17820 }, { "epoch": 2.5843388774142118, "grad_norm": 0.8640225529670715, "learning_rate": 2.4162922162632266e-05, "loss": 0.5226, "step": 17830 }, { "epoch": 2.585788310323586, "grad_norm": 2.8747336864471436, "learning_rate": 2.414842730830555e-05, "loss": 0.5797, "step": 17840 }, { "epoch": 2.58723774323296, "grad_norm": 1.1739059686660767, "learning_rate": 2.4133932453978838e-05, "loss": 0.5475, "step": 17850 }, { "epoch": 2.5886871761423342, "grad_norm": 1.213647723197937, "learning_rate": 2.4119437599652126e-05, "loss": 0.5722, "step": 17860 }, { "epoch": 2.5901366090517084, "grad_norm": 1.078586459159851, "learning_rate": 2.410494274532541e-05, "loss": 0.5696, "step": 17870 }, { "epoch": 2.591586041961083, "grad_norm": 0.7184905409812927, "learning_rate": 2.4090447890998695e-05, "loss": 0.5729, "step": 17880 }, { "epoch": 2.593035474870457, "grad_norm": 1.021785020828247, "learning_rate": 2.4075953036671982e-05, "loss": 0.6097, "step": 17890 }, { "epoch": 2.5944849077798313, "grad_norm": 2.1129040718078613, "learning_rate": 2.4061458182345267e-05, "loss": 0.5692, "step": 17900 }, { "epoch": 2.5959343406892055, "grad_norm": 2.2645630836486816, "learning_rate": 2.4046963328018555e-05, "loss": 0.5428, "step": 17910 }, { "epoch": 2.5973837735985796, "grad_norm": 2.277709484100342, "learning_rate": 2.4032468473691842e-05, "loss": 0.5822, "step": 17920 }, { "epoch": 2.598833206507954, "grad_norm": 2.4047906398773193, "learning_rate": 2.4017973619365127e-05, "loss": 0.5544, "step": 17930 }, { "epoch": 2.600282639417328, "grad_norm": 1.217633605003357, "learning_rate": 2.400347876503841e-05, "loss": 0.5927, "step": 17940 }, { "epoch": 2.601732072326702, "grad_norm": 1.1137984991073608, "learning_rate": 2.39889839107117e-05, "loss": 0.5649, "step": 17950 }, { "epoch": 2.6031815052360763, "grad_norm": 2.8685598373413086, "learning_rate": 2.3974489056384987e-05, "loss": 0.63, "step": 17960 }, { "epoch": 2.6046309381454504, "grad_norm": 2.009880781173706, "learning_rate": 2.395999420205827e-05, "loss": 0.6152, "step": 17970 }, { "epoch": 2.6060803710548246, "grad_norm": 2.669762134552002, "learning_rate": 2.3945499347731555e-05, "loss": 0.63, "step": 17980 }, { "epoch": 2.6075298039641988, "grad_norm": 4.854875564575195, "learning_rate": 2.3931004493404843e-05, "loss": 0.6127, "step": 17990 }, { "epoch": 2.6089792368735734, "grad_norm": 0.7523910403251648, "learning_rate": 2.3916509639078127e-05, "loss": 0.567, "step": 18000 }, { "epoch": 2.6089792368735734, "eval_loss": 0.7176042795181274, "eval_runtime": 670.7992, "eval_samples_per_second": 51.424, "eval_steps_per_second": 2.572, "eval_token_accuracy": 0.00039052371997326166, "step": 18000 }, { "epoch": 2.6104286697829475, "grad_norm": 1.8868845701217651, "learning_rate": 2.3902014784751415e-05, "loss": 0.5227, "step": 18010 }, { "epoch": 2.6118781026923217, "grad_norm": 3.6070969104766846, "learning_rate": 2.38875199304247e-05, "loss": 0.5724, "step": 18020 }, { "epoch": 2.613327535601696, "grad_norm": 1.313196063041687, "learning_rate": 2.3873025076097984e-05, "loss": 0.6014, "step": 18030 }, { "epoch": 2.61477696851107, "grad_norm": 2.521747350692749, "learning_rate": 2.385853022177127e-05, "loss": 0.5432, "step": 18040 }, { "epoch": 2.616226401420444, "grad_norm": 4.846094131469727, "learning_rate": 2.384403536744456e-05, "loss": 0.528, "step": 18050 }, { "epoch": 2.6176758343298183, "grad_norm": 1.277256727218628, "learning_rate": 2.3829540513117847e-05, "loss": 0.5767, "step": 18060 }, { "epoch": 2.619125267239193, "grad_norm": 1.5727423429489136, "learning_rate": 2.381504565879113e-05, "loss": 0.4942, "step": 18070 }, { "epoch": 2.620574700148567, "grad_norm": 0.8757898211479187, "learning_rate": 2.3800550804464416e-05, "loss": 0.5481, "step": 18080 }, { "epoch": 2.6220241330579412, "grad_norm": 2.6296439170837402, "learning_rate": 2.3786055950137703e-05, "loss": 0.5735, "step": 18090 }, { "epoch": 2.6234735659673154, "grad_norm": 1.3403879404067993, "learning_rate": 2.3771561095810988e-05, "loss": 0.5564, "step": 18100 }, { "epoch": 2.6249229988766896, "grad_norm": 1.4756869077682495, "learning_rate": 2.3757066241484276e-05, "loss": 0.5626, "step": 18110 }, { "epoch": 2.6263724317860637, "grad_norm": 1.0589386224746704, "learning_rate": 2.374257138715756e-05, "loss": 0.5912, "step": 18120 }, { "epoch": 2.627821864695438, "grad_norm": 1.1331684589385986, "learning_rate": 2.3728076532830844e-05, "loss": 0.5754, "step": 18130 }, { "epoch": 2.629271297604812, "grad_norm": 3.5398993492126465, "learning_rate": 2.3713581678504132e-05, "loss": 0.5491, "step": 18140 }, { "epoch": 2.630720730514186, "grad_norm": 1.6487380266189575, "learning_rate": 2.369908682417742e-05, "loss": 0.5244, "step": 18150 }, { "epoch": 2.6321701634235604, "grad_norm": 2.910534620285034, "learning_rate": 2.3684591969850704e-05, "loss": 0.5973, "step": 18160 }, { "epoch": 2.6336195963329345, "grad_norm": 1.6422079801559448, "learning_rate": 2.367009711552399e-05, "loss": 0.6234, "step": 18170 }, { "epoch": 2.6350690292423087, "grad_norm": 1.9400933980941772, "learning_rate": 2.3655602261197276e-05, "loss": 0.5992, "step": 18180 }, { "epoch": 2.6365184621516833, "grad_norm": 0.6542257070541382, "learning_rate": 2.3641107406870564e-05, "loss": 0.6516, "step": 18190 }, { "epoch": 2.6379678950610574, "grad_norm": 2.389939308166504, "learning_rate": 2.362661255254385e-05, "loss": 0.4967, "step": 18200 }, { "epoch": 2.6394173279704316, "grad_norm": 4.163430213928223, "learning_rate": 2.3612117698217133e-05, "loss": 0.5762, "step": 18210 }, { "epoch": 2.6408667608798058, "grad_norm": 0.9543486833572388, "learning_rate": 2.359762284389042e-05, "loss": 0.6407, "step": 18220 }, { "epoch": 2.64231619378918, "grad_norm": 2.0527260303497314, "learning_rate": 2.3583127989563705e-05, "loss": 0.5274, "step": 18230 }, { "epoch": 2.643765626698554, "grad_norm": 2.0211446285247803, "learning_rate": 2.3568633135236993e-05, "loss": 0.5337, "step": 18240 }, { "epoch": 2.6452150596079282, "grad_norm": 0.931024968624115, "learning_rate": 2.355413828091028e-05, "loss": 0.65, "step": 18250 }, { "epoch": 2.646664492517303, "grad_norm": 1.0443034172058105, "learning_rate": 2.3539643426583565e-05, "loss": 0.629, "step": 18260 }, { "epoch": 2.648113925426677, "grad_norm": 1.0292446613311768, "learning_rate": 2.352514857225685e-05, "loss": 0.556, "step": 18270 }, { "epoch": 2.649563358336051, "grad_norm": 3.1219959259033203, "learning_rate": 2.3510653717930137e-05, "loss": 0.568, "step": 18280 }, { "epoch": 2.6510127912454253, "grad_norm": 1.1402798891067505, "learning_rate": 2.349615886360342e-05, "loss": 0.6201, "step": 18290 }, { "epoch": 2.6524622241547995, "grad_norm": 1.141597867012024, "learning_rate": 2.348166400927671e-05, "loss": 0.638, "step": 18300 }, { "epoch": 2.6539116570641736, "grad_norm": 3.5763368606567383, "learning_rate": 2.3468618640382665e-05, "loss": 0.5554, "step": 18310 }, { "epoch": 2.655361089973548, "grad_norm": 0.9100473523139954, "learning_rate": 2.3454123786055953e-05, "loss": 0.549, "step": 18320 }, { "epoch": 2.656810522882922, "grad_norm": 2.972250461578369, "learning_rate": 2.3439628931729237e-05, "loss": 0.5549, "step": 18330 }, { "epoch": 2.658259955792296, "grad_norm": 0.8812468647956848, "learning_rate": 2.342513407740252e-05, "loss": 0.5164, "step": 18340 }, { "epoch": 2.6597093887016703, "grad_norm": 1.1236684322357178, "learning_rate": 2.341063922307581e-05, "loss": 0.5589, "step": 18350 }, { "epoch": 2.6611588216110444, "grad_norm": 2.689864158630371, "learning_rate": 2.3396144368749097e-05, "loss": 0.535, "step": 18360 }, { "epoch": 2.662608254520419, "grad_norm": 1.1456810235977173, "learning_rate": 2.338164951442238e-05, "loss": 0.5154, "step": 18370 }, { "epoch": 2.664057687429793, "grad_norm": 2.042848587036133, "learning_rate": 2.3367154660095666e-05, "loss": 0.5722, "step": 18380 }, { "epoch": 2.6655071203391674, "grad_norm": 1.8200641870498657, "learning_rate": 2.3352659805768953e-05, "loss": 0.5991, "step": 18390 }, { "epoch": 2.6669565532485415, "grad_norm": 3.0939741134643555, "learning_rate": 2.333816495144224e-05, "loss": 0.5762, "step": 18400 }, { "epoch": 2.6684059861579157, "grad_norm": 1.2191381454467773, "learning_rate": 2.3323670097115525e-05, "loss": 0.5775, "step": 18410 }, { "epoch": 2.66985541906729, "grad_norm": 3.6405093669891357, "learning_rate": 2.330917524278881e-05, "loss": 0.5567, "step": 18420 }, { "epoch": 2.671304851976664, "grad_norm": 0.6857460737228394, "learning_rate": 2.3294680388462097e-05, "loss": 0.531, "step": 18430 }, { "epoch": 2.6727542848860386, "grad_norm": 1.7805752754211426, "learning_rate": 2.3280185534135382e-05, "loss": 0.5528, "step": 18440 }, { "epoch": 2.6742037177954128, "grad_norm": 0.9542523622512817, "learning_rate": 2.326569067980867e-05, "loss": 0.4993, "step": 18450 }, { "epoch": 2.675653150704787, "grad_norm": 1.0148526430130005, "learning_rate": 2.3251195825481954e-05, "loss": 0.516, "step": 18460 }, { "epoch": 2.677102583614161, "grad_norm": 1.0944411754608154, "learning_rate": 2.323670097115524e-05, "loss": 0.6308, "step": 18470 }, { "epoch": 2.6785520165235353, "grad_norm": 1.1389912366867065, "learning_rate": 2.3222206116828526e-05, "loss": 0.5836, "step": 18480 }, { "epoch": 2.6800014494329094, "grad_norm": 1.4398833513259888, "learning_rate": 2.3207711262501814e-05, "loss": 0.5429, "step": 18490 }, { "epoch": 2.6814508823422836, "grad_norm": 1.2716155052185059, "learning_rate": 2.31932164081751e-05, "loss": 0.5753, "step": 18500 }, { "epoch": 2.6829003152516577, "grad_norm": 1.1771233081817627, "learning_rate": 2.3178721553848386e-05, "loss": 0.5961, "step": 18510 }, { "epoch": 2.684349748161032, "grad_norm": 3.2915737628936768, "learning_rate": 2.316422669952167e-05, "loss": 0.6194, "step": 18520 }, { "epoch": 2.685799181070406, "grad_norm": 3.6810009479522705, "learning_rate": 2.3149731845194958e-05, "loss": 0.5164, "step": 18530 }, { "epoch": 2.68724861397978, "grad_norm": 1.319347858428955, "learning_rate": 2.3135236990868242e-05, "loss": 0.4943, "step": 18540 }, { "epoch": 2.6886980468891544, "grad_norm": 0.9938089847564697, "learning_rate": 2.312074213654153e-05, "loss": 0.5421, "step": 18550 }, { "epoch": 2.690147479798529, "grad_norm": 0.9541263580322266, "learning_rate": 2.3106247282214814e-05, "loss": 0.5477, "step": 18560 }, { "epoch": 2.691596912707903, "grad_norm": 2.2706756591796875, "learning_rate": 2.30917524278881e-05, "loss": 0.5099, "step": 18570 }, { "epoch": 2.6930463456172773, "grad_norm": 1.2195298671722412, "learning_rate": 2.3077257573561387e-05, "loss": 0.6338, "step": 18580 }, { "epoch": 2.6944957785266515, "grad_norm": 0.7414268851280212, "learning_rate": 2.3062762719234674e-05, "loss": 0.5689, "step": 18590 }, { "epoch": 2.6959452114360256, "grad_norm": 3.0787389278411865, "learning_rate": 2.304826786490796e-05, "loss": 0.5427, "step": 18600 }, { "epoch": 2.6973946443453998, "grad_norm": 0.7223326563835144, "learning_rate": 2.3033773010581243e-05, "loss": 0.5665, "step": 18610 }, { "epoch": 2.698844077254774, "grad_norm": 0.889962375164032, "learning_rate": 2.301927815625453e-05, "loss": 0.5911, "step": 18620 }, { "epoch": 2.7002935101641485, "grad_norm": 1.0519202947616577, "learning_rate": 2.300478330192782e-05, "loss": 0.5512, "step": 18630 }, { "epoch": 2.7017429430735227, "grad_norm": 0.8857706785202026, "learning_rate": 2.2990288447601103e-05, "loss": 0.5119, "step": 18640 }, { "epoch": 2.703192375982897, "grad_norm": 1.0033340454101562, "learning_rate": 2.2975793593274387e-05, "loss": 0.5544, "step": 18650 }, { "epoch": 2.704641808892271, "grad_norm": 0.8942063450813293, "learning_rate": 2.2961298738947675e-05, "loss": 0.5107, "step": 18660 }, { "epoch": 2.706091241801645, "grad_norm": 1.006726861000061, "learning_rate": 2.294680388462096e-05, "loss": 0.5571, "step": 18670 }, { "epoch": 2.7075406747110193, "grad_norm": 1.1389527320861816, "learning_rate": 2.2932309030294247e-05, "loss": 0.5514, "step": 18680 }, { "epoch": 2.7089901076203935, "grad_norm": 2.5659804344177246, "learning_rate": 2.2917814175967535e-05, "loss": 0.6332, "step": 18690 }, { "epoch": 2.7104395405297677, "grad_norm": 1.050179123878479, "learning_rate": 2.290331932164082e-05, "loss": 0.5082, "step": 18700 }, { "epoch": 2.711888973439142, "grad_norm": 2.12483286857605, "learning_rate": 2.2888824467314104e-05, "loss": 0.5867, "step": 18710 }, { "epoch": 2.713338406348516, "grad_norm": 1.6364705562591553, "learning_rate": 2.287432961298739e-05, "loss": 0.6449, "step": 18720 }, { "epoch": 2.71478783925789, "grad_norm": 2.376473903656006, "learning_rate": 2.2859834758660676e-05, "loss": 0.562, "step": 18730 }, { "epoch": 2.7162372721672643, "grad_norm": 3.524937152862549, "learning_rate": 2.2845339904333963e-05, "loss": 0.5788, "step": 18740 }, { "epoch": 2.717686705076639, "grad_norm": 1.0858070850372314, "learning_rate": 2.2830845050007248e-05, "loss": 0.5169, "step": 18750 }, { "epoch": 2.719136137986013, "grad_norm": 0.9738995432853699, "learning_rate": 2.2816350195680536e-05, "loss": 0.5891, "step": 18760 }, { "epoch": 2.720585570895387, "grad_norm": 2.339165687561035, "learning_rate": 2.280185534135382e-05, "loss": 0.57, "step": 18770 }, { "epoch": 2.7220350038047614, "grad_norm": 1.0681856870651245, "learning_rate": 2.2787360487027108e-05, "loss": 0.6183, "step": 18780 }, { "epoch": 2.7234844367141355, "grad_norm": 0.8750693798065186, "learning_rate": 2.2772865632700392e-05, "loss": 0.5597, "step": 18790 }, { "epoch": 2.7249338696235097, "grad_norm": 1.375166654586792, "learning_rate": 2.2758370778373676e-05, "loss": 0.6153, "step": 18800 }, { "epoch": 2.726383302532884, "grad_norm": 1.8082159757614136, "learning_rate": 2.2743875924046964e-05, "loss": 0.6993, "step": 18810 }, { "epoch": 2.7278327354422585, "grad_norm": 2.435235023498535, "learning_rate": 2.2729381069720252e-05, "loss": 0.5326, "step": 18820 }, { "epoch": 2.7292821683516326, "grad_norm": 1.028337001800537, "learning_rate": 2.2714886215393536e-05, "loss": 0.5088, "step": 18830 }, { "epoch": 2.730731601261007, "grad_norm": 1.501955509185791, "learning_rate": 2.2700391361066824e-05, "loss": 0.4502, "step": 18840 }, { "epoch": 2.732181034170381, "grad_norm": 2.1301772594451904, "learning_rate": 2.268589650674011e-05, "loss": 0.5966, "step": 18850 }, { "epoch": 2.733630467079755, "grad_norm": 2.629920244216919, "learning_rate": 2.2671401652413393e-05, "loss": 0.5694, "step": 18860 }, { "epoch": 2.7350798999891293, "grad_norm": 2.9983248710632324, "learning_rate": 2.265690679808668e-05, "loss": 0.5938, "step": 18870 }, { "epoch": 2.7365293328985034, "grad_norm": 4.4188995361328125, "learning_rate": 2.2642411943759968e-05, "loss": 0.5527, "step": 18880 }, { "epoch": 2.7379787658078776, "grad_norm": 0.8108800053596497, "learning_rate": 2.2627917089433253e-05, "loss": 0.5222, "step": 18890 }, { "epoch": 2.7394281987172517, "grad_norm": 1.078866958618164, "learning_rate": 2.2613422235106537e-05, "loss": 0.5275, "step": 18900 }, { "epoch": 2.740877631626626, "grad_norm": 2.8870749473571777, "learning_rate": 2.2598927380779825e-05, "loss": 0.6341, "step": 18910 }, { "epoch": 2.742327064536, "grad_norm": 2.361414670944214, "learning_rate": 2.2584432526453112e-05, "loss": 0.5177, "step": 18920 }, { "epoch": 2.7437764974453747, "grad_norm": 0.6874875426292419, "learning_rate": 2.2569937672126397e-05, "loss": 0.5358, "step": 18930 }, { "epoch": 2.745225930354749, "grad_norm": 1.24635648727417, "learning_rate": 2.255544281779968e-05, "loss": 0.5636, "step": 18940 }, { "epoch": 2.746675363264123, "grad_norm": 1.061892032623291, "learning_rate": 2.254094796347297e-05, "loss": 0.5179, "step": 18950 }, { "epoch": 2.748124796173497, "grad_norm": 2.9943783283233643, "learning_rate": 2.2526453109146253e-05, "loss": 0.5643, "step": 18960 }, { "epoch": 2.7495742290828713, "grad_norm": 1.0257405042648315, "learning_rate": 2.251195825481954e-05, "loss": 0.5869, "step": 18970 }, { "epoch": 2.7510236619922455, "grad_norm": 0.9652470946311951, "learning_rate": 2.2497463400492825e-05, "loss": 0.5397, "step": 18980 }, { "epoch": 2.7524730949016196, "grad_norm": 2.92497181892395, "learning_rate": 2.248296854616611e-05, "loss": 0.5578, "step": 18990 }, { "epoch": 2.7539225278109942, "grad_norm": 1.0754202604293823, "learning_rate": 2.2468473691839397e-05, "loss": 0.6103, "step": 19000 }, { "epoch": 2.7539225278109942, "eval_loss": 0.7155176401138306, "eval_runtime": 669.6957, "eval_samples_per_second": 51.508, "eval_steps_per_second": 2.576, "eval_token_accuracy": 0.0004030693748982676, "step": 19000 }, { "epoch": 2.7553719607203684, "grad_norm": 2.33913516998291, "learning_rate": 2.2453978837512685e-05, "loss": 0.5592, "step": 19010 }, { "epoch": 2.7568213936297425, "grad_norm": 1.0931929349899292, "learning_rate": 2.243948398318597e-05, "loss": 0.5391, "step": 19020 }, { "epoch": 2.7582708265391167, "grad_norm": 2.506253480911255, "learning_rate": 2.2424989128859257e-05, "loss": 0.5415, "step": 19030 }, { "epoch": 2.759720259448491, "grad_norm": 1.6211233139038086, "learning_rate": 2.241049427453254e-05, "loss": 0.5904, "step": 19040 }, { "epoch": 2.761169692357865, "grad_norm": 0.7718712091445923, "learning_rate": 2.239599942020583e-05, "loss": 0.5529, "step": 19050 }, { "epoch": 2.762619125267239, "grad_norm": 2.422839641571045, "learning_rate": 2.2381504565879114e-05, "loss": 0.5402, "step": 19060 }, { "epoch": 2.7640685581766133, "grad_norm": 2.453468084335327, "learning_rate": 2.23670097115524e-05, "loss": 0.5041, "step": 19070 }, { "epoch": 2.7655179910859875, "grad_norm": 1.3519423007965088, "learning_rate": 2.2352514857225686e-05, "loss": 0.5736, "step": 19080 }, { "epoch": 2.7669674239953617, "grad_norm": 3.2335236072540283, "learning_rate": 2.233802000289897e-05, "loss": 0.5825, "step": 19090 }, { "epoch": 2.768416856904736, "grad_norm": 2.3906261920928955, "learning_rate": 2.2323525148572258e-05, "loss": 0.6199, "step": 19100 }, { "epoch": 2.76986628981411, "grad_norm": 3.0553524494171143, "learning_rate": 2.2309030294245546e-05, "loss": 0.5798, "step": 19110 }, { "epoch": 2.7713157227234846, "grad_norm": 2.046018600463867, "learning_rate": 2.229453543991883e-05, "loss": 0.5871, "step": 19120 }, { "epoch": 2.7727651556328587, "grad_norm": 2.1828296184539795, "learning_rate": 2.2280040585592114e-05, "loss": 0.5908, "step": 19130 }, { "epoch": 2.774214588542233, "grad_norm": 2.1796507835388184, "learning_rate": 2.2265545731265402e-05, "loss": 0.5838, "step": 19140 }, { "epoch": 2.775664021451607, "grad_norm": 1.3011929988861084, "learning_rate": 2.2251050876938687e-05, "loss": 0.6146, "step": 19150 }, { "epoch": 2.7771134543609812, "grad_norm": 0.858228862285614, "learning_rate": 2.2236556022611974e-05, "loss": 0.5222, "step": 19160 }, { "epoch": 2.7785628872703554, "grad_norm": 3.342522144317627, "learning_rate": 2.222206116828526e-05, "loss": 0.6894, "step": 19170 }, { "epoch": 2.7800123201797295, "grad_norm": 0.9294348359107971, "learning_rate": 2.2207566313958546e-05, "loss": 0.5157, "step": 19180 }, { "epoch": 2.781461753089104, "grad_norm": 0.7711198329925537, "learning_rate": 2.219307145963183e-05, "loss": 0.5299, "step": 19190 }, { "epoch": 2.7829111859984783, "grad_norm": 1.0399785041809082, "learning_rate": 2.217857660530512e-05, "loss": 0.524, "step": 19200 }, { "epoch": 2.7843606189078525, "grad_norm": 1.0538372993469238, "learning_rate": 2.2164081750978406e-05, "loss": 0.5483, "step": 19210 }, { "epoch": 2.7858100518172266, "grad_norm": 0.8236547708511353, "learning_rate": 2.214958689665169e-05, "loss": 0.5912, "step": 19220 }, { "epoch": 2.787259484726601, "grad_norm": 1.0914356708526611, "learning_rate": 2.2135092042324975e-05, "loss": 0.5343, "step": 19230 }, { "epoch": 2.788708917635975, "grad_norm": 2.484463691711426, "learning_rate": 2.2120597187998263e-05, "loss": 0.5842, "step": 19240 }, { "epoch": 2.790158350545349, "grad_norm": 1.2179654836654663, "learning_rate": 2.2106102333671547e-05, "loss": 0.4972, "step": 19250 }, { "epoch": 2.7916077834547233, "grad_norm": 3.0674054622650146, "learning_rate": 2.2091607479344835e-05, "loss": 0.5962, "step": 19260 }, { "epoch": 2.7930572163640974, "grad_norm": 1.1092416048049927, "learning_rate": 2.207711262501812e-05, "loss": 0.5121, "step": 19270 }, { "epoch": 2.7945066492734716, "grad_norm": 3.062148094177246, "learning_rate": 2.2062617770691404e-05, "loss": 0.5796, "step": 19280 }, { "epoch": 2.7959560821828457, "grad_norm": 0.682876467704773, "learning_rate": 2.204812291636469e-05, "loss": 0.5505, "step": 19290 }, { "epoch": 2.79740551509222, "grad_norm": 0.7456188201904297, "learning_rate": 2.203362806203798e-05, "loss": 0.6132, "step": 19300 }, { "epoch": 2.7988549480015945, "grad_norm": 1.0670653581619263, "learning_rate": 2.2019133207711263e-05, "loss": 0.6285, "step": 19310 }, { "epoch": 2.8003043809109687, "grad_norm": 2.5444772243499756, "learning_rate": 2.2004638353384548e-05, "loss": 0.532, "step": 19320 }, { "epoch": 2.801753813820343, "grad_norm": 3.2199554443359375, "learning_rate": 2.1990143499057836e-05, "loss": 0.5256, "step": 19330 }, { "epoch": 2.803203246729717, "grad_norm": 3.2121224403381348, "learning_rate": 2.1975648644731123e-05, "loss": 0.5406, "step": 19340 }, { "epoch": 2.804652679639091, "grad_norm": 2.8595330715179443, "learning_rate": 2.1961153790404408e-05, "loss": 0.6079, "step": 19350 }, { "epoch": 2.8061021125484653, "grad_norm": 1.9917750358581543, "learning_rate": 2.1946658936077692e-05, "loss": 0.5897, "step": 19360 }, { "epoch": 2.8075515454578395, "grad_norm": 0.9936667084693909, "learning_rate": 2.193216408175098e-05, "loss": 0.5673, "step": 19370 }, { "epoch": 2.809000978367214, "grad_norm": 0.9017758369445801, "learning_rate": 2.1917669227424264e-05, "loss": 0.5452, "step": 19380 }, { "epoch": 2.8104504112765882, "grad_norm": 0.9994916319847107, "learning_rate": 2.1903174373097552e-05, "loss": 0.5504, "step": 19390 }, { "epoch": 2.8118998441859624, "grad_norm": 2.0900986194610596, "learning_rate": 2.188867951877084e-05, "loss": 0.5033, "step": 19400 }, { "epoch": 2.8133492770953366, "grad_norm": 0.9001177549362183, "learning_rate": 2.1874184664444124e-05, "loss": 0.5446, "step": 19410 }, { "epoch": 2.8147987100047107, "grad_norm": 1.0415499210357666, "learning_rate": 2.1859689810117408e-05, "loss": 0.6218, "step": 19420 }, { "epoch": 2.816248142914085, "grad_norm": 0.9950922727584839, "learning_rate": 2.1845194955790696e-05, "loss": 0.5994, "step": 19430 }, { "epoch": 2.817697575823459, "grad_norm": 0.9685205817222595, "learning_rate": 2.183070010146398e-05, "loss": 0.609, "step": 19440 }, { "epoch": 2.819147008732833, "grad_norm": 1.086944341659546, "learning_rate": 2.1816205247137268e-05, "loss": 0.5858, "step": 19450 }, { "epoch": 2.8205964416422074, "grad_norm": 1.3267871141433716, "learning_rate": 2.1801710392810553e-05, "loss": 0.5554, "step": 19460 }, { "epoch": 2.8220458745515815, "grad_norm": 0.9111933708190918, "learning_rate": 2.178721553848384e-05, "loss": 0.5218, "step": 19470 }, { "epoch": 2.8234953074609557, "grad_norm": 2.4886646270751953, "learning_rate": 2.1772720684157125e-05, "loss": 0.51, "step": 19480 }, { "epoch": 2.82494474037033, "grad_norm": 2.937021017074585, "learning_rate": 2.1758225829830412e-05, "loss": 0.5775, "step": 19490 }, { "epoch": 2.8263941732797044, "grad_norm": 1.8829514980316162, "learning_rate": 2.1743730975503697e-05, "loss": 0.5318, "step": 19500 }, { "epoch": 2.8278436061890786, "grad_norm": 2.7370100021362305, "learning_rate": 2.172923612117698e-05, "loss": 0.5865, "step": 19510 }, { "epoch": 2.8292930390984528, "grad_norm": 2.0809028148651123, "learning_rate": 2.171474126685027e-05, "loss": 0.5616, "step": 19520 }, { "epoch": 2.830742472007827, "grad_norm": 1.7841269969940186, "learning_rate": 2.1700246412523557e-05, "loss": 0.5817, "step": 19530 }, { "epoch": 2.832191904917201, "grad_norm": 3.6458749771118164, "learning_rate": 2.168575155819684e-05, "loss": 0.5487, "step": 19540 }, { "epoch": 2.8336413378265752, "grad_norm": 0.8706045746803284, "learning_rate": 2.1671256703870125e-05, "loss": 0.5208, "step": 19550 }, { "epoch": 2.8350907707359494, "grad_norm": 0.9333102107048035, "learning_rate": 2.1656761849543413e-05, "loss": 0.5952, "step": 19560 }, { "epoch": 2.836540203645324, "grad_norm": 2.157334566116333, "learning_rate": 2.1642266995216697e-05, "loss": 0.5428, "step": 19570 }, { "epoch": 2.837989636554698, "grad_norm": 1.2280299663543701, "learning_rate": 2.1627772140889985e-05, "loss": 0.5521, "step": 19580 }, { "epoch": 2.8394390694640723, "grad_norm": 1.0699831247329712, "learning_rate": 2.1613277286563273e-05, "loss": 0.5647, "step": 19590 }, { "epoch": 2.8408885023734465, "grad_norm": 2.3743484020233154, "learning_rate": 2.1598782432236557e-05, "loss": 0.5353, "step": 19600 }, { "epoch": 2.8423379352828206, "grad_norm": 1.8679091930389404, "learning_rate": 2.158428757790984e-05, "loss": 0.5263, "step": 19610 }, { "epoch": 2.843787368192195, "grad_norm": 1.9208581447601318, "learning_rate": 2.156979272358313e-05, "loss": 0.5266, "step": 19620 }, { "epoch": 2.845236801101569, "grad_norm": 3.6413888931274414, "learning_rate": 2.1555297869256417e-05, "loss": 0.5512, "step": 19630 }, { "epoch": 2.846686234010943, "grad_norm": 2.0212066173553467, "learning_rate": 2.15408030149297e-05, "loss": 0.5954, "step": 19640 }, { "epoch": 2.8481356669203173, "grad_norm": 2.341249704360962, "learning_rate": 2.1526308160602986e-05, "loss": 0.5832, "step": 19650 }, { "epoch": 2.8495850998296914, "grad_norm": 3.7906134128570557, "learning_rate": 2.1511813306276274e-05, "loss": 0.5869, "step": 19660 }, { "epoch": 2.8510345327390656, "grad_norm": 2.7214527130126953, "learning_rate": 2.1497318451949558e-05, "loss": 0.6188, "step": 19670 }, { "epoch": 2.85248396564844, "grad_norm": 1.1105526685714722, "learning_rate": 2.1482823597622846e-05, "loss": 0.5926, "step": 19680 }, { "epoch": 2.8539333985578144, "grad_norm": 1.4884237051010132, "learning_rate": 2.146832874329613e-05, "loss": 0.5329, "step": 19690 }, { "epoch": 2.8553828314671885, "grad_norm": 1.7720947265625, "learning_rate": 2.1453833888969414e-05, "loss": 0.4844, "step": 19700 }, { "epoch": 2.8568322643765627, "grad_norm": 1.2303423881530762, "learning_rate": 2.1439339034642702e-05, "loss": 0.5637, "step": 19710 }, { "epoch": 2.858281697285937, "grad_norm": 3.0960593223571777, "learning_rate": 2.142484418031599e-05, "loss": 0.5773, "step": 19720 }, { "epoch": 2.859731130195311, "grad_norm": 3.4920639991760254, "learning_rate": 2.1410349325989274e-05, "loss": 0.5379, "step": 19730 }, { "epoch": 2.861180563104685, "grad_norm": 1.1619203090667725, "learning_rate": 2.139585447166256e-05, "loss": 0.5732, "step": 19740 }, { "epoch": 2.8626299960140598, "grad_norm": 1.4657340049743652, "learning_rate": 2.1381359617335846e-05, "loss": 0.5264, "step": 19750 }, { "epoch": 2.864079428923434, "grad_norm": 1.1278634071350098, "learning_rate": 2.1366864763009134e-05, "loss": 0.5607, "step": 19760 }, { "epoch": 2.865528861832808, "grad_norm": 1.0407209396362305, "learning_rate": 2.135236990868242e-05, "loss": 0.5141, "step": 19770 }, { "epoch": 2.8669782947421822, "grad_norm": 2.0300920009613037, "learning_rate": 2.1337875054355706e-05, "loss": 0.5624, "step": 19780 }, { "epoch": 2.8684277276515564, "grad_norm": 1.0135087966918945, "learning_rate": 2.132338020002899e-05, "loss": 0.5987, "step": 19790 }, { "epoch": 2.8698771605609306, "grad_norm": 1.8810256719589233, "learning_rate": 2.1308885345702275e-05, "loss": 0.5018, "step": 19800 }, { "epoch": 2.8713265934703047, "grad_norm": 1.8214755058288574, "learning_rate": 2.1294390491375563e-05, "loss": 0.6013, "step": 19810 }, { "epoch": 2.872776026379679, "grad_norm": 1.2672983407974243, "learning_rate": 2.127989563704885e-05, "loss": 0.5856, "step": 19820 }, { "epoch": 2.874225459289053, "grad_norm": 1.973645806312561, "learning_rate": 2.1265400782722135e-05, "loss": 0.5301, "step": 19830 }, { "epoch": 2.875674892198427, "grad_norm": 3.644392251968384, "learning_rate": 2.125090592839542e-05, "loss": 0.5789, "step": 19840 }, { "epoch": 2.8771243251078014, "grad_norm": 0.8183895349502563, "learning_rate": 2.1236411074068707e-05, "loss": 0.5546, "step": 19850 }, { "epoch": 2.8785737580171755, "grad_norm": 0.9706283211708069, "learning_rate": 2.122191621974199e-05, "loss": 0.5987, "step": 19860 }, { "epoch": 2.88002319092655, "grad_norm": 0.9787003993988037, "learning_rate": 2.120742136541528e-05, "loss": 0.5661, "step": 19870 }, { "epoch": 2.8814726238359243, "grad_norm": 3.2533276081085205, "learning_rate": 2.1192926511088563e-05, "loss": 0.5659, "step": 19880 }, { "epoch": 2.8829220567452984, "grad_norm": 1.3525904417037964, "learning_rate": 2.117843165676185e-05, "loss": 0.5648, "step": 19890 }, { "epoch": 2.8843714896546726, "grad_norm": 1.7455861568450928, "learning_rate": 2.1163936802435135e-05, "loss": 0.6197, "step": 19900 }, { "epoch": 2.8858209225640468, "grad_norm": 1.0630838871002197, "learning_rate": 2.1149441948108423e-05, "loss": 0.6389, "step": 19910 }, { "epoch": 2.887270355473421, "grad_norm": 1.7146954536437988, "learning_rate": 2.113494709378171e-05, "loss": 0.5689, "step": 19920 }, { "epoch": 2.888719788382795, "grad_norm": 1.6422052383422852, "learning_rate": 2.1120452239454995e-05, "loss": 0.5172, "step": 19930 }, { "epoch": 2.8901692212921697, "grad_norm": 2.0748939514160156, "learning_rate": 2.110595738512828e-05, "loss": 0.6374, "step": 19940 }, { "epoch": 2.891618654201544, "grad_norm": 1.256318211555481, "learning_rate": 2.1091462530801567e-05, "loss": 0.627, "step": 19950 }, { "epoch": 2.893068087110918, "grad_norm": 2.529940128326416, "learning_rate": 2.1076967676474852e-05, "loss": 0.5437, "step": 19960 }, { "epoch": 2.894517520020292, "grad_norm": 3.2431910037994385, "learning_rate": 2.106247282214814e-05, "loss": 0.5335, "step": 19970 }, { "epoch": 2.8959669529296663, "grad_norm": 2.3461062908172607, "learning_rate": 2.1047977967821424e-05, "loss": 0.5081, "step": 19980 }, { "epoch": 2.8974163858390405, "grad_norm": 1.2446269989013672, "learning_rate": 2.1033483113494708e-05, "loss": 0.5553, "step": 19990 }, { "epoch": 2.8988658187484146, "grad_norm": 2.994905948638916, "learning_rate": 2.1018988259167996e-05, "loss": 0.5612, "step": 20000 }, { "epoch": 2.8988658187484146, "eval_loss": 0.7098406553268433, "eval_runtime": 669.4934, "eval_samples_per_second": 51.524, "eval_steps_per_second": 2.577, "eval_token_accuracy": 0.0004001742237617278, "step": 20000 }, { "epoch": 2.900315251657789, "grad_norm": 2.2802417278289795, "learning_rate": 2.1004493404841284e-05, "loss": 0.6105, "step": 20010 }, { "epoch": 2.901764684567163, "grad_norm": 3.3665335178375244, "learning_rate": 2.0989998550514568e-05, "loss": 0.5633, "step": 20020 }, { "epoch": 2.903214117476537, "grad_norm": 2.1991968154907227, "learning_rate": 2.0975503696187852e-05, "loss": 0.5318, "step": 20030 }, { "epoch": 2.9046635503859113, "grad_norm": 2.621366024017334, "learning_rate": 2.096100884186114e-05, "loss": 0.5798, "step": 20040 }, { "epoch": 2.9061129832952854, "grad_norm": 0.9481135606765747, "learning_rate": 2.0946513987534428e-05, "loss": 0.5695, "step": 20050 }, { "epoch": 2.90756241620466, "grad_norm": 0.9128068685531616, "learning_rate": 2.0932019133207712e-05, "loss": 0.5408, "step": 20060 }, { "epoch": 2.909011849114034, "grad_norm": 2.810906171798706, "learning_rate": 2.0917524278880997e-05, "loss": 0.5821, "step": 20070 }, { "epoch": 2.9104612820234084, "grad_norm": 1.2242695093154907, "learning_rate": 2.0903029424554284e-05, "loss": 0.5532, "step": 20080 }, { "epoch": 2.9119107149327825, "grad_norm": 1.5578938722610474, "learning_rate": 2.088853457022757e-05, "loss": 0.5768, "step": 20090 }, { "epoch": 2.9133601478421567, "grad_norm": 0.8750723600387573, "learning_rate": 2.0874039715900857e-05, "loss": 0.5902, "step": 20100 }, { "epoch": 2.914809580751531, "grad_norm": 2.7830746173858643, "learning_rate": 2.0859544861574144e-05, "loss": 0.6105, "step": 20110 }, { "epoch": 2.916259013660905, "grad_norm": 0.8406007885932922, "learning_rate": 2.084505000724743e-05, "loss": 0.5396, "step": 20120 }, { "epoch": 2.9177084465702796, "grad_norm": 2.036412477493286, "learning_rate": 2.0830555152920713e-05, "loss": 0.4757, "step": 20130 }, { "epoch": 2.9191578794796538, "grad_norm": 0.9003493785858154, "learning_rate": 2.0816060298594e-05, "loss": 0.6022, "step": 20140 }, { "epoch": 2.920607312389028, "grad_norm": 1.2599937915802002, "learning_rate": 2.080156544426729e-05, "loss": 0.5633, "step": 20150 }, { "epoch": 2.922056745298402, "grad_norm": 1.3828151226043701, "learning_rate": 2.0787070589940573e-05, "loss": 0.5638, "step": 20160 }, { "epoch": 2.9235061782077763, "grad_norm": 1.4246114492416382, "learning_rate": 2.0772575735613857e-05, "loss": 0.526, "step": 20170 }, { "epoch": 2.9249556111171504, "grad_norm": 2.006903886795044, "learning_rate": 2.0758080881287145e-05, "loss": 0.5185, "step": 20180 }, { "epoch": 2.9264050440265246, "grad_norm": 1.0413299798965454, "learning_rate": 2.074358602696043e-05, "loss": 0.5867, "step": 20190 }, { "epoch": 2.9278544769358987, "grad_norm": 2.7793540954589844, "learning_rate": 2.0729091172633717e-05, "loss": 0.5626, "step": 20200 }, { "epoch": 2.929303909845273, "grad_norm": 1.0333194732666016, "learning_rate": 2.0714596318307e-05, "loss": 0.6196, "step": 20210 }, { "epoch": 2.930753342754647, "grad_norm": 1.1859326362609863, "learning_rate": 2.0700101463980286e-05, "loss": 0.6377, "step": 20220 }, { "epoch": 2.932202775664021, "grad_norm": 2.745549440383911, "learning_rate": 2.0685606609653574e-05, "loss": 0.5695, "step": 20230 }, { "epoch": 2.933652208573396, "grad_norm": 2.405184030532837, "learning_rate": 2.067111175532686e-05, "loss": 0.6088, "step": 20240 }, { "epoch": 2.93510164148277, "grad_norm": 1.1689724922180176, "learning_rate": 2.0656616901000146e-05, "loss": 0.5437, "step": 20250 }, { "epoch": 2.936551074392144, "grad_norm": 0.8889197111129761, "learning_rate": 2.064212204667343e-05, "loss": 0.5489, "step": 20260 }, { "epoch": 2.9380005073015183, "grad_norm": 3.05747127532959, "learning_rate": 2.0627627192346718e-05, "loss": 0.5463, "step": 20270 }, { "epoch": 2.9394499402108925, "grad_norm": 1.3492549657821655, "learning_rate": 2.0613132338020002e-05, "loss": 0.6216, "step": 20280 }, { "epoch": 2.9408993731202666, "grad_norm": 0.8226122856140137, "learning_rate": 2.059863748369329e-05, "loss": 0.5107, "step": 20290 }, { "epoch": 2.9423488060296408, "grad_norm": 0.8221601843833923, "learning_rate": 2.0584142629366578e-05, "loss": 0.5439, "step": 20300 }, { "epoch": 2.9437982389390154, "grad_norm": 0.9002091288566589, "learning_rate": 2.0571097260472534e-05, "loss": 0.5844, "step": 20310 }, { "epoch": 2.9452476718483895, "grad_norm": 2.082839012145996, "learning_rate": 2.0556602406145818e-05, "loss": 0.5639, "step": 20320 }, { "epoch": 2.9466971047577637, "grad_norm": 0.8620240688323975, "learning_rate": 2.0542107551819106e-05, "loss": 0.5234, "step": 20330 }, { "epoch": 2.948146537667138, "grad_norm": 2.8008594512939453, "learning_rate": 2.052761269749239e-05, "loss": 0.5547, "step": 20340 }, { "epoch": 2.949595970576512, "grad_norm": 0.930914580821991, "learning_rate": 2.0513117843165678e-05, "loss": 0.5172, "step": 20350 }, { "epoch": 2.951045403485886, "grad_norm": 0.9638407826423645, "learning_rate": 2.0498622988838965e-05, "loss": 0.6278, "step": 20360 }, { "epoch": 2.9524948363952603, "grad_norm": 1.2510921955108643, "learning_rate": 2.048412813451225e-05, "loss": 0.5396, "step": 20370 }, { "epoch": 2.9539442693046345, "grad_norm": 0.7251255512237549, "learning_rate": 2.0469633280185534e-05, "loss": 0.5134, "step": 20380 }, { "epoch": 2.9553937022140087, "grad_norm": 3.2369771003723145, "learning_rate": 2.0455138425858822e-05, "loss": 0.5612, "step": 20390 }, { "epoch": 2.956843135123383, "grad_norm": 1.9414756298065186, "learning_rate": 2.0440643571532106e-05, "loss": 0.5565, "step": 20400 }, { "epoch": 2.958292568032757, "grad_norm": 0.9527379274368286, "learning_rate": 2.0426148717205394e-05, "loss": 0.5756, "step": 20410 }, { "epoch": 2.959742000942131, "grad_norm": 2.340003728866577, "learning_rate": 2.041165386287868e-05, "loss": 0.5586, "step": 20420 }, { "epoch": 2.9611914338515057, "grad_norm": 1.1174806356430054, "learning_rate": 2.0397159008551963e-05, "loss": 0.5186, "step": 20430 }, { "epoch": 2.96264086676088, "grad_norm": 1.0091946125030518, "learning_rate": 2.038266415422525e-05, "loss": 0.5589, "step": 20440 }, { "epoch": 2.964090299670254, "grad_norm": 0.8579798936843872, "learning_rate": 2.0368169299898538e-05, "loss": 0.5222, "step": 20450 }, { "epoch": 2.965539732579628, "grad_norm": 1.2191307544708252, "learning_rate": 2.0353674445571823e-05, "loss": 0.5434, "step": 20460 }, { "epoch": 2.9669891654890024, "grad_norm": 3.384915828704834, "learning_rate": 2.0339179591245107e-05, "loss": 0.578, "step": 20470 }, { "epoch": 2.9684385983983765, "grad_norm": 3.718766927719116, "learning_rate": 2.0324684736918395e-05, "loss": 0.5868, "step": 20480 }, { "epoch": 2.9698880313077507, "grad_norm": 1.0504790544509888, "learning_rate": 2.0310189882591682e-05, "loss": 0.541, "step": 20490 }, { "epoch": 2.9713374642171253, "grad_norm": 0.9422523975372314, "learning_rate": 2.0295695028264967e-05, "loss": 0.5067, "step": 20500 }, { "epoch": 2.9727868971264995, "grad_norm": 0.9271829724311829, "learning_rate": 2.0281200173938255e-05, "loss": 0.5226, "step": 20510 }, { "epoch": 2.9742363300358736, "grad_norm": 1.016239881515503, "learning_rate": 2.026670531961154e-05, "loss": 0.5355, "step": 20520 }, { "epoch": 2.975685762945248, "grad_norm": 1.2245512008666992, "learning_rate": 2.0252210465284823e-05, "loss": 0.595, "step": 20530 }, { "epoch": 2.977135195854622, "grad_norm": 1.3465006351470947, "learning_rate": 2.023771561095811e-05, "loss": 0.544, "step": 20540 }, { "epoch": 2.978584628763996, "grad_norm": 1.4868266582489014, "learning_rate": 2.02232207566314e-05, "loss": 0.541, "step": 20550 }, { "epoch": 2.9800340616733703, "grad_norm": 3.016294479370117, "learning_rate": 2.0208725902304683e-05, "loss": 0.5838, "step": 20560 }, { "epoch": 2.9814834945827444, "grad_norm": 1.2055611610412598, "learning_rate": 2.0194231047977968e-05, "loss": 0.6035, "step": 20570 }, { "epoch": 2.9829329274921186, "grad_norm": 1.2255357503890991, "learning_rate": 2.0179736193651255e-05, "loss": 0.524, "step": 20580 }, { "epoch": 2.9843823604014927, "grad_norm": 2.1640145778656006, "learning_rate": 2.0165241339324543e-05, "loss": 0.453, "step": 20590 }, { "epoch": 2.985831793310867, "grad_norm": 1.263148546218872, "learning_rate": 2.0150746484997827e-05, "loss": 0.577, "step": 20600 }, { "epoch": 2.987281226220241, "grad_norm": 1.0417917966842651, "learning_rate": 2.0136251630671112e-05, "loss": 0.494, "step": 20610 }, { "epoch": 2.9887306591296157, "grad_norm": 2.8763279914855957, "learning_rate": 2.01217567763444e-05, "loss": 0.5378, "step": 20620 }, { "epoch": 2.99018009203899, "grad_norm": 2.2581329345703125, "learning_rate": 2.0107261922017684e-05, "loss": 0.5647, "step": 20630 }, { "epoch": 2.991629524948364, "grad_norm": 0.999770998954773, "learning_rate": 2.009276706769097e-05, "loss": 0.6144, "step": 20640 }, { "epoch": 2.993078957857738, "grad_norm": 2.5331544876098633, "learning_rate": 2.0078272213364256e-05, "loss": 0.5512, "step": 20650 }, { "epoch": 2.9945283907671123, "grad_norm": 1.2347875833511353, "learning_rate": 2.006377735903754e-05, "loss": 0.5147, "step": 20660 }, { "epoch": 2.9959778236764865, "grad_norm": 0.8321196436882019, "learning_rate": 2.0049282504710828e-05, "loss": 0.5601, "step": 20670 }, { "epoch": 2.9974272565858606, "grad_norm": 1.9766823053359985, "learning_rate": 2.0034787650384116e-05, "loss": 0.5128, "step": 20680 }, { "epoch": 2.9988766894952352, "grad_norm": 0.8170207738876343, "learning_rate": 2.00202927960574e-05, "loss": 0.6103, "step": 20690 }, { "epoch": 3.0003261224046094, "grad_norm": 1.5131596326828003, "learning_rate": 2.0005797941730688e-05, "loss": 0.5042, "step": 20700 }, { "epoch": 3.0017755553139835, "grad_norm": 1.2082328796386719, "learning_rate": 1.9991303087403972e-05, "loss": 0.4591, "step": 20710 }, { "epoch": 3.0032249882233577, "grad_norm": 0.784325361251831, "learning_rate": 1.997680823307726e-05, "loss": 0.4597, "step": 20720 }, { "epoch": 3.004674421132732, "grad_norm": 3.104140043258667, "learning_rate": 1.9962313378750544e-05, "loss": 0.4864, "step": 20730 }, { "epoch": 3.006123854042106, "grad_norm": 0.9828206896781921, "learning_rate": 1.9947818524423832e-05, "loss": 0.5007, "step": 20740 }, { "epoch": 3.00757328695148, "grad_norm": 2.5246028900146484, "learning_rate": 1.9933323670097116e-05, "loss": 0.5046, "step": 20750 }, { "epoch": 3.0090227198608543, "grad_norm": 1.0861430168151855, "learning_rate": 1.99188288157704e-05, "loss": 0.4932, "step": 20760 }, { "epoch": 3.0104721527702285, "grad_norm": 0.9487363696098328, "learning_rate": 1.990433396144369e-05, "loss": 0.471, "step": 20770 }, { "epoch": 3.0119215856796027, "grad_norm": 1.6627943515777588, "learning_rate": 1.9889839107116976e-05, "loss": 0.4333, "step": 20780 }, { "epoch": 3.0133710185889773, "grad_norm": 1.0334659814834595, "learning_rate": 1.987534425279026e-05, "loss": 0.4825, "step": 20790 }, { "epoch": 3.0148204514983514, "grad_norm": 0.946616530418396, "learning_rate": 1.9860849398463545e-05, "loss": 0.4572, "step": 20800 }, { "epoch": 3.0162698844077256, "grad_norm": 1.0239499807357788, "learning_rate": 1.9846354544136833e-05, "loss": 0.579, "step": 20810 }, { "epoch": 3.0177193173170997, "grad_norm": 1.330289602279663, "learning_rate": 1.9831859689810117e-05, "loss": 0.4576, "step": 20820 }, { "epoch": 3.019168750226474, "grad_norm": 2.2307474613189697, "learning_rate": 1.9817364835483405e-05, "loss": 0.4538, "step": 20830 }, { "epoch": 3.020618183135848, "grad_norm": 2.4169399738311768, "learning_rate": 1.980286998115669e-05, "loss": 0.4184, "step": 20840 }, { "epoch": 3.0220676160452222, "grad_norm": 2.4117231369018555, "learning_rate": 1.9788375126829974e-05, "loss": 0.5314, "step": 20850 }, { "epoch": 3.0235170489545964, "grad_norm": 1.6371862888336182, "learning_rate": 1.977388027250326e-05, "loss": 0.4839, "step": 20860 }, { "epoch": 3.0249664818639705, "grad_norm": 1.996282935142517, "learning_rate": 1.975938541817655e-05, "loss": 0.4951, "step": 20870 }, { "epoch": 3.026415914773345, "grad_norm": 1.1968421936035156, "learning_rate": 1.9744890563849837e-05, "loss": 0.4765, "step": 20880 }, { "epoch": 3.0278653476827193, "grad_norm": 0.709713876247406, "learning_rate": 1.973039570952312e-05, "loss": 0.4464, "step": 20890 }, { "epoch": 3.0293147805920935, "grad_norm": 2.853358507156372, "learning_rate": 1.9715900855196406e-05, "loss": 0.4124, "step": 20900 }, { "epoch": 3.0307642135014676, "grad_norm": 0.9398220777511597, "learning_rate": 1.9701406000869693e-05, "loss": 0.491, "step": 20910 }, { "epoch": 3.032213646410842, "grad_norm": 0.9328126311302185, "learning_rate": 1.9686911146542978e-05, "loss": 0.5014, "step": 20920 }, { "epoch": 3.033663079320216, "grad_norm": 1.1154359579086304, "learning_rate": 1.9672416292216265e-05, "loss": 0.4352, "step": 20930 }, { "epoch": 3.03511251222959, "grad_norm": 0.8593978881835938, "learning_rate": 1.965792143788955e-05, "loss": 0.4374, "step": 20940 }, { "epoch": 3.0365619451389643, "grad_norm": 1.374656319618225, "learning_rate": 1.9643426583562834e-05, "loss": 0.475, "step": 20950 }, { "epoch": 3.0380113780483384, "grad_norm": 2.1968801021575928, "learning_rate": 1.9628931729236122e-05, "loss": 0.4854, "step": 20960 }, { "epoch": 3.0394608109577126, "grad_norm": 2.48079252243042, "learning_rate": 1.961443687490941e-05, "loss": 0.4832, "step": 20970 }, { "epoch": 3.040910243867087, "grad_norm": 3.2282943725585938, "learning_rate": 1.9599942020582694e-05, "loss": 0.45, "step": 20980 }, { "epoch": 3.0423596767764614, "grad_norm": 1.1852225065231323, "learning_rate": 1.958544716625598e-05, "loss": 0.4655, "step": 20990 }, { "epoch": 3.0438091096858355, "grad_norm": 0.8068433403968811, "learning_rate": 1.9570952311929266e-05, "loss": 0.4486, "step": 21000 }, { "epoch": 3.0438091096858355, "eval_loss": 0.7307726740837097, "eval_runtime": 670.7043, "eval_samples_per_second": 51.431, "eval_steps_per_second": 2.572, "eval_token_accuracy": 0.0003918104538117238, "step": 21000 }, { "epoch": 3.0452585425952097, "grad_norm": 0.8493704795837402, "learning_rate": 1.9556457457602554e-05, "loss": 0.4957, "step": 21010 }, { "epoch": 3.046707975504584, "grad_norm": 1.2335125207901, "learning_rate": 1.9541962603275838e-05, "loss": 0.3801, "step": 21020 }, { "epoch": 3.048157408413958, "grad_norm": 1.2187851667404175, "learning_rate": 1.9527467748949123e-05, "loss": 0.472, "step": 21030 }, { "epoch": 3.049606841323332, "grad_norm": 1.7487982511520386, "learning_rate": 1.951297289462241e-05, "loss": 0.4714, "step": 21040 }, { "epoch": 3.0510562742327063, "grad_norm": 0.8521775007247925, "learning_rate": 1.9498478040295695e-05, "loss": 0.409, "step": 21050 }, { "epoch": 3.0525057071420805, "grad_norm": 1.2800102233886719, "learning_rate": 1.9483983185968982e-05, "loss": 0.4213, "step": 21060 }, { "epoch": 3.053955140051455, "grad_norm": 2.577768087387085, "learning_rate": 1.946948833164227e-05, "loss": 0.5315, "step": 21070 }, { "epoch": 3.0554045729608292, "grad_norm": 1.125487208366394, "learning_rate": 1.9454993477315555e-05, "loss": 0.4036, "step": 21080 }, { "epoch": 3.0568540058702034, "grad_norm": 1.9074199199676514, "learning_rate": 1.944049862298884e-05, "loss": 0.4442, "step": 21090 }, { "epoch": 3.0583034387795776, "grad_norm": 0.7598525285720825, "learning_rate": 1.9426003768662127e-05, "loss": 0.5219, "step": 21100 }, { "epoch": 3.0597528716889517, "grad_norm": 2.014727830886841, "learning_rate": 1.941150891433541e-05, "loss": 0.548, "step": 21110 }, { "epoch": 3.061202304598326, "grad_norm": 2.676212787628174, "learning_rate": 1.93970140600087e-05, "loss": 0.4656, "step": 21120 }, { "epoch": 3.0626517375077, "grad_norm": 2.4987432956695557, "learning_rate": 1.9382519205681983e-05, "loss": 0.4591, "step": 21130 }, { "epoch": 3.064101170417074, "grad_norm": 0.8059296011924744, "learning_rate": 1.936802435135527e-05, "loss": 0.4621, "step": 21140 }, { "epoch": 3.0655506033264484, "grad_norm": 2.7726733684539795, "learning_rate": 1.9353529497028555e-05, "loss": 0.488, "step": 21150 }, { "epoch": 3.0670000362358225, "grad_norm": 1.1009929180145264, "learning_rate": 1.9339034642701843e-05, "loss": 0.4934, "step": 21160 }, { "epoch": 3.068449469145197, "grad_norm": 0.9859609007835388, "learning_rate": 1.9324539788375127e-05, "loss": 0.4611, "step": 21170 }, { "epoch": 3.0698989020545713, "grad_norm": 1.0177518129348755, "learning_rate": 1.931004493404841e-05, "loss": 0.4368, "step": 21180 }, { "epoch": 3.0713483349639454, "grad_norm": 2.414206027984619, "learning_rate": 1.92955500797217e-05, "loss": 0.4755, "step": 21190 }, { "epoch": 3.0727977678733196, "grad_norm": 3.308610677719116, "learning_rate": 1.9281055225394987e-05, "loss": 0.4878, "step": 21200 }, { "epoch": 3.0742472007826938, "grad_norm": 1.0151442289352417, "learning_rate": 1.926656037106827e-05, "loss": 0.4834, "step": 21210 }, { "epoch": 3.075696633692068, "grad_norm": 0.8535460233688354, "learning_rate": 1.9252065516741556e-05, "loss": 0.52, "step": 21220 }, { "epoch": 3.077146066601442, "grad_norm": 0.9755415320396423, "learning_rate": 1.9237570662414844e-05, "loss": 0.4589, "step": 21230 }, { "epoch": 3.0785954995108162, "grad_norm": 0.7422819137573242, "learning_rate": 1.9223075808088128e-05, "loss": 0.4364, "step": 21240 }, { "epoch": 3.0800449324201904, "grad_norm": 1.1758068799972534, "learning_rate": 1.9208580953761416e-05, "loss": 0.4246, "step": 21250 }, { "epoch": 3.081494365329565, "grad_norm": 2.122065305709839, "learning_rate": 1.9194086099434703e-05, "loss": 0.4369, "step": 21260 }, { "epoch": 3.082943798238939, "grad_norm": 0.7626737356185913, "learning_rate": 1.9179591245107988e-05, "loss": 0.4277, "step": 21270 }, { "epoch": 3.0843932311483133, "grad_norm": 2.114098072052002, "learning_rate": 1.9165096390781272e-05, "loss": 0.486, "step": 21280 }, { "epoch": 3.0858426640576875, "grad_norm": 2.370668888092041, "learning_rate": 1.915060153645456e-05, "loss": 0.4631, "step": 21290 }, { "epoch": 3.0872920969670616, "grad_norm": 1.3175185918807983, "learning_rate": 1.9136106682127848e-05, "loss": 0.42, "step": 21300 }, { "epoch": 3.088741529876436, "grad_norm": 1.1968013048171997, "learning_rate": 1.9121611827801132e-05, "loss": 0.476, "step": 21310 }, { "epoch": 3.09019096278581, "grad_norm": 0.7684279084205627, "learning_rate": 1.9107116973474416e-05, "loss": 0.5142, "step": 21320 }, { "epoch": 3.091640395695184, "grad_norm": 0.9500318169593811, "learning_rate": 1.9092622119147704e-05, "loss": 0.4577, "step": 21330 }, { "epoch": 3.0930898286045583, "grad_norm": 2.970555067062378, "learning_rate": 1.907812726482099e-05, "loss": 0.4624, "step": 21340 }, { "epoch": 3.094539261513933, "grad_norm": 2.508944511413574, "learning_rate": 1.9063632410494276e-05, "loss": 0.3773, "step": 21350 }, { "epoch": 3.095988694423307, "grad_norm": 1.0545679330825806, "learning_rate": 1.904913755616756e-05, "loss": 0.4906, "step": 21360 }, { "epoch": 3.097438127332681, "grad_norm": 0.9826215505599976, "learning_rate": 1.9034642701840845e-05, "loss": 0.4465, "step": 21370 }, { "epoch": 3.0988875602420554, "grad_norm": 0.921008825302124, "learning_rate": 1.9020147847514133e-05, "loss": 0.4526, "step": 21380 }, { "epoch": 3.1003369931514295, "grad_norm": 1.1149829626083374, "learning_rate": 1.900565299318742e-05, "loss": 0.4398, "step": 21390 }, { "epoch": 3.1017864260608037, "grad_norm": 1.2042686939239502, "learning_rate": 1.8991158138860705e-05, "loss": 0.5067, "step": 21400 }, { "epoch": 3.103235858970178, "grad_norm": 1.7478713989257812, "learning_rate": 1.897666328453399e-05, "loss": 0.4704, "step": 21410 }, { "epoch": 3.104685291879552, "grad_norm": 1.2265759706497192, "learning_rate": 1.8962168430207277e-05, "loss": 0.4861, "step": 21420 }, { "epoch": 3.106134724788926, "grad_norm": 0.9611939191818237, "learning_rate": 1.8947673575880565e-05, "loss": 0.4485, "step": 21430 }, { "epoch": 3.1075841576983008, "grad_norm": 3.710465908050537, "learning_rate": 1.893317872155385e-05, "loss": 0.499, "step": 21440 }, { "epoch": 3.109033590607675, "grad_norm": 2.539863348007202, "learning_rate": 1.8918683867227137e-05, "loss": 0.4285, "step": 21450 }, { "epoch": 3.110483023517049, "grad_norm": 1.1959104537963867, "learning_rate": 1.890418901290042e-05, "loss": 0.4061, "step": 21460 }, { "epoch": 3.1119324564264232, "grad_norm": 3.4300787448883057, "learning_rate": 1.8889694158573706e-05, "loss": 0.4918, "step": 21470 }, { "epoch": 3.1133818893357974, "grad_norm": 1.6281919479370117, "learning_rate": 1.8875199304246993e-05, "loss": 0.4336, "step": 21480 }, { "epoch": 3.1148313222451716, "grad_norm": 1.0834424495697021, "learning_rate": 1.886070444992028e-05, "loss": 0.462, "step": 21490 }, { "epoch": 3.1162807551545457, "grad_norm": 1.0968761444091797, "learning_rate": 1.8846209595593565e-05, "loss": 0.4493, "step": 21500 }, { "epoch": 3.11773018806392, "grad_norm": 1.9969043731689453, "learning_rate": 1.883171474126685e-05, "loss": 0.4911, "step": 21510 }, { "epoch": 3.119179620973294, "grad_norm": 1.7604318857192993, "learning_rate": 1.8817219886940137e-05, "loss": 0.418, "step": 21520 }, { "epoch": 3.120629053882668, "grad_norm": 1.1388682126998901, "learning_rate": 1.8802725032613422e-05, "loss": 0.4105, "step": 21530 }, { "epoch": 3.122078486792043, "grad_norm": 0.8295134902000427, "learning_rate": 1.878823017828671e-05, "loss": 0.4364, "step": 21540 }, { "epoch": 3.123527919701417, "grad_norm": 1.134069561958313, "learning_rate": 1.8773735323959994e-05, "loss": 0.4599, "step": 21550 }, { "epoch": 3.124977352610791, "grad_norm": 1.7226959466934204, "learning_rate": 1.8759240469633282e-05, "loss": 0.4642, "step": 21560 }, { "epoch": 3.1264267855201653, "grad_norm": 1.3124228715896606, "learning_rate": 1.8744745615306566e-05, "loss": 0.4198, "step": 21570 }, { "epoch": 3.1278762184295394, "grad_norm": 1.0341984033584595, "learning_rate": 1.8730250760979854e-05, "loss": 0.4535, "step": 21580 }, { "epoch": 3.1293256513389136, "grad_norm": 0.8503829836845398, "learning_rate": 1.871575590665314e-05, "loss": 0.4629, "step": 21590 }, { "epoch": 3.1307750842482878, "grad_norm": 2.9683518409729004, "learning_rate": 1.8701261052326426e-05, "loss": 0.46, "step": 21600 }, { "epoch": 3.132224517157662, "grad_norm": 3.3407230377197266, "learning_rate": 1.868676619799971e-05, "loss": 0.4943, "step": 21610 }, { "epoch": 3.133673950067036, "grad_norm": 1.2744535207748413, "learning_rate": 1.8672271343672998e-05, "loss": 0.446, "step": 21620 }, { "epoch": 3.1351233829764107, "grad_norm": 1.004381775856018, "learning_rate": 1.8657776489346282e-05, "loss": 0.4428, "step": 21630 }, { "epoch": 3.136572815885785, "grad_norm": 1.5023181438446045, "learning_rate": 1.864328163501957e-05, "loss": 0.4492, "step": 21640 }, { "epoch": 3.138022248795159, "grad_norm": 0.7856612205505371, "learning_rate": 1.8628786780692854e-05, "loss": 0.4583, "step": 21650 }, { "epoch": 3.139471681704533, "grad_norm": 0.7462323904037476, "learning_rate": 1.861429192636614e-05, "loss": 0.4353, "step": 21660 }, { "epoch": 3.1409211146139073, "grad_norm": 0.8192468881607056, "learning_rate": 1.8599797072039427e-05, "loss": 0.4507, "step": 21670 }, { "epoch": 3.1423705475232815, "grad_norm": 1.2236969470977783, "learning_rate": 1.8585302217712714e-05, "loss": 0.5054, "step": 21680 }, { "epoch": 3.1438199804326556, "grad_norm": 1.431766152381897, "learning_rate": 1.8570807363386e-05, "loss": 0.4206, "step": 21690 }, { "epoch": 3.14526941334203, "grad_norm": 2.7869577407836914, "learning_rate": 1.8556312509059283e-05, "loss": 0.4735, "step": 21700 }, { "epoch": 3.146718846251404, "grad_norm": 1.9017903804779053, "learning_rate": 1.854181765473257e-05, "loss": 0.4565, "step": 21710 }, { "epoch": 3.148168279160778, "grad_norm": 1.2527071237564087, "learning_rate": 1.852732280040586e-05, "loss": 0.4762, "step": 21720 }, { "epoch": 3.1496177120701527, "grad_norm": 1.243003249168396, "learning_rate": 1.8512827946079143e-05, "loss": 0.4709, "step": 21730 }, { "epoch": 3.151067144979527, "grad_norm": 2.1310229301452637, "learning_rate": 1.8498333091752427e-05, "loss": 0.4461, "step": 21740 }, { "epoch": 3.152516577888901, "grad_norm": 2.815653085708618, "learning_rate": 1.8483838237425715e-05, "loss": 0.4761, "step": 21750 }, { "epoch": 3.153966010798275, "grad_norm": 3.001795768737793, "learning_rate": 1.8469343383099e-05, "loss": 0.4694, "step": 21760 }, { "epoch": 3.1554154437076494, "grad_norm": 2.344364643096924, "learning_rate": 1.8454848528772287e-05, "loss": 0.5223, "step": 21770 }, { "epoch": 3.1568648766170235, "grad_norm": 0.849949061870575, "learning_rate": 1.8440353674445575e-05, "loss": 0.4398, "step": 21780 }, { "epoch": 3.1583143095263977, "grad_norm": 0.9206576347351074, "learning_rate": 1.842585882011886e-05, "loss": 0.4655, "step": 21790 }, { "epoch": 3.159763742435772, "grad_norm": 2.563059091567993, "learning_rate": 1.8411363965792144e-05, "loss": 0.4491, "step": 21800 }, { "epoch": 3.161213175345146, "grad_norm": 2.454758882522583, "learning_rate": 1.839686911146543e-05, "loss": 0.4564, "step": 21810 }, { "epoch": 3.1626626082545206, "grad_norm": 2.338440418243408, "learning_rate": 1.8382374257138716e-05, "loss": 0.4938, "step": 21820 }, { "epoch": 3.1641120411638948, "grad_norm": 0.8359615802764893, "learning_rate": 1.8367879402812003e-05, "loss": 0.4533, "step": 21830 }, { "epoch": 3.165561474073269, "grad_norm": 1.198116660118103, "learning_rate": 1.8353384548485288e-05, "loss": 0.4597, "step": 21840 }, { "epoch": 3.167010906982643, "grad_norm": 1.0076746940612793, "learning_rate": 1.8338889694158576e-05, "loss": 0.4855, "step": 21850 }, { "epoch": 3.1684603398920173, "grad_norm": 0.7770813703536987, "learning_rate": 1.832584432526453e-05, "loss": 0.4484, "step": 21860 }, { "epoch": 3.1699097728013914, "grad_norm": 1.7894219160079956, "learning_rate": 1.831134947093782e-05, "loss": 0.4467, "step": 21870 }, { "epoch": 3.1713592057107656, "grad_norm": 0.9608204364776611, "learning_rate": 1.8296854616611104e-05, "loss": 0.4081, "step": 21880 }, { "epoch": 3.1728086386201397, "grad_norm": 3.3298492431640625, "learning_rate": 1.828235976228439e-05, "loss": 0.4777, "step": 21890 }, { "epoch": 3.174258071529514, "grad_norm": 2.6890292167663574, "learning_rate": 1.8267864907957676e-05, "loss": 0.4917, "step": 21900 }, { "epoch": 3.175707504438888, "grad_norm": 1.2616606950759888, "learning_rate": 1.825337005363096e-05, "loss": 0.4995, "step": 21910 }, { "epoch": 3.1771569373482627, "grad_norm": 1.0045479536056519, "learning_rate": 1.8238875199304248e-05, "loss": 0.4002, "step": 21920 }, { "epoch": 3.178606370257637, "grad_norm": 1.0455669164657593, "learning_rate": 1.8224380344977536e-05, "loss": 0.4975, "step": 21930 }, { "epoch": 3.180055803167011, "grad_norm": 0.8399646282196045, "learning_rate": 1.820988549065082e-05, "loss": 0.4358, "step": 21940 }, { "epoch": 3.181505236076385, "grad_norm": 1.2526215314865112, "learning_rate": 1.8195390636324104e-05, "loss": 0.4948, "step": 21950 }, { "epoch": 3.1829546689857593, "grad_norm": 1.0016404390335083, "learning_rate": 1.8180895781997392e-05, "loss": 0.4968, "step": 21960 }, { "epoch": 3.1844041018951335, "grad_norm": 0.9354867339134216, "learning_rate": 1.8166400927670676e-05, "loss": 0.4508, "step": 21970 }, { "epoch": 3.1858535348045076, "grad_norm": 1.2305155992507935, "learning_rate": 1.8151906073343964e-05, "loss": 0.4875, "step": 21980 }, { "epoch": 3.1873029677138818, "grad_norm": 2.4976658821105957, "learning_rate": 1.813741121901725e-05, "loss": 0.4869, "step": 21990 }, { "epoch": 3.1887524006232564, "grad_norm": 1.661704182624817, "learning_rate": 1.8122916364690536e-05, "loss": 0.4676, "step": 22000 }, { "epoch": 3.1887524006232564, "eval_loss": 0.725195050239563, "eval_runtime": 670.7568, "eval_samples_per_second": 51.427, "eval_steps_per_second": 2.572, "eval_token_accuracy": 0.000388915302675184, "step": 22000 }, { "epoch": 3.1902018335326305, "grad_norm": 1.16428542137146, "learning_rate": 1.810842151036382e-05, "loss": 0.4443, "step": 22010 }, { "epoch": 3.1916512664420047, "grad_norm": 3.376563549041748, "learning_rate": 1.809392665603711e-05, "loss": 0.4316, "step": 22020 }, { "epoch": 3.193100699351379, "grad_norm": 1.1599247455596924, "learning_rate": 1.8079431801710396e-05, "loss": 0.4499, "step": 22030 }, { "epoch": 3.194550132260753, "grad_norm": 1.023087739944458, "learning_rate": 1.806493694738368e-05, "loss": 0.514, "step": 22040 }, { "epoch": 3.195999565170127, "grad_norm": 1.0364000797271729, "learning_rate": 1.8050442093056965e-05, "loss": 0.3974, "step": 22050 }, { "epoch": 3.1974489980795013, "grad_norm": 1.0344669818878174, "learning_rate": 1.8035947238730253e-05, "loss": 0.5006, "step": 22060 }, { "epoch": 3.1988984309888755, "grad_norm": 0.8084372282028198, "learning_rate": 1.8021452384403537e-05, "loss": 0.4475, "step": 22070 }, { "epoch": 3.2003478638982497, "grad_norm": 2.4641897678375244, "learning_rate": 1.8006957530076825e-05, "loss": 0.4526, "step": 22080 }, { "epoch": 3.201797296807624, "grad_norm": 1.0928001403808594, "learning_rate": 1.799246267575011e-05, "loss": 0.435, "step": 22090 }, { "epoch": 3.2032467297169984, "grad_norm": 1.0270906686782837, "learning_rate": 1.7977967821423393e-05, "loss": 0.4208, "step": 22100 }, { "epoch": 3.2046961626263726, "grad_norm": 2.1553866863250732, "learning_rate": 1.796347296709668e-05, "loss": 0.4887, "step": 22110 }, { "epoch": 3.2061455955357467, "grad_norm": 2.221139907836914, "learning_rate": 1.794897811276997e-05, "loss": 0.4922, "step": 22120 }, { "epoch": 3.207595028445121, "grad_norm": 1.4236232042312622, "learning_rate": 1.7934483258443253e-05, "loss": 0.4619, "step": 22130 }, { "epoch": 3.209044461354495, "grad_norm": 0.9401782155036926, "learning_rate": 1.7919988404116538e-05, "loss": 0.491, "step": 22140 }, { "epoch": 3.210493894263869, "grad_norm": 2.1895623207092285, "learning_rate": 1.7905493549789825e-05, "loss": 0.4168, "step": 22150 }, { "epoch": 3.2119433271732434, "grad_norm": 2.5863196849823, "learning_rate": 1.7890998695463113e-05, "loss": 0.427, "step": 22160 }, { "epoch": 3.2133927600826175, "grad_norm": 0.9865674376487732, "learning_rate": 1.7876503841136397e-05, "loss": 0.5734, "step": 22170 }, { "epoch": 3.2148421929919917, "grad_norm": 1.3625892400741577, "learning_rate": 1.7862008986809685e-05, "loss": 0.434, "step": 22180 }, { "epoch": 3.2162916259013663, "grad_norm": 2.1608617305755615, "learning_rate": 1.784751413248297e-05, "loss": 0.4579, "step": 22190 }, { "epoch": 3.2177410588107405, "grad_norm": 1.851686716079712, "learning_rate": 1.7833019278156254e-05, "loss": 0.4108, "step": 22200 }, { "epoch": 3.2191904917201146, "grad_norm": 1.1519325971603394, "learning_rate": 1.781852442382954e-05, "loss": 0.4558, "step": 22210 }, { "epoch": 3.220639924629489, "grad_norm": 3.0943803787231445, "learning_rate": 1.780402956950283e-05, "loss": 0.4681, "step": 22220 }, { "epoch": 3.222089357538863, "grad_norm": 1.34933340549469, "learning_rate": 1.7789534715176114e-05, "loss": 0.4719, "step": 22230 }, { "epoch": 3.223538790448237, "grad_norm": 0.900416374206543, "learning_rate": 1.7775039860849398e-05, "loss": 0.4007, "step": 22240 }, { "epoch": 3.2249882233576113, "grad_norm": 1.4920786619186401, "learning_rate": 1.7760545006522686e-05, "loss": 0.377, "step": 22250 }, { "epoch": 3.2264376562669854, "grad_norm": 1.3333457708358765, "learning_rate": 1.7746050152195974e-05, "loss": 0.5223, "step": 22260 }, { "epoch": 3.2278870891763596, "grad_norm": 0.8491736054420471, "learning_rate": 1.7731555297869258e-05, "loss": 0.4507, "step": 22270 }, { "epoch": 3.2293365220857337, "grad_norm": 1.204391360282898, "learning_rate": 1.7717060443542542e-05, "loss": 0.4531, "step": 22280 }, { "epoch": 3.2307859549951083, "grad_norm": 0.8898743987083435, "learning_rate": 1.770256558921583e-05, "loss": 0.4785, "step": 22290 }, { "epoch": 3.2322353879044825, "grad_norm": 2.0492045879364014, "learning_rate": 1.7688070734889114e-05, "loss": 0.5007, "step": 22300 }, { "epoch": 3.2336848208138567, "grad_norm": 1.5305896997451782, "learning_rate": 1.7673575880562402e-05, "loss": 0.458, "step": 22310 }, { "epoch": 3.235134253723231, "grad_norm": 2.20937442779541, "learning_rate": 1.7659081026235687e-05, "loss": 0.4776, "step": 22320 }, { "epoch": 3.236583686632605, "grad_norm": 1.2759435176849365, "learning_rate": 1.764458617190897e-05, "loss": 0.4559, "step": 22330 }, { "epoch": 3.238033119541979, "grad_norm": 2.635709762573242, "learning_rate": 1.763009131758226e-05, "loss": 0.5438, "step": 22340 }, { "epoch": 3.2394825524513533, "grad_norm": 0.9792227745056152, "learning_rate": 1.7615596463255546e-05, "loss": 0.4645, "step": 22350 }, { "epoch": 3.2409319853607275, "grad_norm": 3.481822967529297, "learning_rate": 1.760110160892883e-05, "loss": 0.4575, "step": 22360 }, { "epoch": 3.2423814182701016, "grad_norm": 1.1393392086029053, "learning_rate": 1.758660675460212e-05, "loss": 0.4472, "step": 22370 }, { "epoch": 3.2438308511794762, "grad_norm": 2.524489641189575, "learning_rate": 1.7572111900275403e-05, "loss": 0.5124, "step": 22380 }, { "epoch": 3.2452802840888504, "grad_norm": 2.6743130683898926, "learning_rate": 1.7557617045948687e-05, "loss": 0.4173, "step": 22390 }, { "epoch": 3.2467297169982245, "grad_norm": 0.8293477892875671, "learning_rate": 1.7543122191621975e-05, "loss": 0.4444, "step": 22400 }, { "epoch": 3.2481791499075987, "grad_norm": 1.2171900272369385, "learning_rate": 1.7528627337295263e-05, "loss": 0.475, "step": 22410 }, { "epoch": 3.249628582816973, "grad_norm": 2.4464237689971924, "learning_rate": 1.7514132482968547e-05, "loss": 0.4502, "step": 22420 }, { "epoch": 3.251078015726347, "grad_norm": 2.3313779830932617, "learning_rate": 1.749963762864183e-05, "loss": 0.4472, "step": 22430 }, { "epoch": 3.252527448635721, "grad_norm": 1.9434154033660889, "learning_rate": 1.748514277431512e-05, "loss": 0.4169, "step": 22440 }, { "epoch": 3.2539768815450953, "grad_norm": 0.9746271371841431, "learning_rate": 1.7470647919988407e-05, "loss": 0.4508, "step": 22450 }, { "epoch": 3.2554263144544695, "grad_norm": 1.096774935722351, "learning_rate": 1.745615306566169e-05, "loss": 0.4454, "step": 22460 }, { "epoch": 3.2568757473638437, "grad_norm": 1.1862661838531494, "learning_rate": 1.7441658211334976e-05, "loss": 0.5229, "step": 22470 }, { "epoch": 3.2583251802732183, "grad_norm": 1.1553421020507812, "learning_rate": 1.7427163357008263e-05, "loss": 0.478, "step": 22480 }, { "epoch": 3.2597746131825924, "grad_norm": 3.2264418601989746, "learning_rate": 1.7412668502681548e-05, "loss": 0.4547, "step": 22490 }, { "epoch": 3.2612240460919666, "grad_norm": 1.1565039157867432, "learning_rate": 1.7398173648354835e-05, "loss": 0.4711, "step": 22500 }, { "epoch": 3.2626734790013407, "grad_norm": 2.3308446407318115, "learning_rate": 1.738367879402812e-05, "loss": 0.4084, "step": 22510 }, { "epoch": 3.264122911910715, "grad_norm": 1.196081519126892, "learning_rate": 1.7369183939701404e-05, "loss": 0.4103, "step": 22520 }, { "epoch": 3.265572344820089, "grad_norm": 0.8371307253837585, "learning_rate": 1.7354689085374692e-05, "loss": 0.4792, "step": 22530 }, { "epoch": 3.2670217777294632, "grad_norm": 0.8609314560890198, "learning_rate": 1.734019423104798e-05, "loss": 0.4364, "step": 22540 }, { "epoch": 3.2684712106388374, "grad_norm": 3.442795753479004, "learning_rate": 1.7325699376721267e-05, "loss": 0.45, "step": 22550 }, { "epoch": 3.269920643548212, "grad_norm": 2.094744920730591, "learning_rate": 1.7311204522394552e-05, "loss": 0.4826, "step": 22560 }, { "epoch": 3.271370076457586, "grad_norm": 1.333853840827942, "learning_rate": 1.7296709668067836e-05, "loss": 0.4741, "step": 22570 }, { "epoch": 3.2728195093669603, "grad_norm": 2.6992194652557373, "learning_rate": 1.7282214813741124e-05, "loss": 0.4722, "step": 22580 }, { "epoch": 3.2742689422763345, "grad_norm": 1.3147931098937988, "learning_rate": 1.7267719959414408e-05, "loss": 0.4681, "step": 22590 }, { "epoch": 3.2757183751857086, "grad_norm": 0.9021698236465454, "learning_rate": 1.7253225105087696e-05, "loss": 0.4493, "step": 22600 }, { "epoch": 3.277167808095083, "grad_norm": 1.2024173736572266, "learning_rate": 1.723873025076098e-05, "loss": 0.4583, "step": 22610 }, { "epoch": 3.278617241004457, "grad_norm": 3.715240716934204, "learning_rate": 1.7224235396434265e-05, "loss": 0.4554, "step": 22620 }, { "epoch": 3.280066673913831, "grad_norm": 2.2712244987487793, "learning_rate": 1.7209740542107552e-05, "loss": 0.4563, "step": 22630 }, { "epoch": 3.2815161068232053, "grad_norm": 1.429622769355774, "learning_rate": 1.719524568778084e-05, "loss": 0.4739, "step": 22640 }, { "epoch": 3.2829655397325794, "grad_norm": 3.115586757659912, "learning_rate": 1.7180750833454125e-05, "loss": 0.5281, "step": 22650 }, { "epoch": 3.2844149726419536, "grad_norm": 2.4304490089416504, "learning_rate": 1.716625597912741e-05, "loss": 0.3748, "step": 22660 }, { "epoch": 3.285864405551328, "grad_norm": 0.8875736594200134, "learning_rate": 1.7151761124800697e-05, "loss": 0.5341, "step": 22670 }, { "epoch": 3.2873138384607024, "grad_norm": 3.3635129928588867, "learning_rate": 1.7137266270473984e-05, "loss": 0.4654, "step": 22680 }, { "epoch": 3.2887632713700765, "grad_norm": 1.1657626628875732, "learning_rate": 1.712277141614727e-05, "loss": 0.436, "step": 22690 }, { "epoch": 3.2902127042794507, "grad_norm": 1.698906660079956, "learning_rate": 1.7108276561820553e-05, "loss": 0.4469, "step": 22700 }, { "epoch": 3.291662137188825, "grad_norm": 2.1452109813690186, "learning_rate": 1.709378170749384e-05, "loss": 0.4585, "step": 22710 }, { "epoch": 3.293111570098199, "grad_norm": 1.1009830236434937, "learning_rate": 1.7079286853167125e-05, "loss": 0.504, "step": 22720 }, { "epoch": 3.294561003007573, "grad_norm": 0.9174827933311462, "learning_rate": 1.7064791998840413e-05, "loss": 0.4765, "step": 22730 }, { "epoch": 3.2960104359169473, "grad_norm": 1.4893231391906738, "learning_rate": 1.70502971445137e-05, "loss": 0.4736, "step": 22740 }, { "epoch": 3.297459868826322, "grad_norm": 1.3025282621383667, "learning_rate": 1.7035802290186985e-05, "loss": 0.4695, "step": 22750 }, { "epoch": 3.298909301735696, "grad_norm": 1.2915959358215332, "learning_rate": 1.702130743586027e-05, "loss": 0.4178, "step": 22760 }, { "epoch": 3.3003587346450702, "grad_norm": 0.8622018098831177, "learning_rate": 1.7006812581533557e-05, "loss": 0.4237, "step": 22770 }, { "epoch": 3.3018081675544444, "grad_norm": 0.8027140498161316, "learning_rate": 1.699231772720684e-05, "loss": 0.4728, "step": 22780 }, { "epoch": 3.3032576004638186, "grad_norm": 0.9985853433609009, "learning_rate": 1.697782287288013e-05, "loss": 0.4351, "step": 22790 }, { "epoch": 3.3047070333731927, "grad_norm": 2.357565402984619, "learning_rate": 1.6963328018553414e-05, "loss": 0.4458, "step": 22800 }, { "epoch": 3.306156466282567, "grad_norm": 1.0484607219696045, "learning_rate": 1.6948833164226698e-05, "loss": 0.4322, "step": 22810 }, { "epoch": 3.307605899191941, "grad_norm": 1.152421474456787, "learning_rate": 1.6934338309899986e-05, "loss": 0.4471, "step": 22820 }, { "epoch": 3.309055332101315, "grad_norm": 1.0169322490692139, "learning_rate": 1.6919843455573274e-05, "loss": 0.5096, "step": 22830 }, { "epoch": 3.3105047650106894, "grad_norm": 1.0829248428344727, "learning_rate": 1.6905348601246558e-05, "loss": 0.4855, "step": 22840 }, { "epoch": 3.311954197920064, "grad_norm": 0.8866340517997742, "learning_rate": 1.6890853746919842e-05, "loss": 0.4509, "step": 22850 }, { "epoch": 3.313403630829438, "grad_norm": 1.7526586055755615, "learning_rate": 1.687635889259313e-05, "loss": 0.4671, "step": 22860 }, { "epoch": 3.3148530637388123, "grad_norm": 1.9207016229629517, "learning_rate": 1.6861864038266418e-05, "loss": 0.4679, "step": 22870 }, { "epoch": 3.3163024966481864, "grad_norm": 1.329699993133545, "learning_rate": 1.6847369183939702e-05, "loss": 0.4632, "step": 22880 }, { "epoch": 3.3177519295575606, "grad_norm": 3.8258862495422363, "learning_rate": 1.6832874329612986e-05, "loss": 0.4822, "step": 22890 }, { "epoch": 3.3192013624669348, "grad_norm": 1.1815471649169922, "learning_rate": 1.6818379475286274e-05, "loss": 0.4462, "step": 22900 }, { "epoch": 3.320650795376309, "grad_norm": 0.9723221063613892, "learning_rate": 1.680388462095956e-05, "loss": 0.4768, "step": 22910 }, { "epoch": 3.322100228285683, "grad_norm": 3.022918939590454, "learning_rate": 1.6789389766632846e-05, "loss": 0.4284, "step": 22920 }, { "epoch": 3.3235496611950577, "grad_norm": 0.8839090466499329, "learning_rate": 1.6774894912306134e-05, "loss": 0.5082, "step": 22930 }, { "epoch": 3.324999094104432, "grad_norm": 0.8981729745864868, "learning_rate": 1.676040005797942e-05, "loss": 0.4147, "step": 22940 }, { "epoch": 3.326448527013806, "grad_norm": 2.855755567550659, "learning_rate": 1.6745905203652703e-05, "loss": 0.4797, "step": 22950 }, { "epoch": 3.32789795992318, "grad_norm": 3.2790029048919678, "learning_rate": 1.673141034932599e-05, "loss": 0.5264, "step": 22960 }, { "epoch": 3.3293473928325543, "grad_norm": 0.9484438896179199, "learning_rate": 1.6716915494999278e-05, "loss": 0.4662, "step": 22970 }, { "epoch": 3.3307968257419285, "grad_norm": 1.0549322366714478, "learning_rate": 1.6702420640672563e-05, "loss": 0.4659, "step": 22980 }, { "epoch": 3.3322462586513026, "grad_norm": 1.038637399673462, "learning_rate": 1.6687925786345847e-05, "loss": 0.4936, "step": 22990 }, { "epoch": 3.333695691560677, "grad_norm": 2.0607287883758545, "learning_rate": 1.6673430932019135e-05, "loss": 0.4444, "step": 23000 }, { "epoch": 3.333695691560677, "eval_loss": 0.7252270579338074, "eval_runtime": 671.9965, "eval_samples_per_second": 51.332, "eval_steps_per_second": 2.567, "eval_token_accuracy": 0.0003898803530540306, "step": 23000 }, { "epoch": 3.335145124470051, "grad_norm": 1.2621514797210693, "learning_rate": 1.665893607769242e-05, "loss": 0.498, "step": 23010 }, { "epoch": 3.336594557379425, "grad_norm": 2.548382043838501, "learning_rate": 1.6644441223365707e-05, "loss": 0.5305, "step": 23020 }, { "epoch": 3.3380439902887993, "grad_norm": 1.046783208847046, "learning_rate": 1.662994636903899e-05, "loss": 0.4193, "step": 23030 }, { "epoch": 3.339493423198174, "grad_norm": 0.898941159248352, "learning_rate": 1.6615451514712276e-05, "loss": 0.4921, "step": 23040 }, { "epoch": 3.340942856107548, "grad_norm": 1.0653246641159058, "learning_rate": 1.6600956660385563e-05, "loss": 0.5408, "step": 23050 }, { "epoch": 3.342392289016922, "grad_norm": 1.2510879039764404, "learning_rate": 1.658646180605885e-05, "loss": 0.4975, "step": 23060 }, { "epoch": 3.3438417219262964, "grad_norm": 1.3344258069992065, "learning_rate": 1.6571966951732135e-05, "loss": 0.4376, "step": 23070 }, { "epoch": 3.3452911548356705, "grad_norm": 1.0579745769500732, "learning_rate": 1.655747209740542e-05, "loss": 0.4425, "step": 23080 }, { "epoch": 3.3467405877450447, "grad_norm": 1.7280727624893188, "learning_rate": 1.6542977243078708e-05, "loss": 0.5217, "step": 23090 }, { "epoch": 3.348190020654419, "grad_norm": 0.845836341381073, "learning_rate": 1.6528482388751995e-05, "loss": 0.4382, "step": 23100 }, { "epoch": 3.349639453563793, "grad_norm": 2.331657648086548, "learning_rate": 1.651398753442528e-05, "loss": 0.4631, "step": 23110 }, { "epoch": 3.3510888864731676, "grad_norm": 1.2817667722702026, "learning_rate": 1.6499492680098567e-05, "loss": 0.4699, "step": 23120 }, { "epoch": 3.3525383193825418, "grad_norm": 0.9075936675071716, "learning_rate": 1.6484997825771852e-05, "loss": 0.4654, "step": 23130 }, { "epoch": 3.353987752291916, "grad_norm": 1.164581537246704, "learning_rate": 1.6470502971445136e-05, "loss": 0.4603, "step": 23140 }, { "epoch": 3.35543718520129, "grad_norm": 2.0525588989257812, "learning_rate": 1.6456008117118424e-05, "loss": 0.4169, "step": 23150 }, { "epoch": 3.3568866181106642, "grad_norm": 1.2242342233657837, "learning_rate": 1.644151326279171e-05, "loss": 0.4956, "step": 23160 }, { "epoch": 3.3583360510200384, "grad_norm": 1.5872743129730225, "learning_rate": 1.6427018408464996e-05, "loss": 0.4644, "step": 23170 }, { "epoch": 3.3597854839294126, "grad_norm": 2.8896257877349854, "learning_rate": 1.641252355413828e-05, "loss": 0.4392, "step": 23180 }, { "epoch": 3.3612349168387867, "grad_norm": 3.553441286087036, "learning_rate": 1.6398028699811568e-05, "loss": 0.4607, "step": 23190 }, { "epoch": 3.362684349748161, "grad_norm": 1.3294955492019653, "learning_rate": 1.6383533845484852e-05, "loss": 0.4963, "step": 23200 }, { "epoch": 3.364133782657535, "grad_norm": 0.9303203821182251, "learning_rate": 1.636903899115814e-05, "loss": 0.4213, "step": 23210 }, { "epoch": 3.365583215566909, "grad_norm": 1.2010650634765625, "learning_rate": 1.6354544136831425e-05, "loss": 0.3944, "step": 23220 }, { "epoch": 3.367032648476284, "grad_norm": 1.1837284564971924, "learning_rate": 1.634004928250471e-05, "loss": 0.4574, "step": 23230 }, { "epoch": 3.368482081385658, "grad_norm": 0.9718520045280457, "learning_rate": 1.6325554428177997e-05, "loss": 0.4507, "step": 23240 }, { "epoch": 3.369931514295032, "grad_norm": 3.4572932720184326, "learning_rate": 1.6311059573851284e-05, "loss": 0.4998, "step": 23250 }, { "epoch": 3.3713809472044063, "grad_norm": 3.0571515560150146, "learning_rate": 1.6296564719524572e-05, "loss": 0.4589, "step": 23260 }, { "epoch": 3.3728303801137804, "grad_norm": 1.004155158996582, "learning_rate": 1.6282069865197857e-05, "loss": 0.4638, "step": 23270 }, { "epoch": 3.3742798130231546, "grad_norm": 3.1334617137908936, "learning_rate": 1.626757501087114e-05, "loss": 0.5358, "step": 23280 }, { "epoch": 3.3757292459325288, "grad_norm": 3.0823802947998047, "learning_rate": 1.625308015654443e-05, "loss": 0.4908, "step": 23290 }, { "epoch": 3.377178678841903, "grad_norm": 0.82496178150177, "learning_rate": 1.6238585302217713e-05, "loss": 0.4883, "step": 23300 }, { "epoch": 3.3786281117512775, "grad_norm": 1.1200013160705566, "learning_rate": 1.6224090447891e-05, "loss": 0.478, "step": 23310 }, { "epoch": 3.3800775446606517, "grad_norm": 3.1949923038482666, "learning_rate": 1.6209595593564285e-05, "loss": 0.5102, "step": 23320 }, { "epoch": 3.381526977570026, "grad_norm": 2.673584461212158, "learning_rate": 1.619510073923757e-05, "loss": 0.5098, "step": 23330 }, { "epoch": 3.3829764104794, "grad_norm": 0.9602708220481873, "learning_rate": 1.6180605884910857e-05, "loss": 0.4518, "step": 23340 }, { "epoch": 3.384425843388774, "grad_norm": 2.1943914890289307, "learning_rate": 1.6166111030584145e-05, "loss": 0.4692, "step": 23350 }, { "epoch": 3.3858752762981483, "grad_norm": 2.179941415786743, "learning_rate": 1.615161617625743e-05, "loss": 0.4706, "step": 23360 }, { "epoch": 3.3873247092075225, "grad_norm": 1.5116748809814453, "learning_rate": 1.6137121321930714e-05, "loss": 0.4411, "step": 23370 }, { "epoch": 3.3887741421168966, "grad_norm": 0.8765755295753479, "learning_rate": 1.6122626467604e-05, "loss": 0.4853, "step": 23380 }, { "epoch": 3.390223575026271, "grad_norm": 1.2487914562225342, "learning_rate": 1.610813161327729e-05, "loss": 0.4711, "step": 23390 }, { "epoch": 3.391673007935645, "grad_norm": 0.8878582715988159, "learning_rate": 1.6093636758950574e-05, "loss": 0.4899, "step": 23400 }, { "epoch": 3.3931224408450196, "grad_norm": 1.0657848119735718, "learning_rate": 1.6079141904623858e-05, "loss": 0.511, "step": 23410 }, { "epoch": 3.3945718737543937, "grad_norm": 1.696035385131836, "learning_rate": 1.6064647050297146e-05, "loss": 0.4409, "step": 23420 }, { "epoch": 3.396021306663768, "grad_norm": 1.4846292734146118, "learning_rate": 1.605015219597043e-05, "loss": 0.4929, "step": 23430 }, { "epoch": 3.397470739573142, "grad_norm": 5.0850419998168945, "learning_rate": 1.6035657341643718e-05, "loss": 0.5647, "step": 23440 }, { "epoch": 3.398920172482516, "grad_norm": 3.0048062801361084, "learning_rate": 1.6021162487317005e-05, "loss": 0.4346, "step": 23450 }, { "epoch": 3.4003696053918904, "grad_norm": 0.7665842175483704, "learning_rate": 1.600666763299029e-05, "loss": 0.4606, "step": 23460 }, { "epoch": 3.4018190383012645, "grad_norm": 2.541459798812866, "learning_rate": 1.5992172778663574e-05, "loss": 0.5251, "step": 23470 }, { "epoch": 3.4032684712106387, "grad_norm": 1.6156889200210571, "learning_rate": 1.5977677924336862e-05, "loss": 0.4589, "step": 23480 }, { "epoch": 3.4047179041200133, "grad_norm": 3.334068536758423, "learning_rate": 1.5963183070010146e-05, "loss": 0.4495, "step": 23490 }, { "epoch": 3.4061673370293875, "grad_norm": 1.9630388021469116, "learning_rate": 1.5948688215683434e-05, "loss": 0.4894, "step": 23500 }, { "epoch": 3.4076167699387616, "grad_norm": 1.2060139179229736, "learning_rate": 1.593419336135672e-05, "loss": 0.4561, "step": 23510 }, { "epoch": 3.4090662028481358, "grad_norm": 1.082797884941101, "learning_rate": 1.5919698507030006e-05, "loss": 0.4288, "step": 23520 }, { "epoch": 3.41051563575751, "grad_norm": 2.201314926147461, "learning_rate": 1.590520365270329e-05, "loss": 0.4776, "step": 23530 }, { "epoch": 3.411965068666884, "grad_norm": 0.8573411107063293, "learning_rate": 1.5890708798376578e-05, "loss": 0.4428, "step": 23540 }, { "epoch": 3.4134145015762583, "grad_norm": 0.9044166803359985, "learning_rate": 1.5876213944049863e-05, "loss": 0.4825, "step": 23550 }, { "epoch": 3.4148639344856324, "grad_norm": 2.2872838973999023, "learning_rate": 1.5861719089723147e-05, "loss": 0.4554, "step": 23560 }, { "epoch": 3.4163133673950066, "grad_norm": 1.6215200424194336, "learning_rate": 1.5847224235396435e-05, "loss": 0.4603, "step": 23570 }, { "epoch": 3.4177628003043807, "grad_norm": 1.0931036472320557, "learning_rate": 1.5832729381069722e-05, "loss": 0.4911, "step": 23580 }, { "epoch": 3.419212233213755, "grad_norm": 0.8152222037315369, "learning_rate": 1.5818234526743007e-05, "loss": 0.4648, "step": 23590 }, { "epoch": 3.4206616661231295, "grad_norm": 2.398775815963745, "learning_rate": 1.580373967241629e-05, "loss": 0.4543, "step": 23600 }, { "epoch": 3.4221110990325037, "grad_norm": 1.1491453647613525, "learning_rate": 1.578924481808958e-05, "loss": 0.4195, "step": 23610 }, { "epoch": 3.423560531941878, "grad_norm": 1.175913691520691, "learning_rate": 1.5774749963762863e-05, "loss": 0.5195, "step": 23620 }, { "epoch": 3.425009964851252, "grad_norm": 0.8074049949645996, "learning_rate": 1.576025510943615e-05, "loss": 0.5339, "step": 23630 }, { "epoch": 3.426459397760626, "grad_norm": 1.1793062686920166, "learning_rate": 1.574576025510944e-05, "loss": 0.4522, "step": 23640 }, { "epoch": 3.4279088306700003, "grad_norm": 0.9329249858856201, "learning_rate": 1.5731265400782723e-05, "loss": 0.4521, "step": 23650 }, { "epoch": 3.4293582635793745, "grad_norm": 1.9298598766326904, "learning_rate": 1.5716770546456008e-05, "loss": 0.4866, "step": 23660 }, { "epoch": 3.4308076964887486, "grad_norm": 1.1167508363723755, "learning_rate": 1.5702275692129295e-05, "loss": 0.4857, "step": 23670 }, { "epoch": 3.432257129398123, "grad_norm": 0.9384124875068665, "learning_rate": 1.5687780837802583e-05, "loss": 0.5053, "step": 23680 }, { "epoch": 3.4337065623074974, "grad_norm": 0.8454510569572449, "learning_rate": 1.5673285983475867e-05, "loss": 0.4679, "step": 23690 }, { "epoch": 3.4351559952168715, "grad_norm": 1.1908695697784424, "learning_rate": 1.5658791129149152e-05, "loss": 0.4517, "step": 23700 }, { "epoch": 3.4366054281262457, "grad_norm": 0.7445909976959229, "learning_rate": 1.564429627482244e-05, "loss": 0.4075, "step": 23710 }, { "epoch": 3.43805486103562, "grad_norm": 2.431640148162842, "learning_rate": 1.5629801420495724e-05, "loss": 0.4361, "step": 23720 }, { "epoch": 3.439504293944994, "grad_norm": 2.70326828956604, "learning_rate": 1.561530656616901e-05, "loss": 0.472, "step": 23730 }, { "epoch": 3.440953726854368, "grad_norm": 1.1950196027755737, "learning_rate": 1.5600811711842296e-05, "loss": 0.5528, "step": 23740 }, { "epoch": 3.4424031597637423, "grad_norm": 2.5100083351135254, "learning_rate": 1.558631685751558e-05, "loss": 0.4966, "step": 23750 }, { "epoch": 3.4438525926731165, "grad_norm": 0.8543462753295898, "learning_rate": 1.5571822003188868e-05, "loss": 0.4825, "step": 23760 }, { "epoch": 3.4453020255824907, "grad_norm": 0.789966344833374, "learning_rate": 1.5557327148862156e-05, "loss": 0.4606, "step": 23770 }, { "epoch": 3.446751458491865, "grad_norm": 1.8987724781036377, "learning_rate": 1.554283229453544e-05, "loss": 0.4361, "step": 23780 }, { "epoch": 3.4482008914012394, "grad_norm": 1.3493493795394897, "learning_rate": 1.5528337440208725e-05, "loss": 0.4777, "step": 23790 }, { "epoch": 3.4496503243106136, "grad_norm": 1.1933780908584595, "learning_rate": 1.5513842585882012e-05, "loss": 0.4228, "step": 23800 }, { "epoch": 3.4510997572199877, "grad_norm": 1.8975234031677246, "learning_rate": 1.54993477315553e-05, "loss": 0.4701, "step": 23810 }, { "epoch": 3.452549190129362, "grad_norm": 1.3678011894226074, "learning_rate": 1.5484852877228584e-05, "loss": 0.4882, "step": 23820 }, { "epoch": 3.453998623038736, "grad_norm": 1.091180443763733, "learning_rate": 1.5470358022901872e-05, "loss": 0.4821, "step": 23830 }, { "epoch": 3.45544805594811, "grad_norm": 3.8083252906799316, "learning_rate": 1.5455863168575156e-05, "loss": 0.4612, "step": 23840 }, { "epoch": 3.4568974888574844, "grad_norm": 1.3760696649551392, "learning_rate": 1.544136831424844e-05, "loss": 0.4554, "step": 23850 }, { "epoch": 3.4583469217668585, "grad_norm": 1.2829561233520508, "learning_rate": 1.542687345992173e-05, "loss": 0.4219, "step": 23860 }, { "epoch": 3.459796354676233, "grad_norm": 2.6034016609191895, "learning_rate": 1.5412378605595016e-05, "loss": 0.46, "step": 23870 }, { "epoch": 3.4612457875856073, "grad_norm": 3.3234379291534424, "learning_rate": 1.53978837512683e-05, "loss": 0.5564, "step": 23880 }, { "epoch": 3.4626952204949815, "grad_norm": 0.9328240156173706, "learning_rate": 1.5383388896941585e-05, "loss": 0.4576, "step": 23890 }, { "epoch": 3.4641446534043556, "grad_norm": 1.895174264907837, "learning_rate": 1.5368894042614873e-05, "loss": 0.4811, "step": 23900 }, { "epoch": 3.46559408631373, "grad_norm": 1.6813945770263672, "learning_rate": 1.5354399188288157e-05, "loss": 0.4107, "step": 23910 }, { "epoch": 3.467043519223104, "grad_norm": 1.6949057579040527, "learning_rate": 1.5339904333961445e-05, "loss": 0.4491, "step": 23920 }, { "epoch": 3.468492952132478, "grad_norm": 0.9117002487182617, "learning_rate": 1.532540947963473e-05, "loss": 0.4608, "step": 23930 }, { "epoch": 3.4699423850418523, "grad_norm": 1.4225014448165894, "learning_rate": 1.5310914625308017e-05, "loss": 0.4462, "step": 23940 }, { "epoch": 3.4713918179512264, "grad_norm": 0.9624481201171875, "learning_rate": 1.52964197709813e-05, "loss": 0.4547, "step": 23950 }, { "epoch": 3.4728412508606006, "grad_norm": 1.0208147764205933, "learning_rate": 1.528192491665459e-05, "loss": 0.4626, "step": 23960 }, { "epoch": 3.474290683769975, "grad_norm": 1.079161524772644, "learning_rate": 1.5267430062327877e-05, "loss": 0.4571, "step": 23970 }, { "epoch": 3.4757401166793493, "grad_norm": 2.516960859298706, "learning_rate": 1.525293520800116e-05, "loss": 0.4935, "step": 23980 }, { "epoch": 3.4771895495887235, "grad_norm": 1.0414551496505737, "learning_rate": 1.5238440353674446e-05, "loss": 0.5177, "step": 23990 }, { "epoch": 3.4786389824980977, "grad_norm": 2.013028383255005, "learning_rate": 1.5223945499347733e-05, "loss": 0.5026, "step": 24000 }, { "epoch": 3.4786389824980977, "eval_loss": 0.7205393314361572, "eval_runtime": 671.6003, "eval_samples_per_second": 51.362, "eval_steps_per_second": 2.568, "eval_token_accuracy": 0.0003924538207309549, "step": 24000 }, { "epoch": 3.480088415407472, "grad_norm": 1.3144657611846924, "learning_rate": 1.5209450645021018e-05, "loss": 0.4393, "step": 24010 }, { "epoch": 3.481537848316846, "grad_norm": 1.0083630084991455, "learning_rate": 1.5194955790694304e-05, "loss": 0.5047, "step": 24020 }, { "epoch": 3.48298728122622, "grad_norm": 1.2430204153060913, "learning_rate": 1.5180460936367591e-05, "loss": 0.4527, "step": 24030 }, { "epoch": 3.4844367141355943, "grad_norm": 2.474379062652588, "learning_rate": 1.5165966082040876e-05, "loss": 0.4234, "step": 24040 }, { "epoch": 3.4858861470449685, "grad_norm": 1.0105928182601929, "learning_rate": 1.5151471227714162e-05, "loss": 0.4968, "step": 24050 }, { "epoch": 3.487335579954343, "grad_norm": 1.6667371988296509, "learning_rate": 1.5136976373387448e-05, "loss": 0.4341, "step": 24060 }, { "epoch": 3.4887850128637172, "grad_norm": 5.3816914558410645, "learning_rate": 1.5122481519060736e-05, "loss": 0.4355, "step": 24070 }, { "epoch": 3.4902344457730914, "grad_norm": 1.5800060033798218, "learning_rate": 1.510798666473402e-05, "loss": 0.4395, "step": 24080 }, { "epoch": 3.4916838786824655, "grad_norm": 2.9925456047058105, "learning_rate": 1.5093491810407306e-05, "loss": 0.4635, "step": 24090 }, { "epoch": 3.4931333115918397, "grad_norm": 0.9235763549804688, "learning_rate": 1.5078996956080594e-05, "loss": 0.4665, "step": 24100 }, { "epoch": 3.494582744501214, "grad_norm": 2.027843952178955, "learning_rate": 1.5064502101753877e-05, "loss": 0.401, "step": 24110 }, { "epoch": 3.496032177410588, "grad_norm": 0.9020404815673828, "learning_rate": 1.5050007247427164e-05, "loss": 0.4479, "step": 24120 }, { "epoch": 3.497481610319962, "grad_norm": 0.9779691696166992, "learning_rate": 1.503551239310045e-05, "loss": 0.4969, "step": 24130 }, { "epoch": 3.4989310432293363, "grad_norm": 1.1350491046905518, "learning_rate": 1.5021017538773735e-05, "loss": 0.4676, "step": 24140 }, { "epoch": 3.5003804761387105, "grad_norm": 1.9279160499572754, "learning_rate": 1.5006522684447022e-05, "loss": 0.439, "step": 24150 }, { "epoch": 3.5018299090480847, "grad_norm": 1.7692646980285645, "learning_rate": 1.4992027830120308e-05, "loss": 0.4132, "step": 24160 }, { "epoch": 3.5032793419574593, "grad_norm": 1.7417184114456177, "learning_rate": 1.4977532975793593e-05, "loss": 0.4349, "step": 24170 }, { "epoch": 3.5047287748668334, "grad_norm": 0.9393298625946045, "learning_rate": 1.4963038121466879e-05, "loss": 0.4626, "step": 24180 }, { "epoch": 3.5061782077762076, "grad_norm": 1.083738923072815, "learning_rate": 1.4948543267140167e-05, "loss": 0.4049, "step": 24190 }, { "epoch": 3.5076276406855817, "grad_norm": 1.1180866956710815, "learning_rate": 1.4934048412813451e-05, "loss": 0.4522, "step": 24200 }, { "epoch": 3.509077073594956, "grad_norm": 0.866869330406189, "learning_rate": 1.4919553558486737e-05, "loss": 0.4841, "step": 24210 }, { "epoch": 3.51052650650433, "grad_norm": 1.915380835533142, "learning_rate": 1.4905058704160025e-05, "loss": 0.4705, "step": 24220 }, { "epoch": 3.5119759394137042, "grad_norm": 1.2995796203613281, "learning_rate": 1.489056384983331e-05, "loss": 0.4524, "step": 24230 }, { "epoch": 3.513425372323079, "grad_norm": 1.4961055517196655, "learning_rate": 1.4876068995506595e-05, "loss": 0.4159, "step": 24240 }, { "epoch": 3.514874805232453, "grad_norm": 1.1718025207519531, "learning_rate": 1.4861574141179881e-05, "loss": 0.4582, "step": 24250 }, { "epoch": 3.516324238141827, "grad_norm": 1.1462128162384033, "learning_rate": 1.4847079286853169e-05, "loss": 0.4214, "step": 24260 }, { "epoch": 3.5177736710512013, "grad_norm": 1.2410391569137573, "learning_rate": 1.4832584432526453e-05, "loss": 0.4753, "step": 24270 }, { "epoch": 3.5192231039605755, "grad_norm": 0.9186839461326599, "learning_rate": 1.481808957819974e-05, "loss": 0.5567, "step": 24280 }, { "epoch": 3.5206725368699496, "grad_norm": 1.1599667072296143, "learning_rate": 1.4803594723873027e-05, "loss": 0.398, "step": 24290 }, { "epoch": 3.522121969779324, "grad_norm": 1.4874101877212524, "learning_rate": 1.478909986954631e-05, "loss": 0.4761, "step": 24300 }, { "epoch": 3.523571402688698, "grad_norm": 1.1158050298690796, "learning_rate": 1.4774605015219598e-05, "loss": 0.469, "step": 24310 }, { "epoch": 3.525020835598072, "grad_norm": 1.2028007507324219, "learning_rate": 1.4760110160892884e-05, "loss": 0.4738, "step": 24320 }, { "epoch": 3.5264702685074463, "grad_norm": 2.6865055561065674, "learning_rate": 1.4745615306566168e-05, "loss": 0.5044, "step": 24330 }, { "epoch": 3.5279197014168204, "grad_norm": 1.1461601257324219, "learning_rate": 1.4731120452239456e-05, "loss": 0.4434, "step": 24340 }, { "epoch": 3.529369134326195, "grad_norm": 1.2691434621810913, "learning_rate": 1.4716625597912742e-05, "loss": 0.4977, "step": 24350 }, { "epoch": 3.530818567235569, "grad_norm": 1.963690996170044, "learning_rate": 1.470213074358603e-05, "loss": 0.4618, "step": 24360 }, { "epoch": 3.5322680001449434, "grad_norm": 1.308236837387085, "learning_rate": 1.4687635889259312e-05, "loss": 0.4043, "step": 24370 }, { "epoch": 3.5337174330543175, "grad_norm": 2.2788124084472656, "learning_rate": 1.46731410349326e-05, "loss": 0.4942, "step": 24380 }, { "epoch": 3.5351668659636917, "grad_norm": 2.374098300933838, "learning_rate": 1.4658646180605886e-05, "loss": 0.5085, "step": 24390 }, { "epoch": 3.536616298873066, "grad_norm": 0.970146119594574, "learning_rate": 1.464415132627917e-05, "loss": 0.5168, "step": 24400 }, { "epoch": 3.53806573178244, "grad_norm": 0.8941761255264282, "learning_rate": 1.4629656471952458e-05, "loss": 0.5228, "step": 24410 }, { "epoch": 3.5395151646918146, "grad_norm": 3.1685407161712646, "learning_rate": 1.4615161617625744e-05, "loss": 0.4554, "step": 24420 }, { "epoch": 3.5409645976011888, "grad_norm": 2.12044620513916, "learning_rate": 1.4600666763299029e-05, "loss": 0.449, "step": 24430 }, { "epoch": 3.542414030510563, "grad_norm": 1.0807843208312988, "learning_rate": 1.4586171908972315e-05, "loss": 0.4941, "step": 24440 }, { "epoch": 3.543863463419937, "grad_norm": 0.8228974938392639, "learning_rate": 1.4571677054645602e-05, "loss": 0.4542, "step": 24450 }, { "epoch": 3.5453128963293112, "grad_norm": 1.6381698846817017, "learning_rate": 1.4557182200318887e-05, "loss": 0.436, "step": 24460 }, { "epoch": 3.5467623292386854, "grad_norm": 0.8940735459327698, "learning_rate": 1.4542687345992173e-05, "loss": 0.415, "step": 24470 }, { "epoch": 3.5482117621480596, "grad_norm": 2.5338544845581055, "learning_rate": 1.452819249166546e-05, "loss": 0.4676, "step": 24480 }, { "epoch": 3.5496611950574337, "grad_norm": 2.5289785861968994, "learning_rate": 1.4513697637338747e-05, "loss": 0.4682, "step": 24490 }, { "epoch": 3.551110627966808, "grad_norm": 1.0406261682510376, "learning_rate": 1.4499202783012031e-05, "loss": 0.428, "step": 24500 }, { "epoch": 3.552560060876182, "grad_norm": 2.039503335952759, "learning_rate": 1.4484707928685317e-05, "loss": 0.437, "step": 24510 }, { "epoch": 3.554009493785556, "grad_norm": 1.3291372060775757, "learning_rate": 1.4470213074358605e-05, "loss": 0.4816, "step": 24520 }, { "epoch": 3.5554589266949304, "grad_norm": 2.6131370067596436, "learning_rate": 1.4455718220031889e-05, "loss": 0.4719, "step": 24530 }, { "epoch": 3.556908359604305, "grad_norm": 1.0953946113586426, "learning_rate": 1.4441223365705175e-05, "loss": 0.4539, "step": 24540 }, { "epoch": 3.558357792513679, "grad_norm": 0.8537795543670654, "learning_rate": 1.4426728511378463e-05, "loss": 0.4534, "step": 24550 }, { "epoch": 3.5598072254230533, "grad_norm": 1.268684983253479, "learning_rate": 1.4412233657051746e-05, "loss": 0.4564, "step": 24560 }, { "epoch": 3.5612566583324274, "grad_norm": 1.0174052715301514, "learning_rate": 1.4397738802725033e-05, "loss": 0.4801, "step": 24570 }, { "epoch": 3.5627060912418016, "grad_norm": 0.8783228397369385, "learning_rate": 1.438324394839832e-05, "loss": 0.4757, "step": 24580 }, { "epoch": 3.5641555241511758, "grad_norm": 1.3714172840118408, "learning_rate": 1.4368749094071604e-05, "loss": 0.4513, "step": 24590 }, { "epoch": 3.56560495706055, "grad_norm": 1.7273422479629517, "learning_rate": 1.4354254239744891e-05, "loss": 0.4579, "step": 24600 }, { "epoch": 3.5670543899699245, "grad_norm": 3.6264455318450928, "learning_rate": 1.4339759385418177e-05, "loss": 0.4518, "step": 24610 }, { "epoch": 3.5685038228792987, "grad_norm": 1.7638481855392456, "learning_rate": 1.4325264531091465e-05, "loss": 0.4583, "step": 24620 }, { "epoch": 3.569953255788673, "grad_norm": 1.263018012046814, "learning_rate": 1.4310769676764748e-05, "loss": 0.425, "step": 24630 }, { "epoch": 3.571402688698047, "grad_norm": 1.0282634496688843, "learning_rate": 1.4296274822438036e-05, "loss": 0.4828, "step": 24640 }, { "epoch": 3.572852121607421, "grad_norm": 1.0690590143203735, "learning_rate": 1.4281779968111322e-05, "loss": 0.4938, "step": 24650 }, { "epoch": 3.5743015545167953, "grad_norm": 1.256479024887085, "learning_rate": 1.4267285113784606e-05, "loss": 0.4262, "step": 24660 }, { "epoch": 3.5757509874261695, "grad_norm": 2.001762628555298, "learning_rate": 1.4252790259457894e-05, "loss": 0.4471, "step": 24670 }, { "epoch": 3.5772004203355436, "grad_norm": 0.9218072295188904, "learning_rate": 1.423829540513118e-05, "loss": 0.3911, "step": 24680 }, { "epoch": 3.578649853244918, "grad_norm": 2.517094135284424, "learning_rate": 1.4223800550804464e-05, "loss": 0.4531, "step": 24690 }, { "epoch": 3.580099286154292, "grad_norm": 2.5413553714752197, "learning_rate": 1.420930569647775e-05, "loss": 0.4515, "step": 24700 }, { "epoch": 3.581548719063666, "grad_norm": 2.471299409866333, "learning_rate": 1.4194810842151038e-05, "loss": 0.4395, "step": 24710 }, { "epoch": 3.5829981519730403, "grad_norm": 2.179800271987915, "learning_rate": 1.4180315987824322e-05, "loss": 0.3825, "step": 24720 }, { "epoch": 3.584447584882415, "grad_norm": 2.7383923530578613, "learning_rate": 1.4165821133497608e-05, "loss": 0.473, "step": 24730 }, { "epoch": 3.585897017791789, "grad_norm": 1.0128542184829712, "learning_rate": 1.4151326279170896e-05, "loss": 0.4712, "step": 24740 }, { "epoch": 3.587346450701163, "grad_norm": 2.4596316814422607, "learning_rate": 1.4136831424844179e-05, "loss": 0.4806, "step": 24750 }, { "epoch": 3.5887958836105374, "grad_norm": 1.1265087127685547, "learning_rate": 1.4122336570517467e-05, "loss": 0.4706, "step": 24760 }, { "epoch": 3.5902453165199115, "grad_norm": 2.9591550827026367, "learning_rate": 1.4107841716190753e-05, "loss": 0.4352, "step": 24770 }, { "epoch": 3.5916947494292857, "grad_norm": 1.0448544025421143, "learning_rate": 1.409334686186404e-05, "loss": 0.4267, "step": 24780 }, { "epoch": 3.59314418233866, "grad_norm": 2.992042064666748, "learning_rate": 1.4078852007537325e-05, "loss": 0.4428, "step": 24790 }, { "epoch": 3.5945936152480344, "grad_norm": 2.647773265838623, "learning_rate": 1.406435715321061e-05, "loss": 0.4718, "step": 24800 }, { "epoch": 3.5960430481574086, "grad_norm": 1.7813819646835327, "learning_rate": 1.4049862298883899e-05, "loss": 0.4702, "step": 24810 }, { "epoch": 3.5974924810667828, "grad_norm": 0.9270603656768799, "learning_rate": 1.4035367444557181e-05, "loss": 0.4603, "step": 24820 }, { "epoch": 3.598941913976157, "grad_norm": 5.342667102813721, "learning_rate": 1.4020872590230469e-05, "loss": 0.4678, "step": 24830 }, { "epoch": 3.600391346885531, "grad_norm": 0.9789586663246155, "learning_rate": 1.4006377735903755e-05, "loss": 0.4544, "step": 24840 }, { "epoch": 3.6018407797949052, "grad_norm": 1.0158215761184692, "learning_rate": 1.399188288157704e-05, "loss": 0.4339, "step": 24850 }, { "epoch": 3.6032902127042794, "grad_norm": 1.652602195739746, "learning_rate": 1.3977388027250327e-05, "loss": 0.457, "step": 24860 }, { "epoch": 3.6047396456136536, "grad_norm": 1.1833007335662842, "learning_rate": 1.3962893172923613e-05, "loss": 0.3642, "step": 24870 }, { "epoch": 3.6061890785230277, "grad_norm": 2.9165823459625244, "learning_rate": 1.3948398318596898e-05, "loss": 0.4549, "step": 24880 }, { "epoch": 3.607638511432402, "grad_norm": 1.0205824375152588, "learning_rate": 1.3933903464270184e-05, "loss": 0.5003, "step": 24890 }, { "epoch": 3.609087944341776, "grad_norm": 0.9283063411712646, "learning_rate": 1.3919408609943471e-05, "loss": 0.4788, "step": 24900 }, { "epoch": 3.6105373772511506, "grad_norm": 2.781982898712158, "learning_rate": 1.3904913755616757e-05, "loss": 0.4448, "step": 24910 }, { "epoch": 3.611986810160525, "grad_norm": 1.2095791101455688, "learning_rate": 1.3890418901290042e-05, "loss": 0.4371, "step": 24920 }, { "epoch": 3.613436243069899, "grad_norm": 1.2724969387054443, "learning_rate": 1.387592404696333e-05, "loss": 0.4208, "step": 24930 }, { "epoch": 3.614885675979273, "grad_norm": 0.857598066329956, "learning_rate": 1.3861429192636616e-05, "loss": 0.433, "step": 24940 }, { "epoch": 3.6163351088886473, "grad_norm": 3.5676772594451904, "learning_rate": 1.38469343383099e-05, "loss": 0.4787, "step": 24950 }, { "epoch": 3.6177845417980214, "grad_norm": 2.6916184425354004, "learning_rate": 1.3832439483983186e-05, "loss": 0.4176, "step": 24960 }, { "epoch": 3.6192339747073956, "grad_norm": 1.305999755859375, "learning_rate": 1.3817944629656474e-05, "loss": 0.4208, "step": 24970 }, { "epoch": 3.62068340761677, "grad_norm": 0.7682250738143921, "learning_rate": 1.3803449775329758e-05, "loss": 0.5436, "step": 24980 }, { "epoch": 3.6221328405261444, "grad_norm": 4.667750835418701, "learning_rate": 1.3788954921003044e-05, "loss": 0.4955, "step": 24990 }, { "epoch": 3.6235822734355185, "grad_norm": 1.856907844543457, "learning_rate": 1.3774460066676332e-05, "loss": 0.4153, "step": 25000 }, { "epoch": 3.6235822734355185, "eval_loss": 0.7244583964347839, "eval_runtime": 671.9338, "eval_samples_per_second": 51.337, "eval_steps_per_second": 2.567, "eval_token_accuracy": 0.0003887544609453762, "step": 25000 }, { "epoch": 3.6250317063448927, "grad_norm": 3.083552122116089, "learning_rate": 1.3759965212349615e-05, "loss": 0.4554, "step": 25010 }, { "epoch": 3.626481139254267, "grad_norm": 2.2491211891174316, "learning_rate": 1.3745470358022902e-05, "loss": 0.4488, "step": 25020 }, { "epoch": 3.627930572163641, "grad_norm": 1.8625367879867554, "learning_rate": 1.3730975503696188e-05, "loss": 0.4801, "step": 25030 }, { "epoch": 3.629380005073015, "grad_norm": 1.394662857055664, "learning_rate": 1.3716480649369476e-05, "loss": 0.4696, "step": 25040 }, { "epoch": 3.6308294379823893, "grad_norm": 2.025663137435913, "learning_rate": 1.370198579504276e-05, "loss": 0.5045, "step": 25050 }, { "epoch": 3.6322788708917635, "grad_norm": 0.8624992370605469, "learning_rate": 1.3687490940716046e-05, "loss": 0.4331, "step": 25060 }, { "epoch": 3.6337283038011376, "grad_norm": 1.1813052892684937, "learning_rate": 1.3674445571822002e-05, "loss": 0.4484, "step": 25070 }, { "epoch": 3.635177736710512, "grad_norm": 2.8747799396514893, "learning_rate": 1.365995071749529e-05, "loss": 0.4863, "step": 25080 }, { "epoch": 3.636627169619886, "grad_norm": 2.4268813133239746, "learning_rate": 1.3645455863168576e-05, "loss": 0.4073, "step": 25090 }, { "epoch": 3.6380766025292606, "grad_norm": 2.0211360454559326, "learning_rate": 1.363096100884186e-05, "loss": 0.4031, "step": 25100 }, { "epoch": 3.6395260354386347, "grad_norm": 1.2551864385604858, "learning_rate": 1.3616466154515148e-05, "loss": 0.4496, "step": 25110 }, { "epoch": 3.640975468348009, "grad_norm": 1.1632791757583618, "learning_rate": 1.3601971300188434e-05, "loss": 0.4563, "step": 25120 }, { "epoch": 3.642424901257383, "grad_norm": 1.17465078830719, "learning_rate": 1.3587476445861719e-05, "loss": 0.4994, "step": 25130 }, { "epoch": 3.643874334166757, "grad_norm": 3.1795096397399902, "learning_rate": 1.3572981591535005e-05, "loss": 0.4513, "step": 25140 }, { "epoch": 3.6453237670761314, "grad_norm": 2.9688425064086914, "learning_rate": 1.3558486737208293e-05, "loss": 0.4774, "step": 25150 }, { "epoch": 3.6467731999855055, "grad_norm": 1.1822007894515991, "learning_rate": 1.3543991882881577e-05, "loss": 0.4411, "step": 25160 }, { "epoch": 3.64822263289488, "grad_norm": 1.930434226989746, "learning_rate": 1.3529497028554863e-05, "loss": 0.48, "step": 25170 }, { "epoch": 3.6496720658042543, "grad_norm": 2.6842262744903564, "learning_rate": 1.351500217422815e-05, "loss": 0.4468, "step": 25180 }, { "epoch": 3.6511214987136285, "grad_norm": 2.477809190750122, "learning_rate": 1.3500507319901437e-05, "loss": 0.4006, "step": 25190 }, { "epoch": 3.6525709316230026, "grad_norm": 1.701682209968567, "learning_rate": 1.3486012465574721e-05, "loss": 0.4274, "step": 25200 }, { "epoch": 3.6540203645323768, "grad_norm": 2.3307900428771973, "learning_rate": 1.3471517611248007e-05, "loss": 0.462, "step": 25210 }, { "epoch": 3.655469797441751, "grad_norm": 2.1112964153289795, "learning_rate": 1.3457022756921295e-05, "loss": 0.4564, "step": 25220 }, { "epoch": 3.656919230351125, "grad_norm": 1.1961227655410767, "learning_rate": 1.344252790259458e-05, "loss": 0.4937, "step": 25230 }, { "epoch": 3.6583686632604993, "grad_norm": 3.18811297416687, "learning_rate": 1.3428033048267865e-05, "loss": 0.4762, "step": 25240 }, { "epoch": 3.6598180961698734, "grad_norm": 1.4780430793762207, "learning_rate": 1.3413538193941153e-05, "loss": 0.4465, "step": 25250 }, { "epoch": 3.6612675290792476, "grad_norm": 2.3059370517730713, "learning_rate": 1.3399043339614437e-05, "loss": 0.4737, "step": 25260 }, { "epoch": 3.6627169619886217, "grad_norm": 1.1574058532714844, "learning_rate": 1.3384548485287723e-05, "loss": 0.4123, "step": 25270 }, { "epoch": 3.664166394897996, "grad_norm": 2.186791181564331, "learning_rate": 1.337005363096101e-05, "loss": 0.5098, "step": 25280 }, { "epoch": 3.6656158278073705, "grad_norm": 1.4572516679763794, "learning_rate": 1.3355558776634294e-05, "loss": 0.4766, "step": 25290 }, { "epoch": 3.6670652607167447, "grad_norm": 3.103440523147583, "learning_rate": 1.3341063922307582e-05, "loss": 0.4361, "step": 25300 }, { "epoch": 3.668514693626119, "grad_norm": 2.1486423015594482, "learning_rate": 1.3326569067980868e-05, "loss": 0.4026, "step": 25310 }, { "epoch": 3.669964126535493, "grad_norm": 1.179584264755249, "learning_rate": 1.3312074213654152e-05, "loss": 0.4723, "step": 25320 }, { "epoch": 3.671413559444867, "grad_norm": 1.3920953273773193, "learning_rate": 1.3297579359327438e-05, "loss": 0.4538, "step": 25330 }, { "epoch": 3.6728629923542413, "grad_norm": 2.763362169265747, "learning_rate": 1.3283084505000726e-05, "loss": 0.4749, "step": 25340 }, { "epoch": 3.6743124252636155, "grad_norm": 2.5469725131988525, "learning_rate": 1.3268589650674012e-05, "loss": 0.4669, "step": 25350 }, { "epoch": 3.67576185817299, "grad_norm": 2.4919934272766113, "learning_rate": 1.3254094796347296e-05, "loss": 0.4622, "step": 25360 }, { "epoch": 3.677211291082364, "grad_norm": 1.1692922115325928, "learning_rate": 1.3239599942020584e-05, "loss": 0.4486, "step": 25370 }, { "epoch": 3.6786607239917384, "grad_norm": 2.3812007904052734, "learning_rate": 1.322510508769387e-05, "loss": 0.487, "step": 25380 }, { "epoch": 3.6801101569011125, "grad_norm": 0.8145487904548645, "learning_rate": 1.3210610233367154e-05, "loss": 0.3698, "step": 25390 }, { "epoch": 3.6815595898104867, "grad_norm": 1.1780279874801636, "learning_rate": 1.319611537904044e-05, "loss": 0.4306, "step": 25400 }, { "epoch": 3.683009022719861, "grad_norm": 1.7611922025680542, "learning_rate": 1.3181620524713728e-05, "loss": 0.4532, "step": 25410 }, { "epoch": 3.684458455629235, "grad_norm": 4.195759296417236, "learning_rate": 1.3167125670387013e-05, "loss": 0.4641, "step": 25420 }, { "epoch": 3.685907888538609, "grad_norm": 1.1716632843017578, "learning_rate": 1.3152630816060299e-05, "loss": 0.5277, "step": 25430 }, { "epoch": 3.6873573214479833, "grad_norm": 2.7256765365600586, "learning_rate": 1.3138135961733586e-05, "loss": 0.4893, "step": 25440 }, { "epoch": 3.6888067543573575, "grad_norm": 0.9985123872756958, "learning_rate": 1.312364110740687e-05, "loss": 0.4418, "step": 25450 }, { "epoch": 3.6902561872667317, "grad_norm": 1.2271908521652222, "learning_rate": 1.3109146253080157e-05, "loss": 0.4941, "step": 25460 }, { "epoch": 3.6917056201761063, "grad_norm": 2.9102699756622314, "learning_rate": 1.3094651398753443e-05, "loss": 0.4088, "step": 25470 }, { "epoch": 3.6931550530854804, "grad_norm": 2.339369773864746, "learning_rate": 1.308015654442673e-05, "loss": 0.4207, "step": 25480 }, { "epoch": 3.6946044859948546, "grad_norm": 2.232353687286377, "learning_rate": 1.3065661690100015e-05, "loss": 0.4714, "step": 25490 }, { "epoch": 3.6960539189042287, "grad_norm": 0.9081094861030579, "learning_rate": 1.3051166835773301e-05, "loss": 0.4106, "step": 25500 }, { "epoch": 3.697503351813603, "grad_norm": 0.7285069227218628, "learning_rate": 1.3036671981446589e-05, "loss": 0.4218, "step": 25510 }, { "epoch": 3.698952784722977, "grad_norm": 3.198972225189209, "learning_rate": 1.3022177127119871e-05, "loss": 0.4388, "step": 25520 }, { "epoch": 3.700402217632351, "grad_norm": 1.26154625415802, "learning_rate": 1.300768227279316e-05, "loss": 0.4395, "step": 25530 }, { "epoch": 3.701851650541726, "grad_norm": 3.3266167640686035, "learning_rate": 1.2993187418466445e-05, "loss": 0.4497, "step": 25540 }, { "epoch": 3.7033010834511, "grad_norm": 1.1532198190689087, "learning_rate": 1.297869256413973e-05, "loss": 0.4678, "step": 25550 }, { "epoch": 3.704750516360474, "grad_norm": 1.0286312103271484, "learning_rate": 1.2964197709813017e-05, "loss": 0.5017, "step": 25560 }, { "epoch": 3.7061999492698483, "grad_norm": 2.742030382156372, "learning_rate": 1.2949702855486303e-05, "loss": 0.4423, "step": 25570 }, { "epoch": 3.7076493821792225, "grad_norm": 1.203541874885559, "learning_rate": 1.2935208001159588e-05, "loss": 0.4548, "step": 25580 }, { "epoch": 3.7090988150885966, "grad_norm": 0.8006945848464966, "learning_rate": 1.2920713146832874e-05, "loss": 0.4748, "step": 25590 }, { "epoch": 3.710548247997971, "grad_norm": 1.4705841541290283, "learning_rate": 1.2906218292506162e-05, "loss": 0.4989, "step": 25600 }, { "epoch": 3.711997680907345, "grad_norm": 2.7385144233703613, "learning_rate": 1.2891723438179448e-05, "loss": 0.4733, "step": 25610 }, { "epoch": 3.713447113816719, "grad_norm": 1.4703665971755981, "learning_rate": 1.2877228583852732e-05, "loss": 0.4562, "step": 25620 }, { "epoch": 3.7148965467260933, "grad_norm": 1.0603382587432861, "learning_rate": 1.286273372952602e-05, "loss": 0.4353, "step": 25630 }, { "epoch": 3.7163459796354674, "grad_norm": 0.9223389625549316, "learning_rate": 1.2848238875199306e-05, "loss": 0.4356, "step": 25640 }, { "epoch": 3.7177954125448416, "grad_norm": 1.1404002904891968, "learning_rate": 1.283374402087259e-05, "loss": 0.4852, "step": 25650 }, { "epoch": 3.719244845454216, "grad_norm": 6.546042442321777, "learning_rate": 1.2819249166545876e-05, "loss": 0.4939, "step": 25660 }, { "epoch": 3.7206942783635903, "grad_norm": 1.3828874826431274, "learning_rate": 1.2804754312219164e-05, "loss": 0.4718, "step": 25670 }, { "epoch": 3.7221437112729645, "grad_norm": 4.71876859664917, "learning_rate": 1.2790259457892448e-05, "loss": 0.4888, "step": 25680 }, { "epoch": 3.7235931441823387, "grad_norm": 0.9733912944793701, "learning_rate": 1.2775764603565734e-05, "loss": 0.4908, "step": 25690 }, { "epoch": 3.725042577091713, "grad_norm": 1.7667779922485352, "learning_rate": 1.2761269749239022e-05, "loss": 0.4729, "step": 25700 }, { "epoch": 3.726492010001087, "grad_norm": 2.0237643718719482, "learning_rate": 1.2746774894912306e-05, "loss": 0.3917, "step": 25710 }, { "epoch": 3.727941442910461, "grad_norm": 1.8370978832244873, "learning_rate": 1.2732280040585592e-05, "loss": 0.565, "step": 25720 }, { "epoch": 3.7293908758198357, "grad_norm": 1.0702518224716187, "learning_rate": 1.2717785186258879e-05, "loss": 0.4429, "step": 25730 }, { "epoch": 3.73084030872921, "grad_norm": 2.3258419036865234, "learning_rate": 1.2703290331932163e-05, "loss": 0.4304, "step": 25740 }, { "epoch": 3.732289741638584, "grad_norm": 0.7933976650238037, "learning_rate": 1.268879547760545e-05, "loss": 0.5605, "step": 25750 }, { "epoch": 3.7337391745479582, "grad_norm": 0.957318127155304, "learning_rate": 1.2674300623278737e-05, "loss": 0.4661, "step": 25760 }, { "epoch": 3.7351886074573324, "grad_norm": 2.946476697921753, "learning_rate": 1.2659805768952024e-05, "loss": 0.4515, "step": 25770 }, { "epoch": 3.7366380403667065, "grad_norm": 1.1148138046264648, "learning_rate": 1.2645310914625307e-05, "loss": 0.4709, "step": 25780 }, { "epoch": 3.7380874732760807, "grad_norm": 1.095963716506958, "learning_rate": 1.2630816060298595e-05, "loss": 0.4869, "step": 25790 }, { "epoch": 3.739536906185455, "grad_norm": 1.272081971168518, "learning_rate": 1.2616321205971881e-05, "loss": 0.5348, "step": 25800 }, { "epoch": 3.740986339094829, "grad_norm": 0.9788115620613098, "learning_rate": 1.2601826351645165e-05, "loss": 0.4189, "step": 25810 }, { "epoch": 3.742435772004203, "grad_norm": 1.0667412281036377, "learning_rate": 1.2587331497318453e-05, "loss": 0.4168, "step": 25820 }, { "epoch": 3.7438852049135773, "grad_norm": 1.109317660331726, "learning_rate": 1.2572836642991739e-05, "loss": 0.4634, "step": 25830 }, { "epoch": 3.7453346378229515, "grad_norm": 1.8557586669921875, "learning_rate": 1.2558341788665023e-05, "loss": 0.4367, "step": 25840 }, { "epoch": 3.746784070732326, "grad_norm": 0.8158989548683167, "learning_rate": 1.254384693433831e-05, "loss": 0.4793, "step": 25850 }, { "epoch": 3.7482335036417003, "grad_norm": 3.7515580654144287, "learning_rate": 1.2529352080011597e-05, "loss": 0.4282, "step": 25860 }, { "epoch": 3.7496829365510744, "grad_norm": 2.240804433822632, "learning_rate": 1.2514857225684882e-05, "loss": 0.4173, "step": 25870 }, { "epoch": 3.7511323694604486, "grad_norm": 1.3406633138656616, "learning_rate": 1.2500362371358168e-05, "loss": 0.4622, "step": 25880 }, { "epoch": 3.7525818023698227, "grad_norm": 1.781441330909729, "learning_rate": 1.2485867517031455e-05, "loss": 0.4107, "step": 25890 }, { "epoch": 3.754031235279197, "grad_norm": 1.1386650800704956, "learning_rate": 1.247137266270474e-05, "loss": 0.4773, "step": 25900 }, { "epoch": 3.755480668188571, "grad_norm": 1.1496398448944092, "learning_rate": 1.2456877808378026e-05, "loss": 0.467, "step": 25910 }, { "epoch": 3.7569301010979457, "grad_norm": 0.9759547710418701, "learning_rate": 1.2442382954051312e-05, "loss": 0.4276, "step": 25920 }, { "epoch": 3.75837953400732, "grad_norm": 2.8767662048339844, "learning_rate": 1.2427888099724598e-05, "loss": 0.4146, "step": 25930 }, { "epoch": 3.759828966916694, "grad_norm": 0.8361555933952332, "learning_rate": 1.2413393245397886e-05, "loss": 0.4557, "step": 25940 }, { "epoch": 3.761278399826068, "grad_norm": 1.1116259098052979, "learning_rate": 1.239889839107117e-05, "loss": 0.4553, "step": 25950 }, { "epoch": 3.7627278327354423, "grad_norm": 1.1028730869293213, "learning_rate": 1.2384403536744456e-05, "loss": 0.438, "step": 25960 }, { "epoch": 3.7641772656448165, "grad_norm": 2.509413242340088, "learning_rate": 1.2369908682417742e-05, "loss": 0.4404, "step": 25970 }, { "epoch": 3.7656266985541906, "grad_norm": 0.8821387887001038, "learning_rate": 1.2355413828091028e-05, "loss": 0.4269, "step": 25980 }, { "epoch": 3.767076131463565, "grad_norm": 1.5066933631896973, "learning_rate": 1.2340918973764314e-05, "loss": 0.4466, "step": 25990 }, { "epoch": 3.768525564372939, "grad_norm": 0.8262946605682373, "learning_rate": 1.23264241194376e-05, "loss": 0.413, "step": 26000 }, { "epoch": 3.768525564372939, "eval_loss": 0.7187819480895996, "eval_runtime": 671.7423, "eval_samples_per_second": 51.352, "eval_steps_per_second": 2.568, "eval_token_accuracy": 0.00039164961208191607, "step": 26000 }, { "epoch": 3.769974997282313, "grad_norm": 2.168036699295044, "learning_rate": 1.2311929265110886e-05, "loss": 0.3854, "step": 26010 }, { "epoch": 3.7714244301916873, "grad_norm": 1.015336275100708, "learning_rate": 1.2297434410784172e-05, "loss": 0.4675, "step": 26020 }, { "epoch": 3.772873863101062, "grad_norm": 1.6049621105194092, "learning_rate": 1.2282939556457458e-05, "loss": 0.4444, "step": 26030 }, { "epoch": 3.774323296010436, "grad_norm": 1.115004301071167, "learning_rate": 1.2268444702130743e-05, "loss": 0.483, "step": 26040 }, { "epoch": 3.77577272891981, "grad_norm": 1.3188127279281616, "learning_rate": 1.225394984780403e-05, "loss": 0.4628, "step": 26050 }, { "epoch": 3.7772221618291844, "grad_norm": 1.8667047023773193, "learning_rate": 1.2239454993477317e-05, "loss": 0.4742, "step": 26060 }, { "epoch": 3.7786715947385585, "grad_norm": 1.2756224870681763, "learning_rate": 1.2224960139150603e-05, "loss": 0.4479, "step": 26070 }, { "epoch": 3.7801210276479327, "grad_norm": 1.8248677253723145, "learning_rate": 1.2210465284823889e-05, "loss": 0.4322, "step": 26080 }, { "epoch": 3.781570460557307, "grad_norm": 1.1138015985488892, "learning_rate": 1.2195970430497173e-05, "loss": 0.489, "step": 26090 }, { "epoch": 3.7830198934666814, "grad_norm": 1.01140558719635, "learning_rate": 1.218147557617046e-05, "loss": 0.4234, "step": 26100 }, { "epoch": 3.7844693263760556, "grad_norm": 0.93239426612854, "learning_rate": 1.2166980721843745e-05, "loss": 0.4982, "step": 26110 }, { "epoch": 3.7859187592854298, "grad_norm": 1.03866708278656, "learning_rate": 1.2152485867517031e-05, "loss": 0.495, "step": 26120 }, { "epoch": 3.787368192194804, "grad_norm": 1.112883448600769, "learning_rate": 1.2137991013190319e-05, "loss": 0.4461, "step": 26130 }, { "epoch": 3.788817625104178, "grad_norm": 1.2414758205413818, "learning_rate": 1.2123496158863603e-05, "loss": 0.4634, "step": 26140 }, { "epoch": 3.7902670580135522, "grad_norm": 1.368895411491394, "learning_rate": 1.2109001304536891e-05, "loss": 0.4817, "step": 26150 }, { "epoch": 3.7917164909229264, "grad_norm": 0.937118411064148, "learning_rate": 1.2094506450210175e-05, "loss": 0.448, "step": 26160 }, { "epoch": 3.7931659238323006, "grad_norm": 1.29416823387146, "learning_rate": 1.2080011595883461e-05, "loss": 0.5016, "step": 26170 }, { "epoch": 3.7946153567416747, "grad_norm": 1.7470028400421143, "learning_rate": 1.2065516741556748e-05, "loss": 0.4548, "step": 26180 }, { "epoch": 3.796064789651049, "grad_norm": 3.248729705810547, "learning_rate": 1.2051021887230034e-05, "loss": 0.4589, "step": 26190 }, { "epoch": 3.797514222560423, "grad_norm": 3.4089272022247314, "learning_rate": 1.2036527032903321e-05, "loss": 0.4598, "step": 26200 }, { "epoch": 3.798963655469797, "grad_norm": 1.076757550239563, "learning_rate": 1.2022032178576606e-05, "loss": 0.4611, "step": 26210 }, { "epoch": 3.800413088379172, "grad_norm": 1.8356128931045532, "learning_rate": 1.2007537324249892e-05, "loss": 0.4543, "step": 26220 }, { "epoch": 3.801862521288546, "grad_norm": 0.8461928367614746, "learning_rate": 1.1993042469923178e-05, "loss": 0.4862, "step": 26230 }, { "epoch": 3.80331195419792, "grad_norm": 2.1040289402008057, "learning_rate": 1.1978547615596464e-05, "loss": 0.5152, "step": 26240 }, { "epoch": 3.8047613871072943, "grad_norm": 0.9999257922172546, "learning_rate": 1.196405276126975e-05, "loss": 0.4309, "step": 26250 }, { "epoch": 3.8062108200166684, "grad_norm": 1.2932193279266357, "learning_rate": 1.1949557906943036e-05, "loss": 0.4541, "step": 26260 }, { "epoch": 3.8076602529260426, "grad_norm": 1.0218082666397095, "learning_rate": 1.1935063052616322e-05, "loss": 0.4744, "step": 26270 }, { "epoch": 3.8091096858354168, "grad_norm": 1.7632607221603394, "learning_rate": 1.1920568198289608e-05, "loss": 0.4709, "step": 26280 }, { "epoch": 3.8105591187447914, "grad_norm": 1.796879768371582, "learning_rate": 1.1906073343962894e-05, "loss": 0.4939, "step": 26290 }, { "epoch": 3.8120085516541655, "grad_norm": 1.1654222011566162, "learning_rate": 1.1891578489636178e-05, "loss": 0.5227, "step": 26300 }, { "epoch": 3.8134579845635397, "grad_norm": 2.0588326454162598, "learning_rate": 1.1877083635309466e-05, "loss": 0.4955, "step": 26310 }, { "epoch": 3.814907417472914, "grad_norm": 2.2136971950531006, "learning_rate": 1.1862588780982752e-05, "loss": 0.4586, "step": 26320 }, { "epoch": 3.816356850382288, "grad_norm": 1.157351016998291, "learning_rate": 1.1848093926656037e-05, "loss": 0.4496, "step": 26330 }, { "epoch": 3.817806283291662, "grad_norm": 1.9036961793899536, "learning_rate": 1.1833599072329324e-05, "loss": 0.4533, "step": 26340 }, { "epoch": 3.8192557162010363, "grad_norm": 2.055697441101074, "learning_rate": 1.1819104218002609e-05, "loss": 0.4858, "step": 26350 }, { "epoch": 3.8207051491104105, "grad_norm": 3.0185952186584473, "learning_rate": 1.1804609363675897e-05, "loss": 0.4604, "step": 26360 }, { "epoch": 3.8221545820197846, "grad_norm": 0.9421207308769226, "learning_rate": 1.1790114509349181e-05, "loss": 0.4845, "step": 26370 }, { "epoch": 3.823604014929159, "grad_norm": 1.070281982421875, "learning_rate": 1.1775619655022467e-05, "loss": 0.3999, "step": 26380 }, { "epoch": 3.825053447838533, "grad_norm": 1.2085710763931274, "learning_rate": 1.1761124800695755e-05, "loss": 0.4798, "step": 26390 }, { "epoch": 3.826502880747907, "grad_norm": 1.1355644464492798, "learning_rate": 1.1746629946369039e-05, "loss": 0.4467, "step": 26400 }, { "epoch": 3.8279523136572817, "grad_norm": 0.9834990501403809, "learning_rate": 1.1732135092042327e-05, "loss": 0.4489, "step": 26410 }, { "epoch": 3.829401746566656, "grad_norm": 2.566068649291992, "learning_rate": 1.1717640237715611e-05, "loss": 0.4419, "step": 26420 }, { "epoch": 3.83085117947603, "grad_norm": 1.464672327041626, "learning_rate": 1.1703145383388897e-05, "loss": 0.411, "step": 26430 }, { "epoch": 3.832300612385404, "grad_norm": 1.2169744968414307, "learning_rate": 1.1688650529062183e-05, "loss": 0.4677, "step": 26440 }, { "epoch": 3.8337500452947784, "grad_norm": 1.1080471277236938, "learning_rate": 1.167415567473547e-05, "loss": 0.5183, "step": 26450 }, { "epoch": 3.8351994782041525, "grad_norm": 2.5729031562805176, "learning_rate": 1.1659660820408755e-05, "loss": 0.4575, "step": 26460 }, { "epoch": 3.8366489111135267, "grad_norm": 2.8056209087371826, "learning_rate": 1.1645165966082041e-05, "loss": 0.4587, "step": 26470 }, { "epoch": 3.8380983440229013, "grad_norm": 2.494399070739746, "learning_rate": 1.1630671111755327e-05, "loss": 0.4396, "step": 26480 }, { "epoch": 3.8395477769322754, "grad_norm": 1.3486852645874023, "learning_rate": 1.1616176257428614e-05, "loss": 0.4613, "step": 26490 }, { "epoch": 3.8409972098416496, "grad_norm": 0.9714551568031311, "learning_rate": 1.16016814031019e-05, "loss": 0.4485, "step": 26500 }, { "epoch": 3.8424466427510238, "grad_norm": 1.7452738285064697, "learning_rate": 1.1587186548775186e-05, "loss": 0.4754, "step": 26510 }, { "epoch": 3.843896075660398, "grad_norm": 2.8123321533203125, "learning_rate": 1.1572691694448472e-05, "loss": 0.5065, "step": 26520 }, { "epoch": 3.845345508569772, "grad_norm": 1.3239609003067017, "learning_rate": 1.1558196840121758e-05, "loss": 0.4508, "step": 26530 }, { "epoch": 3.8467949414791462, "grad_norm": 2.126450777053833, "learning_rate": 1.1543701985795042e-05, "loss": 0.4637, "step": 26540 }, { "epoch": 3.8482443743885204, "grad_norm": 1.0921045541763306, "learning_rate": 1.152920713146833e-05, "loss": 0.5123, "step": 26550 }, { "epoch": 3.8496938072978946, "grad_norm": 2.9524543285369873, "learning_rate": 1.1514712277141614e-05, "loss": 0.4508, "step": 26560 }, { "epoch": 3.8511432402072687, "grad_norm": 0.8647069334983826, "learning_rate": 1.1500217422814902e-05, "loss": 0.4644, "step": 26570 }, { "epoch": 3.852592673116643, "grad_norm": 2.2827160358428955, "learning_rate": 1.1485722568488188e-05, "loss": 0.4451, "step": 26580 }, { "epoch": 3.854042106026017, "grad_norm": 2.2924423217773438, "learning_rate": 1.1471227714161472e-05, "loss": 0.4475, "step": 26590 }, { "epoch": 3.8554915389353916, "grad_norm": 1.771622896194458, "learning_rate": 1.145673285983476e-05, "loss": 0.4433, "step": 26600 }, { "epoch": 3.856940971844766, "grad_norm": 1.3076894283294678, "learning_rate": 1.1442238005508044e-05, "loss": 0.3952, "step": 26610 }, { "epoch": 3.85839040475414, "grad_norm": 1.0110349655151367, "learning_rate": 1.1427743151181332e-05, "loss": 0.4517, "step": 26620 }, { "epoch": 3.859839837663514, "grad_norm": 3.608510732650757, "learning_rate": 1.1413248296854617e-05, "loss": 0.4661, "step": 26630 }, { "epoch": 3.8612892705728883, "grad_norm": 2.3432466983795166, "learning_rate": 1.1398753442527903e-05, "loss": 0.4391, "step": 26640 }, { "epoch": 3.8627387034822624, "grad_norm": 1.1964057683944702, "learning_rate": 1.138425858820119e-05, "loss": 0.4588, "step": 26650 }, { "epoch": 3.8641881363916366, "grad_norm": 2.234063148498535, "learning_rate": 1.1369763733874475e-05, "loss": 0.4173, "step": 26660 }, { "epoch": 3.865637569301011, "grad_norm": 3.324658155441284, "learning_rate": 1.135526887954776e-05, "loss": 0.4125, "step": 26670 }, { "epoch": 3.8670870022103854, "grad_norm": 1.9230751991271973, "learning_rate": 1.1340774025221047e-05, "loss": 0.4814, "step": 26680 }, { "epoch": 3.8685364351197595, "grad_norm": 3.1007161140441895, "learning_rate": 1.1326279170894333e-05, "loss": 0.4195, "step": 26690 }, { "epoch": 3.8699858680291337, "grad_norm": 2.6854794025421143, "learning_rate": 1.1311784316567619e-05, "loss": 0.4355, "step": 26700 }, { "epoch": 3.871435300938508, "grad_norm": 0.9971222281455994, "learning_rate": 1.1297289462240905e-05, "loss": 0.4344, "step": 26710 }, { "epoch": 3.872884733847882, "grad_norm": 1.202845811843872, "learning_rate": 1.1282794607914191e-05, "loss": 0.5113, "step": 26720 }, { "epoch": 3.874334166757256, "grad_norm": 2.6811609268188477, "learning_rate": 1.1268299753587477e-05, "loss": 0.4481, "step": 26730 }, { "epoch": 3.8757835996666303, "grad_norm": 2.278353452682495, "learning_rate": 1.1253804899260763e-05, "loss": 0.4706, "step": 26740 }, { "epoch": 3.8772330325760045, "grad_norm": 3.3476059436798096, "learning_rate": 1.1239310044934047e-05, "loss": 0.513, "step": 26750 }, { "epoch": 3.8786824654853786, "grad_norm": 1.9155887365341187, "learning_rate": 1.1224815190607335e-05, "loss": 0.4431, "step": 26760 }, { "epoch": 3.880131898394753, "grad_norm": 1.286580204963684, "learning_rate": 1.1210320336280621e-05, "loss": 0.5217, "step": 26770 }, { "epoch": 3.8815813313041274, "grad_norm": 1.9572170972824097, "learning_rate": 1.1195825481953907e-05, "loss": 0.5095, "step": 26780 }, { "epoch": 3.8830307642135016, "grad_norm": 0.9819499254226685, "learning_rate": 1.1181330627627193e-05, "loss": 0.4438, "step": 26790 }, { "epoch": 3.8844801971228757, "grad_norm": 2.0708963871002197, "learning_rate": 1.1166835773300478e-05, "loss": 0.4139, "step": 26800 }, { "epoch": 3.88592963003225, "grad_norm": 3.3944318294525146, "learning_rate": 1.1152340918973766e-05, "loss": 0.459, "step": 26810 }, { "epoch": 3.887379062941624, "grad_norm": 1.255771279335022, "learning_rate": 1.113784606464705e-05, "loss": 0.438, "step": 26820 }, { "epoch": 3.888828495850998, "grad_norm": 0.9337121844291687, "learning_rate": 1.1123351210320338e-05, "loss": 0.4895, "step": 26830 }, { "epoch": 3.8902779287603724, "grad_norm": 1.0607869625091553, "learning_rate": 1.1108856355993624e-05, "loss": 0.4407, "step": 26840 }, { "epoch": 3.891727361669747, "grad_norm": 0.9897153377532959, "learning_rate": 1.1094361501666908e-05, "loss": 0.4146, "step": 26850 }, { "epoch": 3.893176794579121, "grad_norm": 0.8319249749183655, "learning_rate": 1.1079866647340196e-05, "loss": 0.4798, "step": 26860 }, { "epoch": 3.8946262274884953, "grad_norm": 2.0311408042907715, "learning_rate": 1.106537179301348e-05, "loss": 0.4787, "step": 26870 }, { "epoch": 3.8960756603978695, "grad_norm": 1.3111026287078857, "learning_rate": 1.1050876938686766e-05, "loss": 0.4265, "step": 26880 }, { "epoch": 3.8975250933072436, "grad_norm": 2.4597795009613037, "learning_rate": 1.1036382084360052e-05, "loss": 0.429, "step": 26890 }, { "epoch": 3.8989745262166178, "grad_norm": 2.432349443435669, "learning_rate": 1.1021887230033338e-05, "loss": 0.4206, "step": 26900 }, { "epoch": 3.900423959125992, "grad_norm": 0.9375098347663879, "learning_rate": 1.1007392375706626e-05, "loss": 0.4607, "step": 26910 }, { "epoch": 3.901873392035366, "grad_norm": 0.8173941373825073, "learning_rate": 1.099289752137991e-05, "loss": 0.3743, "step": 26920 }, { "epoch": 3.9033228249447403, "grad_norm": 1.374294638633728, "learning_rate": 1.0978402667053196e-05, "loss": 0.5601, "step": 26930 }, { "epoch": 3.9047722578541144, "grad_norm": 1.6747016906738281, "learning_rate": 1.0963907812726483e-05, "loss": 0.4217, "step": 26940 }, { "epoch": 3.9062216907634886, "grad_norm": 1.1631803512573242, "learning_rate": 1.0949412958399769e-05, "loss": 0.5497, "step": 26950 }, { "epoch": 3.9076711236728627, "grad_norm": 1.1909114122390747, "learning_rate": 1.0934918104073055e-05, "loss": 0.4395, "step": 26960 }, { "epoch": 3.9091205565822373, "grad_norm": 1.498050570487976, "learning_rate": 1.092042324974634e-05, "loss": 0.4709, "step": 26970 }, { "epoch": 3.9105699894916115, "grad_norm": 2.889643669128418, "learning_rate": 1.0905928395419627e-05, "loss": 0.4511, "step": 26980 }, { "epoch": 3.9120194224009857, "grad_norm": 1.0335190296173096, "learning_rate": 1.0891433541092913e-05, "loss": 0.5079, "step": 26990 }, { "epoch": 3.91346885531036, "grad_norm": 0.8706920146942139, "learning_rate": 1.0876938686766199e-05, "loss": 0.4817, "step": 27000 }, { "epoch": 3.91346885531036, "eval_loss": 0.7067095041275024, "eval_runtime": 670.7773, "eval_samples_per_second": 51.425, "eval_steps_per_second": 2.572, "eval_token_accuracy": 0.00039116708689249276, "step": 27000 }, { "epoch": 3.914918288219734, "grad_norm": 1.17312753200531, "learning_rate": 1.0862443832439483e-05, "loss": 0.438, "step": 27010 }, { "epoch": 3.916367721129108, "grad_norm": 1.008259892463684, "learning_rate": 1.0847948978112771e-05, "loss": 0.4378, "step": 27020 }, { "epoch": 3.9178171540384823, "grad_norm": 1.0406113862991333, "learning_rate": 1.0833454123786057e-05, "loss": 0.413, "step": 27030 }, { "epoch": 3.919266586947857, "grad_norm": 2.5961663722991943, "learning_rate": 1.0818959269459343e-05, "loss": 0.5262, "step": 27040 }, { "epoch": 3.920716019857231, "grad_norm": 0.8761188387870789, "learning_rate": 1.0804464415132629e-05, "loss": 0.5301, "step": 27050 }, { "epoch": 3.922165452766605, "grad_norm": 0.9454947113990784, "learning_rate": 1.0789969560805913e-05, "loss": 0.481, "step": 27060 }, { "epoch": 3.9236148856759794, "grad_norm": 3.1509249210357666, "learning_rate": 1.0775474706479201e-05, "loss": 0.4608, "step": 27070 }, { "epoch": 3.9250643185853535, "grad_norm": 3.173954963684082, "learning_rate": 1.0760979852152486e-05, "loss": 0.4451, "step": 27080 }, { "epoch": 3.9265137514947277, "grad_norm": 0.879859209060669, "learning_rate": 1.0746484997825772e-05, "loss": 0.4301, "step": 27090 }, { "epoch": 3.927963184404102, "grad_norm": 1.0729224681854248, "learning_rate": 1.073199014349906e-05, "loss": 0.4324, "step": 27100 }, { "epoch": 3.929412617313476, "grad_norm": 1.933181643486023, "learning_rate": 1.0717495289172344e-05, "loss": 0.4767, "step": 27110 }, { "epoch": 3.93086205022285, "grad_norm": 1.12887442111969, "learning_rate": 1.0703000434845631e-05, "loss": 0.3971, "step": 27120 }, { "epoch": 3.9323114831322243, "grad_norm": 0.8794549703598022, "learning_rate": 1.0688505580518916e-05, "loss": 0.4423, "step": 27130 }, { "epoch": 3.9337609160415985, "grad_norm": 1.0780173540115356, "learning_rate": 1.0674010726192202e-05, "loss": 0.4795, "step": 27140 }, { "epoch": 3.9352103489509727, "grad_norm": 1.5967097282409668, "learning_rate": 1.0659515871865488e-05, "loss": 0.4333, "step": 27150 }, { "epoch": 3.9366597818603473, "grad_norm": 1.123369574546814, "learning_rate": 1.0645021017538774e-05, "loss": 0.4561, "step": 27160 }, { "epoch": 3.9381092147697214, "grad_norm": 3.6029701232910156, "learning_rate": 1.0630526163212062e-05, "loss": 0.4613, "step": 27170 }, { "epoch": 3.9395586476790956, "grad_norm": 0.8784819841384888, "learning_rate": 1.0616031308885346e-05, "loss": 0.4745, "step": 27180 }, { "epoch": 3.9410080805884697, "grad_norm": 2.145846366882324, "learning_rate": 1.0601536454558632e-05, "loss": 0.4747, "step": 27190 }, { "epoch": 3.942457513497844, "grad_norm": 1.1199047565460205, "learning_rate": 1.0587041600231918e-05, "loss": 0.4514, "step": 27200 }, { "epoch": 3.943906946407218, "grad_norm": 0.8590757250785828, "learning_rate": 1.0572546745905204e-05, "loss": 0.3595, "step": 27210 }, { "epoch": 3.945356379316592, "grad_norm": 2.8308565616607666, "learning_rate": 1.055805189157849e-05, "loss": 0.4527, "step": 27220 }, { "epoch": 3.946805812225967, "grad_norm": 0.7825255990028381, "learning_rate": 1.0543557037251776e-05, "loss": 0.4501, "step": 27230 }, { "epoch": 3.948255245135341, "grad_norm": 2.58693790435791, "learning_rate": 1.0530511668357732e-05, "loss": 0.4467, "step": 27240 }, { "epoch": 3.949704678044715, "grad_norm": 0.8955137133598328, "learning_rate": 1.051601681403102e-05, "loss": 0.479, "step": 27250 }, { "epoch": 3.9511541109540893, "grad_norm": 1.1574593782424927, "learning_rate": 1.0501521959704304e-05, "loss": 0.456, "step": 27260 }, { "epoch": 3.9526035438634635, "grad_norm": 1.5674717426300049, "learning_rate": 1.0487027105377592e-05, "loss": 0.4149, "step": 27270 }, { "epoch": 3.9540529767728376, "grad_norm": 3.8193156719207764, "learning_rate": 1.0472532251050878e-05, "loss": 0.3872, "step": 27280 }, { "epoch": 3.955502409682212, "grad_norm": 4.670598030090332, "learning_rate": 1.0458037396724163e-05, "loss": 0.4626, "step": 27290 }, { "epoch": 3.956951842591586, "grad_norm": 1.0468988418579102, "learning_rate": 1.044354254239745e-05, "loss": 0.4514, "step": 27300 }, { "epoch": 3.95840127550096, "grad_norm": 1.9248346090316772, "learning_rate": 1.0429047688070735e-05, "loss": 0.4064, "step": 27310 }, { "epoch": 3.9598507084103343, "grad_norm": 1.1533076763153076, "learning_rate": 1.041455283374402e-05, "loss": 0.4469, "step": 27320 }, { "epoch": 3.9613001413197084, "grad_norm": 1.8082165718078613, "learning_rate": 1.0400057979417307e-05, "loss": 0.4182, "step": 27330 }, { "epoch": 3.962749574229083, "grad_norm": 2.489048957824707, "learning_rate": 1.0385563125090593e-05, "loss": 0.4603, "step": 27340 }, { "epoch": 3.964199007138457, "grad_norm": 1.018362045288086, "learning_rate": 1.037106827076388e-05, "loss": 0.415, "step": 27350 }, { "epoch": 3.9656484400478313, "grad_norm": 1.5398354530334473, "learning_rate": 1.0356573416437165e-05, "loss": 0.4022, "step": 27360 }, { "epoch": 3.9670978729572055, "grad_norm": 3.7499420642852783, "learning_rate": 1.0342078562110451e-05, "loss": 0.4887, "step": 27370 }, { "epoch": 3.9685473058665797, "grad_norm": 3.1090011596679688, "learning_rate": 1.0327583707783737e-05, "loss": 0.4385, "step": 27380 }, { "epoch": 3.969996738775954, "grad_norm": 2.216132402420044, "learning_rate": 1.0313088853457023e-05, "loss": 0.4689, "step": 27390 }, { "epoch": 3.971446171685328, "grad_norm": 1.6161932945251465, "learning_rate": 1.0298593999130309e-05, "loss": 0.4268, "step": 27400 }, { "epoch": 3.9728956045947026, "grad_norm": 0.9366248846054077, "learning_rate": 1.0284099144803595e-05, "loss": 0.4378, "step": 27410 }, { "epoch": 3.9743450375040767, "grad_norm": 1.3713431358337402, "learning_rate": 1.0269604290476881e-05, "loss": 0.429, "step": 27420 }, { "epoch": 3.975794470413451, "grad_norm": 1.966168999671936, "learning_rate": 1.0255109436150167e-05, "loss": 0.4333, "step": 27430 }, { "epoch": 3.977243903322825, "grad_norm": 3.127967119216919, "learning_rate": 1.0240614581823453e-05, "loss": 0.4401, "step": 27440 }, { "epoch": 3.9786933362321992, "grad_norm": 1.8882118463516235, "learning_rate": 1.0226119727496738e-05, "loss": 0.4623, "step": 27450 }, { "epoch": 3.9801427691415734, "grad_norm": 0.9691013097763062, "learning_rate": 1.0211624873170025e-05, "loss": 0.4796, "step": 27460 }, { "epoch": 3.9815922020509475, "grad_norm": 1.0078339576721191, "learning_rate": 1.0197130018843312e-05, "loss": 0.445, "step": 27470 }, { "epoch": 3.9830416349603217, "grad_norm": 4.346449375152588, "learning_rate": 1.0182635164516598e-05, "loss": 0.53, "step": 27480 }, { "epoch": 3.984491067869696, "grad_norm": 1.0477688312530518, "learning_rate": 1.0168140310189884e-05, "loss": 0.4503, "step": 27490 }, { "epoch": 3.98594050077907, "grad_norm": 2.4577136039733887, "learning_rate": 1.0153645455863168e-05, "loss": 0.4389, "step": 27500 }, { "epoch": 3.987389933688444, "grad_norm": 1.7137912511825562, "learning_rate": 1.0139150601536456e-05, "loss": 0.4215, "step": 27510 }, { "epoch": 3.9888393665978183, "grad_norm": 2.337385892868042, "learning_rate": 1.012465574720974e-05, "loss": 0.488, "step": 27520 }, { "epoch": 3.990288799507193, "grad_norm": 1.083498477935791, "learning_rate": 1.0110160892883028e-05, "loss": 0.4287, "step": 27530 }, { "epoch": 3.991738232416567, "grad_norm": 2.4905881881713867, "learning_rate": 1.0095666038556314e-05, "loss": 0.4793, "step": 27540 }, { "epoch": 3.9931876653259413, "grad_norm": 1.1195241212844849, "learning_rate": 1.0081171184229598e-05, "loss": 0.4409, "step": 27550 }, { "epoch": 3.9946370982353154, "grad_norm": 1.6588102579116821, "learning_rate": 1.0066676329902886e-05, "loss": 0.4468, "step": 27560 }, { "epoch": 3.9960865311446896, "grad_norm": 1.223429560661316, "learning_rate": 1.005218147557617e-05, "loss": 0.4396, "step": 27570 }, { "epoch": 3.9975359640540638, "grad_norm": 1.179884910583496, "learning_rate": 1.0037686621249456e-05, "loss": 0.4471, "step": 27580 }, { "epoch": 3.998985396963438, "grad_norm": 1.0562238693237305, "learning_rate": 1.0023191766922742e-05, "loss": 0.414, "step": 27590 }, { "epoch": 4.0004348298728125, "grad_norm": 1.1050264835357666, "learning_rate": 1.0008696912596029e-05, "loss": 0.3992, "step": 27600 }, { "epoch": 4.001884262782187, "grad_norm": 2.0935981273651123, "learning_rate": 9.994202058269316e-06, "loss": 0.3756, "step": 27610 }, { "epoch": 4.003333695691561, "grad_norm": 1.0844813585281372, "learning_rate": 9.9797072039426e-06, "loss": 0.4063, "step": 27620 }, { "epoch": 4.004783128600935, "grad_norm": 0.849988579750061, "learning_rate": 9.965212349615887e-06, "loss": 0.3845, "step": 27630 }, { "epoch": 4.006232561510309, "grad_norm": 0.8378141522407532, "learning_rate": 9.950717495289173e-06, "loss": 0.3691, "step": 27640 }, { "epoch": 4.007681994419683, "grad_norm": 2.816222906112671, "learning_rate": 9.936222640962459e-06, "loss": 0.3848, "step": 27650 }, { "epoch": 4.0091314273290575, "grad_norm": 1.0942838191986084, "learning_rate": 9.921727786635745e-06, "loss": 0.3793, "step": 27660 }, { "epoch": 4.010580860238432, "grad_norm": 0.8341204524040222, "learning_rate": 9.907232932309031e-06, "loss": 0.3884, "step": 27670 }, { "epoch": 4.012030293147806, "grad_norm": 1.0412321090698242, "learning_rate": 9.892738077982317e-06, "loss": 0.3347, "step": 27680 }, { "epoch": 4.01347972605718, "grad_norm": 1.2412534952163696, "learning_rate": 9.878243223655603e-06, "loss": 0.4247, "step": 27690 }, { "epoch": 4.014929158966554, "grad_norm": 1.1662929058074951, "learning_rate": 9.863748369328889e-06, "loss": 0.3439, "step": 27700 }, { "epoch": 4.016378591875928, "grad_norm": 1.7275515794754028, "learning_rate": 9.849253515002173e-06, "loss": 0.3666, "step": 27710 }, { "epoch": 4.017828024785302, "grad_norm": 1.2136681079864502, "learning_rate": 9.834758660675461e-06, "loss": 0.353, "step": 27720 }, { "epoch": 4.019277457694677, "grad_norm": 1.1961663961410522, "learning_rate": 9.820263806348747e-06, "loss": 0.3749, "step": 27730 }, { "epoch": 4.020726890604051, "grad_norm": 1.3632068634033203, "learning_rate": 9.805768952022033e-06, "loss": 0.3846, "step": 27740 }, { "epoch": 4.022176323513426, "grad_norm": 1.477921724319458, "learning_rate": 9.79127409769532e-06, "loss": 0.3621, "step": 27750 }, { "epoch": 4.0236257564228, "grad_norm": 1.5257829427719116, "learning_rate": 9.776779243368604e-06, "loss": 0.3435, "step": 27760 }, { "epoch": 4.025075189332174, "grad_norm": 1.464673638343811, "learning_rate": 9.762284389041891e-06, "loss": 0.3477, "step": 27770 }, { "epoch": 4.026524622241548, "grad_norm": 1.0875053405761719, "learning_rate": 9.747789534715176e-06, "loss": 0.414, "step": 27780 }, { "epoch": 4.027974055150922, "grad_norm": 2.292750120162964, "learning_rate": 9.733294680388462e-06, "loss": 0.3562, "step": 27790 }, { "epoch": 4.029423488060297, "grad_norm": 2.2468414306640625, "learning_rate": 9.71879982606175e-06, "loss": 0.3949, "step": 27800 }, { "epoch": 4.030872920969671, "grad_norm": 0.8002530932426453, "learning_rate": 9.704304971735034e-06, "loss": 0.362, "step": 27810 }, { "epoch": 4.032322353879045, "grad_norm": 0.9956921935081482, "learning_rate": 9.689810117408322e-06, "loss": 0.3498, "step": 27820 }, { "epoch": 4.033771786788419, "grad_norm": 2.087887763977051, "learning_rate": 9.675315263081606e-06, "loss": 0.3751, "step": 27830 }, { "epoch": 4.035221219697793, "grad_norm": 3.2509827613830566, "learning_rate": 9.660820408754892e-06, "loss": 0.3628, "step": 27840 }, { "epoch": 4.036670652607167, "grad_norm": 2.119020462036133, "learning_rate": 9.646325554428178e-06, "loss": 0.4038, "step": 27850 }, { "epoch": 4.038120085516542, "grad_norm": 1.0762407779693604, "learning_rate": 9.631830700101464e-06, "loss": 0.4082, "step": 27860 }, { "epoch": 4.039569518425916, "grad_norm": 1.4908643960952759, "learning_rate": 9.61733584577475e-06, "loss": 0.421, "step": 27870 }, { "epoch": 4.04101895133529, "grad_norm": 1.3782938718795776, "learning_rate": 9.602840991448036e-06, "loss": 0.3493, "step": 27880 }, { "epoch": 4.042468384244664, "grad_norm": 1.111978530883789, "learning_rate": 9.588346137121322e-06, "loss": 0.4287, "step": 27890 }, { "epoch": 4.043917817154038, "grad_norm": 1.9733977317810059, "learning_rate": 9.573851282794608e-06, "loss": 0.3812, "step": 27900 }, { "epoch": 4.045367250063412, "grad_norm": 0.8419144153594971, "learning_rate": 9.559356428467894e-06, "loss": 0.3388, "step": 27910 }, { "epoch": 4.0468166829727865, "grad_norm": 1.2305309772491455, "learning_rate": 9.54486157414118e-06, "loss": 0.4008, "step": 27920 }, { "epoch": 4.048266115882161, "grad_norm": 1.153856873512268, "learning_rate": 9.530366719814467e-06, "loss": 0.4178, "step": 27930 }, { "epoch": 4.049715548791536, "grad_norm": 0.8963118195533752, "learning_rate": 9.515871865487753e-06, "loss": 0.334, "step": 27940 }, { "epoch": 4.05116498170091, "grad_norm": 1.9021250009536743, "learning_rate": 9.501377011161039e-06, "loss": 0.3836, "step": 27950 }, { "epoch": 4.052614414610284, "grad_norm": 1.6093897819519043, "learning_rate": 9.486882156834325e-06, "loss": 0.3618, "step": 27960 }, { "epoch": 4.054063847519658, "grad_norm": 2.0067999362945557, "learning_rate": 9.472387302507609e-06, "loss": 0.3485, "step": 27970 }, { "epoch": 4.055513280429032, "grad_norm": 1.1295496225357056, "learning_rate": 9.457892448180897e-06, "loss": 0.4286, "step": 27980 }, { "epoch": 4.0569627133384065, "grad_norm": 1.2019520998001099, "learning_rate": 9.443397593854183e-06, "loss": 0.3784, "step": 27990 }, { "epoch": 4.058412146247781, "grad_norm": 1.1256000995635986, "learning_rate": 9.428902739527467e-06, "loss": 0.401, "step": 28000 }, { "epoch": 4.058412146247781, "eval_loss": 0.7432445287704468, "eval_runtime": 670.6325, "eval_samples_per_second": 51.437, "eval_steps_per_second": 2.572, "eval_token_accuracy": 0.0003882719357559529, "step": 28000 }, { "epoch": 4.059861579157155, "grad_norm": 1.4858769178390503, "learning_rate": 9.414407885200755e-06, "loss": 0.3862, "step": 28010 }, { "epoch": 4.061311012066529, "grad_norm": 3.2917826175689697, "learning_rate": 9.39991303087404e-06, "loss": 0.3758, "step": 28020 }, { "epoch": 4.062760444975903, "grad_norm": 1.42050302028656, "learning_rate": 9.385418176547327e-06, "loss": 0.3977, "step": 28030 }, { "epoch": 4.064209877885277, "grad_norm": 2.3631317615509033, "learning_rate": 9.370923322220611e-06, "loss": 0.3469, "step": 28040 }, { "epoch": 4.0656593107946515, "grad_norm": 0.9823781847953796, "learning_rate": 9.356428467893898e-06, "loss": 0.3718, "step": 28050 }, { "epoch": 4.067108743704026, "grad_norm": 2.334778070449829, "learning_rate": 9.341933613567185e-06, "loss": 0.3862, "step": 28060 }, { "epoch": 4.0685581766134, "grad_norm": 1.390881896018982, "learning_rate": 9.32743875924047e-06, "loss": 0.3643, "step": 28070 }, { "epoch": 4.070007609522774, "grad_norm": 1.4181934595108032, "learning_rate": 9.312943904913756e-06, "loss": 0.4207, "step": 28080 }, { "epoch": 4.071457042432148, "grad_norm": 0.8947294354438782, "learning_rate": 9.298449050587042e-06, "loss": 0.3836, "step": 28090 }, { "epoch": 4.072906475341522, "grad_norm": 0.939887285232544, "learning_rate": 9.283954196260328e-06, "loss": 0.3692, "step": 28100 }, { "epoch": 4.074355908250896, "grad_norm": 3.092376470565796, "learning_rate": 9.269459341933614e-06, "loss": 0.4199, "step": 28110 }, { "epoch": 4.075805341160271, "grad_norm": 1.2078293561935425, "learning_rate": 9.2549644876069e-06, "loss": 0.3482, "step": 28120 }, { "epoch": 4.077254774069646, "grad_norm": 2.018075704574585, "learning_rate": 9.240469633280186e-06, "loss": 0.3607, "step": 28130 }, { "epoch": 4.07870420697902, "grad_norm": 1.0871379375457764, "learning_rate": 9.225974778953472e-06, "loss": 0.3976, "step": 28140 }, { "epoch": 4.080153639888394, "grad_norm": 1.2211644649505615, "learning_rate": 9.211479924626758e-06, "loss": 0.3996, "step": 28150 }, { "epoch": 4.081603072797768, "grad_norm": 4.177424907684326, "learning_rate": 9.196985070300044e-06, "loss": 0.4018, "step": 28160 }, { "epoch": 4.083052505707142, "grad_norm": 0.9837602972984314, "learning_rate": 9.18249021597333e-06, "loss": 0.3776, "step": 28170 }, { "epoch": 4.0845019386165164, "grad_norm": 1.2474371194839478, "learning_rate": 9.167995361646616e-06, "loss": 0.3645, "step": 28180 }, { "epoch": 4.085951371525891, "grad_norm": 0.8499113917350769, "learning_rate": 9.153500507319902e-06, "loss": 0.3986, "step": 28190 }, { "epoch": 4.087400804435265, "grad_norm": 1.1272683143615723, "learning_rate": 9.139005652993188e-06, "loss": 0.3665, "step": 28200 }, { "epoch": 4.088850237344639, "grad_norm": 0.9529480338096619, "learning_rate": 9.124510798666473e-06, "loss": 0.3491, "step": 28210 }, { "epoch": 4.090299670254013, "grad_norm": 2.408130645751953, "learning_rate": 9.11001594433976e-06, "loss": 0.3772, "step": 28220 }, { "epoch": 4.091749103163387, "grad_norm": 1.2539433240890503, "learning_rate": 9.095521090013045e-06, "loss": 0.4046, "step": 28230 }, { "epoch": 4.093198536072761, "grad_norm": 2.582306146621704, "learning_rate": 9.081026235686333e-06, "loss": 0.3939, "step": 28240 }, { "epoch": 4.094647968982136, "grad_norm": 1.1347997188568115, "learning_rate": 9.066531381359619e-06, "loss": 0.3405, "step": 28250 }, { "epoch": 4.09609740189151, "grad_norm": 2.0996315479278564, "learning_rate": 9.052036527032903e-06, "loss": 0.3538, "step": 28260 }, { "epoch": 4.097546834800884, "grad_norm": 1.378403663635254, "learning_rate": 9.03754167270619e-06, "loss": 0.3505, "step": 28270 }, { "epoch": 4.098996267710258, "grad_norm": 1.0263172388076782, "learning_rate": 9.023046818379475e-06, "loss": 0.3402, "step": 28280 }, { "epoch": 4.100445700619632, "grad_norm": 1.7301150560379028, "learning_rate": 9.008551964052761e-06, "loss": 0.3679, "step": 28290 }, { "epoch": 4.101895133529006, "grad_norm": 1.9480246305465698, "learning_rate": 8.994057109726047e-06, "loss": 0.4301, "step": 28300 }, { "epoch": 4.103344566438381, "grad_norm": 1.4775640964508057, "learning_rate": 8.979562255399333e-06, "loss": 0.3665, "step": 28310 }, { "epoch": 4.104793999347756, "grad_norm": 2.209846258163452, "learning_rate": 8.965067401072621e-06, "loss": 0.3446, "step": 28320 }, { "epoch": 4.10624343225713, "grad_norm": 1.0079455375671387, "learning_rate": 8.950572546745905e-06, "loss": 0.3878, "step": 28330 }, { "epoch": 4.107692865166504, "grad_norm": 0.8966060876846313, "learning_rate": 8.936077692419191e-06, "loss": 0.3611, "step": 28340 }, { "epoch": 4.109142298075878, "grad_norm": 1.1710530519485474, "learning_rate": 8.921582838092477e-06, "loss": 0.3578, "step": 28350 }, { "epoch": 4.110591730985252, "grad_norm": 1.261860966682434, "learning_rate": 8.907087983765763e-06, "loss": 0.4166, "step": 28360 }, { "epoch": 4.112041163894626, "grad_norm": 1.0835785865783691, "learning_rate": 8.89259312943905e-06, "loss": 0.3574, "step": 28370 }, { "epoch": 4.1134905968040005, "grad_norm": 2.9820594787597656, "learning_rate": 8.878098275112336e-06, "loss": 0.3804, "step": 28380 }, { "epoch": 4.114940029713375, "grad_norm": 3.3930695056915283, "learning_rate": 8.863603420785622e-06, "loss": 0.3602, "step": 28390 }, { "epoch": 4.116389462622749, "grad_norm": 2.611999034881592, "learning_rate": 8.849108566458908e-06, "loss": 0.3743, "step": 28400 }, { "epoch": 4.117838895532123, "grad_norm": 1.090427279472351, "learning_rate": 8.834613712132194e-06, "loss": 0.3773, "step": 28410 }, { "epoch": 4.119288328441497, "grad_norm": 1.434935212135315, "learning_rate": 8.820118857805478e-06, "loss": 0.3808, "step": 28420 }, { "epoch": 4.120737761350871, "grad_norm": 6.215574264526367, "learning_rate": 8.805624003478766e-06, "loss": 0.3795, "step": 28430 }, { "epoch": 4.1221871942602455, "grad_norm": 1.3055042028427124, "learning_rate": 8.791129149152052e-06, "loss": 0.376, "step": 28440 }, { "epoch": 4.12363662716962, "grad_norm": 1.7235387563705444, "learning_rate": 8.776634294825338e-06, "loss": 0.4442, "step": 28450 }, { "epoch": 4.125086060078994, "grad_norm": 2.1163599491119385, "learning_rate": 8.762139440498624e-06, "loss": 0.3433, "step": 28460 }, { "epoch": 4.126535492988368, "grad_norm": 1.0409663915634155, "learning_rate": 8.747644586171908e-06, "loss": 0.3704, "step": 28470 }, { "epoch": 4.127984925897742, "grad_norm": 1.1953526735305786, "learning_rate": 8.733149731845196e-06, "loss": 0.3988, "step": 28480 }, { "epoch": 4.129434358807116, "grad_norm": 3.2460741996765137, "learning_rate": 8.71865487751848e-06, "loss": 0.3214, "step": 28490 }, { "epoch": 4.130883791716491, "grad_norm": 1.576264500617981, "learning_rate": 8.704160023191767e-06, "loss": 0.3375, "step": 28500 }, { "epoch": 4.1323332246258655, "grad_norm": 3.3589694499969482, "learning_rate": 8.689665168865054e-06, "loss": 0.3652, "step": 28510 }, { "epoch": 4.13378265753524, "grad_norm": 0.7416440844535828, "learning_rate": 8.675170314538339e-06, "loss": 0.3622, "step": 28520 }, { "epoch": 4.135232090444614, "grad_norm": 1.1319648027420044, "learning_rate": 8.660675460211626e-06, "loss": 0.371, "step": 28530 }, { "epoch": 4.136681523353988, "grad_norm": 1.8906817436218262, "learning_rate": 8.64618060588491e-06, "loss": 0.3673, "step": 28540 }, { "epoch": 4.138130956263362, "grad_norm": 0.9824037551879883, "learning_rate": 8.631685751558197e-06, "loss": 0.3147, "step": 28550 }, { "epoch": 4.139580389172736, "grad_norm": 1.198995590209961, "learning_rate": 8.617190897231483e-06, "loss": 0.3112, "step": 28560 }, { "epoch": 4.1410298220821105, "grad_norm": 1.260464072227478, "learning_rate": 8.602696042904769e-06, "loss": 0.326, "step": 28570 }, { "epoch": 4.142479254991485, "grad_norm": 1.0763384103775024, "learning_rate": 8.588201188578057e-06, "loss": 0.3248, "step": 28580 }, { "epoch": 4.143928687900859, "grad_norm": 0.7927284836769104, "learning_rate": 8.573706334251341e-06, "loss": 0.3897, "step": 28590 }, { "epoch": 4.145378120810233, "grad_norm": 0.9664852619171143, "learning_rate": 8.559211479924627e-06, "loss": 0.3691, "step": 28600 }, { "epoch": 4.146827553719607, "grad_norm": 1.0483916997909546, "learning_rate": 8.544716625597913e-06, "loss": 0.3442, "step": 28610 }, { "epoch": 4.148276986628981, "grad_norm": 0.9723681807518005, "learning_rate": 8.5302217712712e-06, "loss": 0.3085, "step": 28620 }, { "epoch": 4.149726419538355, "grad_norm": 1.37981116771698, "learning_rate": 8.515726916944485e-06, "loss": 0.4007, "step": 28630 }, { "epoch": 4.15117585244773, "grad_norm": 2.6564297676086426, "learning_rate": 8.501232062617771e-06, "loss": 0.3669, "step": 28640 }, { "epoch": 4.152625285357104, "grad_norm": 1.5353738069534302, "learning_rate": 8.486737208291057e-06, "loss": 0.4073, "step": 28650 }, { "epoch": 4.154074718266478, "grad_norm": 1.42356538772583, "learning_rate": 8.472242353964343e-06, "loss": 0.421, "step": 28660 }, { "epoch": 4.155524151175852, "grad_norm": 1.5719921588897705, "learning_rate": 8.45774749963763e-06, "loss": 0.3538, "step": 28670 }, { "epoch": 4.156973584085227, "grad_norm": 1.324820876121521, "learning_rate": 8.443252645310914e-06, "loss": 0.4142, "step": 28680 }, { "epoch": 4.158423016994601, "grad_norm": 1.1727113723754883, "learning_rate": 8.428757790984202e-06, "loss": 0.3242, "step": 28690 }, { "epoch": 4.159872449903975, "grad_norm": 1.2460103034973145, "learning_rate": 8.414262936657488e-06, "loss": 0.345, "step": 28700 }, { "epoch": 4.16132188281335, "grad_norm": 0.9975523352622986, "learning_rate": 8.399768082330774e-06, "loss": 0.36, "step": 28710 }, { "epoch": 4.162771315722724, "grad_norm": 0.8736883401870728, "learning_rate": 8.38527322800406e-06, "loss": 0.3209, "step": 28720 }, { "epoch": 4.164220748632098, "grad_norm": 1.1812388896942139, "learning_rate": 8.370778373677344e-06, "loss": 0.4303, "step": 28730 }, { "epoch": 4.165670181541472, "grad_norm": 3.161813259124756, "learning_rate": 8.356283519350632e-06, "loss": 0.4146, "step": 28740 }, { "epoch": 4.167119614450846, "grad_norm": 2.7170424461364746, "learning_rate": 8.341788665023916e-06, "loss": 0.3542, "step": 28750 }, { "epoch": 4.16856904736022, "grad_norm": 1.185248613357544, "learning_rate": 8.327293810697202e-06, "loss": 0.3726, "step": 28760 }, { "epoch": 4.1700184802695945, "grad_norm": 2.6338613033294678, "learning_rate": 8.31279895637049e-06, "loss": 0.3621, "step": 28770 }, { "epoch": 4.171467913178969, "grad_norm": 1.1241165399551392, "learning_rate": 8.298304102043774e-06, "loss": 0.3779, "step": 28780 }, { "epoch": 4.172917346088343, "grad_norm": 1.1666885614395142, "learning_rate": 8.283809247717062e-06, "loss": 0.3321, "step": 28790 }, { "epoch": 4.174366778997717, "grad_norm": 1.3057632446289062, "learning_rate": 8.269314393390346e-06, "loss": 0.3854, "step": 28800 }, { "epoch": 4.175816211907091, "grad_norm": 1.140620231628418, "learning_rate": 8.254819539063632e-06, "loss": 0.3756, "step": 28810 }, { "epoch": 4.177265644816465, "grad_norm": 1.1446179151535034, "learning_rate": 8.240324684736919e-06, "loss": 0.3853, "step": 28820 }, { "epoch": 4.1787150777258395, "grad_norm": 1.6761678457260132, "learning_rate": 8.225829830410205e-06, "loss": 0.3084, "step": 28830 }, { "epoch": 4.180164510635214, "grad_norm": 1.3657466173171997, "learning_rate": 8.21133497608349e-06, "loss": 0.3652, "step": 28840 }, { "epoch": 4.181613943544588, "grad_norm": 2.4275379180908203, "learning_rate": 8.196840121756777e-06, "loss": 0.3708, "step": 28850 }, { "epoch": 4.183063376453962, "grad_norm": 1.454341173171997, "learning_rate": 8.182345267430063e-06, "loss": 0.319, "step": 28860 }, { "epoch": 4.184512809363337, "grad_norm": 0.9970882534980774, "learning_rate": 8.167850413103349e-06, "loss": 0.3553, "step": 28870 }, { "epoch": 4.185962242272711, "grad_norm": 2.611583709716797, "learning_rate": 8.153355558776635e-06, "loss": 0.3761, "step": 28880 }, { "epoch": 4.187411675182085, "grad_norm": 2.846203327178955, "learning_rate": 8.138860704449921e-06, "loss": 0.3141, "step": 28890 }, { "epoch": 4.1888611080914595, "grad_norm": 4.5427422523498535, "learning_rate": 8.124365850123207e-06, "loss": 0.3629, "step": 28900 }, { "epoch": 4.190310541000834, "grad_norm": 0.8743450045585632, "learning_rate": 8.109870995796493e-06, "loss": 0.3784, "step": 28910 }, { "epoch": 4.191759973910208, "grad_norm": 1.2091394662857056, "learning_rate": 8.095376141469779e-06, "loss": 0.3752, "step": 28920 }, { "epoch": 4.193209406819582, "grad_norm": 0.9753203988075256, "learning_rate": 8.080881287143065e-06, "loss": 0.3474, "step": 28930 }, { "epoch": 4.194658839728956, "grad_norm": 1.3854143619537354, "learning_rate": 8.06638643281635e-06, "loss": 0.3687, "step": 28940 }, { "epoch": 4.19610827263833, "grad_norm": 0.9765649437904358, "learning_rate": 8.051891578489637e-06, "loss": 0.317, "step": 28950 }, { "epoch": 4.1975577055477045, "grad_norm": 1.0090285539627075, "learning_rate": 8.037396724162923e-06, "loss": 0.3461, "step": 28960 }, { "epoch": 4.199007138457079, "grad_norm": 4.256155014038086, "learning_rate": 8.022901869836208e-06, "loss": 0.3616, "step": 28970 }, { "epoch": 4.200456571366453, "grad_norm": 1.2576302289962769, "learning_rate": 8.008407015509495e-06, "loss": 0.4056, "step": 28980 }, { "epoch": 4.201906004275827, "grad_norm": 1.1981736421585083, "learning_rate": 7.99391216118278e-06, "loss": 0.3322, "step": 28990 }, { "epoch": 4.203355437185201, "grad_norm": 2.0531351566314697, "learning_rate": 7.979417306856067e-06, "loss": 0.3387, "step": 29000 }, { "epoch": 4.203355437185201, "eval_loss": 0.7457379698753357, "eval_runtime": 671.0276, "eval_samples_per_second": 51.406, "eval_steps_per_second": 2.571, "eval_token_accuracy": 0.00039100624516268497, "step": 29000 }, { "epoch": 4.204804870094575, "grad_norm": 1.163179874420166, "learning_rate": 7.964922452529352e-06, "loss": 0.3684, "step": 29010 }, { "epoch": 4.206254303003949, "grad_norm": 0.9141797423362732, "learning_rate": 7.950427598202638e-06, "loss": 0.3168, "step": 29020 }, { "epoch": 4.207703735913324, "grad_norm": 4.031828880310059, "learning_rate": 7.935932743875926e-06, "loss": 0.3199, "step": 29030 }, { "epoch": 4.209153168822698, "grad_norm": 0.893057644367218, "learning_rate": 7.92143788954921e-06, "loss": 0.3713, "step": 29040 }, { "epoch": 4.210602601732072, "grad_norm": 2.213886022567749, "learning_rate": 7.906943035222496e-06, "loss": 0.416, "step": 29050 }, { "epoch": 4.212052034641447, "grad_norm": 1.2855664491653442, "learning_rate": 7.892448180895782e-06, "loss": 0.3832, "step": 29060 }, { "epoch": 4.213501467550821, "grad_norm": 3.5074546337127686, "learning_rate": 7.877953326569068e-06, "loss": 0.3621, "step": 29070 }, { "epoch": 4.214950900460195, "grad_norm": 0.9053571224212646, "learning_rate": 7.863458472242354e-06, "loss": 0.3863, "step": 29080 }, { "epoch": 4.216400333369569, "grad_norm": 1.0677660703659058, "learning_rate": 7.84896361791564e-06, "loss": 0.3479, "step": 29090 }, { "epoch": 4.217849766278944, "grad_norm": 1.8787217140197754, "learning_rate": 7.834468763588926e-06, "loss": 0.3097, "step": 29100 }, { "epoch": 4.219299199188318, "grad_norm": 1.2158981561660767, "learning_rate": 7.819973909262212e-06, "loss": 0.3648, "step": 29110 }, { "epoch": 4.220748632097692, "grad_norm": 1.449982762336731, "learning_rate": 7.805479054935498e-06, "loss": 0.4157, "step": 29120 }, { "epoch": 4.222198065007066, "grad_norm": 1.1894491910934448, "learning_rate": 7.790984200608784e-06, "loss": 0.4087, "step": 29130 }, { "epoch": 4.22364749791644, "grad_norm": 1.0339592695236206, "learning_rate": 7.77648934628207e-06, "loss": 0.4189, "step": 29140 }, { "epoch": 4.225096930825814, "grad_norm": 1.3686072826385498, "learning_rate": 7.761994491955357e-06, "loss": 0.3937, "step": 29150 }, { "epoch": 4.2265463637351885, "grad_norm": 1.0015712976455688, "learning_rate": 7.747499637628643e-06, "loss": 0.3284, "step": 29160 }, { "epoch": 4.227995796644563, "grad_norm": 1.5643901824951172, "learning_rate": 7.733004783301929e-06, "loss": 0.38, "step": 29170 }, { "epoch": 4.229445229553937, "grad_norm": 2.5827622413635254, "learning_rate": 7.718509928975213e-06, "loss": 0.3809, "step": 29180 }, { "epoch": 4.230894662463311, "grad_norm": 1.25447678565979, "learning_rate": 7.7040150746485e-06, "loss": 0.3496, "step": 29190 }, { "epoch": 4.232344095372685, "grad_norm": 1.0713152885437012, "learning_rate": 7.689520220321785e-06, "loss": 0.3992, "step": 29200 }, { "epoch": 4.233793528282059, "grad_norm": 1.319899559020996, "learning_rate": 7.675025365995073e-06, "loss": 0.355, "step": 29210 }, { "epoch": 4.2352429611914335, "grad_norm": 4.824685096740723, "learning_rate": 7.660530511668359e-06, "loss": 0.3656, "step": 29220 }, { "epoch": 4.236692394100808, "grad_norm": 1.3379300832748413, "learning_rate": 7.646035657341643e-06, "loss": 0.3793, "step": 29230 }, { "epoch": 4.238141827010182, "grad_norm": 0.7790377140045166, "learning_rate": 7.631540803014931e-06, "loss": 0.3265, "step": 29240 }, { "epoch": 4.239591259919557, "grad_norm": 2.0793423652648926, "learning_rate": 7.617045948688216e-06, "loss": 0.4222, "step": 29250 }, { "epoch": 4.241040692828931, "grad_norm": 2.919572114944458, "learning_rate": 7.6025510943615015e-06, "loss": 0.3479, "step": 29260 }, { "epoch": 4.242490125738305, "grad_norm": 2.324187755584717, "learning_rate": 7.588056240034788e-06, "loss": 0.3964, "step": 29270 }, { "epoch": 4.243939558647679, "grad_norm": 1.03130304813385, "learning_rate": 7.573561385708074e-06, "loss": 0.3583, "step": 29280 }, { "epoch": 4.2453889915570535, "grad_norm": 1.1990162134170532, "learning_rate": 7.5590665313813605e-06, "loss": 0.3342, "step": 29290 }, { "epoch": 4.246838424466428, "grad_norm": 0.9026666879653931, "learning_rate": 7.544571677054646e-06, "loss": 0.3746, "step": 29300 }, { "epoch": 4.248287857375802, "grad_norm": 1.1271377801895142, "learning_rate": 7.530076822727932e-06, "loss": 0.3457, "step": 29310 }, { "epoch": 4.249737290285176, "grad_norm": 0.9937952756881714, "learning_rate": 7.515581968401219e-06, "loss": 0.343, "step": 29320 }, { "epoch": 4.25118672319455, "grad_norm": 1.5706512928009033, "learning_rate": 7.501087114074504e-06, "loss": 0.398, "step": 29330 }, { "epoch": 4.252636156103924, "grad_norm": 1.5114712715148926, "learning_rate": 7.486592259747791e-06, "loss": 0.4031, "step": 29340 }, { "epoch": 4.2540855890132985, "grad_norm": 1.1789222955703735, "learning_rate": 7.472097405421076e-06, "loss": 0.358, "step": 29350 }, { "epoch": 4.255535021922673, "grad_norm": 1.2037668228149414, "learning_rate": 7.457602551094361e-06, "loss": 0.3892, "step": 29360 }, { "epoch": 4.256984454832047, "grad_norm": 2.206874370574951, "learning_rate": 7.443107696767648e-06, "loss": 0.4155, "step": 29370 }, { "epoch": 4.258433887741421, "grad_norm": 1.2747567892074585, "learning_rate": 7.428612842440934e-06, "loss": 0.3568, "step": 29380 }, { "epoch": 4.259883320650795, "grad_norm": 3.8064351081848145, "learning_rate": 7.414117988114219e-06, "loss": 0.3754, "step": 29390 }, { "epoch": 4.261332753560169, "grad_norm": 1.3781629800796509, "learning_rate": 7.399623133787506e-06, "loss": 0.3825, "step": 29400 }, { "epoch": 4.262782186469543, "grad_norm": 1.3373839855194092, "learning_rate": 7.3851282794607914e-06, "loss": 0.3651, "step": 29410 }, { "epoch": 4.264231619378918, "grad_norm": 1.3949230909347534, "learning_rate": 7.370633425134078e-06, "loss": 0.3145, "step": 29420 }, { "epoch": 4.265681052288292, "grad_norm": 1.4587570428848267, "learning_rate": 7.3561385708073635e-06, "loss": 0.4271, "step": 29430 }, { "epoch": 4.267130485197667, "grad_norm": 1.9194419384002686, "learning_rate": 7.34164371648065e-06, "loss": 0.4094, "step": 29440 }, { "epoch": 4.268579918107041, "grad_norm": 1.3405871391296387, "learning_rate": 7.3271488621539365e-06, "loss": 0.3578, "step": 29450 }, { "epoch": 4.270029351016415, "grad_norm": 2.9382286071777344, "learning_rate": 7.312654007827222e-06, "loss": 0.3905, "step": 29460 }, { "epoch": 4.271478783925789, "grad_norm": 1.1766321659088135, "learning_rate": 7.298159153500507e-06, "loss": 0.3477, "step": 29470 }, { "epoch": 4.272928216835163, "grad_norm": 1.8907839059829712, "learning_rate": 7.283664299173794e-06, "loss": 0.3483, "step": 29480 }, { "epoch": 4.274377649744538, "grad_norm": 0.7925496697425842, "learning_rate": 7.269169444847079e-06, "loss": 0.3633, "step": 29490 }, { "epoch": 4.275827082653912, "grad_norm": 2.068272352218628, "learning_rate": 7.254674590520366e-06, "loss": 0.4196, "step": 29500 }, { "epoch": 4.277276515563286, "grad_norm": 1.5290910005569458, "learning_rate": 7.240179736193652e-06, "loss": 0.3769, "step": 29510 }, { "epoch": 4.27872594847266, "grad_norm": 1.2152941226959229, "learning_rate": 7.225684881866937e-06, "loss": 0.3677, "step": 29520 }, { "epoch": 4.280175381382034, "grad_norm": 1.2992326021194458, "learning_rate": 7.211190027540224e-06, "loss": 0.325, "step": 29530 }, { "epoch": 4.281624814291408, "grad_norm": 1.1786214113235474, "learning_rate": 7.196695173213509e-06, "loss": 0.3846, "step": 29540 }, { "epoch": 4.283074247200783, "grad_norm": 1.2369322776794434, "learning_rate": 7.182200318886796e-06, "loss": 0.3818, "step": 29550 }, { "epoch": 4.284523680110157, "grad_norm": 0.9638352990150452, "learning_rate": 7.167705464560081e-06, "loss": 0.3796, "step": 29560 }, { "epoch": 4.285973113019531, "grad_norm": 3.598480463027954, "learning_rate": 7.1532106102333674e-06, "loss": 0.4363, "step": 29570 }, { "epoch": 4.287422545928905, "grad_norm": 0.931704044342041, "learning_rate": 7.1387157559066535e-06, "loss": 0.3293, "step": 29580 }, { "epoch": 4.288871978838279, "grad_norm": 2.29419207572937, "learning_rate": 7.1242209015799396e-06, "loss": 0.4097, "step": 29590 }, { "epoch": 4.290321411747653, "grad_norm": 1.5176444053649902, "learning_rate": 7.109726047253225e-06, "loss": 0.388, "step": 29600 }, { "epoch": 4.291770844657028, "grad_norm": 2.4443917274475098, "learning_rate": 7.095231192926512e-06, "loss": 0.3435, "step": 29610 }, { "epoch": 4.293220277566402, "grad_norm": 0.9792112708091736, "learning_rate": 7.080736338599797e-06, "loss": 0.4226, "step": 29620 }, { "epoch": 4.294669710475777, "grad_norm": 2.7311971187591553, "learning_rate": 7.066241484273084e-06, "loss": 0.3917, "step": 29630 }, { "epoch": 4.296119143385151, "grad_norm": 1.7587502002716064, "learning_rate": 7.05174662994637e-06, "loss": 0.3605, "step": 29640 }, { "epoch": 4.297568576294525, "grad_norm": 1.7742453813552856, "learning_rate": 7.037251775619655e-06, "loss": 0.3729, "step": 29650 }, { "epoch": 4.299018009203899, "grad_norm": 2.9178965091705322, "learning_rate": 7.022756921292942e-06, "loss": 0.3903, "step": 29660 }, { "epoch": 4.300467442113273, "grad_norm": 2.7828633785247803, "learning_rate": 7.008262066966227e-06, "loss": 0.3831, "step": 29670 }, { "epoch": 4.3019168750226475, "grad_norm": 1.2201550006866455, "learning_rate": 6.993767212639512e-06, "loss": 0.3784, "step": 29680 }, { "epoch": 4.303366307932022, "grad_norm": 1.3287190198898315, "learning_rate": 6.979272358312799e-06, "loss": 0.3481, "step": 29690 }, { "epoch": 4.304815740841396, "grad_norm": 2.4197843074798584, "learning_rate": 6.964777503986085e-06, "loss": 0.4058, "step": 29700 }, { "epoch": 4.30626517375077, "grad_norm": 0.7661453485488892, "learning_rate": 6.950282649659371e-06, "loss": 0.42, "step": 29710 }, { "epoch": 4.307714606660144, "grad_norm": 1.3475944995880127, "learning_rate": 6.935787795332657e-06, "loss": 0.383, "step": 29720 }, { "epoch": 4.309164039569518, "grad_norm": 1.3460301160812378, "learning_rate": 6.921292941005943e-06, "loss": 0.3737, "step": 29730 }, { "epoch": 4.3106134724788925, "grad_norm": 1.1138921976089478, "learning_rate": 6.9067980866792295e-06, "loss": 0.3157, "step": 29740 }, { "epoch": 4.312062905388267, "grad_norm": 1.7547369003295898, "learning_rate": 6.892303232352515e-06, "loss": 0.3319, "step": 29750 }, { "epoch": 4.313512338297641, "grad_norm": 2.247833013534546, "learning_rate": 6.877808378025802e-06, "loss": 0.3575, "step": 29760 }, { "epoch": 4.314961771207015, "grad_norm": 1.2264424562454224, "learning_rate": 6.863313523699088e-06, "loss": 0.4161, "step": 29770 }, { "epoch": 4.316411204116389, "grad_norm": 3.117626428604126, "learning_rate": 6.848818669372373e-06, "loss": 0.4135, "step": 29780 }, { "epoch": 4.317860637025763, "grad_norm": 1.0561470985412598, "learning_rate": 6.83432381504566e-06, "loss": 0.3621, "step": 29790 }, { "epoch": 4.319310069935138, "grad_norm": 2.285572052001953, "learning_rate": 6.819828960718945e-06, "loss": 0.3992, "step": 29800 }, { "epoch": 4.3207595028445125, "grad_norm": 1.8720812797546387, "learning_rate": 6.80533410639223e-06, "loss": 0.4166, "step": 29810 }, { "epoch": 4.322208935753887, "grad_norm": 1.1257436275482178, "learning_rate": 6.790839252065517e-06, "loss": 0.4057, "step": 29820 }, { "epoch": 4.323658368663261, "grad_norm": 1.3093199729919434, "learning_rate": 6.776344397738803e-06, "loss": 0.396, "step": 29830 }, { "epoch": 4.325107801572635, "grad_norm": 1.3871891498565674, "learning_rate": 6.761849543412089e-06, "loss": 0.362, "step": 29840 }, { "epoch": 4.326557234482009, "grad_norm": 1.465741753578186, "learning_rate": 6.747354689085375e-06, "loss": 0.4141, "step": 29850 }, { "epoch": 4.328006667391383, "grad_norm": 2.450000047683716, "learning_rate": 6.7328598347586605e-06, "loss": 0.3643, "step": 29860 }, { "epoch": 4.3294561003007574, "grad_norm": 2.5313851833343506, "learning_rate": 6.718364980431947e-06, "loss": 0.3701, "step": 29870 }, { "epoch": 4.330905533210132, "grad_norm": 4.837574005126953, "learning_rate": 6.7038701261052326e-06, "loss": 0.3432, "step": 29880 }, { "epoch": 4.332354966119506, "grad_norm": 1.1457256078720093, "learning_rate": 6.6893752717785195e-06, "loss": 0.4141, "step": 29890 }, { "epoch": 4.33380439902888, "grad_norm": 2.2787954807281494, "learning_rate": 6.6748804174518055e-06, "loss": 0.3558, "step": 29900 }, { "epoch": 4.335253831938254, "grad_norm": 1.28756582736969, "learning_rate": 6.660385563125091e-06, "loss": 0.331, "step": 29910 }, { "epoch": 4.336703264847628, "grad_norm": 1.1767174005508423, "learning_rate": 6.645890708798378e-06, "loss": 0.375, "step": 29920 }, { "epoch": 4.338152697757002, "grad_norm": 0.9055870771408081, "learning_rate": 6.631395854471663e-06, "loss": 0.3931, "step": 29930 }, { "epoch": 4.339602130666377, "grad_norm": 0.9743188619613647, "learning_rate": 6.616901000144948e-06, "loss": 0.322, "step": 29940 }, { "epoch": 4.341051563575751, "grad_norm": 0.8511983156204224, "learning_rate": 6.602406145818235e-06, "loss": 0.3795, "step": 29950 }, { "epoch": 4.342500996485125, "grad_norm": 2.069638252258301, "learning_rate": 6.587911291491521e-06, "loss": 0.4144, "step": 29960 }, { "epoch": 4.343950429394499, "grad_norm": 1.1656533479690552, "learning_rate": 6.573416437164807e-06, "loss": 0.3852, "step": 29970 }, { "epoch": 4.345399862303873, "grad_norm": 0.7122425436973572, "learning_rate": 6.558921582838093e-06, "loss": 0.3483, "step": 29980 }, { "epoch": 4.346849295213248, "grad_norm": 2.2737338542938232, "learning_rate": 6.544426728511378e-06, "loss": 0.3555, "step": 29990 }, { "epoch": 4.348298728122622, "grad_norm": 3.028883934020996, "learning_rate": 6.529931874184665e-06, "loss": 0.3979, "step": 30000 }, { "epoch": 4.348298728122622, "eval_loss": 0.7482135891914368, "eval_runtime": 670.7938, "eval_samples_per_second": 51.424, "eval_steps_per_second": 2.572, "eval_token_accuracy": 0.00039020203651364614, "step": 30000 }, { "epoch": 4.349748161031997, "grad_norm": 5.186731815338135, "learning_rate": 6.51543701985795e-06, "loss": 0.3573, "step": 30010 }, { "epoch": 4.351197593941371, "grad_norm": 1.2703396081924438, "learning_rate": 6.5009421655312365e-06, "loss": 0.3683, "step": 30020 }, { "epoch": 4.352647026850745, "grad_norm": 1.4026705026626587, "learning_rate": 6.4864473112045225e-06, "loss": 0.3419, "step": 30030 }, { "epoch": 4.354096459760119, "grad_norm": 2.418346405029297, "learning_rate": 6.4719524568778086e-06, "loss": 0.3841, "step": 30040 }, { "epoch": 4.355545892669493, "grad_norm": 1.4166592359542847, "learning_rate": 6.4574576025510955e-06, "loss": 0.3962, "step": 30050 }, { "epoch": 4.356995325578867, "grad_norm": 3.7106871604919434, "learning_rate": 6.442962748224381e-06, "loss": 0.3566, "step": 30060 }, { "epoch": 4.3584447584882415, "grad_norm": 2.121419906616211, "learning_rate": 6.428467893897666e-06, "loss": 0.3455, "step": 30070 }, { "epoch": 4.359894191397616, "grad_norm": 1.0121147632598877, "learning_rate": 6.413973039570953e-06, "loss": 0.3533, "step": 30080 }, { "epoch": 4.36134362430699, "grad_norm": 1.1510685682296753, "learning_rate": 6.399478185244239e-06, "loss": 0.3555, "step": 30090 }, { "epoch": 4.362793057216364, "grad_norm": 1.9346082210540771, "learning_rate": 6.384983330917525e-06, "loss": 0.3558, "step": 30100 }, { "epoch": 4.364242490125738, "grad_norm": 0.8266393542289734, "learning_rate": 6.370488476590811e-06, "loss": 0.3293, "step": 30110 }, { "epoch": 4.365691923035112, "grad_norm": 3.2215356826782227, "learning_rate": 6.355993622264096e-06, "loss": 0.4042, "step": 30120 }, { "epoch": 4.3671413559444865, "grad_norm": 1.1464142799377441, "learning_rate": 6.341498767937383e-06, "loss": 0.3664, "step": 30130 }, { "epoch": 4.368590788853861, "grad_norm": 1.333441972732544, "learning_rate": 6.327003913610668e-06, "loss": 0.3791, "step": 30140 }, { "epoch": 4.370040221763235, "grad_norm": 2.17280912399292, "learning_rate": 6.312509059283954e-06, "loss": 0.3384, "step": 30150 }, { "epoch": 4.371489654672609, "grad_norm": 0.8691934943199158, "learning_rate": 6.29801420495724e-06, "loss": 0.3567, "step": 30160 }, { "epoch": 4.372939087581983, "grad_norm": 3.7443151473999023, "learning_rate": 6.283519350630526e-06, "loss": 0.351, "step": 30170 }, { "epoch": 4.374388520491358, "grad_norm": 1.8633484840393066, "learning_rate": 6.269024496303813e-06, "loss": 0.3379, "step": 30180 }, { "epoch": 4.375837953400732, "grad_norm": 1.3033164739608765, "learning_rate": 6.2545296419770985e-06, "loss": 0.3508, "step": 30190 }, { "epoch": 4.3772873863101065, "grad_norm": 1.2720485925674438, "learning_rate": 6.2400347876503846e-06, "loss": 0.3582, "step": 30200 }, { "epoch": 4.378736819219481, "grad_norm": 1.2477245330810547, "learning_rate": 6.22553993332367e-06, "loss": 0.3825, "step": 30210 }, { "epoch": 4.380186252128855, "grad_norm": 0.7797627449035645, "learning_rate": 6.211045078996957e-06, "loss": 0.3619, "step": 30220 }, { "epoch": 4.381635685038229, "grad_norm": 1.2631372213363647, "learning_rate": 6.196550224670243e-06, "loss": 0.4119, "step": 30230 }, { "epoch": 4.383085117947603, "grad_norm": 1.0257337093353271, "learning_rate": 6.182055370343529e-06, "loss": 0.3778, "step": 30240 }, { "epoch": 4.384534550856977, "grad_norm": 1.3090648651123047, "learning_rate": 6.167560516016814e-06, "loss": 0.389, "step": 30250 }, { "epoch": 4.3859839837663515, "grad_norm": 2.94443941116333, "learning_rate": 6.1530656616901e-06, "loss": 0.3542, "step": 30260 }, { "epoch": 4.387433416675726, "grad_norm": 2.2124414443969727, "learning_rate": 6.138570807363386e-06, "loss": 0.3965, "step": 30270 }, { "epoch": 4.3888828495851, "grad_norm": 0.953989565372467, "learning_rate": 6.124075953036672e-06, "loss": 0.3712, "step": 30280 }, { "epoch": 4.390332282494474, "grad_norm": 1.1157747507095337, "learning_rate": 6.109581098709958e-06, "loss": 0.3971, "step": 30290 }, { "epoch": 4.391781715403848, "grad_norm": 2.8602752685546875, "learning_rate": 6.095086244383244e-06, "loss": 0.3393, "step": 30300 }, { "epoch": 4.393231148313222, "grad_norm": 1.1085230112075806, "learning_rate": 6.08059139005653e-06, "loss": 0.3075, "step": 30310 }, { "epoch": 4.394680581222596, "grad_norm": 1.6991569995880127, "learning_rate": 6.066096535729816e-06, "loss": 0.3769, "step": 30320 }, { "epoch": 4.396130014131971, "grad_norm": 2.418412923812866, "learning_rate": 6.051601681403102e-06, "loss": 0.3531, "step": 30330 }, { "epoch": 4.397579447041345, "grad_norm": 3.8710434436798096, "learning_rate": 6.037106827076388e-06, "loss": 0.3861, "step": 30340 }, { "epoch": 4.399028879950719, "grad_norm": 1.5485104322433472, "learning_rate": 6.022611972749674e-06, "loss": 0.3489, "step": 30350 }, { "epoch": 4.400478312860093, "grad_norm": 1.4378596544265747, "learning_rate": 6.0081171184229606e-06, "loss": 0.3723, "step": 30360 }, { "epoch": 4.401927745769468, "grad_norm": 1.7529176473617554, "learning_rate": 5.993622264096247e-06, "loss": 0.3547, "step": 30370 }, { "epoch": 4.403377178678842, "grad_norm": 5.1034770011901855, "learning_rate": 5.979127409769532e-06, "loss": 0.3742, "step": 30380 }, { "epoch": 4.404826611588216, "grad_norm": 1.0158805847167969, "learning_rate": 5.964632555442818e-06, "loss": 0.2926, "step": 30390 }, { "epoch": 4.406276044497591, "grad_norm": 1.960871934890747, "learning_rate": 5.950137701116104e-06, "loss": 0.3627, "step": 30400 }, { "epoch": 4.407725477406965, "grad_norm": 1.0288090705871582, "learning_rate": 5.93564284678939e-06, "loss": 0.3859, "step": 30410 }, { "epoch": 4.409174910316339, "grad_norm": 2.829598903656006, "learning_rate": 5.921147992462676e-06, "loss": 0.411, "step": 30420 }, { "epoch": 4.410624343225713, "grad_norm": 2.1486783027648926, "learning_rate": 5.906653138135962e-06, "loss": 0.437, "step": 30430 }, { "epoch": 4.412073776135087, "grad_norm": 3.272394895553589, "learning_rate": 5.892158283809248e-06, "loss": 0.3843, "step": 30440 }, { "epoch": 4.413523209044461, "grad_norm": 2.6068198680877686, "learning_rate": 5.877663429482534e-06, "loss": 0.341, "step": 30450 }, { "epoch": 4.4149726419538355, "grad_norm": 1.2102731466293335, "learning_rate": 5.863168575155819e-06, "loss": 0.3798, "step": 30460 }, { "epoch": 4.41642207486321, "grad_norm": 1.095699429512024, "learning_rate": 5.8486737208291055e-06, "loss": 0.3586, "step": 30470 }, { "epoch": 4.417871507772584, "grad_norm": 0.7822348475456238, "learning_rate": 5.8341788665023915e-06, "loss": 0.4051, "step": 30480 }, { "epoch": 4.419320940681958, "grad_norm": 1.0242159366607666, "learning_rate": 5.819684012175678e-06, "loss": 0.3461, "step": 30490 }, { "epoch": 4.420770373591332, "grad_norm": 0.9580021500587463, "learning_rate": 5.8051891578489645e-06, "loss": 0.3512, "step": 30500 }, { "epoch": 4.422219806500706, "grad_norm": 0.9196089506149292, "learning_rate": 5.79069430352225e-06, "loss": 0.3654, "step": 30510 }, { "epoch": 4.4236692394100805, "grad_norm": 1.0600301027297974, "learning_rate": 5.776199449195536e-06, "loss": 0.36, "step": 30520 }, { "epoch": 4.425118672319455, "grad_norm": 1.2911509275436401, "learning_rate": 5.761704594868822e-06, "loss": 0.3656, "step": 30530 }, { "epoch": 4.426568105228829, "grad_norm": 1.0180575847625732, "learning_rate": 5.747209740542108e-06, "loss": 0.3845, "step": 30540 }, { "epoch": 4.428017538138203, "grad_norm": 2.772061586380005, "learning_rate": 5.732714886215394e-06, "loss": 0.4467, "step": 30550 }, { "epoch": 4.429466971047578, "grad_norm": 1.3060688972473145, "learning_rate": 5.71822003188868e-06, "loss": 0.4538, "step": 30560 }, { "epoch": 4.430916403956952, "grad_norm": 1.272159218788147, "learning_rate": 5.703725177561966e-06, "loss": 0.3817, "step": 30570 }, { "epoch": 4.432365836866326, "grad_norm": 1.1166160106658936, "learning_rate": 5.690679808667923e-06, "loss": 0.3476, "step": 30580 }, { "epoch": 4.4338152697757005, "grad_norm": 3.2157490253448486, "learning_rate": 5.676184954341209e-06, "loss": 0.3477, "step": 30590 }, { "epoch": 4.435264702685075, "grad_norm": 3.557138204574585, "learning_rate": 5.661690100014496e-06, "loss": 0.4244, "step": 30600 }, { "epoch": 4.436714135594449, "grad_norm": 1.1806175708770752, "learning_rate": 5.647195245687781e-06, "loss": 0.3459, "step": 30610 }, { "epoch": 4.438163568503823, "grad_norm": 2.0846927165985107, "learning_rate": 5.632700391361067e-06, "loss": 0.3608, "step": 30620 }, { "epoch": 4.439613001413197, "grad_norm": 1.833366870880127, "learning_rate": 5.618205537034353e-06, "loss": 0.3432, "step": 30630 }, { "epoch": 4.441062434322571, "grad_norm": 1.0325963497161865, "learning_rate": 5.603710682707639e-06, "loss": 0.3604, "step": 30640 }, { "epoch": 4.4425118672319455, "grad_norm": 2.9289095401763916, "learning_rate": 5.589215828380924e-06, "loss": 0.4177, "step": 30650 }, { "epoch": 4.44396130014132, "grad_norm": 5.7591233253479, "learning_rate": 5.574720974054211e-06, "loss": 0.3608, "step": 30660 }, { "epoch": 4.445410733050694, "grad_norm": 1.84727144241333, "learning_rate": 5.560226119727497e-06, "loss": 0.381, "step": 30670 }, { "epoch": 4.446860165960068, "grad_norm": 2.1353752613067627, "learning_rate": 5.545731265400783e-06, "loss": 0.3827, "step": 30680 }, { "epoch": 4.448309598869442, "grad_norm": 0.8009002208709717, "learning_rate": 5.531236411074069e-06, "loss": 0.3945, "step": 30690 }, { "epoch": 4.449759031778816, "grad_norm": 1.4904606342315674, "learning_rate": 5.5167415567473546e-06, "loss": 0.3419, "step": 30700 }, { "epoch": 4.45120846468819, "grad_norm": 0.9975236654281616, "learning_rate": 5.502246702420641e-06, "loss": 0.3785, "step": 30710 }, { "epoch": 4.452657897597565, "grad_norm": 1.2959935665130615, "learning_rate": 5.487751848093927e-06, "loss": 0.3375, "step": 30720 }, { "epoch": 4.454107330506939, "grad_norm": 0.96599942445755, "learning_rate": 5.4732569937672136e-06, "loss": 0.4297, "step": 30730 }, { "epoch": 4.455556763416313, "grad_norm": 3.204017400741577, "learning_rate": 5.458762139440499e-06, "loss": 0.4341, "step": 30740 }, { "epoch": 4.457006196325688, "grad_norm": 2.8199219703674316, "learning_rate": 5.444267285113785e-06, "loss": 0.4114, "step": 30750 }, { "epoch": 4.458455629235062, "grad_norm": 1.0257399082183838, "learning_rate": 5.429772430787071e-06, "loss": 0.3839, "step": 30760 }, { "epoch": 4.459905062144436, "grad_norm": 1.6263266801834106, "learning_rate": 5.415277576460357e-06, "loss": 0.3479, "step": 30770 }, { "epoch": 4.46135449505381, "grad_norm": 1.4419124126434326, "learning_rate": 5.400782722133642e-06, "loss": 0.3483, "step": 30780 }, { "epoch": 4.462803927963185, "grad_norm": 1.30050790309906, "learning_rate": 5.386287867806929e-06, "loss": 0.3943, "step": 30790 }, { "epoch": 4.464253360872559, "grad_norm": 1.2906863689422607, "learning_rate": 5.371793013480215e-06, "loss": 0.371, "step": 30800 }, { "epoch": 4.465702793781933, "grad_norm": 3.1720147132873535, "learning_rate": 5.357298159153501e-06, "loss": 0.4104, "step": 30810 }, { "epoch": 4.467152226691307, "grad_norm": 3.066178321838379, "learning_rate": 5.342803304826786e-06, "loss": 0.3952, "step": 30820 }, { "epoch": 4.468601659600681, "grad_norm": 1.2431939840316772, "learning_rate": 5.328308450500072e-06, "loss": 0.3695, "step": 30830 }, { "epoch": 4.470051092510055, "grad_norm": 1.892478108406067, "learning_rate": 5.3138135961733585e-06, "loss": 0.4132, "step": 30840 }, { "epoch": 4.4715005254194296, "grad_norm": 3.275731325149536, "learning_rate": 5.2993187418466445e-06, "loss": 0.4039, "step": 30850 }, { "epoch": 4.472949958328804, "grad_norm": 1.752671718597412, "learning_rate": 5.284823887519931e-06, "loss": 0.3586, "step": 30860 }, { "epoch": 4.474399391238178, "grad_norm": 1.0183453559875488, "learning_rate": 5.270329033193217e-06, "loss": 0.3996, "step": 30870 }, { "epoch": 4.475848824147552, "grad_norm": 1.460972547531128, "learning_rate": 5.255834178866503e-06, "loss": 0.4028, "step": 30880 }, { "epoch": 4.477298257056926, "grad_norm": 0.9415088295936584, "learning_rate": 5.241339324539789e-06, "loss": 0.3684, "step": 30890 }, { "epoch": 4.4787476899663, "grad_norm": 1.0596305131912231, "learning_rate": 5.226844470213075e-06, "loss": 0.3759, "step": 30900 }, { "epoch": 4.4801971228756745, "grad_norm": 1.233249306678772, "learning_rate": 5.21234961588636e-06, "loss": 0.3843, "step": 30910 }, { "epoch": 4.48164655578505, "grad_norm": 0.9509343504905701, "learning_rate": 5.197854761559647e-06, "loss": 0.3638, "step": 30920 }, { "epoch": 4.483095988694424, "grad_norm": 1.282771348953247, "learning_rate": 5.183359907232933e-06, "loss": 0.3996, "step": 30930 }, { "epoch": 4.484545421603798, "grad_norm": 0.8737091422080994, "learning_rate": 5.168865052906219e-06, "loss": 0.4465, "step": 30940 }, { "epoch": 4.485994854513172, "grad_norm": 1.1933667659759521, "learning_rate": 5.154370198579504e-06, "loss": 0.4358, "step": 30950 }, { "epoch": 4.487444287422546, "grad_norm": 3.5740418434143066, "learning_rate": 5.13987534425279e-06, "loss": 0.4641, "step": 30960 }, { "epoch": 4.48889372033192, "grad_norm": 1.8437187671661377, "learning_rate": 5.125380489926076e-06, "loss": 0.3242, "step": 30970 }, { "epoch": 4.4903431532412945, "grad_norm": 0.7734196782112122, "learning_rate": 5.110885635599362e-06, "loss": 0.3591, "step": 30980 }, { "epoch": 4.491792586150669, "grad_norm": 1.2379523515701294, "learning_rate": 5.096390781272648e-06, "loss": 0.3943, "step": 30990 }, { "epoch": 4.493242019060043, "grad_norm": 1.5033332109451294, "learning_rate": 5.0818959269459345e-06, "loss": 0.3712, "step": 31000 }, { "epoch": 4.493242019060043, "eval_loss": 0.7379248738288879, "eval_runtime": 671.531, "eval_samples_per_second": 51.368, "eval_steps_per_second": 2.569, "eval_token_accuracy": 0.00039438392148864813, "step": 31000 }, { "epoch": 4.494691451969417, "grad_norm": 0.9957663416862488, "learning_rate": 5.0674010726192205e-06, "loss": 0.3583, "step": 31010 }, { "epoch": 4.496140884878791, "grad_norm": 2.0881576538085938, "learning_rate": 5.0529062182925066e-06, "loss": 0.457, "step": 31020 }, { "epoch": 4.497590317788165, "grad_norm": 2.050384283065796, "learning_rate": 5.038411363965792e-06, "loss": 0.3787, "step": 31030 }, { "epoch": 4.4990397506975395, "grad_norm": 1.6135433912277222, "learning_rate": 5.023916509639078e-06, "loss": 0.3959, "step": 31040 }, { "epoch": 4.500489183606914, "grad_norm": 1.5975104570388794, "learning_rate": 5.009421655312365e-06, "loss": 0.3822, "step": 31050 }, { "epoch": 4.501938616516288, "grad_norm": 0.8000165820121765, "learning_rate": 4.994926800985651e-06, "loss": 0.4042, "step": 31060 }, { "epoch": 4.503388049425662, "grad_norm": 2.54587984085083, "learning_rate": 4.980431946658937e-06, "loss": 0.3736, "step": 31070 }, { "epoch": 4.504837482335036, "grad_norm": 1.709140419960022, "learning_rate": 4.965937092332222e-06, "loss": 0.3642, "step": 31080 }, { "epoch": 4.50628691524441, "grad_norm": 1.4000811576843262, "learning_rate": 4.951442238005508e-06, "loss": 0.4011, "step": 31090 }, { "epoch": 4.507736348153784, "grad_norm": 1.3397213220596313, "learning_rate": 4.936947383678794e-06, "loss": 0.3643, "step": 31100 }, { "epoch": 4.5091857810631595, "grad_norm": 1.9789170026779175, "learning_rate": 4.92245252935208e-06, "loss": 0.3884, "step": 31110 }, { "epoch": 4.510635213972533, "grad_norm": 1.190403938293457, "learning_rate": 4.907957675025366e-06, "loss": 0.3527, "step": 31120 }, { "epoch": 4.512084646881908, "grad_norm": 2.6957547664642334, "learning_rate": 4.893462820698652e-06, "loss": 0.3774, "step": 31130 }, { "epoch": 4.513534079791282, "grad_norm": 2.3824095726013184, "learning_rate": 4.878967966371938e-06, "loss": 0.3082, "step": 31140 }, { "epoch": 4.514983512700656, "grad_norm": 2.6183104515075684, "learning_rate": 4.864473112045224e-06, "loss": 0.384, "step": 31150 }, { "epoch": 4.51643294561003, "grad_norm": 3.184983491897583, "learning_rate": 4.84997825771851e-06, "loss": 0.4113, "step": 31160 }, { "epoch": 4.517882378519404, "grad_norm": 1.9513596296310425, "learning_rate": 4.835483403391796e-06, "loss": 0.3987, "step": 31170 }, { "epoch": 4.519331811428779, "grad_norm": 1.6810030937194824, "learning_rate": 4.8209885490650826e-06, "loss": 0.3304, "step": 31180 }, { "epoch": 4.520781244338153, "grad_norm": 1.9544678926467896, "learning_rate": 4.806493694738369e-06, "loss": 0.3758, "step": 31190 }, { "epoch": 4.522230677247527, "grad_norm": 1.5571215152740479, "learning_rate": 4.791998840411654e-06, "loss": 0.387, "step": 31200 }, { "epoch": 4.523680110156901, "grad_norm": 1.1763455867767334, "learning_rate": 4.77750398608494e-06, "loss": 0.3823, "step": 31210 }, { "epoch": 4.525129543066275, "grad_norm": 0.8894882798194885, "learning_rate": 4.763009131758226e-06, "loss": 0.3887, "step": 31220 }, { "epoch": 4.526578975975649, "grad_norm": 1.1514997482299805, "learning_rate": 4.748514277431512e-06, "loss": 0.3829, "step": 31230 }, { "epoch": 4.528028408885024, "grad_norm": 2.9833333492279053, "learning_rate": 4.734019423104798e-06, "loss": 0.3914, "step": 31240 }, { "epoch": 4.529477841794398, "grad_norm": 2.5646626949310303, "learning_rate": 4.719524568778084e-06, "loss": 0.3732, "step": 31250 }, { "epoch": 4.530927274703772, "grad_norm": 1.1297060251235962, "learning_rate": 4.70502971445137e-06, "loss": 0.4017, "step": 31260 }, { "epoch": 4.532376707613146, "grad_norm": 0.9327825903892517, "learning_rate": 4.690534860124656e-06, "loss": 0.4008, "step": 31270 }, { "epoch": 4.53382614052252, "grad_norm": 3.265214681625366, "learning_rate": 4.676040005797942e-06, "loss": 0.3892, "step": 31280 }, { "epoch": 4.535275573431894, "grad_norm": 1.352654218673706, "learning_rate": 4.6615451514712275e-06, "loss": 0.3863, "step": 31290 }, { "epoch": 4.536725006341269, "grad_norm": 2.224597692489624, "learning_rate": 4.6470502971445135e-06, "loss": 0.3573, "step": 31300 }, { "epoch": 4.538174439250644, "grad_norm": 3.5135717391967773, "learning_rate": 4.6325554428178e-06, "loss": 0.3702, "step": 31310 }, { "epoch": 4.539623872160018, "grad_norm": 1.3063206672668457, "learning_rate": 4.6180605884910865e-06, "loss": 0.3273, "step": 31320 }, { "epoch": 4.541073305069392, "grad_norm": 2.603647232055664, "learning_rate": 4.603565734164372e-06, "loss": 0.3497, "step": 31330 }, { "epoch": 4.542522737978766, "grad_norm": 2.3394999504089355, "learning_rate": 4.589070879837658e-06, "loss": 0.3514, "step": 31340 }, { "epoch": 4.54397217088814, "grad_norm": 2.2680232524871826, "learning_rate": 4.574576025510944e-06, "loss": 0.3498, "step": 31350 }, { "epoch": 4.545421603797514, "grad_norm": 3.2815279960632324, "learning_rate": 4.56008117118423e-06, "loss": 0.3533, "step": 31360 }, { "epoch": 4.5468710367068885, "grad_norm": 1.1699095964431763, "learning_rate": 4.545586316857516e-06, "loss": 0.3879, "step": 31370 }, { "epoch": 4.548320469616263, "grad_norm": 1.0367563962936401, "learning_rate": 4.531091462530802e-06, "loss": 0.3172, "step": 31380 }, { "epoch": 4.549769902525637, "grad_norm": 0.8867725133895874, "learning_rate": 4.516596608204088e-06, "loss": 0.3398, "step": 31390 }, { "epoch": 4.551219335435011, "grad_norm": 2.650059700012207, "learning_rate": 4.502101753877374e-06, "loss": 0.349, "step": 31400 }, { "epoch": 4.552668768344385, "grad_norm": 1.0593748092651367, "learning_rate": 4.487606899550659e-06, "loss": 0.3405, "step": 31410 }, { "epoch": 4.554118201253759, "grad_norm": 2.0027034282684326, "learning_rate": 4.473112045223945e-06, "loss": 0.4048, "step": 31420 }, { "epoch": 4.5555676341631335, "grad_norm": 0.9918128252029419, "learning_rate": 4.458617190897231e-06, "loss": 0.3633, "step": 31430 }, { "epoch": 4.557017067072508, "grad_norm": 1.1196961402893066, "learning_rate": 4.444122336570518e-06, "loss": 0.3797, "step": 31440 }, { "epoch": 4.558466499981882, "grad_norm": 0.9278882145881653, "learning_rate": 4.429627482243804e-06, "loss": 0.3514, "step": 31450 }, { "epoch": 4.559915932891256, "grad_norm": 3.254080057144165, "learning_rate": 4.4151326279170895e-06, "loss": 0.3482, "step": 31460 }, { "epoch": 4.56136536580063, "grad_norm": 1.4721205234527588, "learning_rate": 4.400637773590376e-06, "loss": 0.3894, "step": 31470 }, { "epoch": 4.562814798710004, "grad_norm": 2.2425873279571533, "learning_rate": 4.386142919263662e-06, "loss": 0.3487, "step": 31480 }, { "epoch": 4.564264231619379, "grad_norm": 1.2692406177520752, "learning_rate": 4.371648064936948e-06, "loss": 0.3727, "step": 31490 }, { "epoch": 4.5657136645287535, "grad_norm": 3.3129732608795166, "learning_rate": 4.357153210610234e-06, "loss": 0.378, "step": 31500 }, { "epoch": 4.567163097438128, "grad_norm": 1.16659414768219, "learning_rate": 4.34265835628352e-06, "loss": 0.363, "step": 31510 }, { "epoch": 4.568612530347502, "grad_norm": 2.6467912197113037, "learning_rate": 4.328163501956806e-06, "loss": 0.368, "step": 31520 }, { "epoch": 4.570061963256876, "grad_norm": 1.0563710927963257, "learning_rate": 4.313668647630092e-06, "loss": 0.3764, "step": 31530 }, { "epoch": 4.57151139616625, "grad_norm": 2.4709951877593994, "learning_rate": 4.299173793303377e-06, "loss": 0.4012, "step": 31540 }, { "epoch": 4.572960829075624, "grad_norm": 1.983708143234253, "learning_rate": 4.284678938976663e-06, "loss": 0.3856, "step": 31550 }, { "epoch": 4.5744102619849984, "grad_norm": 2.240994691848755, "learning_rate": 4.270184084649949e-06, "loss": 0.3514, "step": 31560 }, { "epoch": 4.575859694894373, "grad_norm": 0.9982314705848694, "learning_rate": 4.255689230323236e-06, "loss": 0.3412, "step": 31570 }, { "epoch": 4.577309127803747, "grad_norm": 1.9218682050704956, "learning_rate": 4.241194375996521e-06, "loss": 0.4052, "step": 31580 }, { "epoch": 4.578758560713121, "grad_norm": 0.9534339308738708, "learning_rate": 4.226699521669807e-06, "loss": 0.375, "step": 31590 }, { "epoch": 4.580207993622495, "grad_norm": 1.9388453960418701, "learning_rate": 4.212204667343093e-06, "loss": 0.2817, "step": 31600 }, { "epoch": 4.581657426531869, "grad_norm": 2.833406925201416, "learning_rate": 4.1977098130163795e-06, "loss": 0.3794, "step": 31610 }, { "epoch": 4.583106859441243, "grad_norm": 3.2463910579681396, "learning_rate": 4.183214958689665e-06, "loss": 0.3999, "step": 31620 }, { "epoch": 4.584556292350618, "grad_norm": 1.149850606918335, "learning_rate": 4.168720104362952e-06, "loss": 0.3686, "step": 31630 }, { "epoch": 4.586005725259992, "grad_norm": 1.5383622646331787, "learning_rate": 4.154225250036238e-06, "loss": 0.3935, "step": 31640 }, { "epoch": 4.587455158169366, "grad_norm": 2.9202868938446045, "learning_rate": 4.139730395709524e-06, "loss": 0.4011, "step": 31650 }, { "epoch": 4.588904591078741, "grad_norm": 2.718942403793335, "learning_rate": 4.12523554138281e-06, "loss": 0.3847, "step": 31660 }, { "epoch": 4.590354023988114, "grad_norm": 1.3251688480377197, "learning_rate": 4.110740687056095e-06, "loss": 0.3841, "step": 31670 }, { "epoch": 4.591803456897489, "grad_norm": 2.328781843185425, "learning_rate": 4.096245832729381e-06, "loss": 0.3276, "step": 31680 }, { "epoch": 4.593252889806863, "grad_norm": 1.1885885000228882, "learning_rate": 4.081750978402667e-06, "loss": 0.3735, "step": 31690 }, { "epoch": 4.594702322716238, "grad_norm": 1.1832793951034546, "learning_rate": 4.067256124075953e-06, "loss": 0.3657, "step": 31700 }, { "epoch": 4.596151755625612, "grad_norm": 1.5288221836090088, "learning_rate": 4.052761269749239e-06, "loss": 0.4204, "step": 31710 }, { "epoch": 4.597601188534986, "grad_norm": 4.330370903015137, "learning_rate": 4.038266415422525e-06, "loss": 0.378, "step": 31720 }, { "epoch": 4.59905062144436, "grad_norm": 3.168311595916748, "learning_rate": 4.023771561095811e-06, "loss": 0.3494, "step": 31730 }, { "epoch": 4.600500054353734, "grad_norm": 1.6006698608398438, "learning_rate": 4.009276706769097e-06, "loss": 0.3926, "step": 31740 }, { "epoch": 4.601949487263108, "grad_norm": 1.0302852392196655, "learning_rate": 3.9947818524423825e-06, "loss": 0.3911, "step": 31750 }, { "epoch": 4.6033989201724825, "grad_norm": 1.771335482597351, "learning_rate": 3.9802869981156694e-06, "loss": 0.3601, "step": 31760 }, { "epoch": 4.604848353081857, "grad_norm": 2.2682509422302246, "learning_rate": 3.9657921437889555e-06, "loss": 0.3695, "step": 31770 }, { "epoch": 4.606297785991231, "grad_norm": 3.5409722328186035, "learning_rate": 3.9512972894622415e-06, "loss": 0.335, "step": 31780 }, { "epoch": 4.607747218900605, "grad_norm": 1.0471943616867065, "learning_rate": 3.936802435135527e-06, "loss": 0.3629, "step": 31790 }, { "epoch": 4.609196651809979, "grad_norm": 3.680050849914551, "learning_rate": 3.922307580808813e-06, "loss": 0.4099, "step": 31800 }, { "epoch": 4.610646084719353, "grad_norm": 2.532383441925049, "learning_rate": 3.907812726482099e-06, "loss": 0.3722, "step": 31810 }, { "epoch": 4.6120955176287275, "grad_norm": 1.3008819818496704, "learning_rate": 3.893317872155385e-06, "loss": 0.3492, "step": 31820 }, { "epoch": 4.613544950538102, "grad_norm": 2.2614519596099854, "learning_rate": 3.878823017828671e-06, "loss": 0.3966, "step": 31830 }, { "epoch": 4.614994383447476, "grad_norm": 1.2360858917236328, "learning_rate": 3.864328163501957e-06, "loss": 0.3602, "step": 31840 }, { "epoch": 4.616443816356851, "grad_norm": 1.06869375705719, "learning_rate": 3.849833309175243e-06, "loss": 0.3444, "step": 31850 }, { "epoch": 4.617893249266224, "grad_norm": 1.8802971839904785, "learning_rate": 3.835338454848529e-06, "loss": 0.366, "step": 31860 }, { "epoch": 4.619342682175599, "grad_norm": 1.0579355955123901, "learning_rate": 3.820843600521815e-06, "loss": 0.3805, "step": 31870 }, { "epoch": 4.620792115084973, "grad_norm": 2.116001844406128, "learning_rate": 3.806348746195101e-06, "loss": 0.3232, "step": 31880 }, { "epoch": 4.6222415479943475, "grad_norm": 1.0102404356002808, "learning_rate": 3.791853891868387e-06, "loss": 0.3467, "step": 31890 }, { "epoch": 4.623690980903722, "grad_norm": 1.343885064125061, "learning_rate": 3.777359037541673e-06, "loss": 0.4108, "step": 31900 }, { "epoch": 4.625140413813096, "grad_norm": 1.089756727218628, "learning_rate": 3.762864183214959e-06, "loss": 0.3257, "step": 31910 }, { "epoch": 4.62658984672247, "grad_norm": 3.171680450439453, "learning_rate": 3.7483693288882446e-06, "loss": 0.3808, "step": 31920 }, { "epoch": 4.628039279631844, "grad_norm": 1.396655559539795, "learning_rate": 3.7338744745615306e-06, "loss": 0.3759, "step": 31930 }, { "epoch": 4.629488712541218, "grad_norm": 1.0053515434265137, "learning_rate": 3.7193796202348167e-06, "loss": 0.3902, "step": 31940 }, { "epoch": 4.6309381454505925, "grad_norm": 2.9222521781921387, "learning_rate": 3.704884765908103e-06, "loss": 0.428, "step": 31950 }, { "epoch": 4.632387578359967, "grad_norm": 1.056503176689148, "learning_rate": 3.6903899115813884e-06, "loss": 0.3976, "step": 31960 }, { "epoch": 4.633837011269341, "grad_norm": 1.6677240133285522, "learning_rate": 3.675895057254675e-06, "loss": 0.4082, "step": 31970 }, { "epoch": 4.635286444178715, "grad_norm": 4.744880676269531, "learning_rate": 3.661400202927961e-06, "loss": 0.4138, "step": 31980 }, { "epoch": 4.636735877088089, "grad_norm": 1.129930019378662, "learning_rate": 3.646905348601247e-06, "loss": 0.4117, "step": 31990 }, { "epoch": 4.638185309997463, "grad_norm": 1.0966359376907349, "learning_rate": 3.6324104942745326e-06, "loss": 0.3454, "step": 32000 }, { "epoch": 4.638185309997463, "eval_loss": 0.7402629256248474, "eval_runtime": 670.9833, "eval_samples_per_second": 51.41, "eval_steps_per_second": 2.571, "eval_token_accuracy": 0.0003950272884078792, "step": 32000 }, { "epoch": 4.639634742906837, "grad_norm": 0.9030367732048035, "learning_rate": 3.6179156399478186e-06, "loss": 0.3426, "step": 32010 }, { "epoch": 4.641084175816212, "grad_norm": 1.3874012231826782, "learning_rate": 3.6034207856211047e-06, "loss": 0.3564, "step": 32020 }, { "epoch": 4.642533608725586, "grad_norm": 1.7644416093826294, "learning_rate": 3.5889259312943907e-06, "loss": 0.4444, "step": 32030 }, { "epoch": 4.643983041634961, "grad_norm": 1.5777500867843628, "learning_rate": 3.574431076967677e-06, "loss": 0.4593, "step": 32040 }, { "epoch": 4.645432474544334, "grad_norm": 0.8278200030326843, "learning_rate": 3.5599362226409624e-06, "loss": 0.3529, "step": 32050 }, { "epoch": 4.646881907453709, "grad_norm": 2.230210542678833, "learning_rate": 3.5454413683142485e-06, "loss": 0.3893, "step": 32060 }, { "epoch": 4.648331340363083, "grad_norm": 1.252874493598938, "learning_rate": 3.5309465139875345e-06, "loss": 0.3803, "step": 32070 }, { "epoch": 4.649780773272457, "grad_norm": 1.3750890493392944, "learning_rate": 3.516451659660821e-06, "loss": 0.3525, "step": 32080 }, { "epoch": 4.651230206181832, "grad_norm": 2.124300241470337, "learning_rate": 3.5019568053341062e-06, "loss": 0.3254, "step": 32090 }, { "epoch": 4.652679639091206, "grad_norm": 2.438168525695801, "learning_rate": 3.4874619510073923e-06, "loss": 0.3373, "step": 32100 }, { "epoch": 4.65412907200058, "grad_norm": 2.202852487564087, "learning_rate": 3.4729670966806788e-06, "loss": 0.3415, "step": 32110 }, { "epoch": 4.655578504909954, "grad_norm": 2.5048999786376953, "learning_rate": 3.458472242353965e-06, "loss": 0.3981, "step": 32120 }, { "epoch": 4.657027937819328, "grad_norm": 1.2640185356140137, "learning_rate": 3.4439773880272504e-06, "loss": 0.4032, "step": 32130 }, { "epoch": 4.658477370728702, "grad_norm": 2.459134578704834, "learning_rate": 3.4294825337005365e-06, "loss": 0.3492, "step": 32140 }, { "epoch": 4.6599268036380765, "grad_norm": 1.477692723274231, "learning_rate": 3.4149876793738225e-06, "loss": 0.3743, "step": 32150 }, { "epoch": 4.661376236547451, "grad_norm": 1.0264941453933716, "learning_rate": 3.4004928250471086e-06, "loss": 0.3192, "step": 32160 }, { "epoch": 4.662825669456825, "grad_norm": 2.095033645629883, "learning_rate": 3.3859979707203942e-06, "loss": 0.4001, "step": 32170 }, { "epoch": 4.664275102366199, "grad_norm": 2.331382989883423, "learning_rate": 3.3715031163936803e-06, "loss": 0.3358, "step": 32180 }, { "epoch": 4.665724535275573, "grad_norm": 1.119020938873291, "learning_rate": 3.3570082620669663e-06, "loss": 0.3441, "step": 32190 }, { "epoch": 4.667173968184947, "grad_norm": 2.7784945964813232, "learning_rate": 3.3425134077402524e-06, "loss": 0.3859, "step": 32200 }, { "epoch": 4.6686234010943215, "grad_norm": 2.008835554122925, "learning_rate": 3.328018553413538e-06, "loss": 0.3245, "step": 32210 }, { "epoch": 4.670072834003696, "grad_norm": 1.0234019756317139, "learning_rate": 3.313523699086824e-06, "loss": 0.3272, "step": 32220 }, { "epoch": 4.671522266913071, "grad_norm": 1.2887885570526123, "learning_rate": 3.29902884476011e-06, "loss": 0.3998, "step": 32230 }, { "epoch": 4.672971699822444, "grad_norm": 1.0463289022445679, "learning_rate": 3.2845339904333966e-06, "loss": 0.4122, "step": 32240 }, { "epoch": 4.674421132731819, "grad_norm": 1.143078327178955, "learning_rate": 3.2700391361066826e-06, "loss": 0.4343, "step": 32250 }, { "epoch": 4.675870565641193, "grad_norm": 1.4409193992614746, "learning_rate": 3.255544281779968e-06, "loss": 0.3785, "step": 32260 }, { "epoch": 4.677319998550567, "grad_norm": 0.8818618059158325, "learning_rate": 3.2410494274532543e-06, "loss": 0.3965, "step": 32270 }, { "epoch": 4.6787694314599415, "grad_norm": 1.2765010595321655, "learning_rate": 3.2265545731265404e-06, "loss": 0.4304, "step": 32280 }, { "epoch": 4.680218864369316, "grad_norm": 1.0178990364074707, "learning_rate": 3.2120597187998264e-06, "loss": 0.3661, "step": 32290 }, { "epoch": 4.68166829727869, "grad_norm": 1.806789755821228, "learning_rate": 3.197564864473112e-06, "loss": 0.3513, "step": 32300 }, { "epoch": 4.683117730188064, "grad_norm": 2.861187219619751, "learning_rate": 3.183070010146398e-06, "loss": 0.38, "step": 32310 }, { "epoch": 4.684567163097438, "grad_norm": 1.3312263488769531, "learning_rate": 3.168575155819684e-06, "loss": 0.4133, "step": 32320 }, { "epoch": 4.686016596006812, "grad_norm": 2.996910572052002, "learning_rate": 3.1540803014929702e-06, "loss": 0.3746, "step": 32330 }, { "epoch": 4.6874660289161865, "grad_norm": 1.8551859855651855, "learning_rate": 3.139585447166256e-06, "loss": 0.3787, "step": 32340 }, { "epoch": 4.688915461825561, "grad_norm": 2.761601448059082, "learning_rate": 3.125090592839542e-06, "loss": 0.3392, "step": 32350 }, { "epoch": 4.690364894734935, "grad_norm": 1.2694810628890991, "learning_rate": 3.110595738512828e-06, "loss": 0.3822, "step": 32360 }, { "epoch": 4.691814327644309, "grad_norm": 3.686568260192871, "learning_rate": 3.096100884186114e-06, "loss": 0.4155, "step": 32370 }, { "epoch": 4.693263760553683, "grad_norm": 3.2763731479644775, "learning_rate": 3.0816060298594e-06, "loss": 0.4185, "step": 32380 }, { "epoch": 4.694713193463057, "grad_norm": 1.6245061159133911, "learning_rate": 3.0671111755326857e-06, "loss": 0.386, "step": 32390 }, { "epoch": 4.696162626372431, "grad_norm": 1.0963075160980225, "learning_rate": 3.052616321205972e-06, "loss": 0.4002, "step": 32400 }, { "epoch": 4.697612059281806, "grad_norm": 1.1938602924346924, "learning_rate": 3.038121466879258e-06, "loss": 0.3925, "step": 32410 }, { "epoch": 4.699061492191181, "grad_norm": 2.0994720458984375, "learning_rate": 3.023626612552544e-06, "loss": 0.3838, "step": 32420 }, { "epoch": 4.700510925100555, "grad_norm": 0.9986076354980469, "learning_rate": 3.0091317582258303e-06, "loss": 0.3786, "step": 32430 }, { "epoch": 4.701960358009929, "grad_norm": 3.9333064556121826, "learning_rate": 2.994636903899116e-06, "loss": 0.4101, "step": 32440 }, { "epoch": 4.703409790919303, "grad_norm": 2.4673099517822266, "learning_rate": 2.980142049572402e-06, "loss": 0.3625, "step": 32450 }, { "epoch": 4.704859223828677, "grad_norm": 2.997154712677002, "learning_rate": 2.965647195245688e-06, "loss": 0.3262, "step": 32460 }, { "epoch": 4.706308656738051, "grad_norm": 2.245668411254883, "learning_rate": 2.951152340918974e-06, "loss": 0.3691, "step": 32470 }, { "epoch": 4.707758089647426, "grad_norm": 1.5757455825805664, "learning_rate": 2.9366574865922598e-06, "loss": 0.4091, "step": 32480 }, { "epoch": 4.7092075225568, "grad_norm": 3.344550609588623, "learning_rate": 2.922162632265546e-06, "loss": 0.4019, "step": 32490 }, { "epoch": 4.710656955466174, "grad_norm": 1.346003532409668, "learning_rate": 2.907667777938832e-06, "loss": 0.3991, "step": 32500 }, { "epoch": 4.712106388375548, "grad_norm": 1.0615825653076172, "learning_rate": 2.893172923612118e-06, "loss": 0.3754, "step": 32510 }, { "epoch": 4.713555821284922, "grad_norm": 1.41044282913208, "learning_rate": 2.8786780692854035e-06, "loss": 0.3001, "step": 32520 }, { "epoch": 4.715005254194296, "grad_norm": 0.9503432512283325, "learning_rate": 2.86418321495869e-06, "loss": 0.381, "step": 32530 }, { "epoch": 4.7164546871036706, "grad_norm": 2.1342263221740723, "learning_rate": 2.8496883606319757e-06, "loss": 0.3472, "step": 32540 }, { "epoch": 4.717904120013045, "grad_norm": 1.4033797979354858, "learning_rate": 2.8351935063052617e-06, "loss": 0.387, "step": 32550 }, { "epoch": 4.719353552922419, "grad_norm": 1.4123448133468628, "learning_rate": 2.8206986519785478e-06, "loss": 0.3624, "step": 32560 }, { "epoch": 4.720802985831793, "grad_norm": 1.1497482061386108, "learning_rate": 2.806203797651834e-06, "loss": 0.3973, "step": 32570 }, { "epoch": 4.722252418741167, "grad_norm": 1.3808242082595825, "learning_rate": 2.7917089433251194e-06, "loss": 0.3993, "step": 32580 }, { "epoch": 4.723701851650541, "grad_norm": 1.8661115169525146, "learning_rate": 2.777214088998406e-06, "loss": 0.3679, "step": 32590 }, { "epoch": 4.7251512845599155, "grad_norm": 3.0913310050964355, "learning_rate": 2.7627192346716915e-06, "loss": 0.2996, "step": 32600 }, { "epoch": 4.726600717469291, "grad_norm": 2.8135464191436768, "learning_rate": 2.7482243803449776e-06, "loss": 0.3693, "step": 32610 }, { "epoch": 4.728050150378665, "grad_norm": 2.949455738067627, "learning_rate": 2.7337295260182637e-06, "loss": 0.3799, "step": 32620 }, { "epoch": 4.729499583288039, "grad_norm": 1.1281206607818604, "learning_rate": 2.7192346716915497e-06, "loss": 0.4465, "step": 32630 }, { "epoch": 4.730949016197413, "grad_norm": 1.0962039232254028, "learning_rate": 2.7047398173648358e-06, "loss": 0.3521, "step": 32640 }, { "epoch": 4.732398449106787, "grad_norm": 1.1542834043502808, "learning_rate": 2.6902449630381214e-06, "loss": 0.3267, "step": 32650 }, { "epoch": 4.733847882016161, "grad_norm": 2.2527689933776855, "learning_rate": 2.675750108711408e-06, "loss": 0.3637, "step": 32660 }, { "epoch": 4.7352973149255355, "grad_norm": 3.0720131397247314, "learning_rate": 2.6612552543846935e-06, "loss": 0.4143, "step": 32670 }, { "epoch": 4.73674674783491, "grad_norm": 3.2450637817382812, "learning_rate": 2.6467604000579796e-06, "loss": 0.3778, "step": 32680 }, { "epoch": 4.738196180744284, "grad_norm": 0.8772507309913635, "learning_rate": 2.6322655457312656e-06, "loss": 0.3518, "step": 32690 }, { "epoch": 4.739645613653658, "grad_norm": 2.1515049934387207, "learning_rate": 2.6177706914045517e-06, "loss": 0.3454, "step": 32700 }, { "epoch": 4.741095046563032, "grad_norm": 0.9839628338813782, "learning_rate": 2.6032758370778373e-06, "loss": 0.3805, "step": 32710 }, { "epoch": 4.742544479472406, "grad_norm": 1.3019304275512695, "learning_rate": 2.5887809827511238e-06, "loss": 0.3661, "step": 32720 }, { "epoch": 4.7439939123817805, "grad_norm": 1.898655891418457, "learning_rate": 2.5742861284244094e-06, "loss": 0.3405, "step": 32730 }, { "epoch": 4.745443345291155, "grad_norm": 1.5310337543487549, "learning_rate": 2.5597912740976954e-06, "loss": 0.3885, "step": 32740 }, { "epoch": 4.746892778200529, "grad_norm": 2.6126482486724854, "learning_rate": 2.5452964197709815e-06, "loss": 0.4087, "step": 32750 }, { "epoch": 4.748342211109903, "grad_norm": 2.2393836975097656, "learning_rate": 2.5308015654442676e-06, "loss": 0.3558, "step": 32760 }, { "epoch": 4.749791644019277, "grad_norm": 1.1990814208984375, "learning_rate": 2.516306711117553e-06, "loss": 0.3635, "step": 32770 }, { "epoch": 4.751241076928652, "grad_norm": 0.9254428148269653, "learning_rate": 2.5018118567908392e-06, "loss": 0.4035, "step": 32780 }, { "epoch": 4.752690509838025, "grad_norm": 3.164818525314331, "learning_rate": 2.4873170024641253e-06, "loss": 0.3199, "step": 32790 }, { "epoch": 4.7541399427474005, "grad_norm": 0.9472362995147705, "learning_rate": 2.4728221481374113e-06, "loss": 0.3885, "step": 32800 }, { "epoch": 4.755589375656775, "grad_norm": 1.333412528038025, "learning_rate": 2.458327293810697e-06, "loss": 0.4131, "step": 32810 }, { "epoch": 4.757038808566149, "grad_norm": 0.9033955335617065, "learning_rate": 2.4438324394839834e-06, "loss": 0.3789, "step": 32820 }, { "epoch": 4.758488241475523, "grad_norm": 2.3149867057800293, "learning_rate": 2.4293375851572695e-06, "loss": 0.3417, "step": 32830 }, { "epoch": 4.759937674384897, "grad_norm": 5.298091888427734, "learning_rate": 2.414842730830555e-06, "loss": 0.3469, "step": 32840 }, { "epoch": 4.761387107294271, "grad_norm": 1.0421656370162964, "learning_rate": 2.400347876503841e-06, "loss": 0.3931, "step": 32850 }, { "epoch": 4.762836540203645, "grad_norm": 1.1229223012924194, "learning_rate": 2.3858530221771272e-06, "loss": 0.3572, "step": 32860 }, { "epoch": 4.76428597311302, "grad_norm": 1.4950625896453857, "learning_rate": 2.3713581678504133e-06, "loss": 0.434, "step": 32870 }, { "epoch": 4.765735406022394, "grad_norm": 0.9826526641845703, "learning_rate": 2.3568633135236993e-06, "loss": 0.3881, "step": 32880 }, { "epoch": 4.767184838931768, "grad_norm": 1.790209174156189, "learning_rate": 2.3423684591969854e-06, "loss": 0.3745, "step": 32890 }, { "epoch": 4.768634271841142, "grad_norm": 3.7698655128479004, "learning_rate": 2.327873604870271e-06, "loss": 0.4164, "step": 32900 }, { "epoch": 4.770083704750516, "grad_norm": 1.2466737031936646, "learning_rate": 2.313378750543557e-06, "loss": 0.3579, "step": 32910 }, { "epoch": 4.77153313765989, "grad_norm": 1.1752427816390991, "learning_rate": 2.298883896216843e-06, "loss": 0.3975, "step": 32920 }, { "epoch": 4.772982570569265, "grad_norm": 4.076375961303711, "learning_rate": 2.284389041890129e-06, "loss": 0.4107, "step": 32930 }, { "epoch": 4.774432003478639, "grad_norm": 2.440988540649414, "learning_rate": 2.269894187563415e-06, "loss": 0.3744, "step": 32940 }, { "epoch": 4.775881436388013, "grad_norm": 2.235698699951172, "learning_rate": 2.2553993332367013e-06, "loss": 0.3726, "step": 32950 }, { "epoch": 4.777330869297387, "grad_norm": 2.558980941772461, "learning_rate": 2.240904478909987e-06, "loss": 0.4063, "step": 32960 }, { "epoch": 4.778780302206762, "grad_norm": 0.951149582862854, "learning_rate": 2.226409624583273e-06, "loss": 0.3733, "step": 32970 }, { "epoch": 4.780229735116135, "grad_norm": 1.0211766958236694, "learning_rate": 2.211914770256559e-06, "loss": 0.3332, "step": 32980 }, { "epoch": 4.78167916802551, "grad_norm": 2.6187455654144287, "learning_rate": 2.197419915929845e-06, "loss": 0.367, "step": 32990 }, { "epoch": 4.783128600934885, "grad_norm": 2.046088933944702, "learning_rate": 2.1829250616031307e-06, "loss": 0.3534, "step": 33000 }, { "epoch": 4.783128600934885, "eval_loss": 0.7387080788612366, "eval_runtime": 671.8242, "eval_samples_per_second": 51.345, "eval_steps_per_second": 2.568, "eval_token_accuracy": 0.00039422307975884034, "step": 33000 }, { "epoch": 4.784578033844259, "grad_norm": 1.1268091201782227, "learning_rate": 2.1684302072764168e-06, "loss": 0.3607, "step": 33010 }, { "epoch": 4.786027466753633, "grad_norm": 0.9580293297767639, "learning_rate": 2.1539353529497032e-06, "loss": 0.3174, "step": 33020 }, { "epoch": 4.787476899663007, "grad_norm": 0.8425546288490295, "learning_rate": 2.139440498622989e-06, "loss": 0.3413, "step": 33030 }, { "epoch": 4.788926332572381, "grad_norm": 2.578343391418457, "learning_rate": 2.124945644296275e-06, "loss": 0.3708, "step": 33040 }, { "epoch": 4.790375765481755, "grad_norm": 1.0360602140426636, "learning_rate": 2.110450789969561e-06, "loss": 0.4026, "step": 33050 }, { "epoch": 4.7918251983911295, "grad_norm": 1.632294774055481, "learning_rate": 2.095955935642847e-06, "loss": 0.3823, "step": 33060 }, { "epoch": 4.793274631300504, "grad_norm": 1.0471699237823486, "learning_rate": 2.0814610813161327e-06, "loss": 0.4129, "step": 33070 }, { "epoch": 4.794724064209878, "grad_norm": 1.4320305585861206, "learning_rate": 2.066966226989419e-06, "loss": 0.3969, "step": 33080 }, { "epoch": 4.796173497119252, "grad_norm": 0.9930534362792969, "learning_rate": 2.0524713726627048e-06, "loss": 0.349, "step": 33090 }, { "epoch": 4.797622930028626, "grad_norm": 1.012718915939331, "learning_rate": 2.037976518335991e-06, "loss": 0.3413, "step": 33100 }, { "epoch": 4.799072362938, "grad_norm": 1.2036629915237427, "learning_rate": 2.023481664009277e-06, "loss": 0.3922, "step": 33110 }, { "epoch": 4.8005217958473745, "grad_norm": 2.238356351852417, "learning_rate": 2.008986809682563e-06, "loss": 0.3291, "step": 33120 }, { "epoch": 4.801971228756749, "grad_norm": 1.101607084274292, "learning_rate": 1.9944919553558486e-06, "loss": 0.3901, "step": 33130 }, { "epoch": 4.803420661666123, "grad_norm": 2.1101644039154053, "learning_rate": 1.9799971010291346e-06, "loss": 0.3392, "step": 33140 }, { "epoch": 4.804870094575497, "grad_norm": 2.539505958557129, "learning_rate": 1.9655022467024207e-06, "loss": 0.3621, "step": 33150 }, { "epoch": 4.806319527484872, "grad_norm": 4.139045715332031, "learning_rate": 1.9510073923757067e-06, "loss": 0.4016, "step": 33160 }, { "epoch": 4.807768960394245, "grad_norm": 0.9701805114746094, "learning_rate": 1.9365125380489928e-06, "loss": 0.3427, "step": 33170 }, { "epoch": 4.80921839330362, "grad_norm": 0.8436054587364197, "learning_rate": 1.922017683722279e-06, "loss": 0.3647, "step": 33180 }, { "epoch": 4.8106678262129945, "grad_norm": 1.258991003036499, "learning_rate": 1.9075228293955645e-06, "loss": 0.3411, "step": 33190 }, { "epoch": 4.812117259122369, "grad_norm": 3.107102394104004, "learning_rate": 1.8930279750688507e-06, "loss": 0.4033, "step": 33200 }, { "epoch": 4.813566692031743, "grad_norm": 0.9850480556488037, "learning_rate": 1.8785331207421368e-06, "loss": 0.3227, "step": 33210 }, { "epoch": 4.815016124941117, "grad_norm": 1.4181708097457886, "learning_rate": 1.8640382664154226e-06, "loss": 0.3293, "step": 33220 }, { "epoch": 4.816465557850491, "grad_norm": 2.7566471099853516, "learning_rate": 1.8495434120887087e-06, "loss": 0.4135, "step": 33230 }, { "epoch": 4.817914990759865, "grad_norm": 2.4442412853240967, "learning_rate": 1.8350485577619945e-06, "loss": 0.383, "step": 33240 }, { "epoch": 4.8193644236692395, "grad_norm": 2.687974691390991, "learning_rate": 1.8205537034352808e-06, "loss": 0.3687, "step": 33250 }, { "epoch": 4.820813856578614, "grad_norm": 1.3777414560317993, "learning_rate": 1.8060588491085664e-06, "loss": 0.3538, "step": 33260 }, { "epoch": 4.822263289487988, "grad_norm": 1.8097695112228394, "learning_rate": 1.7915639947818527e-06, "loss": 0.2637, "step": 33270 }, { "epoch": 4.823712722397362, "grad_norm": 2.2218167781829834, "learning_rate": 1.7770691404551385e-06, "loss": 0.4544, "step": 33280 }, { "epoch": 4.825162155306736, "grad_norm": 1.296929955482483, "learning_rate": 1.7625742861284246e-06, "loss": 0.3599, "step": 33290 }, { "epoch": 4.82661158821611, "grad_norm": 2.5712153911590576, "learning_rate": 1.7480794318017104e-06, "loss": 0.4087, "step": 33300 }, { "epoch": 4.828061021125484, "grad_norm": 1.5026534795761108, "learning_rate": 1.7335845774749965e-06, "loss": 0.3474, "step": 33310 }, { "epoch": 4.829510454034859, "grad_norm": 1.357690691947937, "learning_rate": 1.7190897231482823e-06, "loss": 0.3939, "step": 33320 }, { "epoch": 4.830959886944233, "grad_norm": 1.5788393020629883, "learning_rate": 1.7045948688215686e-06, "loss": 0.3603, "step": 33330 }, { "epoch": 4.832409319853607, "grad_norm": 0.8207820653915405, "learning_rate": 1.6901000144948542e-06, "loss": 0.4107, "step": 33340 }, { "epoch": 4.833858752762982, "grad_norm": 1.154213309288025, "learning_rate": 1.6756051601681405e-06, "loss": 0.3741, "step": 33350 }, { "epoch": 4.835308185672355, "grad_norm": 1.2206467390060425, "learning_rate": 1.6611103058414263e-06, "loss": 0.3595, "step": 33360 }, { "epoch": 4.83675761858173, "grad_norm": 1.109948992729187, "learning_rate": 1.6466154515147124e-06, "loss": 0.4087, "step": 33370 }, { "epoch": 4.838207051491104, "grad_norm": 3.8707070350646973, "learning_rate": 1.6321205971879982e-06, "loss": 0.3811, "step": 33380 }, { "epoch": 4.839656484400479, "grad_norm": 2.4858224391937256, "learning_rate": 1.6176257428612842e-06, "loss": 0.3446, "step": 33390 }, { "epoch": 4.841105917309853, "grad_norm": 1.0518187284469604, "learning_rate": 1.60313088853457e-06, "loss": 0.384, "step": 33400 }, { "epoch": 4.842555350219227, "grad_norm": 1.2034097909927368, "learning_rate": 1.5886360342078564e-06, "loss": 0.3816, "step": 33410 }, { "epoch": 4.844004783128601, "grad_norm": 2.01949405670166, "learning_rate": 1.5741411798811424e-06, "loss": 0.4041, "step": 33420 }, { "epoch": 4.845454216037975, "grad_norm": 2.740180492401123, "learning_rate": 1.5596463255544283e-06, "loss": 0.3867, "step": 33430 }, { "epoch": 4.846903648947349, "grad_norm": 1.3823065757751465, "learning_rate": 1.5451514712277143e-06, "loss": 0.3484, "step": 33440 }, { "epoch": 4.8483530818567235, "grad_norm": 1.4719023704528809, "learning_rate": 1.5306566169010001e-06, "loss": 0.355, "step": 33450 }, { "epoch": 4.849802514766098, "grad_norm": 1.2143726348876953, "learning_rate": 1.5161617625742862e-06, "loss": 0.3884, "step": 33460 }, { "epoch": 4.851251947675472, "grad_norm": 3.5473649501800537, "learning_rate": 1.501666908247572e-06, "loss": 0.38, "step": 33470 }, { "epoch": 4.852701380584846, "grad_norm": 1.1043490171432495, "learning_rate": 1.487172053920858e-06, "loss": 0.3983, "step": 33480 }, { "epoch": 4.85415081349422, "grad_norm": 1.6615132093429565, "learning_rate": 1.4726771995941441e-06, "loss": 0.3192, "step": 33490 }, { "epoch": 4.855600246403594, "grad_norm": 3.8564488887786865, "learning_rate": 1.45818234526743e-06, "loss": 0.403, "step": 33500 }, { "epoch": 4.8570496793129685, "grad_norm": 1.4073970317840576, "learning_rate": 1.4436874909407163e-06, "loss": 0.3609, "step": 33510 }, { "epoch": 4.858499112222343, "grad_norm": 2.5036427974700928, "learning_rate": 1.429192636614002e-06, "loss": 0.3634, "step": 33520 }, { "epoch": 4.859948545131717, "grad_norm": 2.0004215240478516, "learning_rate": 1.4146977822872881e-06, "loss": 0.33, "step": 33530 }, { "epoch": 4.861397978041092, "grad_norm": 3.5735065937042236, "learning_rate": 1.4002029279605742e-06, "loss": 0.3731, "step": 33540 }, { "epoch": 4.862847410950465, "grad_norm": 1.5266069173812866, "learning_rate": 1.38570807363386e-06, "loss": 0.3769, "step": 33550 }, { "epoch": 4.86429684385984, "grad_norm": 2.06255841255188, "learning_rate": 1.371213219307146e-06, "loss": 0.3752, "step": 33560 }, { "epoch": 4.865746276769214, "grad_norm": 3.314347267150879, "learning_rate": 1.356718364980432e-06, "loss": 0.3887, "step": 33570 }, { "epoch": 4.8671957096785885, "grad_norm": 1.280548095703125, "learning_rate": 1.342223510653718e-06, "loss": 0.3756, "step": 33580 }, { "epoch": 4.868645142587963, "grad_norm": 2.8850739002227783, "learning_rate": 1.327728656327004e-06, "loss": 0.334, "step": 33590 }, { "epoch": 4.870094575497337, "grad_norm": 1.094756841659546, "learning_rate": 1.3132338020002899e-06, "loss": 0.3001, "step": 33600 }, { "epoch": 4.871544008406711, "grad_norm": 5.066037654876709, "learning_rate": 1.298738947673576e-06, "loss": 0.3696, "step": 33610 }, { "epoch": 4.872993441316085, "grad_norm": 3.2792046070098877, "learning_rate": 1.284244093346862e-06, "loss": 0.3982, "step": 33620 }, { "epoch": 4.874442874225459, "grad_norm": 0.984911322593689, "learning_rate": 1.2697492390201478e-06, "loss": 0.364, "step": 33630 }, { "epoch": 4.8758923071348335, "grad_norm": 1.0168277025222778, "learning_rate": 1.2552543846934339e-06, "loss": 0.4063, "step": 33640 }, { "epoch": 4.877341740044208, "grad_norm": 1.043861746788025, "learning_rate": 1.2407595303667197e-06, "loss": 0.3545, "step": 33650 }, { "epoch": 4.878791172953582, "grad_norm": 1.2901207208633423, "learning_rate": 1.2262646760400058e-06, "loss": 0.3993, "step": 33660 }, { "epoch": 4.880240605862956, "grad_norm": 2.493473768234253, "learning_rate": 1.2117698217132918e-06, "loss": 0.3616, "step": 33670 }, { "epoch": 4.88169003877233, "grad_norm": 1.1101723909378052, "learning_rate": 1.1972749673865777e-06, "loss": 0.3554, "step": 33680 }, { "epoch": 4.883139471681704, "grad_norm": 1.0349514484405518, "learning_rate": 1.1827801130598637e-06, "loss": 0.3509, "step": 33690 }, { "epoch": 4.884588904591078, "grad_norm": 1.1507887840270996, "learning_rate": 1.1682852587331498e-06, "loss": 0.364, "step": 33700 }, { "epoch": 4.886038337500453, "grad_norm": 3.162672281265259, "learning_rate": 1.1537904044064358e-06, "loss": 0.3366, "step": 33710 }, { "epoch": 4.887487770409827, "grad_norm": 1.3397330045700073, "learning_rate": 1.1392955500797219e-06, "loss": 0.4009, "step": 33720 }, { "epoch": 4.888937203319202, "grad_norm": 1.1104768514633179, "learning_rate": 1.1248006957530077e-06, "loss": 0.383, "step": 33730 }, { "epoch": 4.890386636228576, "grad_norm": 2.326978921890259, "learning_rate": 1.1103058414262938e-06, "loss": 0.3212, "step": 33740 }, { "epoch": 4.89183606913795, "grad_norm": 1.2988479137420654, "learning_rate": 1.0958109870995798e-06, "loss": 0.3584, "step": 33750 }, { "epoch": 4.893285502047324, "grad_norm": 2.5861144065856934, "learning_rate": 1.0813161327728657e-06, "loss": 0.3753, "step": 33760 }, { "epoch": 4.894734934956698, "grad_norm": 1.090281367301941, "learning_rate": 1.0668212784461517e-06, "loss": 0.3217, "step": 33770 }, { "epoch": 4.896184367866073, "grad_norm": 3.0620217323303223, "learning_rate": 1.0523264241194376e-06, "loss": 0.3265, "step": 33780 }, { "epoch": 4.897633800775447, "grad_norm": 1.767822265625, "learning_rate": 1.0378315697927236e-06, "loss": 0.3413, "step": 33790 }, { "epoch": 4.899083233684821, "grad_norm": 0.9307612776756287, "learning_rate": 1.0233367154660097e-06, "loss": 0.3716, "step": 33800 }, { "epoch": 4.900532666594195, "grad_norm": 2.0926198959350586, "learning_rate": 1.0088418611392955e-06, "loss": 0.2855, "step": 33810 }, { "epoch": 4.901982099503569, "grad_norm": 2.8857176303863525, "learning_rate": 9.943470068125816e-07, "loss": 0.3717, "step": 33820 }, { "epoch": 4.903431532412943, "grad_norm": 1.6708345413208008, "learning_rate": 9.798521524858676e-07, "loss": 0.3262, "step": 33830 }, { "epoch": 4.9048809653223175, "grad_norm": 1.5556457042694092, "learning_rate": 9.653572981591535e-07, "loss": 0.3121, "step": 33840 }, { "epoch": 4.906330398231692, "grad_norm": 3.1761248111724854, "learning_rate": 9.508624438324395e-07, "loss": 0.3898, "step": 33850 }, { "epoch": 4.907779831141066, "grad_norm": 1.3463460206985474, "learning_rate": 9.363675895057255e-07, "loss": 0.3161, "step": 33860 }, { "epoch": 4.90922926405044, "grad_norm": 2.4650721549987793, "learning_rate": 9.218727351790114e-07, "loss": 0.3677, "step": 33870 }, { "epoch": 4.910678696959814, "grad_norm": 1.9204206466674805, "learning_rate": 9.073778808522974e-07, "loss": 0.3759, "step": 33880 }, { "epoch": 4.912128129869188, "grad_norm": 4.739460468292236, "learning_rate": 8.928830265255834e-07, "loss": 0.37, "step": 33890 }, { "epoch": 4.9135775627785625, "grad_norm": 1.053930401802063, "learning_rate": 8.783881721988696e-07, "loss": 0.3451, "step": 33900 }, { "epoch": 4.915026995687937, "grad_norm": 0.9955337643623352, "learning_rate": 8.638933178721555e-07, "loss": 0.3697, "step": 33910 }, { "epoch": 4.916476428597312, "grad_norm": 2.0659215450286865, "learning_rate": 8.493984635454415e-07, "loss": 0.3888, "step": 33920 }, { "epoch": 4.917925861506686, "grad_norm": 0.9193480610847473, "learning_rate": 8.349036092187274e-07, "loss": 0.345, "step": 33930 }, { "epoch": 4.91937529441606, "grad_norm": 0.9872389435768127, "learning_rate": 8.204087548920135e-07, "loss": 0.3537, "step": 33940 }, { "epoch": 4.920824727325434, "grad_norm": 1.2617017030715942, "learning_rate": 8.059139005652994e-07, "loss": 0.3635, "step": 33950 }, { "epoch": 4.922274160234808, "grad_norm": 1.9014856815338135, "learning_rate": 7.914190462385854e-07, "loss": 0.306, "step": 33960 }, { "epoch": 4.9237235931441825, "grad_norm": 2.8198678493499756, "learning_rate": 7.769241919118713e-07, "loss": 0.2914, "step": 33970 }, { "epoch": 4.925173026053557, "grad_norm": 1.2952039241790771, "learning_rate": 7.624293375851574e-07, "loss": 0.3804, "step": 33980 }, { "epoch": 4.926622458962931, "grad_norm": 1.6215705871582031, "learning_rate": 7.479344832584433e-07, "loss": 0.3993, "step": 33990 }, { "epoch": 4.928071891872305, "grad_norm": 2.0749127864837646, "learning_rate": 7.334396289317293e-07, "loss": 0.37, "step": 34000 }, { "epoch": 4.928071891872305, "eval_loss": 0.7409716248512268, "eval_runtime": 672.128, "eval_samples_per_second": 51.322, "eval_steps_per_second": 2.566, "eval_token_accuracy": 0.0003939013962992248, "step": 34000 }, { "epoch": 4.929521324781679, "grad_norm": 1.1606099605560303, "learning_rate": 7.189447746050152e-07, "loss": 0.3569, "step": 34010 }, { "epoch": 4.930970757691053, "grad_norm": 1.0649526119232178, "learning_rate": 7.044499202783013e-07, "loss": 0.3471, "step": 34020 }, { "epoch": 4.9324201906004275, "grad_norm": 2.766159772872925, "learning_rate": 6.899550659515872e-07, "loss": 0.3769, "step": 34030 }, { "epoch": 4.933869623509802, "grad_norm": 2.2011117935180664, "learning_rate": 6.754602116248732e-07, "loss": 0.3361, "step": 34040 }, { "epoch": 4.935319056419176, "grad_norm": 1.4634935855865479, "learning_rate": 6.609653572981592e-07, "loss": 0.3589, "step": 34050 }, { "epoch": 4.93676848932855, "grad_norm": 2.5267245769500732, "learning_rate": 6.464705029714452e-07, "loss": 0.3581, "step": 34060 }, { "epoch": 4.938217922237924, "grad_norm": 3.63118314743042, "learning_rate": 6.319756486447312e-07, "loss": 0.3609, "step": 34070 }, { "epoch": 4.939667355147298, "grad_norm": 1.3346004486083984, "learning_rate": 6.174807943180172e-07, "loss": 0.4164, "step": 34080 }, { "epoch": 4.941116788056673, "grad_norm": 1.1010299921035767, "learning_rate": 6.029859399913031e-07, "loss": 0.3551, "step": 34090 }, { "epoch": 4.942566220966047, "grad_norm": 1.9690850973129272, "learning_rate": 5.884910856645891e-07, "loss": 0.3622, "step": 34100 }, { "epoch": 4.944015653875422, "grad_norm": 2.8707642555236816, "learning_rate": 5.739962313378751e-07, "loss": 0.3133, "step": 34110 }, { "epoch": 4.945465086784796, "grad_norm": 2.438140869140625, "learning_rate": 5.595013770111611e-07, "loss": 0.4127, "step": 34120 }, { "epoch": 4.94691451969417, "grad_norm": 1.2852033376693726, "learning_rate": 5.45006522684447e-07, "loss": 0.3827, "step": 34130 }, { "epoch": 4.948363952603544, "grad_norm": 1.318913221359253, "learning_rate": 5.30511668357733e-07, "loss": 0.4089, "step": 34140 }, { "epoch": 4.949813385512918, "grad_norm": 1.2271665334701538, "learning_rate": 5.16016814031019e-07, "loss": 0.3706, "step": 34150 }, { "epoch": 4.951262818422292, "grad_norm": 1.5632954835891724, "learning_rate": 5.015219597043051e-07, "loss": 0.3624, "step": 34160 }, { "epoch": 4.952712251331667, "grad_norm": 3.3097076416015625, "learning_rate": 4.87027105377591e-07, "loss": 0.3476, "step": 34170 }, { "epoch": 4.954161684241041, "grad_norm": 1.1198471784591675, "learning_rate": 4.7253225105087695e-07, "loss": 0.405, "step": 34180 }, { "epoch": 4.955611117150415, "grad_norm": 0.8534343838691711, "learning_rate": 4.5803739672416295e-07, "loss": 0.3386, "step": 34190 }, { "epoch": 4.957060550059789, "grad_norm": 2.1439709663391113, "learning_rate": 4.435425423974489e-07, "loss": 0.3165, "step": 34200 }, { "epoch": 4.958509982969163, "grad_norm": 1.0483152866363525, "learning_rate": 4.290476880707349e-07, "loss": 0.3532, "step": 34210 }, { "epoch": 4.959959415878537, "grad_norm": 0.9778922200202942, "learning_rate": 4.1455283374402085e-07, "loss": 0.3545, "step": 34220 }, { "epoch": 4.9614088487879116, "grad_norm": 1.4083807468414307, "learning_rate": 4.0005797941730685e-07, "loss": 0.4085, "step": 34230 }, { "epoch": 4.962858281697286, "grad_norm": 3.145470142364502, "learning_rate": 3.8556312509059285e-07, "loss": 0.3465, "step": 34240 }, { "epoch": 4.96430771460666, "grad_norm": 1.1022526025772095, "learning_rate": 3.7106827076387885e-07, "loss": 0.338, "step": 34250 }, { "epoch": 4.965757147516034, "grad_norm": 1.3441957235336304, "learning_rate": 3.565734164371648e-07, "loss": 0.3876, "step": 34260 }, { "epoch": 4.967206580425408, "grad_norm": 1.3486835956573486, "learning_rate": 3.4207856211045085e-07, "loss": 0.339, "step": 34270 }, { "epoch": 4.968656013334783, "grad_norm": 2.366464853286743, "learning_rate": 3.275837077837368e-07, "loss": 0.4093, "step": 34280 }, { "epoch": 4.9701054462441565, "grad_norm": 2.5643863677978516, "learning_rate": 3.130888534570228e-07, "loss": 0.3802, "step": 34290 }, { "epoch": 4.971554879153532, "grad_norm": 1.02238130569458, "learning_rate": 2.9859399913030874e-07, "loss": 0.3807, "step": 34300 }, { "epoch": 4.973004312062906, "grad_norm": 3.854581594467163, "learning_rate": 2.8409914480359474e-07, "loss": 0.3604, "step": 34310 }, { "epoch": 4.97445374497228, "grad_norm": 2.844658374786377, "learning_rate": 2.6960429047688074e-07, "loss": 0.3959, "step": 34320 }, { "epoch": 4.975903177881654, "grad_norm": 1.9395471811294556, "learning_rate": 2.551094361501667e-07, "loss": 0.3671, "step": 34330 }, { "epoch": 4.977352610791028, "grad_norm": 1.0512335300445557, "learning_rate": 2.420640672561241e-07, "loss": 0.3511, "step": 34340 }, { "epoch": 4.978802043700402, "grad_norm": 1.3229668140411377, "learning_rate": 2.275692129294101e-07, "loss": 0.4025, "step": 34350 }, { "epoch": 4.9802514766097765, "grad_norm": 0.8195211291313171, "learning_rate": 2.1307435860269607e-07, "loss": 0.3992, "step": 34360 }, { "epoch": 4.981700909519151, "grad_norm": 1.1004067659378052, "learning_rate": 1.9857950427598204e-07, "loss": 0.3877, "step": 34370 }, { "epoch": 4.983150342428525, "grad_norm": 1.3482115268707275, "learning_rate": 1.8408464994926802e-07, "loss": 0.3713, "step": 34380 }, { "epoch": 4.984599775337899, "grad_norm": 1.212549090385437, "learning_rate": 1.69589795622554e-07, "loss": 0.4233, "step": 34390 }, { "epoch": 4.986049208247273, "grad_norm": 3.041564702987671, "learning_rate": 1.5509494129583997e-07, "loss": 0.3125, "step": 34400 }, { "epoch": 4.987498641156647, "grad_norm": 2.7439684867858887, "learning_rate": 1.4060008696912597e-07, "loss": 0.3424, "step": 34410 }, { "epoch": 4.9889480740660215, "grad_norm": 2.762303590774536, "learning_rate": 1.2610523264241194e-07, "loss": 0.3415, "step": 34420 }, { "epoch": 4.990397506975396, "grad_norm": 2.809271812438965, "learning_rate": 1.1161037831569793e-07, "loss": 0.3471, "step": 34430 }, { "epoch": 4.99184693988477, "grad_norm": 2.674682855606079, "learning_rate": 9.711552398898391e-08, "loss": 0.359, "step": 34440 }, { "epoch": 4.993296372794144, "grad_norm": 1.3303627967834473, "learning_rate": 8.26206696622699e-08, "loss": 0.3938, "step": 34450 }, { "epoch": 4.994745805703518, "grad_norm": 1.5275235176086426, "learning_rate": 6.812581533555588e-08, "loss": 0.4174, "step": 34460 }, { "epoch": 4.996195238612893, "grad_norm": 1.2861908674240112, "learning_rate": 5.363096100884186e-08, "loss": 0.3857, "step": 34470 }, { "epoch": 4.997644671522266, "grad_norm": 3.7214603424072266, "learning_rate": 3.913610668212785e-08, "loss": 0.3666, "step": 34480 }, { "epoch": 4.9990941044316415, "grad_norm": 4.302470684051514, "learning_rate": 2.464125235541383e-08, "loss": 0.3139, "step": 34490 } ], "logging_steps": 10, "max_steps": 34495, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.471826509366821e+18, "train_batch_size": 10, "trial_name": null, "trial_params": null }