{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 18.537590113285273, "eval_steps": 500, "global_step": 18000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010298661174047374, "grad_norm": 28.08726692199707, "learning_rate": 1.0000000000000002e-06, "loss": 2.0981, "step": 10 }, { "epoch": 0.02059732234809475, "grad_norm": 16.46196746826172, "learning_rate": 2.1111111111111114e-06, "loss": 1.9619, "step": 20 }, { "epoch": 0.030895983522142123, "grad_norm": 13.453218460083008, "learning_rate": 3.2222222222222222e-06, "loss": 1.5883, "step": 30 }, { "epoch": 0.0411946446961895, "grad_norm": 3.0111494064331055, "learning_rate": 4.333333333333334e-06, "loss": 0.8443, "step": 40 }, { "epoch": 0.05149330587023687, "grad_norm": 1.8473039865493774, "learning_rate": 5.444444444444445e-06, "loss": 0.4851, "step": 50 }, { "epoch": 0.061791967044284246, "grad_norm": 1.983799695968628, "learning_rate": 6.555555555555556e-06, "loss": 0.4895, "step": 60 }, { "epoch": 0.07209062821833162, "grad_norm": 1.359467625617981, "learning_rate": 7.666666666666667e-06, "loss": 0.3476, "step": 70 }, { "epoch": 0.082389289392379, "grad_norm": 1.6559157371520996, "learning_rate": 8.777777777777778e-06, "loss": 0.3161, "step": 80 }, { "epoch": 0.09268795056642637, "grad_norm": 1.4577065706253052, "learning_rate": 9.888888888888889e-06, "loss": 0.2894, "step": 90 }, { "epoch": 0.10298661174047374, "grad_norm": 1.9685674905776978, "learning_rate": 1.1000000000000001e-05, "loss": 0.2669, "step": 100 }, { "epoch": 0.11328527291452112, "grad_norm": 1.0735788345336914, "learning_rate": 1.2111111111111112e-05, "loss": 0.2384, "step": 110 }, { "epoch": 0.12358393408856849, "grad_norm": 2.065934419631958, "learning_rate": 1.3222222222222221e-05, "loss": 0.2315, "step": 120 }, { "epoch": 0.13388259526261587, "grad_norm": 1.3160645961761475, "learning_rate": 1.4333333333333334e-05, "loss": 0.2212, "step": 130 }, { "epoch": 0.14418125643666324, "grad_norm": 1.132812738418579, "learning_rate": 1.5444444444444446e-05, "loss": 0.2107, "step": 140 }, { "epoch": 0.15447991761071062, "grad_norm": 0.8556684851646423, "learning_rate": 1.655555555555556e-05, "loss": 0.1983, "step": 150 }, { "epoch": 0.164778578784758, "grad_norm": 1.1401009559631348, "learning_rate": 1.7666666666666668e-05, "loss": 0.1821, "step": 160 }, { "epoch": 0.17507723995880536, "grad_norm": 0.9898369312286377, "learning_rate": 1.8777777777777777e-05, "loss": 0.1725, "step": 170 }, { "epoch": 0.18537590113285274, "grad_norm": 1.2845979928970337, "learning_rate": 1.988888888888889e-05, "loss": 0.1798, "step": 180 }, { "epoch": 0.1956745623069001, "grad_norm": 0.7349956631660461, "learning_rate": 2.1e-05, "loss": 0.1553, "step": 190 }, { "epoch": 0.2059732234809475, "grad_norm": 1.0893903970718384, "learning_rate": 2.211111111111111e-05, "loss": 0.161, "step": 200 }, { "epoch": 0.21627188465499486, "grad_norm": 1.4773167371749878, "learning_rate": 2.3222222222222224e-05, "loss": 0.1687, "step": 210 }, { "epoch": 0.22657054582904224, "grad_norm": 0.7343375086784363, "learning_rate": 2.4333333333333336e-05, "loss": 0.1541, "step": 220 }, { "epoch": 0.2368692070030896, "grad_norm": 1.459641456604004, "learning_rate": 2.5444444444444442e-05, "loss": 0.1546, "step": 230 }, { "epoch": 0.24716786817713698, "grad_norm": 1.007576823234558, "learning_rate": 2.6555555555555555e-05, "loss": 0.1397, "step": 240 }, { "epoch": 0.25746652935118436, "grad_norm": 0.7707590460777283, "learning_rate": 2.7666666666666667e-05, "loss": 0.1395, "step": 250 }, { "epoch": 0.26776519052523173, "grad_norm": 0.8418192863464355, "learning_rate": 2.877777777777778e-05, "loss": 0.1367, "step": 260 }, { "epoch": 0.2780638516992791, "grad_norm": 1.433361291885376, "learning_rate": 2.988888888888889e-05, "loss": 0.1443, "step": 270 }, { "epoch": 0.2883625128733265, "grad_norm": 1.6851385831832886, "learning_rate": 3.1e-05, "loss": 0.1412, "step": 280 }, { "epoch": 0.29866117404737386, "grad_norm": 1.0967495441436768, "learning_rate": 3.2111111111111114e-05, "loss": 0.1465, "step": 290 }, { "epoch": 0.30895983522142123, "grad_norm": 0.9680765867233276, "learning_rate": 3.322222222222222e-05, "loss": 0.1409, "step": 300 }, { "epoch": 0.3192584963954686, "grad_norm": 0.8024266362190247, "learning_rate": 3.433333333333333e-05, "loss": 0.151, "step": 310 }, { "epoch": 0.329557157569516, "grad_norm": 1.2099324464797974, "learning_rate": 3.5444444444444445e-05, "loss": 0.1276, "step": 320 }, { "epoch": 0.33985581874356335, "grad_norm": 1.553401231765747, "learning_rate": 3.655555555555556e-05, "loss": 0.1407, "step": 330 }, { "epoch": 0.35015447991761073, "grad_norm": 0.9965718388557434, "learning_rate": 3.766666666666667e-05, "loss": 0.1193, "step": 340 }, { "epoch": 0.3604531410916581, "grad_norm": 1.0881636142730713, "learning_rate": 3.877777777777778e-05, "loss": 0.1161, "step": 350 }, { "epoch": 0.3707518022657055, "grad_norm": 0.7971917986869812, "learning_rate": 3.9888888888888895e-05, "loss": 0.1153, "step": 360 }, { "epoch": 0.38105046343975285, "grad_norm": 0.6419103741645813, "learning_rate": 4.1e-05, "loss": 0.1268, "step": 370 }, { "epoch": 0.3913491246138002, "grad_norm": 0.8467381596565247, "learning_rate": 4.211111111111111e-05, "loss": 0.1089, "step": 380 }, { "epoch": 0.4016477857878476, "grad_norm": 0.7437835335731506, "learning_rate": 4.3222222222222226e-05, "loss": 0.1196, "step": 390 }, { "epoch": 0.411946446961895, "grad_norm": 1.1879000663757324, "learning_rate": 4.433333333333334e-05, "loss": 0.1104, "step": 400 }, { "epoch": 0.42224510813594235, "grad_norm": 1.103964924812317, "learning_rate": 4.5444444444444444e-05, "loss": 0.1154, "step": 410 }, { "epoch": 0.4325437693099897, "grad_norm": 1.20859956741333, "learning_rate": 4.6555555555555556e-05, "loss": 0.1151, "step": 420 }, { "epoch": 0.4428424304840371, "grad_norm": 1.3592861890792847, "learning_rate": 4.766666666666667e-05, "loss": 0.1221, "step": 430 }, { "epoch": 0.45314109165808447, "grad_norm": 0.7694193720817566, "learning_rate": 4.8777777777777775e-05, "loss": 0.1081, "step": 440 }, { "epoch": 0.46343975283213185, "grad_norm": 0.8526501655578613, "learning_rate": 4.9888888888888894e-05, "loss": 0.1071, "step": 450 }, { "epoch": 0.4737384140061792, "grad_norm": 0.8666425943374634, "learning_rate": 5.1000000000000006e-05, "loss": 0.1125, "step": 460 }, { "epoch": 0.4840370751802266, "grad_norm": 1.0404722690582275, "learning_rate": 5.211111111111111e-05, "loss": 0.1235, "step": 470 }, { "epoch": 0.49433573635427397, "grad_norm": 0.8314346671104431, "learning_rate": 5.322222222222223e-05, "loss": 0.1156, "step": 480 }, { "epoch": 0.5046343975283213, "grad_norm": 0.8053165674209595, "learning_rate": 5.433333333333334e-05, "loss": 0.0963, "step": 490 }, { "epoch": 0.5149330587023687, "grad_norm": 0.9703218340873718, "learning_rate": 5.544444444444444e-05, "loss": 0.1081, "step": 500 }, { "epoch": 0.525231719876416, "grad_norm": 1.0357967615127563, "learning_rate": 5.655555555555556e-05, "loss": 0.1053, "step": 510 }, { "epoch": 0.5355303810504635, "grad_norm": 0.6202366948127747, "learning_rate": 5.766666666666667e-05, "loss": 0.1161, "step": 520 }, { "epoch": 0.5458290422245108, "grad_norm": 0.9413891434669495, "learning_rate": 5.8777777777777774e-05, "loss": 0.1109, "step": 530 }, { "epoch": 0.5561277033985582, "grad_norm": 0.9725326299667358, "learning_rate": 5.988888888888889e-05, "loss": 0.1087, "step": 540 }, { "epoch": 0.5664263645726055, "grad_norm": 1.1372697353363037, "learning_rate": 6.1e-05, "loss": 0.0934, "step": 550 }, { "epoch": 0.576725025746653, "grad_norm": 0.9730582237243652, "learning_rate": 6.21111111111111e-05, "loss": 0.089, "step": 560 }, { "epoch": 0.5870236869207003, "grad_norm": 1.031986117362976, "learning_rate": 6.322222222222223e-05, "loss": 0.0921, "step": 570 }, { "epoch": 0.5973223480947477, "grad_norm": 0.9803087115287781, "learning_rate": 6.433333333333333e-05, "loss": 0.109, "step": 580 }, { "epoch": 0.607621009268795, "grad_norm": 1.2565224170684814, "learning_rate": 6.544444444444446e-05, "loss": 0.1075, "step": 590 }, { "epoch": 0.6179196704428425, "grad_norm": 0.6035177707672119, "learning_rate": 6.655555555555555e-05, "loss": 0.1069, "step": 600 }, { "epoch": 0.6282183316168898, "grad_norm": 0.6485044360160828, "learning_rate": 6.766666666666667e-05, "loss": 0.1041, "step": 610 }, { "epoch": 0.6385169927909372, "grad_norm": 0.9063082337379456, "learning_rate": 6.877777777777778e-05, "loss": 0.087, "step": 620 }, { "epoch": 0.6488156539649845, "grad_norm": 0.7508301734924316, "learning_rate": 6.988888888888889e-05, "loss": 0.0993, "step": 630 }, { "epoch": 0.659114315139032, "grad_norm": 0.7371131777763367, "learning_rate": 7.1e-05, "loss": 0.0965, "step": 640 }, { "epoch": 0.6694129763130793, "grad_norm": 0.9033893942832947, "learning_rate": 7.211111111111112e-05, "loss": 0.0927, "step": 650 }, { "epoch": 0.6797116374871267, "grad_norm": 1.0828319787979126, "learning_rate": 7.322222222222223e-05, "loss": 0.1039, "step": 660 }, { "epoch": 0.690010298661174, "grad_norm": 0.7973754405975342, "learning_rate": 7.433333333333333e-05, "loss": 0.0942, "step": 670 }, { "epoch": 0.7003089598352215, "grad_norm": 0.9999275803565979, "learning_rate": 7.544444444444445e-05, "loss": 0.0938, "step": 680 }, { "epoch": 0.7106076210092688, "grad_norm": 0.7432506680488586, "learning_rate": 7.655555555555555e-05, "loss": 0.0822, "step": 690 }, { "epoch": 0.7209062821833162, "grad_norm": 0.7960357069969177, "learning_rate": 7.766666666666667e-05, "loss": 0.0885, "step": 700 }, { "epoch": 0.7312049433573635, "grad_norm": 0.6295223236083984, "learning_rate": 7.877777777777778e-05, "loss": 0.0984, "step": 710 }, { "epoch": 0.741503604531411, "grad_norm": 0.6425987482070923, "learning_rate": 7.988888888888889e-05, "loss": 0.0851, "step": 720 }, { "epoch": 0.7518022657054583, "grad_norm": 0.7241719961166382, "learning_rate": 8.1e-05, "loss": 0.0818, "step": 730 }, { "epoch": 0.7621009268795057, "grad_norm": 0.6875414252281189, "learning_rate": 8.211111111111112e-05, "loss": 0.0776, "step": 740 }, { "epoch": 0.772399588053553, "grad_norm": 0.7593461275100708, "learning_rate": 8.322222222222223e-05, "loss": 0.0862, "step": 750 }, { "epoch": 0.7826982492276005, "grad_norm": 1.1254090070724487, "learning_rate": 8.433333333333334e-05, "loss": 0.0831, "step": 760 }, { "epoch": 0.7929969104016478, "grad_norm": 0.6563543677330017, "learning_rate": 8.544444444444445e-05, "loss": 0.0756, "step": 770 }, { "epoch": 0.8032955715756952, "grad_norm": 0.500499963760376, "learning_rate": 8.655555555555555e-05, "loss": 0.09, "step": 780 }, { "epoch": 0.8135942327497425, "grad_norm": 0.6962169408798218, "learning_rate": 8.766666666666668e-05, "loss": 0.0913, "step": 790 }, { "epoch": 0.82389289392379, "grad_norm": 0.8879425525665283, "learning_rate": 8.877777777777778e-05, "loss": 0.094, "step": 800 }, { "epoch": 0.8341915550978373, "grad_norm": 0.7109111547470093, "learning_rate": 8.988888888888889e-05, "loss": 0.0899, "step": 810 }, { "epoch": 0.8444902162718847, "grad_norm": 0.6895614266395569, "learning_rate": 9.1e-05, "loss": 0.0899, "step": 820 }, { "epoch": 0.854788877445932, "grad_norm": 0.5885145664215088, "learning_rate": 9.211111111111112e-05, "loss": 0.0894, "step": 830 }, { "epoch": 0.8650875386199794, "grad_norm": 0.6228615641593933, "learning_rate": 9.322222222222223e-05, "loss": 0.0826, "step": 840 }, { "epoch": 0.8753861997940268, "grad_norm": 0.6920461654663086, "learning_rate": 9.433333333333334e-05, "loss": 0.0926, "step": 850 }, { "epoch": 0.8856848609680742, "grad_norm": 0.8142651319503784, "learning_rate": 9.544444444444445e-05, "loss": 0.0769, "step": 860 }, { "epoch": 0.8959835221421215, "grad_norm": 0.8525772094726562, "learning_rate": 9.655555555555555e-05, "loss": 0.0775, "step": 870 }, { "epoch": 0.9062821833161689, "grad_norm": 0.6274034976959229, "learning_rate": 9.766666666666668e-05, "loss": 0.0793, "step": 880 }, { "epoch": 0.9165808444902163, "grad_norm": 0.7031662464141846, "learning_rate": 9.877777777777778e-05, "loss": 0.081, "step": 890 }, { "epoch": 0.9268795056642637, "grad_norm": 0.542312741279602, "learning_rate": 9.98888888888889e-05, "loss": 0.0878, "step": 900 }, { "epoch": 0.937178166838311, "grad_norm": 0.5504183173179626, "learning_rate": 9.999993165095463e-05, "loss": 0.0711, "step": 910 }, { "epoch": 0.9474768280123584, "grad_norm": 0.6083622574806213, "learning_rate": 9.999969538288952e-05, "loss": 0.0774, "step": 920 }, { "epoch": 0.9577754891864058, "grad_norm": 0.7640944123268127, "learning_rate": 9.999929035278659e-05, "loss": 0.0711, "step": 930 }, { "epoch": 0.9680741503604532, "grad_norm": 0.34581655263900757, "learning_rate": 9.999871656201292e-05, "loss": 0.0716, "step": 940 }, { "epoch": 0.9783728115345005, "grad_norm": 0.6435947418212891, "learning_rate": 9.999797401250521e-05, "loss": 0.0833, "step": 950 }, { "epoch": 0.9886714727085479, "grad_norm": 0.6153683662414551, "learning_rate": 9.999706270676973e-05, "loss": 0.0683, "step": 960 }, { "epoch": 0.9989701338825953, "grad_norm": 0.5145250558853149, "learning_rate": 9.999598264788241e-05, "loss": 0.0679, "step": 970 }, { "epoch": 1.0092687950566426, "grad_norm": 0.5474639534950256, "learning_rate": 9.999473383948872e-05, "loss": 0.0652, "step": 980 }, { "epoch": 1.01956745623069, "grad_norm": 0.4673866331577301, "learning_rate": 9.99933162858037e-05, "loss": 0.0806, "step": 990 }, { "epoch": 1.0298661174047374, "grad_norm": 0.500733494758606, "learning_rate": 9.999172999161198e-05, "loss": 0.0746, "step": 1000 }, { "epoch": 1.0401647785787849, "grad_norm": 0.6277178525924683, "learning_rate": 9.998997496226772e-05, "loss": 0.0691, "step": 1010 }, { "epoch": 1.050463439752832, "grad_norm": 0.34232184290885925, "learning_rate": 9.998805120369458e-05, "loss": 0.069, "step": 1020 }, { "epoch": 1.0607621009268795, "grad_norm": 0.6583809852600098, "learning_rate": 9.998595872238577e-05, "loss": 0.0646, "step": 1030 }, { "epoch": 1.071060762100927, "grad_norm": 0.5400450825691223, "learning_rate": 9.998369752540395e-05, "loss": 0.0709, "step": 1040 }, { "epoch": 1.0813594232749741, "grad_norm": 0.716460645198822, "learning_rate": 9.998126762038126e-05, "loss": 0.0659, "step": 1050 }, { "epoch": 1.0916580844490216, "grad_norm": 0.7969040274620056, "learning_rate": 9.997866901551926e-05, "loss": 0.0834, "step": 1060 }, { "epoch": 1.101956745623069, "grad_norm": 0.6805360317230225, "learning_rate": 9.997590171958892e-05, "loss": 0.0661, "step": 1070 }, { "epoch": 1.1122554067971164, "grad_norm": 0.6645709872245789, "learning_rate": 9.997296574193058e-05, "loss": 0.0719, "step": 1080 }, { "epoch": 1.1225540679711639, "grad_norm": 0.9983972311019897, "learning_rate": 9.996986109245395e-05, "loss": 0.063, "step": 1090 }, { "epoch": 1.132852729145211, "grad_norm": 0.47811999917030334, "learning_rate": 9.996658778163802e-05, "loss": 0.0812, "step": 1100 }, { "epoch": 1.1431513903192585, "grad_norm": 0.9598459601402283, "learning_rate": 9.996314582053106e-05, "loss": 0.0797, "step": 1110 }, { "epoch": 1.153450051493306, "grad_norm": 0.8147891759872437, "learning_rate": 9.995953522075061e-05, "loss": 0.076, "step": 1120 }, { "epoch": 1.1637487126673531, "grad_norm": 0.36551281809806824, "learning_rate": 9.995575599448336e-05, "loss": 0.0689, "step": 1130 }, { "epoch": 1.1740473738414006, "grad_norm": 0.41024380922317505, "learning_rate": 9.995180815448523e-05, "loss": 0.091, "step": 1140 }, { "epoch": 1.184346035015448, "grad_norm": 0.5559478998184204, "learning_rate": 9.994769171408118e-05, "loss": 0.0783, "step": 1150 }, { "epoch": 1.1946446961894954, "grad_norm": 0.39498281478881836, "learning_rate": 9.994340668716527e-05, "loss": 0.0655, "step": 1160 }, { "epoch": 1.2049433573635429, "grad_norm": 0.7332147359848022, "learning_rate": 9.993895308820058e-05, "loss": 0.0739, "step": 1170 }, { "epoch": 1.21524201853759, "grad_norm": 0.5935864448547363, "learning_rate": 9.99343309322192e-05, "loss": 0.0651, "step": 1180 }, { "epoch": 1.2255406797116375, "grad_norm": 0.5222606658935547, "learning_rate": 9.99295402348221e-05, "loss": 0.0676, "step": 1190 }, { "epoch": 1.235839340885685, "grad_norm": 0.5474528670310974, "learning_rate": 9.992458101217912e-05, "loss": 0.0775, "step": 1200 }, { "epoch": 1.2461380020597321, "grad_norm": 0.7393515110015869, "learning_rate": 9.991945328102897e-05, "loss": 0.0679, "step": 1210 }, { "epoch": 1.2564366632337796, "grad_norm": 0.48135286569595337, "learning_rate": 9.991415705867903e-05, "loss": 0.0627, "step": 1220 }, { "epoch": 1.266735324407827, "grad_norm": 0.40880492329597473, "learning_rate": 9.990869236300546e-05, "loss": 0.0621, "step": 1230 }, { "epoch": 1.2770339855818744, "grad_norm": 0.4522377550601959, "learning_rate": 9.990305921245306e-05, "loss": 0.0629, "step": 1240 }, { "epoch": 1.2873326467559219, "grad_norm": 0.5431732535362244, "learning_rate": 9.989725762603515e-05, "loss": 0.0711, "step": 1250 }, { "epoch": 1.297631307929969, "grad_norm": 0.4390816390514374, "learning_rate": 9.989128762333362e-05, "loss": 0.058, "step": 1260 }, { "epoch": 1.3079299691040165, "grad_norm": 0.5823209881782532, "learning_rate": 9.988514922449879e-05, "loss": 0.0742, "step": 1270 }, { "epoch": 1.318228630278064, "grad_norm": 0.6167677044868469, "learning_rate": 9.987884245024934e-05, "loss": 0.0698, "step": 1280 }, { "epoch": 1.3285272914521111, "grad_norm": 0.470501184463501, "learning_rate": 9.98723673218723e-05, "loss": 0.0669, "step": 1290 }, { "epoch": 1.3388259526261586, "grad_norm": 0.3435496985912323, "learning_rate": 9.986572386122291e-05, "loss": 0.0655, "step": 1300 }, { "epoch": 1.349124613800206, "grad_norm": 0.5990545749664307, "learning_rate": 9.98589120907246e-05, "loss": 0.0653, "step": 1310 }, { "epoch": 1.3594232749742534, "grad_norm": 0.7209518551826477, "learning_rate": 9.985193203336886e-05, "loss": 0.0654, "step": 1320 }, { "epoch": 1.3697219361483008, "grad_norm": 0.6588581800460815, "learning_rate": 9.984478371271521e-05, "loss": 0.066, "step": 1330 }, { "epoch": 1.380020597322348, "grad_norm": 0.5437431931495667, "learning_rate": 9.98374671528911e-05, "loss": 0.0685, "step": 1340 }, { "epoch": 1.3903192584963955, "grad_norm": 0.4081268012523651, "learning_rate": 9.982998237859184e-05, "loss": 0.0649, "step": 1350 }, { "epoch": 1.400617919670443, "grad_norm": 0.5363196134567261, "learning_rate": 9.98223294150805e-05, "loss": 0.0614, "step": 1360 }, { "epoch": 1.4109165808444901, "grad_norm": 0.5327999591827393, "learning_rate": 9.981450828818783e-05, "loss": 0.058, "step": 1370 }, { "epoch": 1.4212152420185376, "grad_norm": 0.39524152874946594, "learning_rate": 9.980651902431216e-05, "loss": 0.0606, "step": 1380 }, { "epoch": 1.431513903192585, "grad_norm": 0.5942156910896301, "learning_rate": 9.979836165041936e-05, "loss": 0.0589, "step": 1390 }, { "epoch": 1.4418125643666324, "grad_norm": 0.6506125330924988, "learning_rate": 9.97900361940427e-05, "loss": 0.0618, "step": 1400 }, { "epoch": 1.4521112255406798, "grad_norm": 0.43637052178382874, "learning_rate": 9.978154268328276e-05, "loss": 0.0728, "step": 1410 }, { "epoch": 1.462409886714727, "grad_norm": 0.5816675424575806, "learning_rate": 9.977288114680737e-05, "loss": 0.0738, "step": 1420 }, { "epoch": 1.4727085478887745, "grad_norm": 0.3983500301837921, "learning_rate": 9.976405161385147e-05, "loss": 0.0674, "step": 1430 }, { "epoch": 1.483007209062822, "grad_norm": 0.41254571080207825, "learning_rate": 9.975505411421704e-05, "loss": 0.066, "step": 1440 }, { "epoch": 1.4933058702368691, "grad_norm": 0.4647277593612671, "learning_rate": 9.974588867827301e-05, "loss": 0.0646, "step": 1450 }, { "epoch": 1.5036045314109165, "grad_norm": 0.4378807544708252, "learning_rate": 9.97365553369551e-05, "loss": 0.0589, "step": 1460 }, { "epoch": 1.513903192584964, "grad_norm": 0.6178969144821167, "learning_rate": 9.972705412176577e-05, "loss": 0.0621, "step": 1470 }, { "epoch": 1.5242018537590112, "grad_norm": 0.5825141072273254, "learning_rate": 9.971738506477414e-05, "loss": 0.0644, "step": 1480 }, { "epoch": 1.5345005149330588, "grad_norm": 0.5849868655204773, "learning_rate": 9.970754819861577e-05, "loss": 0.0669, "step": 1490 }, { "epoch": 1.544799176107106, "grad_norm": 0.5067623853683472, "learning_rate": 9.969754355649268e-05, "loss": 0.071, "step": 1500 }, { "epoch": 1.5550978372811535, "grad_norm": 0.5842755436897278, "learning_rate": 9.968737117217313e-05, "loss": 0.0713, "step": 1510 }, { "epoch": 1.565396498455201, "grad_norm": 0.3868110179901123, "learning_rate": 9.967703107999158e-05, "loss": 0.0635, "step": 1520 }, { "epoch": 1.575695159629248, "grad_norm": 0.4535583257675171, "learning_rate": 9.966652331484853e-05, "loss": 0.0587, "step": 1530 }, { "epoch": 1.5859938208032955, "grad_norm": 0.38644909858703613, "learning_rate": 9.965584791221048e-05, "loss": 0.0708, "step": 1540 }, { "epoch": 1.596292481977343, "grad_norm": 0.460753858089447, "learning_rate": 9.964500490810966e-05, "loss": 0.0645, "step": 1550 }, { "epoch": 1.6065911431513902, "grad_norm": 0.5585173964500427, "learning_rate": 9.963399433914405e-05, "loss": 0.0587, "step": 1560 }, { "epoch": 1.6168898043254378, "grad_norm": 0.6196934580802917, "learning_rate": 9.962281624247722e-05, "loss": 0.0663, "step": 1570 }, { "epoch": 1.627188465499485, "grad_norm": 0.440153568983078, "learning_rate": 9.961147065583813e-05, "loss": 0.0568, "step": 1580 }, { "epoch": 1.6374871266735325, "grad_norm": 0.49740493297576904, "learning_rate": 9.959995761752112e-05, "loss": 0.0616, "step": 1590 }, { "epoch": 1.64778578784758, "grad_norm": 0.7940653562545776, "learning_rate": 9.958827716638572e-05, "loss": 0.0656, "step": 1600 }, { "epoch": 1.658084449021627, "grad_norm": 0.39363256096839905, "learning_rate": 9.957642934185648e-05, "loss": 0.059, "step": 1610 }, { "epoch": 1.6683831101956745, "grad_norm": 0.5798192620277405, "learning_rate": 9.95644141839229e-05, "loss": 0.057, "step": 1620 }, { "epoch": 1.678681771369722, "grad_norm": 0.43519875407218933, "learning_rate": 9.955223173313931e-05, "loss": 0.0547, "step": 1630 }, { "epoch": 1.6889804325437692, "grad_norm": 0.5713900327682495, "learning_rate": 9.953988203062463e-05, "loss": 0.0655, "step": 1640 }, { "epoch": 1.6992790937178168, "grad_norm": 0.8694477677345276, "learning_rate": 9.952736511806236e-05, "loss": 0.0793, "step": 1650 }, { "epoch": 1.709577754891864, "grad_norm": 0.344855397939682, "learning_rate": 9.951468103770032e-05, "loss": 0.0654, "step": 1660 }, { "epoch": 1.7198764160659115, "grad_norm": 0.747203528881073, "learning_rate": 9.950182983235063e-05, "loss": 0.0694, "step": 1670 }, { "epoch": 1.730175077239959, "grad_norm": 0.44555550813674927, "learning_rate": 9.948881154538945e-05, "loss": 0.0729, "step": 1680 }, { "epoch": 1.740473738414006, "grad_norm": 0.4354792535305023, "learning_rate": 9.94756262207569e-05, "loss": 0.0739, "step": 1690 }, { "epoch": 1.7507723995880535, "grad_norm": 0.4117138683795929, "learning_rate": 9.946227390295689e-05, "loss": 0.0648, "step": 1700 }, { "epoch": 1.761071060762101, "grad_norm": 0.5352147221565247, "learning_rate": 9.9448754637057e-05, "loss": 0.0614, "step": 1710 }, { "epoch": 1.7713697219361482, "grad_norm": 0.3937685787677765, "learning_rate": 9.943506846868826e-05, "loss": 0.0668, "step": 1720 }, { "epoch": 1.7816683831101958, "grad_norm": 0.510313868522644, "learning_rate": 9.942121544404509e-05, "loss": 0.0564, "step": 1730 }, { "epoch": 1.791967044284243, "grad_norm": 0.43196746706962585, "learning_rate": 9.940719560988505e-05, "loss": 0.0515, "step": 1740 }, { "epoch": 1.8022657054582905, "grad_norm": 0.4649578928947449, "learning_rate": 9.939300901352876e-05, "loss": 0.0681, "step": 1750 }, { "epoch": 1.8125643666323379, "grad_norm": 0.6281247735023499, "learning_rate": 9.937865570285967e-05, "loss": 0.0721, "step": 1760 }, { "epoch": 1.822863027806385, "grad_norm": 0.6799906492233276, "learning_rate": 9.936413572632397e-05, "loss": 0.0565, "step": 1770 }, { "epoch": 1.8331616889804325, "grad_norm": 0.4169757068157196, "learning_rate": 9.934944913293038e-05, "loss": 0.0626, "step": 1780 }, { "epoch": 1.84346035015448, "grad_norm": 0.42282024025917053, "learning_rate": 9.933459597224997e-05, "loss": 0.0654, "step": 1790 }, { "epoch": 1.8537590113285272, "grad_norm": 0.34127193689346313, "learning_rate": 9.931957629441607e-05, "loss": 0.0572, "step": 1800 }, { "epoch": 1.8640576725025748, "grad_norm": 0.3683079183101654, "learning_rate": 9.930439015012396e-05, "loss": 0.0621, "step": 1810 }, { "epoch": 1.874356333676622, "grad_norm": 0.5137266516685486, "learning_rate": 9.92890375906309e-05, "loss": 0.0554, "step": 1820 }, { "epoch": 1.8846549948506695, "grad_norm": 0.4121856391429901, "learning_rate": 9.927351866775578e-05, "loss": 0.0631, "step": 1830 }, { "epoch": 1.8949536560247169, "grad_norm": 0.5225406289100647, "learning_rate": 9.925783343387903e-05, "loss": 0.0557, "step": 1840 }, { "epoch": 1.905252317198764, "grad_norm": 0.3983275294303894, "learning_rate": 9.924198194194237e-05, "loss": 0.0631, "step": 1850 }, { "epoch": 1.9155509783728115, "grad_norm": 0.49256351590156555, "learning_rate": 9.922596424544876e-05, "loss": 0.0661, "step": 1860 }, { "epoch": 1.925849639546859, "grad_norm": 0.5363610982894897, "learning_rate": 9.92097803984621e-05, "loss": 0.0706, "step": 1870 }, { "epoch": 1.9361483007209062, "grad_norm": 0.4455360472202301, "learning_rate": 9.919343045560712e-05, "loss": 0.0698, "step": 1880 }, { "epoch": 1.9464469618949538, "grad_norm": 0.5394087433815002, "learning_rate": 9.917691447206913e-05, "loss": 0.0616, "step": 1890 }, { "epoch": 1.956745623069001, "grad_norm": 0.3595924377441406, "learning_rate": 9.91602325035939e-05, "loss": 0.067, "step": 1900 }, { "epoch": 1.9670442842430484, "grad_norm": 0.2918682396411896, "learning_rate": 9.914338460648743e-05, "loss": 0.0732, "step": 1910 }, { "epoch": 1.9773429454170959, "grad_norm": 0.41418296098709106, "learning_rate": 9.912637083761578e-05, "loss": 0.0635, "step": 1920 }, { "epoch": 1.987641606591143, "grad_norm": 0.5165850520133972, "learning_rate": 9.910919125440485e-05, "loss": 0.069, "step": 1930 }, { "epoch": 1.9979402677651905, "grad_norm": 0.3793902099132538, "learning_rate": 9.909184591484027e-05, "loss": 0.0717, "step": 1940 }, { "epoch": 2.008238928939238, "grad_norm": 0.6616620421409607, "learning_rate": 9.907433487746702e-05, "loss": 0.0586, "step": 1950 }, { "epoch": 2.018537590113285, "grad_norm": 0.5687305331230164, "learning_rate": 9.905665820138949e-05, "loss": 0.0569, "step": 1960 }, { "epoch": 2.028836251287333, "grad_norm": 0.49890944361686707, "learning_rate": 9.903881594627105e-05, "loss": 0.0668, "step": 1970 }, { "epoch": 2.03913491246138, "grad_norm": 0.5814046859741211, "learning_rate": 9.902080817233398e-05, "loss": 0.0644, "step": 1980 }, { "epoch": 2.049433573635427, "grad_norm": 0.32920873165130615, "learning_rate": 9.900263494035921e-05, "loss": 0.0611, "step": 1990 }, { "epoch": 2.059732234809475, "grad_norm": 0.5075499415397644, "learning_rate": 9.898429631168619e-05, "loss": 0.0586, "step": 2000 }, { "epoch": 2.070030895983522, "grad_norm": 0.4823492169380188, "learning_rate": 9.896579234821253e-05, "loss": 0.0468, "step": 2010 }, { "epoch": 2.0803295571575697, "grad_norm": 0.5481283068656921, "learning_rate": 9.894712311239398e-05, "loss": 0.0611, "step": 2020 }, { "epoch": 2.090628218331617, "grad_norm": 0.4776170551776886, "learning_rate": 9.892828866724406e-05, "loss": 0.0657, "step": 2030 }, { "epoch": 2.100926879505664, "grad_norm": 0.5601367354393005, "learning_rate": 9.8909289076334e-05, "loss": 0.0665, "step": 2040 }, { "epoch": 2.111225540679712, "grad_norm": 0.3499130308628082, "learning_rate": 9.88901244037923e-05, "loss": 0.0563, "step": 2050 }, { "epoch": 2.121524201853759, "grad_norm": 0.4545436501502991, "learning_rate": 9.88707947143048e-05, "loss": 0.0557, "step": 2060 }, { "epoch": 2.131822863027806, "grad_norm": 0.46852630376815796, "learning_rate": 9.885130007311423e-05, "loss": 0.0522, "step": 2070 }, { "epoch": 2.142121524201854, "grad_norm": 0.308856338262558, "learning_rate": 9.883164054602012e-05, "loss": 0.058, "step": 2080 }, { "epoch": 2.152420185375901, "grad_norm": 0.7965716123580933, "learning_rate": 9.881181619937848e-05, "loss": 0.0535, "step": 2090 }, { "epoch": 2.1627188465499483, "grad_norm": 0.3949962556362152, "learning_rate": 9.879182710010169e-05, "loss": 0.0536, "step": 2100 }, { "epoch": 2.173017507723996, "grad_norm": 0.40669289231300354, "learning_rate": 9.877167331565816e-05, "loss": 0.0598, "step": 2110 }, { "epoch": 2.183316168898043, "grad_norm": 0.6267198324203491, "learning_rate": 9.875135491407217e-05, "loss": 0.0647, "step": 2120 }, { "epoch": 2.193614830072091, "grad_norm": 0.3919011950492859, "learning_rate": 9.873087196392368e-05, "loss": 0.063, "step": 2130 }, { "epoch": 2.203913491246138, "grad_norm": 0.3769017457962036, "learning_rate": 9.871022453434798e-05, "loss": 0.0558, "step": 2140 }, { "epoch": 2.214212152420185, "grad_norm": 0.382344126701355, "learning_rate": 9.868941269503551e-05, "loss": 0.0615, "step": 2150 }, { "epoch": 2.224510813594233, "grad_norm": 0.7266145348548889, "learning_rate": 9.86684365162317e-05, "loss": 0.0611, "step": 2160 }, { "epoch": 2.23480947476828, "grad_norm": 0.5791377425193787, "learning_rate": 9.864729606873663e-05, "loss": 0.0575, "step": 2170 }, { "epoch": 2.2451081359423277, "grad_norm": 0.40031886100769043, "learning_rate": 9.862599142390482e-05, "loss": 0.0559, "step": 2180 }, { "epoch": 2.255406797116375, "grad_norm": 0.34372609853744507, "learning_rate": 9.860452265364502e-05, "loss": 0.0623, "step": 2190 }, { "epoch": 2.265705458290422, "grad_norm": 0.5310713052749634, "learning_rate": 9.858288983041996e-05, "loss": 0.0628, "step": 2200 }, { "epoch": 2.27600411946447, "grad_norm": 0.4002261459827423, "learning_rate": 9.856109302724603e-05, "loss": 0.0528, "step": 2210 }, { "epoch": 2.286302780638517, "grad_norm": 0.3995415270328522, "learning_rate": 9.853913231769318e-05, "loss": 0.0603, "step": 2220 }, { "epoch": 2.296601441812564, "grad_norm": 0.5082608461380005, "learning_rate": 9.851700777588453e-05, "loss": 0.0555, "step": 2230 }, { "epoch": 2.306900102986612, "grad_norm": 0.3878387212753296, "learning_rate": 9.849471947649617e-05, "loss": 0.054, "step": 2240 }, { "epoch": 2.317198764160659, "grad_norm": 0.44272416830062866, "learning_rate": 9.847226749475695e-05, "loss": 0.067, "step": 2250 }, { "epoch": 2.3274974253347063, "grad_norm": 0.38929831981658936, "learning_rate": 9.844965190644817e-05, "loss": 0.0518, "step": 2260 }, { "epoch": 2.337796086508754, "grad_norm": 0.3083374798297882, "learning_rate": 9.842687278790337e-05, "loss": 0.0484, "step": 2270 }, { "epoch": 2.348094747682801, "grad_norm": 0.41075581312179565, "learning_rate": 9.8403930216008e-05, "loss": 0.0635, "step": 2280 }, { "epoch": 2.358393408856849, "grad_norm": 0.2911306917667389, "learning_rate": 9.838082426819926e-05, "loss": 0.0599, "step": 2290 }, { "epoch": 2.368692070030896, "grad_norm": 0.524851381778717, "learning_rate": 9.835755502246575e-05, "loss": 0.0542, "step": 2300 }, { "epoch": 2.378990731204943, "grad_norm": 0.45933887362480164, "learning_rate": 9.833412255734724e-05, "loss": 0.0671, "step": 2310 }, { "epoch": 2.389289392378991, "grad_norm": 0.38324400782585144, "learning_rate": 9.831052695193445e-05, "loss": 0.0596, "step": 2320 }, { "epoch": 2.399588053553038, "grad_norm": 0.7916087508201599, "learning_rate": 9.828676828586871e-05, "loss": 0.0722, "step": 2330 }, { "epoch": 2.4098867147270857, "grad_norm": 0.4739670157432556, "learning_rate": 9.826284663934171e-05, "loss": 0.0596, "step": 2340 }, { "epoch": 2.420185375901133, "grad_norm": 0.37064895033836365, "learning_rate": 9.823876209309527e-05, "loss": 0.062, "step": 2350 }, { "epoch": 2.43048403707518, "grad_norm": 0.6001970171928406, "learning_rate": 9.821451472842102e-05, "loss": 0.0623, "step": 2360 }, { "epoch": 2.4407826982492278, "grad_norm": 0.40998250246047974, "learning_rate": 9.819010462716016e-05, "loss": 0.0586, "step": 2370 }, { "epoch": 2.451081359423275, "grad_norm": 0.4756927490234375, "learning_rate": 9.816553187170317e-05, "loss": 0.0522, "step": 2380 }, { "epoch": 2.461380020597322, "grad_norm": 0.47659242153167725, "learning_rate": 9.814079654498949e-05, "loss": 0.0573, "step": 2390 }, { "epoch": 2.47167868177137, "grad_norm": 0.4043289124965668, "learning_rate": 9.811589873050735e-05, "loss": 0.0654, "step": 2400 }, { "epoch": 2.481977342945417, "grad_norm": 0.7355890870094299, "learning_rate": 9.809083851229335e-05, "loss": 0.0523, "step": 2410 }, { "epoch": 2.4922760041194643, "grad_norm": 0.4957990348339081, "learning_rate": 9.806561597493228e-05, "loss": 0.0566, "step": 2420 }, { "epoch": 2.502574665293512, "grad_norm": 0.3758098781108856, "learning_rate": 9.80402312035568e-05, "loss": 0.0509, "step": 2430 }, { "epoch": 2.512873326467559, "grad_norm": 0.4361479878425598, "learning_rate": 9.801468428384716e-05, "loss": 0.0566, "step": 2440 }, { "epoch": 2.5231719876416063, "grad_norm": 0.4788246750831604, "learning_rate": 9.798897530203087e-05, "loss": 0.0577, "step": 2450 }, { "epoch": 2.533470648815654, "grad_norm": 0.3828676640987396, "learning_rate": 9.796310434488248e-05, "loss": 0.0552, "step": 2460 }, { "epoch": 2.543769309989701, "grad_norm": 0.34888461232185364, "learning_rate": 9.79370714997232e-05, "loss": 0.0562, "step": 2470 }, { "epoch": 2.554067971163749, "grad_norm": 0.5660400986671448, "learning_rate": 9.791087685442071e-05, "loss": 0.0593, "step": 2480 }, { "epoch": 2.564366632337796, "grad_norm": 0.3883237838745117, "learning_rate": 9.788452049738879e-05, "loss": 0.0567, "step": 2490 }, { "epoch": 2.5746652935118437, "grad_norm": 0.34366926550865173, "learning_rate": 9.785800251758701e-05, "loss": 0.055, "step": 2500 }, { "epoch": 2.584963954685891, "grad_norm": 0.2992055416107178, "learning_rate": 9.783132300452049e-05, "loss": 0.053, "step": 2510 }, { "epoch": 2.595262615859938, "grad_norm": 0.3543379306793213, "learning_rate": 9.780448204823958e-05, "loss": 0.0587, "step": 2520 }, { "epoch": 2.6055612770339858, "grad_norm": 0.32997754216194153, "learning_rate": 9.777747973933948e-05, "loss": 0.0483, "step": 2530 }, { "epoch": 2.615859938208033, "grad_norm": 0.4290192425251007, "learning_rate": 9.775031616896008e-05, "loss": 0.0565, "step": 2540 }, { "epoch": 2.62615859938208, "grad_norm": 0.39540722966194153, "learning_rate": 9.772299142878549e-05, "loss": 0.0567, "step": 2550 }, { "epoch": 2.636457260556128, "grad_norm": 0.46537721157073975, "learning_rate": 9.769550561104388e-05, "loss": 0.0511, "step": 2560 }, { "epoch": 2.646755921730175, "grad_norm": 0.4019800126552582, "learning_rate": 9.766785880850707e-05, "loss": 0.0576, "step": 2570 }, { "epoch": 2.6570545829042223, "grad_norm": 0.3543599545955658, "learning_rate": 9.764005111449021e-05, "loss": 0.0561, "step": 2580 }, { "epoch": 2.66735324407827, "grad_norm": 0.459049791097641, "learning_rate": 9.761208262285155e-05, "loss": 0.0626, "step": 2590 }, { "epoch": 2.677651905252317, "grad_norm": 0.4867796003818512, "learning_rate": 9.758395342799206e-05, "loss": 0.0504, "step": 2600 }, { "epoch": 2.6879505664263643, "grad_norm": 0.42788106203079224, "learning_rate": 9.755566362485512e-05, "loss": 0.0578, "step": 2610 }, { "epoch": 2.698249227600412, "grad_norm": 0.3226776719093323, "learning_rate": 9.752721330892624e-05, "loss": 0.0552, "step": 2620 }, { "epoch": 2.708547888774459, "grad_norm": 0.4271225333213806, "learning_rate": 9.749860257623263e-05, "loss": 0.0549, "step": 2630 }, { "epoch": 2.718846549948507, "grad_norm": 0.39057081937789917, "learning_rate": 9.7469831523343e-05, "loss": 0.0558, "step": 2640 }, { "epoch": 2.729145211122554, "grad_norm": 0.4585021436214447, "learning_rate": 9.744090024736719e-05, "loss": 0.0481, "step": 2650 }, { "epoch": 2.7394438722966017, "grad_norm": 0.4004554748535156, "learning_rate": 9.741180884595578e-05, "loss": 0.0671, "step": 2660 }, { "epoch": 2.749742533470649, "grad_norm": 0.3565993010997772, "learning_rate": 9.738255741729987e-05, "loss": 0.0623, "step": 2670 }, { "epoch": 2.760041194644696, "grad_norm": 0.30855366587638855, "learning_rate": 9.735314606013068e-05, "loss": 0.0588, "step": 2680 }, { "epoch": 2.7703398558187438, "grad_norm": 0.4170495271682739, "learning_rate": 9.732357487371924e-05, "loss": 0.056, "step": 2690 }, { "epoch": 2.780638516992791, "grad_norm": 0.5667279362678528, "learning_rate": 9.729384395787602e-05, "loss": 0.0612, "step": 2700 }, { "epoch": 2.790937178166838, "grad_norm": 0.27353501319885254, "learning_rate": 9.726395341295062e-05, "loss": 0.0493, "step": 2710 }, { "epoch": 2.801235839340886, "grad_norm": 0.5288174152374268, "learning_rate": 9.723390333983144e-05, "loss": 0.0629, "step": 2720 }, { "epoch": 2.811534500514933, "grad_norm": 0.4831124544143677, "learning_rate": 9.720369383994535e-05, "loss": 0.0549, "step": 2730 }, { "epoch": 2.8218331616889802, "grad_norm": 0.3807002902030945, "learning_rate": 9.717332501525729e-05, "loss": 0.0561, "step": 2740 }, { "epoch": 2.832131822863028, "grad_norm": 0.6944444179534912, "learning_rate": 9.714279696826998e-05, "loss": 0.0564, "step": 2750 }, { "epoch": 2.842430484037075, "grad_norm": 0.3146667778491974, "learning_rate": 9.711210980202354e-05, "loss": 0.0544, "step": 2760 }, { "epoch": 2.8527291452111223, "grad_norm": 0.4342884421348572, "learning_rate": 9.708126362009522e-05, "loss": 0.0541, "step": 2770 }, { "epoch": 2.86302780638517, "grad_norm": 0.4473581612110138, "learning_rate": 9.70502585265989e-05, "loss": 0.0567, "step": 2780 }, { "epoch": 2.873326467559217, "grad_norm": 0.34954315423965454, "learning_rate": 9.70190946261849e-05, "loss": 0.0508, "step": 2790 }, { "epoch": 2.883625128733265, "grad_norm": 0.37677961587905884, "learning_rate": 9.698777202403953e-05, "loss": 0.0555, "step": 2800 }, { "epoch": 2.893923789907312, "grad_norm": 0.3924347460269928, "learning_rate": 9.695629082588473e-05, "loss": 0.0607, "step": 2810 }, { "epoch": 2.9042224510813597, "grad_norm": 0.34362998604774475, "learning_rate": 9.69246511379778e-05, "loss": 0.0479, "step": 2820 }, { "epoch": 2.914521112255407, "grad_norm": 0.48478758335113525, "learning_rate": 9.689285306711094e-05, "loss": 0.0564, "step": 2830 }, { "epoch": 2.924819773429454, "grad_norm": 0.39429691433906555, "learning_rate": 9.686089672061094e-05, "loss": 0.0552, "step": 2840 }, { "epoch": 2.9351184346035017, "grad_norm": 0.27760738134384155, "learning_rate": 9.682878220633885e-05, "loss": 0.0507, "step": 2850 }, { "epoch": 2.945417095777549, "grad_norm": 0.3564143180847168, "learning_rate": 9.679650963268951e-05, "loss": 0.0529, "step": 2860 }, { "epoch": 2.955715756951596, "grad_norm": 0.3425343930721283, "learning_rate": 9.676407910859131e-05, "loss": 0.05, "step": 2870 }, { "epoch": 2.966014418125644, "grad_norm": 0.3504887819290161, "learning_rate": 9.673149074350573e-05, "loss": 0.0529, "step": 2880 }, { "epoch": 2.976313079299691, "grad_norm": 0.432216078042984, "learning_rate": 9.669874464742705e-05, "loss": 0.0582, "step": 2890 }, { "epoch": 2.9866117404737382, "grad_norm": 0.4117823541164398, "learning_rate": 9.666584093088189e-05, "loss": 0.0516, "step": 2900 }, { "epoch": 2.996910401647786, "grad_norm": 0.4118179380893707, "learning_rate": 9.663277970492886e-05, "loss": 0.0664, "step": 2910 }, { "epoch": 3.007209062821833, "grad_norm": 0.31822094321250916, "learning_rate": 9.659956108115827e-05, "loss": 0.0607, "step": 2920 }, { "epoch": 3.0175077239958807, "grad_norm": 0.34220412373542786, "learning_rate": 9.656618517169164e-05, "loss": 0.0523, "step": 2930 }, { "epoch": 3.027806385169928, "grad_norm": 0.33871203660964966, "learning_rate": 9.65326520891814e-05, "loss": 0.0486, "step": 2940 }, { "epoch": 3.038105046343975, "grad_norm": 0.4035494327545166, "learning_rate": 9.649896194681045e-05, "loss": 0.0497, "step": 2950 }, { "epoch": 3.048403707518023, "grad_norm": 0.36851248145103455, "learning_rate": 9.646511485829186e-05, "loss": 0.062, "step": 2960 }, { "epoch": 3.05870236869207, "grad_norm": 0.3193969428539276, "learning_rate": 9.643111093786835e-05, "loss": 0.0514, "step": 2970 }, { "epoch": 3.0690010298661172, "grad_norm": 0.331909716129303, "learning_rate": 9.639695030031204e-05, "loss": 0.0488, "step": 2980 }, { "epoch": 3.079299691040165, "grad_norm": 0.35757410526275635, "learning_rate": 9.636263306092406e-05, "loss": 0.0576, "step": 2990 }, { "epoch": 3.089598352214212, "grad_norm": 0.4217674434185028, "learning_rate": 9.6328159335534e-05, "loss": 0.0554, "step": 3000 }, { "epoch": 3.0998970133882597, "grad_norm": 0.3531946539878845, "learning_rate": 9.629352924049975e-05, "loss": 0.059, "step": 3010 }, { "epoch": 3.110195674562307, "grad_norm": 0.39479324221611023, "learning_rate": 9.625874289270688e-05, "loss": 0.0621, "step": 3020 }, { "epoch": 3.120494335736354, "grad_norm": 0.29987436532974243, "learning_rate": 9.622380040956842e-05, "loss": 0.0511, "step": 3030 }, { "epoch": 3.130792996910402, "grad_norm": 0.5292258262634277, "learning_rate": 9.61887019090244e-05, "loss": 0.0564, "step": 3040 }, { "epoch": 3.141091658084449, "grad_norm": 0.33128613233566284, "learning_rate": 9.615344750954141e-05, "loss": 0.0548, "step": 3050 }, { "epoch": 3.151390319258496, "grad_norm": 0.43356847763061523, "learning_rate": 9.611803733011229e-05, "loss": 0.0557, "step": 3060 }, { "epoch": 3.161688980432544, "grad_norm": 0.4408741295337677, "learning_rate": 9.60824714902556e-05, "loss": 0.0582, "step": 3070 }, { "epoch": 3.171987641606591, "grad_norm": 0.307669460773468, "learning_rate": 9.604675011001538e-05, "loss": 0.0442, "step": 3080 }, { "epoch": 3.1822863027806383, "grad_norm": 0.49202683568000793, "learning_rate": 9.601087330996061e-05, "loss": 0.0599, "step": 3090 }, { "epoch": 3.192584963954686, "grad_norm": 0.3430628180503845, "learning_rate": 9.597484121118487e-05, "loss": 0.0501, "step": 3100 }, { "epoch": 3.202883625128733, "grad_norm": 0.45715686678886414, "learning_rate": 9.593865393530592e-05, "loss": 0.0533, "step": 3110 }, { "epoch": 3.213182286302781, "grad_norm": 0.29405537247657776, "learning_rate": 9.590231160446526e-05, "loss": 0.0579, "step": 3120 }, { "epoch": 3.223480947476828, "grad_norm": 0.4138418436050415, "learning_rate": 9.586581434132775e-05, "loss": 0.0553, "step": 3130 }, { "epoch": 3.233779608650875, "grad_norm": 0.2747637927532196, "learning_rate": 9.582916226908118e-05, "loss": 0.0534, "step": 3140 }, { "epoch": 3.244078269824923, "grad_norm": 0.3608400821685791, "learning_rate": 9.57923555114359e-05, "loss": 0.0512, "step": 3150 }, { "epoch": 3.25437693099897, "grad_norm": 0.4042729437351227, "learning_rate": 9.575539419262434e-05, "loss": 0.0445, "step": 3160 }, { "epoch": 3.2646755921730177, "grad_norm": 0.35471370816230774, "learning_rate": 9.571827843740057e-05, "loss": 0.0542, "step": 3170 }, { "epoch": 3.274974253347065, "grad_norm": 0.2936842441558838, "learning_rate": 9.568100837104e-05, "loss": 0.0505, "step": 3180 }, { "epoch": 3.285272914521112, "grad_norm": 0.2880595028400421, "learning_rate": 9.56435841193388e-05, "loss": 0.0458, "step": 3190 }, { "epoch": 3.29557157569516, "grad_norm": 0.33003637194633484, "learning_rate": 9.560600580861365e-05, "loss": 0.0576, "step": 3200 }, { "epoch": 3.305870236869207, "grad_norm": 0.4025996923446655, "learning_rate": 9.556827356570116e-05, "loss": 0.0598, "step": 3210 }, { "epoch": 3.316168898043254, "grad_norm": 0.5448514819145203, "learning_rate": 9.553038751795746e-05, "loss": 0.0503, "step": 3220 }, { "epoch": 3.326467559217302, "grad_norm": 0.39959079027175903, "learning_rate": 9.549234779325792e-05, "loss": 0.0581, "step": 3230 }, { "epoch": 3.336766220391349, "grad_norm": 0.31689804792404175, "learning_rate": 9.545415451999653e-05, "loss": 0.054, "step": 3240 }, { "epoch": 3.3470648815653963, "grad_norm": 0.5861422419548035, "learning_rate": 9.541580782708557e-05, "loss": 0.0498, "step": 3250 }, { "epoch": 3.357363542739444, "grad_norm": 0.36639899015426636, "learning_rate": 9.537730784395514e-05, "loss": 0.0625, "step": 3260 }, { "epoch": 3.367662203913491, "grad_norm": 0.3032686710357666, "learning_rate": 9.533865470055275e-05, "loss": 0.0543, "step": 3270 }, { "epoch": 3.377960865087539, "grad_norm": 0.4109341502189636, "learning_rate": 9.529984852734285e-05, "loss": 0.0582, "step": 3280 }, { "epoch": 3.388259526261586, "grad_norm": 0.38670700788497925, "learning_rate": 9.526088945530645e-05, "loss": 0.0547, "step": 3290 }, { "epoch": 3.398558187435633, "grad_norm": 0.30283281207084656, "learning_rate": 9.522177761594057e-05, "loss": 0.0434, "step": 3300 }, { "epoch": 3.408856848609681, "grad_norm": 0.3940243721008301, "learning_rate": 9.518251314125788e-05, "loss": 0.0548, "step": 3310 }, { "epoch": 3.419155509783728, "grad_norm": 0.6107800006866455, "learning_rate": 9.514309616378626e-05, "loss": 0.0453, "step": 3320 }, { "epoch": 3.4294541709577757, "grad_norm": 0.3535449802875519, "learning_rate": 9.510352681656832e-05, "loss": 0.0509, "step": 3330 }, { "epoch": 3.439752832131823, "grad_norm": 0.4279785454273224, "learning_rate": 9.50638052331609e-05, "loss": 0.0511, "step": 3340 }, { "epoch": 3.45005149330587, "grad_norm": 0.5184943675994873, "learning_rate": 9.502393154763478e-05, "loss": 0.0553, "step": 3350 }, { "epoch": 3.460350154479918, "grad_norm": 0.6247850656509399, "learning_rate": 9.498390589457404e-05, "loss": 0.0485, "step": 3360 }, { "epoch": 3.470648815653965, "grad_norm": 0.4810273349285126, "learning_rate": 9.494372840907572e-05, "loss": 0.0646, "step": 3370 }, { "epoch": 3.480947476828012, "grad_norm": 0.31024450063705444, "learning_rate": 9.490339922674934e-05, "loss": 0.0506, "step": 3380 }, { "epoch": 3.49124613800206, "grad_norm": 0.3408045172691345, "learning_rate": 9.486291848371643e-05, "loss": 0.0598, "step": 3390 }, { "epoch": 3.501544799176107, "grad_norm": 0.3190326988697052, "learning_rate": 9.482228631661005e-05, "loss": 0.0569, "step": 3400 }, { "epoch": 3.5118434603501543, "grad_norm": 0.3894359767436981, "learning_rate": 9.478150286257443e-05, "loss": 0.048, "step": 3410 }, { "epoch": 3.522142121524202, "grad_norm": 0.33339062333106995, "learning_rate": 9.474056825926434e-05, "loss": 0.0533, "step": 3420 }, { "epoch": 3.532440782698249, "grad_norm": 0.4688987731933594, "learning_rate": 9.46994826448448e-05, "loss": 0.0495, "step": 3430 }, { "epoch": 3.5427394438722963, "grad_norm": 0.24669192731380463, "learning_rate": 9.465824615799046e-05, "loss": 0.0487, "step": 3440 }, { "epoch": 3.553038105046344, "grad_norm": 0.43672746419906616, "learning_rate": 9.461685893788526e-05, "loss": 0.0529, "step": 3450 }, { "epoch": 3.563336766220391, "grad_norm": 0.3806833028793335, "learning_rate": 9.457532112422187e-05, "loss": 0.0644, "step": 3460 }, { "epoch": 3.573635427394439, "grad_norm": 0.43160000443458557, "learning_rate": 9.453363285720129e-05, "loss": 0.046, "step": 3470 }, { "epoch": 3.583934088568486, "grad_norm": 0.3873897194862366, "learning_rate": 9.44917942775323e-05, "loss": 0.0561, "step": 3480 }, { "epoch": 3.5942327497425337, "grad_norm": 0.420244425535202, "learning_rate": 9.444980552643103e-05, "loss": 0.0544, "step": 3490 }, { "epoch": 3.604531410916581, "grad_norm": 0.2572662830352783, "learning_rate": 9.44076667456205e-05, "loss": 0.0609, "step": 3500 }, { "epoch": 3.614830072090628, "grad_norm": 0.5829557776451111, "learning_rate": 9.43653780773301e-05, "loss": 0.0683, "step": 3510 }, { "epoch": 3.6251287332646758, "grad_norm": 0.5830304622650146, "learning_rate": 9.432293966429514e-05, "loss": 0.067, "step": 3520 }, { "epoch": 3.635427394438723, "grad_norm": 0.38021519780158997, "learning_rate": 9.428035164975636e-05, "loss": 0.0498, "step": 3530 }, { "epoch": 3.64572605561277, "grad_norm": 0.4201594591140747, "learning_rate": 9.423761417745942e-05, "loss": 0.0569, "step": 3540 }, { "epoch": 3.656024716786818, "grad_norm": 0.5576770305633545, "learning_rate": 9.419472739165449e-05, "loss": 0.0667, "step": 3550 }, { "epoch": 3.666323377960865, "grad_norm": 0.34150251746177673, "learning_rate": 9.415169143709565e-05, "loss": 0.0539, "step": 3560 }, { "epoch": 3.6766220391349123, "grad_norm": 0.5191327333450317, "learning_rate": 9.410850645904049e-05, "loss": 0.0609, "step": 3570 }, { "epoch": 3.68692070030896, "grad_norm": 0.3418954610824585, "learning_rate": 9.40651726032496e-05, "loss": 0.0485, "step": 3580 }, { "epoch": 3.697219361483007, "grad_norm": 0.44254234433174133, "learning_rate": 9.402169001598611e-05, "loss": 0.0552, "step": 3590 }, { "epoch": 3.7075180226570543, "grad_norm": 0.549349308013916, "learning_rate": 9.397805884401504e-05, "loss": 0.0601, "step": 3600 }, { "epoch": 3.717816683831102, "grad_norm": 0.4500453472137451, "learning_rate": 9.393427923460308e-05, "loss": 0.0496, "step": 3610 }, { "epoch": 3.728115345005149, "grad_norm": 0.5540750622749329, "learning_rate": 9.389035133551778e-05, "loss": 0.0563, "step": 3620 }, { "epoch": 3.738414006179197, "grad_norm": 0.28786641359329224, "learning_rate": 9.38462752950273e-05, "loss": 0.0532, "step": 3630 }, { "epoch": 3.748712667353244, "grad_norm": 0.3725302219390869, "learning_rate": 9.380205126189983e-05, "loss": 0.0558, "step": 3640 }, { "epoch": 3.7590113285272917, "grad_norm": 0.47449609637260437, "learning_rate": 9.375767938540299e-05, "loss": 0.0559, "step": 3650 }, { "epoch": 3.769309989701339, "grad_norm": 0.5294702649116516, "learning_rate": 9.371315981530349e-05, "loss": 0.0534, "step": 3660 }, { "epoch": 3.779608650875386, "grad_norm": 0.29216107726097107, "learning_rate": 9.366849270186649e-05, "loss": 0.0519, "step": 3670 }, { "epoch": 3.7899073120494338, "grad_norm": 0.28166675567626953, "learning_rate": 9.362367819585518e-05, "loss": 0.0532, "step": 3680 }, { "epoch": 3.800205973223481, "grad_norm": 0.5699660778045654, "learning_rate": 9.357871644853024e-05, "loss": 0.0533, "step": 3690 }, { "epoch": 3.810504634397528, "grad_norm": 0.44877076148986816, "learning_rate": 9.353360761164931e-05, "loss": 0.0569, "step": 3700 }, { "epoch": 3.820803295571576, "grad_norm": 0.4341685175895691, "learning_rate": 9.348835183746649e-05, "loss": 0.0579, "step": 3710 }, { "epoch": 3.831101956745623, "grad_norm": 0.37804657220840454, "learning_rate": 9.344294927873188e-05, "loss": 0.0535, "step": 3720 }, { "epoch": 3.8414006179196702, "grad_norm": 0.47172001004219055, "learning_rate": 9.339740008869092e-05, "loss": 0.049, "step": 3730 }, { "epoch": 3.851699279093718, "grad_norm": 0.29430967569351196, "learning_rate": 9.335170442108408e-05, "loss": 0.0547, "step": 3740 }, { "epoch": 3.861997940267765, "grad_norm": 0.40547069907188416, "learning_rate": 9.330586243014617e-05, "loss": 0.0486, "step": 3750 }, { "epoch": 3.8722966014418123, "grad_norm": 0.3896206319332123, "learning_rate": 9.325987427060586e-05, "loss": 0.0585, "step": 3760 }, { "epoch": 3.88259526261586, "grad_norm": 0.29565155506134033, "learning_rate": 9.321374009768525e-05, "loss": 0.0508, "step": 3770 }, { "epoch": 3.892893923789907, "grad_norm": 0.5239169597625732, "learning_rate": 9.316746006709919e-05, "loss": 0.0608, "step": 3780 }, { "epoch": 3.903192584963955, "grad_norm": 0.2817414402961731, "learning_rate": 9.31210343350549e-05, "loss": 0.0465, "step": 3790 }, { "epoch": 3.913491246138002, "grad_norm": 0.4744998514652252, "learning_rate": 9.307446305825135e-05, "loss": 0.0616, "step": 3800 }, { "epoch": 3.9237899073120497, "grad_norm": 0.4715334475040436, "learning_rate": 9.302774639387877e-05, "loss": 0.0557, "step": 3810 }, { "epoch": 3.934088568486097, "grad_norm": 0.5753309726715088, "learning_rate": 9.298088449961813e-05, "loss": 0.0592, "step": 3820 }, { "epoch": 3.944387229660144, "grad_norm": 0.318158358335495, "learning_rate": 9.293387753364052e-05, "loss": 0.0604, "step": 3830 }, { "epoch": 3.9546858908341918, "grad_norm": 0.4752749800682068, "learning_rate": 9.288672565460679e-05, "loss": 0.049, "step": 3840 }, { "epoch": 3.964984552008239, "grad_norm": 0.284682035446167, "learning_rate": 9.283942902166681e-05, "loss": 0.0491, "step": 3850 }, { "epoch": 3.975283213182286, "grad_norm": 0.4126709997653961, "learning_rate": 9.27919877944591e-05, "loss": 0.0508, "step": 3860 }, { "epoch": 3.985581874356334, "grad_norm": 0.34126409888267517, "learning_rate": 9.27444021331102e-05, "loss": 0.0545, "step": 3870 }, { "epoch": 3.995880535530381, "grad_norm": 0.5670478343963623, "learning_rate": 9.269667219823412e-05, "loss": 0.0483, "step": 3880 }, { "epoch": 4.006179196704428, "grad_norm": 0.3084736466407776, "learning_rate": 9.264879815093191e-05, "loss": 0.0499, "step": 3890 }, { "epoch": 4.016477857878476, "grad_norm": 0.4823373258113861, "learning_rate": 9.260078015279096e-05, "loss": 0.0558, "step": 3900 }, { "epoch": 4.0267765190525235, "grad_norm": 0.2889825105667114, "learning_rate": 9.255261836588458e-05, "loss": 0.0561, "step": 3910 }, { "epoch": 4.03707518022657, "grad_norm": 0.28834298253059387, "learning_rate": 9.250431295277137e-05, "loss": 0.0498, "step": 3920 }, { "epoch": 4.047373841400618, "grad_norm": 0.40643489360809326, "learning_rate": 9.245586407649473e-05, "loss": 0.0479, "step": 3930 }, { "epoch": 4.057672502574666, "grad_norm": 0.3214862644672394, "learning_rate": 9.240727190058227e-05, "loss": 0.0498, "step": 3940 }, { "epoch": 4.067971163748712, "grad_norm": 0.40402647852897644, "learning_rate": 9.235853658904529e-05, "loss": 0.0522, "step": 3950 }, { "epoch": 4.07826982492276, "grad_norm": 0.3338010311126709, "learning_rate": 9.230965830637821e-05, "loss": 0.0506, "step": 3960 }, { "epoch": 4.088568486096808, "grad_norm": 0.42742258310317993, "learning_rate": 9.226063721755799e-05, "loss": 0.053, "step": 3970 }, { "epoch": 4.098867147270854, "grad_norm": 0.3947793245315552, "learning_rate": 9.221147348804362e-05, "loss": 0.0541, "step": 3980 }, { "epoch": 4.109165808444902, "grad_norm": 0.4395465552806854, "learning_rate": 9.216216728377554e-05, "loss": 0.0509, "step": 3990 }, { "epoch": 4.11946446961895, "grad_norm": 0.28796476125717163, "learning_rate": 9.211271877117507e-05, "loss": 0.0501, "step": 4000 }, { "epoch": 4.1297631307929965, "grad_norm": 0.31560418009757996, "learning_rate": 9.206312811714386e-05, "loss": 0.0502, "step": 4010 }, { "epoch": 4.140061791967044, "grad_norm": 0.45714765787124634, "learning_rate": 9.201339548906332e-05, "loss": 0.0579, "step": 4020 }, { "epoch": 4.150360453141092, "grad_norm": 0.3373541831970215, "learning_rate": 9.196352105479409e-05, "loss": 0.0504, "step": 4030 }, { "epoch": 4.1606591143151395, "grad_norm": 0.5105737447738647, "learning_rate": 9.19135049826754e-05, "loss": 0.0619, "step": 4040 }, { "epoch": 4.170957775489186, "grad_norm": 0.3023523688316345, "learning_rate": 9.186334744152458e-05, "loss": 0.0499, "step": 4050 }, { "epoch": 4.181256436663234, "grad_norm": 0.3311084508895874, "learning_rate": 9.18130486006364e-05, "loss": 0.0484, "step": 4060 }, { "epoch": 4.1915550978372815, "grad_norm": 0.3167574405670166, "learning_rate": 9.176260862978263e-05, "loss": 0.0605, "step": 4070 }, { "epoch": 4.201853759011328, "grad_norm": 0.3764163553714752, "learning_rate": 9.171202769921134e-05, "loss": 0.0521, "step": 4080 }, { "epoch": 4.212152420185376, "grad_norm": 0.325210303068161, "learning_rate": 9.16613059796464e-05, "loss": 0.0471, "step": 4090 }, { "epoch": 4.222451081359424, "grad_norm": 0.3970625102519989, "learning_rate": 9.161044364228683e-05, "loss": 0.0545, "step": 4100 }, { "epoch": 4.23274974253347, "grad_norm": 0.306384414434433, "learning_rate": 9.155944085880637e-05, "loss": 0.0539, "step": 4110 }, { "epoch": 4.243048403707518, "grad_norm": 0.4230334162712097, "learning_rate": 9.150829780135269e-05, "loss": 0.0456, "step": 4120 }, { "epoch": 4.253347064881566, "grad_norm": 0.29097849130630493, "learning_rate": 9.145701464254698e-05, "loss": 0.0511, "step": 4130 }, { "epoch": 4.263645726055612, "grad_norm": 0.390979140996933, "learning_rate": 9.140559155548333e-05, "loss": 0.0461, "step": 4140 }, { "epoch": 4.27394438722966, "grad_norm": 0.2566828429698944, "learning_rate": 9.135402871372808e-05, "loss": 0.0508, "step": 4150 }, { "epoch": 4.284243048403708, "grad_norm": 0.4710136651992798, "learning_rate": 9.130232629131932e-05, "loss": 0.0503, "step": 4160 }, { "epoch": 4.2945417095777545, "grad_norm": 0.4374995827674866, "learning_rate": 9.125048446276618e-05, "loss": 0.0599, "step": 4170 }, { "epoch": 4.304840370751802, "grad_norm": 0.43765076994895935, "learning_rate": 9.119850340304843e-05, "loss": 0.0531, "step": 4180 }, { "epoch": 4.31513903192585, "grad_norm": 0.45118576288223267, "learning_rate": 9.114638328761571e-05, "loss": 0.0527, "step": 4190 }, { "epoch": 4.325437693099897, "grad_norm": 0.3243924379348755, "learning_rate": 9.109412429238704e-05, "loss": 0.0431, "step": 4200 }, { "epoch": 4.335736354273944, "grad_norm": 0.33518919348716736, "learning_rate": 9.104172659375017e-05, "loss": 0.0491, "step": 4210 }, { "epoch": 4.346035015447992, "grad_norm": 0.6875081062316895, "learning_rate": 9.098919036856102e-05, "loss": 0.0488, "step": 4220 }, { "epoch": 4.3563336766220395, "grad_norm": 0.5093826055526733, "learning_rate": 9.093651579414311e-05, "loss": 0.0487, "step": 4230 }, { "epoch": 4.366632337796086, "grad_norm": 0.37270835041999817, "learning_rate": 9.088370304828685e-05, "loss": 0.0559, "step": 4240 }, { "epoch": 4.376930998970134, "grad_norm": 0.4596996307373047, "learning_rate": 9.083075230924913e-05, "loss": 0.0578, "step": 4250 }, { "epoch": 4.387229660144182, "grad_norm": 0.3775595426559448, "learning_rate": 9.077766375575246e-05, "loss": 0.0562, "step": 4260 }, { "epoch": 4.397528321318228, "grad_norm": 0.3252449333667755, "learning_rate": 9.072443756698459e-05, "loss": 0.0558, "step": 4270 }, { "epoch": 4.407826982492276, "grad_norm": 0.42610299587249756, "learning_rate": 9.067107392259783e-05, "loss": 0.0455, "step": 4280 }, { "epoch": 4.418125643666324, "grad_norm": 0.36227330565452576, "learning_rate": 9.061757300270845e-05, "loss": 0.0498, "step": 4290 }, { "epoch": 4.42842430484037, "grad_norm": 0.4343869686126709, "learning_rate": 9.056393498789602e-05, "loss": 0.0504, "step": 4300 }, { "epoch": 4.438722966014418, "grad_norm": 0.4492502808570862, "learning_rate": 9.051016005920282e-05, "loss": 0.0526, "step": 4310 }, { "epoch": 4.449021627188466, "grad_norm": 0.2649560868740082, "learning_rate": 9.045624839813334e-05, "loss": 0.0488, "step": 4320 }, { "epoch": 4.4593202883625125, "grad_norm": 0.2290182262659073, "learning_rate": 9.040220018665347e-05, "loss": 0.0427, "step": 4330 }, { "epoch": 4.46961894953656, "grad_norm": 0.37687376141548157, "learning_rate": 9.034801560719011e-05, "loss": 0.0437, "step": 4340 }, { "epoch": 4.479917610710608, "grad_norm": 0.21943651139736176, "learning_rate": 9.029369484263033e-05, "loss": 0.047, "step": 4350 }, { "epoch": 4.490216271884655, "grad_norm": 0.32304951548576355, "learning_rate": 9.02392380763209e-05, "loss": 0.0461, "step": 4360 }, { "epoch": 4.500514933058702, "grad_norm": 0.21305856108665466, "learning_rate": 9.018464549206769e-05, "loss": 0.0461, "step": 4370 }, { "epoch": 4.51081359423275, "grad_norm": 0.6847507953643799, "learning_rate": 9.012991727413487e-05, "loss": 0.0475, "step": 4380 }, { "epoch": 4.521112255406797, "grad_norm": 0.3444644808769226, "learning_rate": 9.007505360724453e-05, "loss": 0.0423, "step": 4390 }, { "epoch": 4.531410916580844, "grad_norm": 0.3524458110332489, "learning_rate": 9.002005467657586e-05, "loss": 0.058, "step": 4400 }, { "epoch": 4.541709577754892, "grad_norm": 0.4131333529949188, "learning_rate": 8.996492066776464e-05, "loss": 0.0462, "step": 4410 }, { "epoch": 4.55200823892894, "grad_norm": 0.35865673422813416, "learning_rate": 8.990965176690252e-05, "loss": 0.0493, "step": 4420 }, { "epoch": 4.562306900102986, "grad_norm": 0.3511912524700165, "learning_rate": 8.985424816053651e-05, "loss": 0.0561, "step": 4430 }, { "epoch": 4.572605561277034, "grad_norm": 0.2704029083251953, "learning_rate": 8.979871003566826e-05, "loss": 0.0526, "step": 4440 }, { "epoch": 4.582904222451082, "grad_norm": 0.3202318847179413, "learning_rate": 8.974303757975345e-05, "loss": 0.0532, "step": 4450 }, { "epoch": 4.593202883625128, "grad_norm": 0.31483763456344604, "learning_rate": 8.968723098070117e-05, "loss": 0.051, "step": 4460 }, { "epoch": 4.603501544799176, "grad_norm": 0.3457460403442383, "learning_rate": 8.963129042687329e-05, "loss": 0.0507, "step": 4470 }, { "epoch": 4.613800205973224, "grad_norm": 0.31409910321235657, "learning_rate": 8.957521610708375e-05, "loss": 0.0503, "step": 4480 }, { "epoch": 4.6240988671472705, "grad_norm": 0.2827114164829254, "learning_rate": 8.951900821059809e-05, "loss": 0.0494, "step": 4490 }, { "epoch": 4.634397528321318, "grad_norm": 0.31604471802711487, "learning_rate": 8.946266692713261e-05, "loss": 0.0483, "step": 4500 }, { "epoch": 4.644696189495366, "grad_norm": 0.3118681311607361, "learning_rate": 8.940619244685388e-05, "loss": 0.0553, "step": 4510 }, { "epoch": 4.6549948506694125, "grad_norm": 0.2974856197834015, "learning_rate": 8.934958496037802e-05, "loss": 0.051, "step": 4520 }, { "epoch": 4.66529351184346, "grad_norm": 0.3584068715572357, "learning_rate": 8.92928446587701e-05, "loss": 0.0459, "step": 4530 }, { "epoch": 4.675592173017508, "grad_norm": 0.36687174439430237, "learning_rate": 8.923597173354345e-05, "loss": 0.0483, "step": 4540 }, { "epoch": 4.6858908341915555, "grad_norm": 0.35569944977760315, "learning_rate": 8.917896637665908e-05, "loss": 0.05, "step": 4550 }, { "epoch": 4.696189495365602, "grad_norm": 0.38467368483543396, "learning_rate": 8.912182878052495e-05, "loss": 0.0421, "step": 4560 }, { "epoch": 4.70648815653965, "grad_norm": 0.36783739924430847, "learning_rate": 8.906455913799538e-05, "loss": 0.0509, "step": 4570 }, { "epoch": 4.716786817713698, "grad_norm": 0.2462991178035736, "learning_rate": 8.900715764237037e-05, "loss": 0.0469, "step": 4580 }, { "epoch": 4.727085478887744, "grad_norm": 0.3449934720993042, "learning_rate": 8.894962448739499e-05, "loss": 0.0467, "step": 4590 }, { "epoch": 4.737384140061792, "grad_norm": 0.38251376152038574, "learning_rate": 8.889195986725865e-05, "loss": 0.049, "step": 4600 }, { "epoch": 4.74768280123584, "grad_norm": 0.30399325489997864, "learning_rate": 8.883416397659452e-05, "loss": 0.0532, "step": 4610 }, { "epoch": 4.757981462409886, "grad_norm": 0.4609906077384949, "learning_rate": 8.877623701047885e-05, "loss": 0.0511, "step": 4620 }, { "epoch": 4.768280123583934, "grad_norm": 0.40049266815185547, "learning_rate": 8.871817916443025e-05, "loss": 0.0567, "step": 4630 }, { "epoch": 4.778578784757982, "grad_norm": 0.5834691524505615, "learning_rate": 8.865999063440916e-05, "loss": 0.0491, "step": 4640 }, { "epoch": 4.7888774459320285, "grad_norm": 0.4367988705635071, "learning_rate": 8.860167161681707e-05, "loss": 0.0573, "step": 4650 }, { "epoch": 4.799176107106076, "grad_norm": 0.33364230394363403, "learning_rate": 8.854322230849588e-05, "loss": 0.0604, "step": 4660 }, { "epoch": 4.809474768280124, "grad_norm": 0.42235320806503296, "learning_rate": 8.848464290672729e-05, "loss": 0.0518, "step": 4670 }, { "epoch": 4.819773429454171, "grad_norm": 0.32555538415908813, "learning_rate": 8.84259336092321e-05, "loss": 0.0457, "step": 4680 }, { "epoch": 4.830072090628218, "grad_norm": 0.34331732988357544, "learning_rate": 8.836709461416952e-05, "loss": 0.0558, "step": 4690 }, { "epoch": 4.840370751802266, "grad_norm": 0.6019324064254761, "learning_rate": 8.830812612013655e-05, "loss": 0.0573, "step": 4700 }, { "epoch": 4.850669412976313, "grad_norm": 0.2844030261039734, "learning_rate": 8.824902832616723e-05, "loss": 0.0571, "step": 4710 }, { "epoch": 4.86096807415036, "grad_norm": 0.47788453102111816, "learning_rate": 8.818980143173213e-05, "loss": 0.0565, "step": 4720 }, { "epoch": 4.871266735324408, "grad_norm": 0.24314385652542114, "learning_rate": 8.81304456367374e-05, "loss": 0.046, "step": 4730 }, { "epoch": 4.8815653964984556, "grad_norm": 0.3316558301448822, "learning_rate": 8.807096114152442e-05, "loss": 0.0519, "step": 4740 }, { "epoch": 4.891864057672502, "grad_norm": 0.4027853012084961, "learning_rate": 8.801134814686891e-05, "loss": 0.0495, "step": 4750 }, { "epoch": 4.90216271884655, "grad_norm": 0.3290289342403412, "learning_rate": 8.795160685398027e-05, "loss": 0.0449, "step": 4760 }, { "epoch": 4.912461380020598, "grad_norm": 0.3217390775680542, "learning_rate": 8.789173746450101e-05, "loss": 0.0578, "step": 4770 }, { "epoch": 4.922760041194644, "grad_norm": 0.43397730588912964, "learning_rate": 8.783174018050594e-05, "loss": 0.0483, "step": 4780 }, { "epoch": 4.933058702368692, "grad_norm": 0.38298988342285156, "learning_rate": 8.777161520450158e-05, "loss": 0.0479, "step": 4790 }, { "epoch": 4.94335736354274, "grad_norm": 0.36208289861679077, "learning_rate": 8.771136273942544e-05, "loss": 0.0525, "step": 4800 }, { "epoch": 4.9536560247167865, "grad_norm": 0.3291323482990265, "learning_rate": 8.765098298864533e-05, "loss": 0.0469, "step": 4810 }, { "epoch": 4.963954685890834, "grad_norm": 0.23334382474422455, "learning_rate": 8.759047615595869e-05, "loss": 0.0478, "step": 4820 }, { "epoch": 4.974253347064882, "grad_norm": 0.3632581830024719, "learning_rate": 8.752984244559188e-05, "loss": 0.0558, "step": 4830 }, { "epoch": 4.9845520082389285, "grad_norm": 0.3983827531337738, "learning_rate": 8.746908206219955e-05, "loss": 0.0584, "step": 4840 }, { "epoch": 4.994850669412976, "grad_norm": 0.5021440982818604, "learning_rate": 8.740819521086383e-05, "loss": 0.0522, "step": 4850 }, { "epoch": 5.005149330587024, "grad_norm": 0.4782863259315491, "learning_rate": 8.734718209709377e-05, "loss": 0.0503, "step": 4860 }, { "epoch": 5.0154479917610715, "grad_norm": 0.3124346435070038, "learning_rate": 8.728604292682459e-05, "loss": 0.0523, "step": 4870 }, { "epoch": 5.025746652935118, "grad_norm": 0.46991485357284546, "learning_rate": 8.722477790641694e-05, "loss": 0.0507, "step": 4880 }, { "epoch": 5.036045314109166, "grad_norm": 0.381569504737854, "learning_rate": 8.71633872426563e-05, "loss": 0.0473, "step": 4890 }, { "epoch": 5.0463439752832135, "grad_norm": 0.4210774004459381, "learning_rate": 8.710187114275219e-05, "loss": 0.0521, "step": 4900 }, { "epoch": 5.05664263645726, "grad_norm": 0.3999352753162384, "learning_rate": 8.70402298143375e-05, "loss": 0.0548, "step": 4910 }, { "epoch": 5.066941297631308, "grad_norm": 0.32023027539253235, "learning_rate": 8.697846346546787e-05, "loss": 0.0508, "step": 4920 }, { "epoch": 5.077239958805356, "grad_norm": 0.38814589381217957, "learning_rate": 8.691657230462083e-05, "loss": 0.0484, "step": 4930 }, { "epoch": 5.087538619979402, "grad_norm": 0.3033084571361542, "learning_rate": 8.685455654069523e-05, "loss": 0.0432, "step": 4940 }, { "epoch": 5.09783728115345, "grad_norm": 0.39010483026504517, "learning_rate": 8.679241638301049e-05, "loss": 0.0506, "step": 4950 }, { "epoch": 5.108135942327498, "grad_norm": 0.28835776448249817, "learning_rate": 8.673015204130586e-05, "loss": 0.0543, "step": 4960 }, { "epoch": 5.1184346035015444, "grad_norm": 0.5217164754867554, "learning_rate": 8.66677637257398e-05, "loss": 0.0501, "step": 4970 }, { "epoch": 5.128733264675592, "grad_norm": 0.4083517789840698, "learning_rate": 8.660525164688913e-05, "loss": 0.0572, "step": 4980 }, { "epoch": 5.13903192584964, "grad_norm": 0.5034805536270142, "learning_rate": 8.654261601574849e-05, "loss": 0.0541, "step": 4990 }, { "epoch": 5.1493305870236865, "grad_norm": 0.3255571126937866, "learning_rate": 8.647985704372948e-05, "loss": 0.0539, "step": 5000 }, { "epoch": 5.159629248197734, "grad_norm": 0.589500367641449, "learning_rate": 8.641697494266006e-05, "loss": 0.0497, "step": 5010 }, { "epoch": 5.169927909371782, "grad_norm": 0.3600839674472809, "learning_rate": 8.635396992478371e-05, "loss": 0.0564, "step": 5020 }, { "epoch": 5.1802265705458295, "grad_norm": 0.3535096049308777, "learning_rate": 8.629084220275887e-05, "loss": 0.0528, "step": 5030 }, { "epoch": 5.190525231719876, "grad_norm": 0.3266212046146393, "learning_rate": 8.622759198965809e-05, "loss": 0.0476, "step": 5040 }, { "epoch": 5.200823892893924, "grad_norm": 0.4038067162036896, "learning_rate": 8.616421949896734e-05, "loss": 0.0517, "step": 5050 }, { "epoch": 5.2111225540679715, "grad_norm": 0.3460542857646942, "learning_rate": 8.610072494458535e-05, "loss": 0.0474, "step": 5060 }, { "epoch": 5.221421215242018, "grad_norm": 0.41362518072128296, "learning_rate": 8.603710854082286e-05, "loss": 0.0515, "step": 5070 }, { "epoch": 5.231719876416066, "grad_norm": 0.2805697023868561, "learning_rate": 8.597337050240184e-05, "loss": 0.0519, "step": 5080 }, { "epoch": 5.242018537590114, "grad_norm": 0.4825451374053955, "learning_rate": 8.590951104445482e-05, "loss": 0.0504, "step": 5090 }, { "epoch": 5.25231719876416, "grad_norm": 0.3441821038722992, "learning_rate": 8.584553038252414e-05, "loss": 0.0581, "step": 5100 }, { "epoch": 5.262615859938208, "grad_norm": 0.39510828256607056, "learning_rate": 8.578142873256129e-05, "loss": 0.0532, "step": 5110 }, { "epoch": 5.272914521112256, "grad_norm": 0.3733309805393219, "learning_rate": 8.571720631092609e-05, "loss": 0.057, "step": 5120 }, { "epoch": 5.283213182286302, "grad_norm": 0.3860830068588257, "learning_rate": 8.565286333438594e-05, "loss": 0.049, "step": 5130 }, { "epoch": 5.29351184346035, "grad_norm": 0.3507029414176941, "learning_rate": 8.558840002011528e-05, "loss": 0.0542, "step": 5140 }, { "epoch": 5.303810504634398, "grad_norm": 0.30535757541656494, "learning_rate": 8.552381658569457e-05, "loss": 0.0584, "step": 5150 }, { "epoch": 5.3141091658084445, "grad_norm": 0.3580070734024048, "learning_rate": 8.545911324910982e-05, "loss": 0.0509, "step": 5160 }, { "epoch": 5.324407826982492, "grad_norm": 0.21992090344429016, "learning_rate": 8.539429022875169e-05, "loss": 0.0412, "step": 5170 }, { "epoch": 5.33470648815654, "grad_norm": 0.6406000852584839, "learning_rate": 8.532934774341483e-05, "loss": 0.0518, "step": 5180 }, { "epoch": 5.3450051493305875, "grad_norm": 0.43300265073776245, "learning_rate": 8.526428601229706e-05, "loss": 0.0539, "step": 5190 }, { "epoch": 5.355303810504634, "grad_norm": 0.5168215036392212, "learning_rate": 8.519910525499874e-05, "loss": 0.0552, "step": 5200 }, { "epoch": 5.365602471678682, "grad_norm": 0.2501913905143738, "learning_rate": 8.513380569152196e-05, "loss": 0.0506, "step": 5210 }, { "epoch": 5.3759011328527295, "grad_norm": 0.2757486402988434, "learning_rate": 8.506838754226982e-05, "loss": 0.0565, "step": 5220 }, { "epoch": 5.386199794026776, "grad_norm": 0.47264114022254944, "learning_rate": 8.500285102804568e-05, "loss": 0.0519, "step": 5230 }, { "epoch": 5.396498455200824, "grad_norm": 0.30214348435401917, "learning_rate": 8.493719637005237e-05, "loss": 0.0424, "step": 5240 }, { "epoch": 5.406797116374872, "grad_norm": 0.4345119893550873, "learning_rate": 8.487142378989152e-05, "loss": 0.0412, "step": 5250 }, { "epoch": 5.417095777548918, "grad_norm": 0.33627235889434814, "learning_rate": 8.480553350956282e-05, "loss": 0.0481, "step": 5260 }, { "epoch": 5.427394438722966, "grad_norm": 0.3047385811805725, "learning_rate": 8.473952575146312e-05, "loss": 0.0481, "step": 5270 }, { "epoch": 5.437693099897014, "grad_norm": 0.4447433352470398, "learning_rate": 8.46734007383859e-05, "loss": 0.046, "step": 5280 }, { "epoch": 5.44799176107106, "grad_norm": 0.4087453782558441, "learning_rate": 8.460715869352035e-05, "loss": 0.0487, "step": 5290 }, { "epoch": 5.458290422245108, "grad_norm": 0.3321467339992523, "learning_rate": 8.454079984045065e-05, "loss": 0.0413, "step": 5300 }, { "epoch": 5.468589083419156, "grad_norm": 0.356514036655426, "learning_rate": 8.447432440315533e-05, "loss": 0.049, "step": 5310 }, { "epoch": 5.4788877445932025, "grad_norm": 0.37567445635795593, "learning_rate": 8.44077326060063e-05, "loss": 0.0461, "step": 5320 }, { "epoch": 5.48918640576725, "grad_norm": 0.3040042519569397, "learning_rate": 8.434102467376832e-05, "loss": 0.0401, "step": 5330 }, { "epoch": 5.499485066941298, "grad_norm": 0.39934873580932617, "learning_rate": 8.427420083159807e-05, "loss": 0.0493, "step": 5340 }, { "epoch": 5.509783728115345, "grad_norm": 0.4000271260738373, "learning_rate": 8.420726130504351e-05, "loss": 0.0541, "step": 5350 }, { "epoch": 5.520082389289392, "grad_norm": 0.2750590443611145, "learning_rate": 8.414020632004299e-05, "loss": 0.0481, "step": 5360 }, { "epoch": 5.53038105046344, "grad_norm": 0.4174776077270508, "learning_rate": 8.407303610292462e-05, "loss": 0.0501, "step": 5370 }, { "epoch": 5.5406797116374875, "grad_norm": 0.2651192247867584, "learning_rate": 8.400575088040548e-05, "loss": 0.0491, "step": 5380 }, { "epoch": 5.550978372811534, "grad_norm": 0.49490901827812195, "learning_rate": 8.393835087959072e-05, "loss": 0.0488, "step": 5390 }, { "epoch": 5.561277033985582, "grad_norm": 0.6012644171714783, "learning_rate": 8.387083632797299e-05, "loss": 0.05, "step": 5400 }, { "epoch": 5.57157569515963, "grad_norm": 0.4538785219192505, "learning_rate": 8.380320745343153e-05, "loss": 0.0479, "step": 5410 }, { "epoch": 5.581874356333676, "grad_norm": 0.358992338180542, "learning_rate": 8.373546448423147e-05, "loss": 0.05, "step": 5420 }, { "epoch": 5.592173017507724, "grad_norm": 0.3814113736152649, "learning_rate": 8.366760764902304e-05, "loss": 0.0415, "step": 5430 }, { "epoch": 5.602471678681772, "grad_norm": 0.6442550420761108, "learning_rate": 8.359963717684077e-05, "loss": 0.0495, "step": 5440 }, { "epoch": 5.612770339855818, "grad_norm": 0.34561294317245483, "learning_rate": 8.353155329710279e-05, "loss": 0.0507, "step": 5450 }, { "epoch": 5.623069001029866, "grad_norm": 0.333892822265625, "learning_rate": 8.346335623960998e-05, "loss": 0.0406, "step": 5460 }, { "epoch": 5.633367662203914, "grad_norm": 0.21642594039440155, "learning_rate": 8.339504623454521e-05, "loss": 0.05, "step": 5470 }, { "epoch": 5.6436663233779605, "grad_norm": 0.21974137425422668, "learning_rate": 8.332662351247262e-05, "loss": 0.0497, "step": 5480 }, { "epoch": 5.653964984552008, "grad_norm": 0.35917189717292786, "learning_rate": 8.325808830433679e-05, "loss": 0.041, "step": 5490 }, { "epoch": 5.664263645726056, "grad_norm": 0.2640712857246399, "learning_rate": 8.318944084146192e-05, "loss": 0.047, "step": 5500 }, { "epoch": 5.674562306900103, "grad_norm": 6.280691623687744, "learning_rate": 8.312068135555115e-05, "loss": 0.0481, "step": 5510 }, { "epoch": 5.68486096807415, "grad_norm": 0.269490122795105, "learning_rate": 8.305181007868572e-05, "loss": 0.0416, "step": 5520 }, { "epoch": 5.695159629248198, "grad_norm": 0.408123254776001, "learning_rate": 8.298282724332419e-05, "loss": 0.049, "step": 5530 }, { "epoch": 5.705458290422245, "grad_norm": 0.2983226478099823, "learning_rate": 8.291373308230165e-05, "loss": 0.0497, "step": 5540 }, { "epoch": 5.715756951596292, "grad_norm": 0.35842761397361755, "learning_rate": 8.284452782882894e-05, "loss": 0.0477, "step": 5550 }, { "epoch": 5.72605561277034, "grad_norm": 0.2742210328578949, "learning_rate": 8.277521171649189e-05, "loss": 0.052, "step": 5560 }, { "epoch": 5.736354273944388, "grad_norm": 0.2822439968585968, "learning_rate": 8.27057849792505e-05, "loss": 0.0491, "step": 5570 }, { "epoch": 5.746652935118434, "grad_norm": 0.3104664385318756, "learning_rate": 8.263624785143812e-05, "loss": 0.0493, "step": 5580 }, { "epoch": 5.756951596292482, "grad_norm": 0.32532253861427307, "learning_rate": 8.256660056776076e-05, "loss": 0.0581, "step": 5590 }, { "epoch": 5.76725025746653, "grad_norm": 0.3366002142429352, "learning_rate": 8.249684336329617e-05, "loss": 0.043, "step": 5600 }, { "epoch": 5.777548918640576, "grad_norm": 0.25842759013175964, "learning_rate": 8.242697647349317e-05, "loss": 0.0485, "step": 5610 }, { "epoch": 5.787847579814624, "grad_norm": 0.302432656288147, "learning_rate": 8.235700013417076e-05, "loss": 0.0521, "step": 5620 }, { "epoch": 5.798146240988672, "grad_norm": 0.3358532190322876, "learning_rate": 8.228691458151738e-05, "loss": 0.0441, "step": 5630 }, { "epoch": 5.8084449021627185, "grad_norm": 0.4343230724334717, "learning_rate": 8.221672005209008e-05, "loss": 0.0521, "step": 5640 }, { "epoch": 5.818743563336766, "grad_norm": 0.30650976300239563, "learning_rate": 8.214641678281374e-05, "loss": 0.0538, "step": 5650 }, { "epoch": 5.829042224510814, "grad_norm": 0.3401453197002411, "learning_rate": 8.207600501098026e-05, "loss": 0.0428, "step": 5660 }, { "epoch": 5.8393408856848605, "grad_norm": 0.45636221766471863, "learning_rate": 8.200548497424778e-05, "loss": 0.0582, "step": 5670 }, { "epoch": 5.849639546858908, "grad_norm": 0.2774709165096283, "learning_rate": 8.193485691063985e-05, "loss": 0.048, "step": 5680 }, { "epoch": 5.859938208032956, "grad_norm": 0.29194507002830505, "learning_rate": 8.186412105854463e-05, "loss": 0.0534, "step": 5690 }, { "epoch": 5.8702368692070035, "grad_norm": 0.36549675464630127, "learning_rate": 8.17932776567141e-05, "loss": 0.0571, "step": 5700 }, { "epoch": 5.88053553038105, "grad_norm": 0.302418977022171, "learning_rate": 8.172232694426329e-05, "loss": 0.0423, "step": 5710 }, { "epoch": 5.890834191555098, "grad_norm": 0.27770909667015076, "learning_rate": 8.165126916066936e-05, "loss": 0.0487, "step": 5720 }, { "epoch": 5.901132852729146, "grad_norm": 0.3784064054489136, "learning_rate": 8.158010454577093e-05, "loss": 0.0504, "step": 5730 }, { "epoch": 5.911431513903192, "grad_norm": 0.29943570494651794, "learning_rate": 8.150883333976713e-05, "loss": 0.0458, "step": 5740 }, { "epoch": 5.92173017507724, "grad_norm": 0.26842376589775085, "learning_rate": 8.143745578321695e-05, "loss": 0.0523, "step": 5750 }, { "epoch": 5.932028836251288, "grad_norm": 0.19866850972175598, "learning_rate": 8.136597211703827e-05, "loss": 0.0429, "step": 5760 }, { "epoch": 5.942327497425334, "grad_norm": 0.30413612723350525, "learning_rate": 8.129438258250712e-05, "loss": 0.0441, "step": 5770 }, { "epoch": 5.952626158599382, "grad_norm": 0.2791491746902466, "learning_rate": 8.122268742125695e-05, "loss": 0.047, "step": 5780 }, { "epoch": 5.96292481977343, "grad_norm": 0.34201282262802124, "learning_rate": 8.115088687527761e-05, "loss": 0.0501, "step": 5790 }, { "epoch": 5.9732234809474765, "grad_norm": 0.39383724331855774, "learning_rate": 8.107898118691473e-05, "loss": 0.0497, "step": 5800 }, { "epoch": 5.983522142121524, "grad_norm": 0.3670088052749634, "learning_rate": 8.100697059886879e-05, "loss": 0.0428, "step": 5810 }, { "epoch": 5.993820803295572, "grad_norm": 0.3595752716064453, "learning_rate": 8.093485535419434e-05, "loss": 0.0467, "step": 5820 }, { "epoch": 6.0041194644696185, "grad_norm": 0.403352290391922, "learning_rate": 8.086263569629919e-05, "loss": 0.0441, "step": 5830 }, { "epoch": 6.014418125643666, "grad_norm": 0.18506278097629547, "learning_rate": 8.079031186894354e-05, "loss": 0.0508, "step": 5840 }, { "epoch": 6.024716786817714, "grad_norm": 0.5713401436805725, "learning_rate": 8.071788411623922e-05, "loss": 0.0491, "step": 5850 }, { "epoch": 6.0350154479917615, "grad_norm": 0.20415346324443817, "learning_rate": 8.064535268264883e-05, "loss": 0.0502, "step": 5860 }, { "epoch": 6.045314109165808, "grad_norm": 0.28075137734413147, "learning_rate": 8.057271781298489e-05, "loss": 0.0512, "step": 5870 }, { "epoch": 6.055612770339856, "grad_norm": 0.3114660680294037, "learning_rate": 8.049997975240909e-05, "loss": 0.0508, "step": 5880 }, { "epoch": 6.0659114315139036, "grad_norm": 0.3134065866470337, "learning_rate": 8.042713874643136e-05, "loss": 0.0531, "step": 5890 }, { "epoch": 6.07621009268795, "grad_norm": 0.24600578844547272, "learning_rate": 8.035419504090915e-05, "loss": 0.0478, "step": 5900 }, { "epoch": 6.086508753861998, "grad_norm": 0.34766799211502075, "learning_rate": 8.028114888204653e-05, "loss": 0.0486, "step": 5910 }, { "epoch": 6.096807415036046, "grad_norm": 0.3067956268787384, "learning_rate": 8.020800051639337e-05, "loss": 0.0452, "step": 5920 }, { "epoch": 6.107106076210092, "grad_norm": 0.3019874691963196, "learning_rate": 8.013475019084453e-05, "loss": 0.0458, "step": 5930 }, { "epoch": 6.11740473738414, "grad_norm": 0.3271634578704834, "learning_rate": 8.006139815263898e-05, "loss": 0.0561, "step": 5940 }, { "epoch": 6.127703398558188, "grad_norm": 0.2930561304092407, "learning_rate": 7.998794464935904e-05, "loss": 0.0407, "step": 5950 }, { "epoch": 6.1380020597322344, "grad_norm": 0.37962770462036133, "learning_rate": 7.991438992892946e-05, "loss": 0.048, "step": 5960 }, { "epoch": 6.148300720906282, "grad_norm": 0.36476749181747437, "learning_rate": 7.984073423961664e-05, "loss": 0.0439, "step": 5970 }, { "epoch": 6.15859938208033, "grad_norm": 0.31208914518356323, "learning_rate": 7.97669778300278e-05, "loss": 0.0431, "step": 5980 }, { "epoch": 6.1688980432543765, "grad_norm": 0.758002758026123, "learning_rate": 7.969312094911007e-05, "loss": 0.0481, "step": 5990 }, { "epoch": 6.179196704428424, "grad_norm": 1.8981136083602905, "learning_rate": 7.961916384614975e-05, "loss": 0.0621, "step": 6000 }, { "epoch": 6.189495365602472, "grad_norm": 0.277136892080307, "learning_rate": 7.954510677077138e-05, "loss": 0.0586, "step": 6010 }, { "epoch": 6.1997940267765195, "grad_norm": 0.27095285058021545, "learning_rate": 7.947094997293695e-05, "loss": 0.0484, "step": 6020 }, { "epoch": 6.210092687950566, "grad_norm": 0.2608092427253723, "learning_rate": 7.9396693702945e-05, "loss": 0.0457, "step": 6030 }, { "epoch": 6.220391349124614, "grad_norm": 0.5210095643997192, "learning_rate": 7.932233821142987e-05, "loss": 0.0473, "step": 6040 }, { "epoch": 6.2306900102986615, "grad_norm": 0.254302978515625, "learning_rate": 7.924788374936078e-05, "loss": 0.045, "step": 6050 }, { "epoch": 6.240988671472708, "grad_norm": 0.343322217464447, "learning_rate": 7.917333056804097e-05, "loss": 0.054, "step": 6060 }, { "epoch": 6.251287332646756, "grad_norm": 0.4098043143749237, "learning_rate": 7.909867891910694e-05, "loss": 0.0435, "step": 6070 }, { "epoch": 6.261585993820804, "grad_norm": 0.34776240587234497, "learning_rate": 7.902392905452749e-05, "loss": 0.0538, "step": 6080 }, { "epoch": 6.27188465499485, "grad_norm": 0.5250643491744995, "learning_rate": 7.894908122660296e-05, "loss": 0.0431, "step": 6090 }, { "epoch": 6.282183316168898, "grad_norm": 0.37657663226127625, "learning_rate": 7.887413568796433e-05, "loss": 0.0532, "step": 6100 }, { "epoch": 6.292481977342946, "grad_norm": 0.28036069869995117, "learning_rate": 7.879909269157236e-05, "loss": 0.0382, "step": 6110 }, { "epoch": 6.302780638516992, "grad_norm": 0.4012965261936188, "learning_rate": 7.87239524907168e-05, "loss": 0.0472, "step": 6120 }, { "epoch": 6.31307929969104, "grad_norm": 0.4002419412136078, "learning_rate": 7.864871533901544e-05, "loss": 0.051, "step": 6130 }, { "epoch": 6.323377960865088, "grad_norm": 0.3897566795349121, "learning_rate": 7.857338149041332e-05, "loss": 0.0487, "step": 6140 }, { "epoch": 6.3336766220391345, "grad_norm": 0.4365810751914978, "learning_rate": 7.849795119918191e-05, "loss": 0.0486, "step": 6150 }, { "epoch": 6.343975283213182, "grad_norm": 0.38556814193725586, "learning_rate": 7.842242471991809e-05, "loss": 0.0509, "step": 6160 }, { "epoch": 6.35427394438723, "grad_norm": 0.3570299744606018, "learning_rate": 7.834680230754353e-05, "loss": 0.0485, "step": 6170 }, { "epoch": 6.364572605561277, "grad_norm": 0.25796523690223694, "learning_rate": 7.82710842173036e-05, "loss": 0.0474, "step": 6180 }, { "epoch": 6.374871266735324, "grad_norm": 0.4013979732990265, "learning_rate": 7.819527070476665e-05, "loss": 0.0453, "step": 6190 }, { "epoch": 6.385169927909372, "grad_norm": 0.2755083739757538, "learning_rate": 7.811936202582306e-05, "loss": 0.0407, "step": 6200 }, { "epoch": 6.3954685890834195, "grad_norm": 0.8050864338874817, "learning_rate": 7.80433584366845e-05, "loss": 0.0468, "step": 6210 }, { "epoch": 6.405767250257466, "grad_norm": 0.5987268686294556, "learning_rate": 7.796726019388295e-05, "loss": 0.0445, "step": 6220 }, { "epoch": 6.416065911431514, "grad_norm": 0.31688612699508667, "learning_rate": 7.789106755426985e-05, "loss": 0.0414, "step": 6230 }, { "epoch": 6.426364572605562, "grad_norm": 0.2687252163887024, "learning_rate": 7.781478077501525e-05, "loss": 0.0381, "step": 6240 }, { "epoch": 6.436663233779608, "grad_norm": 0.31859585642814636, "learning_rate": 7.773840011360698e-05, "loss": 0.0486, "step": 6250 }, { "epoch": 6.446961894953656, "grad_norm": 0.39176130294799805, "learning_rate": 7.766192582784974e-05, "loss": 0.0492, "step": 6260 }, { "epoch": 6.457260556127704, "grad_norm": 0.4192884862422943, "learning_rate": 7.758535817586424e-05, "loss": 0.0524, "step": 6270 }, { "epoch": 6.46755921730175, "grad_norm": 0.41165101528167725, "learning_rate": 7.750869741608628e-05, "loss": 0.0459, "step": 6280 }, { "epoch": 6.477857878475798, "grad_norm": 0.37704214453697205, "learning_rate": 7.7431943807266e-05, "loss": 0.0555, "step": 6290 }, { "epoch": 6.488156539649846, "grad_norm": 0.4949089586734772, "learning_rate": 7.735509760846682e-05, "loss": 0.0493, "step": 6300 }, { "epoch": 6.4984552008238925, "grad_norm": 0.27363213896751404, "learning_rate": 7.727815907906481e-05, "loss": 0.0498, "step": 6310 }, { "epoch": 6.50875386199794, "grad_norm": 0.32286787033081055, "learning_rate": 7.720112847874759e-05, "loss": 0.0445, "step": 6320 }, { "epoch": 6.519052523171988, "grad_norm": 0.2211546152830124, "learning_rate": 7.712400606751356e-05, "loss": 0.0475, "step": 6330 }, { "epoch": 6.5293511843460355, "grad_norm": 0.2400301843881607, "learning_rate": 7.7046792105671e-05, "loss": 0.0459, "step": 6340 }, { "epoch": 6.539649845520082, "grad_norm": 0.3111647069454193, "learning_rate": 7.696948685383725e-05, "loss": 0.0492, "step": 6350 }, { "epoch": 6.54994850669413, "grad_norm": 0.3468630313873291, "learning_rate": 7.68920905729377e-05, "loss": 0.0422, "step": 6360 }, { "epoch": 6.5602471678681775, "grad_norm": 0.4992178678512573, "learning_rate": 7.6814603524205e-05, "loss": 0.0489, "step": 6370 }, { "epoch": 6.570545829042224, "grad_norm": 0.33954063057899475, "learning_rate": 7.673702596917824e-05, "loss": 0.0483, "step": 6380 }, { "epoch": 6.580844490216272, "grad_norm": 0.3721350133419037, "learning_rate": 7.665935816970193e-05, "loss": 0.0415, "step": 6390 }, { "epoch": 6.59114315139032, "grad_norm": 0.30230167508125305, "learning_rate": 7.658160038792518e-05, "loss": 0.0431, "step": 6400 }, { "epoch": 6.601441812564366, "grad_norm": 0.2966795861721039, "learning_rate": 7.650375288630083e-05, "loss": 0.0431, "step": 6410 }, { "epoch": 6.611740473738414, "grad_norm": 0.28090888261795044, "learning_rate": 7.642581592758453e-05, "loss": 0.0413, "step": 6420 }, { "epoch": 6.622039134912462, "grad_norm": 0.3371041715145111, "learning_rate": 7.634778977483389e-05, "loss": 0.0469, "step": 6430 }, { "epoch": 6.632337796086508, "grad_norm": 0.28260523080825806, "learning_rate": 7.626967469140754e-05, "loss": 0.0437, "step": 6440 }, { "epoch": 6.642636457260556, "grad_norm": 0.2734527289867401, "learning_rate": 7.619147094096434e-05, "loss": 0.043, "step": 6450 }, { "epoch": 6.652935118434604, "grad_norm": 0.3294004797935486, "learning_rate": 7.611317878746238e-05, "loss": 0.0414, "step": 6460 }, { "epoch": 6.663233779608651, "grad_norm": 0.45815443992614746, "learning_rate": 7.60347984951581e-05, "loss": 0.0496, "step": 6470 }, { "epoch": 6.673532440782698, "grad_norm": 0.24537749588489532, "learning_rate": 7.59563303286055e-05, "loss": 0.0425, "step": 6480 }, { "epoch": 6.683831101956746, "grad_norm": 0.32262513041496277, "learning_rate": 7.587777455265515e-05, "loss": 0.042, "step": 6490 }, { "epoch": 6.6941297631307926, "grad_norm": 0.19561485946178436, "learning_rate": 7.579913143245328e-05, "loss": 0.0424, "step": 6500 }, { "epoch": 6.70442842430484, "grad_norm": 0.29754048585891724, "learning_rate": 7.572040123344103e-05, "loss": 0.0466, "step": 6510 }, { "epoch": 6.714727085478888, "grad_norm": 0.33084553480148315, "learning_rate": 7.564158422135337e-05, "loss": 0.0496, "step": 6520 }, { "epoch": 6.7250257466529355, "grad_norm": 0.40858951210975647, "learning_rate": 7.55626806622183e-05, "loss": 0.0481, "step": 6530 }, { "epoch": 6.735324407826982, "grad_norm": 0.9231746792793274, "learning_rate": 7.548369082235595e-05, "loss": 0.0512, "step": 6540 }, { "epoch": 6.74562306900103, "grad_norm": 0.4263251721858978, "learning_rate": 7.54046149683777e-05, "loss": 0.0429, "step": 6550 }, { "epoch": 6.755921730175078, "grad_norm": 0.2868654131889343, "learning_rate": 7.532545336718521e-05, "loss": 0.048, "step": 6560 }, { "epoch": 6.766220391349124, "grad_norm": 0.250887930393219, "learning_rate": 7.524620628596954e-05, "loss": 0.0477, "step": 6570 }, { "epoch": 6.776519052523172, "grad_norm": 0.3410227298736572, "learning_rate": 7.516687399221037e-05, "loss": 0.0474, "step": 6580 }, { "epoch": 6.78681771369722, "grad_norm": 0.42289555072784424, "learning_rate": 7.508745675367483e-05, "loss": 0.0445, "step": 6590 }, { "epoch": 6.797116374871266, "grad_norm": 0.3723140358924866, "learning_rate": 7.500795483841692e-05, "loss": 0.0473, "step": 6600 }, { "epoch": 6.807415036045314, "grad_norm": 0.5165073275566101, "learning_rate": 7.492836851477636e-05, "loss": 0.0502, "step": 6610 }, { "epoch": 6.817713697219362, "grad_norm": 0.3081056773662567, "learning_rate": 7.484869805137778e-05, "loss": 0.0478, "step": 6620 }, { "epoch": 6.8280123583934085, "grad_norm": 0.39798182249069214, "learning_rate": 7.476894371712982e-05, "loss": 0.0516, "step": 6630 }, { "epoch": 6.838311019567456, "grad_norm": 0.3031449615955353, "learning_rate": 7.468910578122418e-05, "loss": 0.0458, "step": 6640 }, { "epoch": 6.848609680741504, "grad_norm": 0.40421777963638306, "learning_rate": 7.460918451313481e-05, "loss": 0.0464, "step": 6650 }, { "epoch": 6.858908341915551, "grad_norm": 0.3347015976905823, "learning_rate": 7.452918018261684e-05, "loss": 0.0427, "step": 6660 }, { "epoch": 6.869207003089598, "grad_norm": 0.46592167019844055, "learning_rate": 7.444909305970578e-05, "loss": 0.0395, "step": 6670 }, { "epoch": 6.879505664263646, "grad_norm": 0.31017211079597473, "learning_rate": 7.436892341471663e-05, "loss": 0.052, "step": 6680 }, { "epoch": 6.889804325437693, "grad_norm": 0.575901210308075, "learning_rate": 7.428867151824287e-05, "loss": 0.0489, "step": 6690 }, { "epoch": 6.90010298661174, "grad_norm": 0.372746080160141, "learning_rate": 7.420833764115561e-05, "loss": 0.0428, "step": 6700 }, { "epoch": 6.910401647785788, "grad_norm": 0.37451857328414917, "learning_rate": 7.41279220546027e-05, "loss": 0.0432, "step": 6710 }, { "epoch": 6.920700308959836, "grad_norm": 0.3189006447792053, "learning_rate": 7.404742503000776e-05, "loss": 0.0519, "step": 6720 }, { "epoch": 6.930998970133882, "grad_norm": 0.22485186159610748, "learning_rate": 7.396684683906928e-05, "loss": 0.0507, "step": 6730 }, { "epoch": 6.94129763130793, "grad_norm": 0.3649514615535736, "learning_rate": 7.38861877537597e-05, "loss": 0.0485, "step": 6740 }, { "epoch": 6.951596292481978, "grad_norm": 0.37899455428123474, "learning_rate": 7.380544804632453e-05, "loss": 0.0454, "step": 6750 }, { "epoch": 6.961894953656024, "grad_norm": 0.4623110294342041, "learning_rate": 7.372462798928137e-05, "loss": 0.0446, "step": 6760 }, { "epoch": 6.972193614830072, "grad_norm": 0.41896483302116394, "learning_rate": 7.364372785541902e-05, "loss": 0.0432, "step": 6770 }, { "epoch": 6.98249227600412, "grad_norm": 0.28001904487609863, "learning_rate": 7.356274791779661e-05, "loss": 0.0447, "step": 6780 }, { "epoch": 6.9927909371781665, "grad_norm": 0.35105225443840027, "learning_rate": 7.348168844974254e-05, "loss": 0.0445, "step": 6790 }, { "epoch": 7.003089598352214, "grad_norm": 0.41556599736213684, "learning_rate": 7.340054972485371e-05, "loss": 0.0512, "step": 6800 }, { "epoch": 7.013388259526262, "grad_norm": 0.4035722017288208, "learning_rate": 7.331933201699457e-05, "loss": 0.0423, "step": 6810 }, { "epoch": 7.0236869207003085, "grad_norm": 0.4090428352355957, "learning_rate": 7.323803560029605e-05, "loss": 0.0514, "step": 6820 }, { "epoch": 7.033985581874356, "grad_norm": 0.3787795901298523, "learning_rate": 7.315666074915481e-05, "loss": 0.0402, "step": 6830 }, { "epoch": 7.044284243048404, "grad_norm": 0.32284408807754517, "learning_rate": 7.307520773823227e-05, "loss": 0.0466, "step": 6840 }, { "epoch": 7.0545829042224515, "grad_norm": 0.35008612275123596, "learning_rate": 7.299367684245362e-05, "loss": 0.0451, "step": 6850 }, { "epoch": 7.064881565396498, "grad_norm": 0.38151565194129944, "learning_rate": 7.29120683370069e-05, "loss": 0.0364, "step": 6860 }, { "epoch": 7.075180226570546, "grad_norm": 0.21700677275657654, "learning_rate": 7.283038249734217e-05, "loss": 0.0504, "step": 6870 }, { "epoch": 7.085478887744594, "grad_norm": 0.3018152415752411, "learning_rate": 7.27486195991705e-05, "loss": 0.0519, "step": 6880 }, { "epoch": 7.09577754891864, "grad_norm": 0.2052696943283081, "learning_rate": 7.266677991846301e-05, "loss": 0.042, "step": 6890 }, { "epoch": 7.106076210092688, "grad_norm": 0.39970454573631287, "learning_rate": 7.258486373144999e-05, "loss": 0.0409, "step": 6900 }, { "epoch": 7.116374871266736, "grad_norm": 0.22980281710624695, "learning_rate": 7.250287131462004e-05, "loss": 0.0445, "step": 6910 }, { "epoch": 7.126673532440782, "grad_norm": 0.3403468430042267, "learning_rate": 7.242080294471895e-05, "loss": 0.0565, "step": 6920 }, { "epoch": 7.13697219361483, "grad_norm": 0.25713488459587097, "learning_rate": 7.233865889874891e-05, "loss": 0.0456, "step": 6930 }, { "epoch": 7.147270854788878, "grad_norm": 0.3376232981681824, "learning_rate": 7.225643945396757e-05, "loss": 0.0378, "step": 6940 }, { "epoch": 7.1575695159629245, "grad_norm": 0.255604088306427, "learning_rate": 7.217414488788702e-05, "loss": 0.041, "step": 6950 }, { "epoch": 7.167868177136972, "grad_norm": 0.2713391184806824, "learning_rate": 7.209177547827294e-05, "loss": 0.0527, "step": 6960 }, { "epoch": 7.17816683831102, "grad_norm": 0.2645740509033203, "learning_rate": 7.20093315031436e-05, "loss": 0.0432, "step": 6970 }, { "epoch": 7.1884654994850665, "grad_norm": 0.3499581515789032, "learning_rate": 7.192681324076896e-05, "loss": 0.0516, "step": 6980 }, { "epoch": 7.198764160659114, "grad_norm": 0.24416272342205048, "learning_rate": 7.184422096966971e-05, "loss": 0.0435, "step": 6990 }, { "epoch": 7.209062821833162, "grad_norm": 0.3371264338493347, "learning_rate": 7.176155496861638e-05, "loss": 0.0463, "step": 7000 }, { "epoch": 7.2193614830072095, "grad_norm": 0.3851630687713623, "learning_rate": 7.167881551662831e-05, "loss": 0.0407, "step": 7010 }, { "epoch": 7.229660144181256, "grad_norm": 0.2070106714963913, "learning_rate": 7.159600289297276e-05, "loss": 0.0386, "step": 7020 }, { "epoch": 7.239958805355304, "grad_norm": 0.3137363791465759, "learning_rate": 7.151311737716397e-05, "loss": 0.0411, "step": 7030 }, { "epoch": 7.2502574665293515, "grad_norm": 0.3703240752220154, "learning_rate": 7.143015924896226e-05, "loss": 0.0426, "step": 7040 }, { "epoch": 7.260556127703398, "grad_norm": 0.3365670144557953, "learning_rate": 7.134712878837294e-05, "loss": 0.0506, "step": 7050 }, { "epoch": 7.270854788877446, "grad_norm": 0.2538038194179535, "learning_rate": 7.126402627564555e-05, "loss": 0.0466, "step": 7060 }, { "epoch": 7.281153450051494, "grad_norm": 0.43290919065475464, "learning_rate": 7.118085199127276e-05, "loss": 0.0463, "step": 7070 }, { "epoch": 7.29145211122554, "grad_norm": 0.2167598456144333, "learning_rate": 7.109760621598952e-05, "loss": 0.0421, "step": 7080 }, { "epoch": 7.301750772399588, "grad_norm": 0.24321898818016052, "learning_rate": 7.101428923077209e-05, "loss": 0.0382, "step": 7090 }, { "epoch": 7.312049433573636, "grad_norm": 0.31298938393592834, "learning_rate": 7.093090131683704e-05, "loss": 0.0401, "step": 7100 }, { "epoch": 7.3223480947476824, "grad_norm": 0.38020390272140503, "learning_rate": 7.08474427556404e-05, "loss": 0.0454, "step": 7110 }, { "epoch": 7.33264675592173, "grad_norm": 0.37544867396354675, "learning_rate": 7.076391382887661e-05, "loss": 0.0408, "step": 7120 }, { "epoch": 7.342945417095778, "grad_norm": 0.2992228865623474, "learning_rate": 7.068031481847762e-05, "loss": 0.0454, "step": 7130 }, { "epoch": 7.3532440782698245, "grad_norm": 0.48509418964385986, "learning_rate": 7.059664600661196e-05, "loss": 0.044, "step": 7140 }, { "epoch": 7.363542739443872, "grad_norm": 0.4964796304702759, "learning_rate": 7.051290767568371e-05, "loss": 0.0526, "step": 7150 }, { "epoch": 7.37384140061792, "grad_norm": 0.22935813665390015, "learning_rate": 7.042910010833163e-05, "loss": 0.0416, "step": 7160 }, { "epoch": 7.3841400617919675, "grad_norm": 0.2570447325706482, "learning_rate": 7.034522358742816e-05, "loss": 0.0488, "step": 7170 }, { "epoch": 7.394438722966014, "grad_norm": 0.23174193501472473, "learning_rate": 7.026127839607847e-05, "loss": 0.0423, "step": 7180 }, { "epoch": 7.404737384140062, "grad_norm": 0.33260369300842285, "learning_rate": 7.017726481761951e-05, "loss": 0.0464, "step": 7190 }, { "epoch": 7.4150360453141095, "grad_norm": 0.4475546181201935, "learning_rate": 7.009318313561908e-05, "loss": 0.0475, "step": 7200 }, { "epoch": 7.425334706488156, "grad_norm": 0.2761160731315613, "learning_rate": 7.000903363387482e-05, "loss": 0.0448, "step": 7210 }, { "epoch": 7.435633367662204, "grad_norm": 0.39867162704467773, "learning_rate": 6.99248165964133e-05, "loss": 0.0455, "step": 7220 }, { "epoch": 7.445932028836252, "grad_norm": 0.3500315546989441, "learning_rate": 6.9840532307489e-05, "loss": 0.0452, "step": 7230 }, { "epoch": 7.456230690010298, "grad_norm": 0.30247119069099426, "learning_rate": 6.975618105158346e-05, "loss": 0.0458, "step": 7240 }, { "epoch": 7.466529351184346, "grad_norm": 0.357147753238678, "learning_rate": 6.967176311340418e-05, "loss": 0.0401, "step": 7250 }, { "epoch": 7.476828012358394, "grad_norm": 0.36390820145606995, "learning_rate": 6.958727877788378e-05, "loss": 0.0432, "step": 7260 }, { "epoch": 7.48712667353244, "grad_norm": 0.3110693395137787, "learning_rate": 6.950272833017896e-05, "loss": 0.0413, "step": 7270 }, { "epoch": 7.497425334706488, "grad_norm": 0.26132798194885254, "learning_rate": 6.941811205566957e-05, "loss": 0.0448, "step": 7280 }, { "epoch": 7.507723995880536, "grad_norm": 0.2721041142940521, "learning_rate": 6.933343023995767e-05, "loss": 0.0358, "step": 7290 }, { "epoch": 7.518022657054583, "grad_norm": 0.26367267966270447, "learning_rate": 6.924868316886649e-05, "loss": 0.0515, "step": 7300 }, { "epoch": 7.52832131822863, "grad_norm": 0.4417518377304077, "learning_rate": 6.916387112843957e-05, "loss": 0.054, "step": 7310 }, { "epoch": 7.538619979402678, "grad_norm": 0.3166719079017639, "learning_rate": 6.907899440493968e-05, "loss": 0.0485, "step": 7320 }, { "epoch": 7.548918640576725, "grad_norm": 0.330705463886261, "learning_rate": 6.899405328484794e-05, "loss": 0.0444, "step": 7330 }, { "epoch": 7.559217301750772, "grad_norm": 0.22663088142871857, "learning_rate": 6.890904805486286e-05, "loss": 0.0424, "step": 7340 }, { "epoch": 7.56951596292482, "grad_norm": 0.3720453083515167, "learning_rate": 6.88239790018993e-05, "loss": 0.043, "step": 7350 }, { "epoch": 7.5798146240988675, "grad_norm": 0.2161106914281845, "learning_rate": 6.873884641308752e-05, "loss": 0.042, "step": 7360 }, { "epoch": 7.590113285272914, "grad_norm": 0.3371187448501587, "learning_rate": 6.865365057577227e-05, "loss": 0.0463, "step": 7370 }, { "epoch": 7.600411946446962, "grad_norm": 0.3055129945278168, "learning_rate": 6.856839177751176e-05, "loss": 0.0474, "step": 7380 }, { "epoch": 7.61071060762101, "grad_norm": 0.3375736474990845, "learning_rate": 6.84830703060767e-05, "loss": 0.0439, "step": 7390 }, { "epoch": 7.621009268795056, "grad_norm": 0.3460111916065216, "learning_rate": 6.839768644944937e-05, "loss": 0.0464, "step": 7400 }, { "epoch": 7.631307929969104, "grad_norm": 0.3610309660434723, "learning_rate": 6.83122404958226e-05, "loss": 0.0441, "step": 7410 }, { "epoch": 7.641606591143152, "grad_norm": 0.32009249925613403, "learning_rate": 6.82267327335988e-05, "loss": 0.0405, "step": 7420 }, { "epoch": 7.651905252317198, "grad_norm": 0.532019853591919, "learning_rate": 6.814116345138902e-05, "loss": 0.0401, "step": 7430 }, { "epoch": 7.662203913491246, "grad_norm": 0.25246256589889526, "learning_rate": 6.805553293801196e-05, "loss": 0.0476, "step": 7440 }, { "epoch": 7.672502574665294, "grad_norm": 0.2576782703399658, "learning_rate": 6.796984148249295e-05, "loss": 0.0456, "step": 7450 }, { "epoch": 7.6828012358393405, "grad_norm": 0.4437432885169983, "learning_rate": 6.788408937406307e-05, "loss": 0.0434, "step": 7460 }, { "epoch": 7.693099897013388, "grad_norm": 0.3884623050689697, "learning_rate": 6.77982769021581e-05, "loss": 0.0433, "step": 7470 }, { "epoch": 7.703398558187436, "grad_norm": 0.30564385652542114, "learning_rate": 6.771240435641754e-05, "loss": 0.0419, "step": 7480 }, { "epoch": 7.7136972193614834, "grad_norm": 0.29946035146713257, "learning_rate": 6.762647202668366e-05, "loss": 0.0481, "step": 7490 }, { "epoch": 7.72399588053553, "grad_norm": 0.270355761051178, "learning_rate": 6.754048020300054e-05, "loss": 0.0432, "step": 7500 }, { "epoch": 7.734294541709578, "grad_norm": 0.3664805293083191, "learning_rate": 6.745442917561309e-05, "loss": 0.0379, "step": 7510 }, { "epoch": 7.7445932028836255, "grad_norm": 0.788110077381134, "learning_rate": 6.736831923496596e-05, "loss": 0.0521, "step": 7520 }, { "epoch": 7.754891864057672, "grad_norm": 0.46117472648620605, "learning_rate": 6.728215067170273e-05, "loss": 0.0487, "step": 7530 }, { "epoch": 7.76519052523172, "grad_norm": 0.18957702815532684, "learning_rate": 6.719592377666483e-05, "loss": 0.0479, "step": 7540 }, { "epoch": 7.775489186405768, "grad_norm": 0.4086840748786926, "learning_rate": 6.710963884089054e-05, "loss": 0.0426, "step": 7550 }, { "epoch": 7.785787847579814, "grad_norm": 0.21845366060733795, "learning_rate": 6.70232961556141e-05, "loss": 0.0402, "step": 7560 }, { "epoch": 7.796086508753862, "grad_norm": 0.18775074183940887, "learning_rate": 6.693689601226458e-05, "loss": 0.04, "step": 7570 }, { "epoch": 7.80638516992791, "grad_norm": 0.30147698521614075, "learning_rate": 6.685043870246507e-05, "loss": 0.0434, "step": 7580 }, { "epoch": 7.816683831101956, "grad_norm": 0.366470068693161, "learning_rate": 6.676392451803161e-05, "loss": 0.0463, "step": 7590 }, { "epoch": 7.826982492276004, "grad_norm": 0.3885975778102875, "learning_rate": 6.667735375097214e-05, "loss": 0.0453, "step": 7600 }, { "epoch": 7.837281153450052, "grad_norm": 0.29683852195739746, "learning_rate": 6.659072669348564e-05, "loss": 0.0419, "step": 7610 }, { "epoch": 7.8475798146240985, "grad_norm": 0.29188981652259827, "learning_rate": 6.650404363796108e-05, "loss": 0.0371, "step": 7620 }, { "epoch": 7.857878475798146, "grad_norm": 0.40961870551109314, "learning_rate": 6.641730487697639e-05, "loss": 0.0435, "step": 7630 }, { "epoch": 7.868177136972194, "grad_norm": 0.33139774203300476, "learning_rate": 6.633051070329759e-05, "loss": 0.0413, "step": 7640 }, { "epoch": 7.8784757981462405, "grad_norm": 0.28173500299453735, "learning_rate": 6.624366140987768e-05, "loss": 0.0452, "step": 7650 }, { "epoch": 7.888774459320288, "grad_norm": 0.2889021039009094, "learning_rate": 6.615675728985572e-05, "loss": 0.0423, "step": 7660 }, { "epoch": 7.899073120494336, "grad_norm": 0.6384182572364807, "learning_rate": 6.606979863655583e-05, "loss": 0.0379, "step": 7670 }, { "epoch": 7.9093717816683835, "grad_norm": 0.4132192134857178, "learning_rate": 6.598278574348619e-05, "loss": 0.0391, "step": 7680 }, { "epoch": 7.91967044284243, "grad_norm": 0.3432478606700897, "learning_rate": 6.589571890433803e-05, "loss": 0.0473, "step": 7690 }, { "epoch": 7.929969104016478, "grad_norm": 0.3030139207839966, "learning_rate": 6.580859841298471e-05, "loss": 0.0374, "step": 7700 }, { "epoch": 7.940267765190526, "grad_norm": 0.27307939529418945, "learning_rate": 6.572142456348065e-05, "loss": 0.0402, "step": 7710 }, { "epoch": 7.950566426364572, "grad_norm": 0.2667880952358246, "learning_rate": 6.563419765006038e-05, "loss": 0.0463, "step": 7720 }, { "epoch": 7.96086508753862, "grad_norm": 0.37028032541275024, "learning_rate": 6.55469179671375e-05, "loss": 0.038, "step": 7730 }, { "epoch": 7.971163748712668, "grad_norm": 0.3381376266479492, "learning_rate": 6.545958580930377e-05, "loss": 0.0455, "step": 7740 }, { "epoch": 7.981462409886714, "grad_norm": 0.28161460161209106, "learning_rate": 6.537220147132805e-05, "loss": 0.0396, "step": 7750 }, { "epoch": 7.991761071060762, "grad_norm": 0.26298263669013977, "learning_rate": 6.528476524815528e-05, "loss": 0.0424, "step": 7760 }, { "epoch": 8.002059732234809, "grad_norm": 0.2671511769294739, "learning_rate": 6.519727743490561e-05, "loss": 0.0384, "step": 7770 }, { "epoch": 8.012358393408856, "grad_norm": 0.3101862967014313, "learning_rate": 6.510973832687323e-05, "loss": 0.0465, "step": 7780 }, { "epoch": 8.022657054582904, "grad_norm": 0.3037969768047333, "learning_rate": 6.502214821952555e-05, "loss": 0.0473, "step": 7790 }, { "epoch": 8.032955715756952, "grad_norm": 0.45323264598846436, "learning_rate": 6.493450740850203e-05, "loss": 0.0432, "step": 7800 }, { "epoch": 8.043254376931, "grad_norm": 0.41797924041748047, "learning_rate": 6.484681618961331e-05, "loss": 0.048, "step": 7810 }, { "epoch": 8.053553038105047, "grad_norm": 0.4865727424621582, "learning_rate": 6.47590748588402e-05, "loss": 0.0512, "step": 7820 }, { "epoch": 8.063851699279093, "grad_norm": 0.3105076849460602, "learning_rate": 6.46712837123326e-05, "loss": 0.0448, "step": 7830 }, { "epoch": 8.07415036045314, "grad_norm": 0.25625815987586975, "learning_rate": 6.458344304640858e-05, "loss": 0.0416, "step": 7840 }, { "epoch": 8.084449021627188, "grad_norm": 0.31119033694267273, "learning_rate": 6.449555315755333e-05, "loss": 0.041, "step": 7850 }, { "epoch": 8.094747682801236, "grad_norm": 0.39366838335990906, "learning_rate": 6.440761434241821e-05, "loss": 0.0404, "step": 7860 }, { "epoch": 8.105046343975284, "grad_norm": 0.31691083312034607, "learning_rate": 6.431962689781969e-05, "loss": 0.0392, "step": 7870 }, { "epoch": 8.115345005149331, "grad_norm": 0.23836584389209747, "learning_rate": 6.423159112073838e-05, "loss": 0.0455, "step": 7880 }, { "epoch": 8.125643666323377, "grad_norm": 0.2766348719596863, "learning_rate": 6.414350730831805e-05, "loss": 0.0405, "step": 7890 }, { "epoch": 8.135942327497425, "grad_norm": 0.3610820174217224, "learning_rate": 6.405537575786456e-05, "loss": 0.0459, "step": 7900 }, { "epoch": 8.146240988671472, "grad_norm": 0.4069831669330597, "learning_rate": 6.396719676684494e-05, "loss": 0.0449, "step": 7910 }, { "epoch": 8.15653964984552, "grad_norm": 0.38294172286987305, "learning_rate": 6.387897063288635e-05, "loss": 0.0495, "step": 7920 }, { "epoch": 8.166838311019568, "grad_norm": 0.3302978575229645, "learning_rate": 6.3790697653775e-05, "loss": 0.0453, "step": 7930 }, { "epoch": 8.177136972193615, "grad_norm": 0.26982101798057556, "learning_rate": 6.37023781274553e-05, "loss": 0.0463, "step": 7940 }, { "epoch": 8.187435633367663, "grad_norm": 0.23370954394340515, "learning_rate": 6.361401235202872e-05, "loss": 0.0465, "step": 7950 }, { "epoch": 8.197734294541709, "grad_norm": 0.3092534840106964, "learning_rate": 6.352560062575284e-05, "loss": 0.055, "step": 7960 }, { "epoch": 8.208032955715757, "grad_norm": 0.36051103472709656, "learning_rate": 6.343714324704034e-05, "loss": 0.0551, "step": 7970 }, { "epoch": 8.218331616889804, "grad_norm": 0.33508798480033875, "learning_rate": 6.3348640514458e-05, "loss": 0.0462, "step": 7980 }, { "epoch": 8.228630278063852, "grad_norm": 0.9673136472702026, "learning_rate": 6.326009272672564e-05, "loss": 0.0442, "step": 7990 }, { "epoch": 8.2389289392379, "grad_norm": 1.469125509262085, "learning_rate": 6.317150018271522e-05, "loss": 0.0465, "step": 8000 }, { "epoch": 8.249227600411947, "grad_norm": 0.3022879660129547, "learning_rate": 6.308286318144971e-05, "loss": 0.052, "step": 8010 }, { "epoch": 8.259526261585993, "grad_norm": 0.240738183259964, "learning_rate": 6.299418202210214e-05, "loss": 0.044, "step": 8020 }, { "epoch": 8.26982492276004, "grad_norm": 0.3125, "learning_rate": 6.290545700399462e-05, "loss": 0.0413, "step": 8030 }, { "epoch": 8.280123583934088, "grad_norm": 0.3256394565105438, "learning_rate": 6.281668842659725e-05, "loss": 0.0381, "step": 8040 }, { "epoch": 8.290422245108136, "grad_norm": 0.3764393925666809, "learning_rate": 6.27278765895272e-05, "loss": 0.0412, "step": 8050 }, { "epoch": 8.300720906282184, "grad_norm": 0.28021517395973206, "learning_rate": 6.263902179254762e-05, "loss": 0.0392, "step": 8060 }, { "epoch": 8.311019567456231, "grad_norm": 0.3545322120189667, "learning_rate": 6.255012433556665e-05, "loss": 0.039, "step": 8070 }, { "epoch": 8.321318228630279, "grad_norm": 0.33872804045677185, "learning_rate": 6.246118451863646e-05, "loss": 0.0417, "step": 8080 }, { "epoch": 8.331616889804325, "grad_norm": 0.9136466383934021, "learning_rate": 6.237220264195216e-05, "loss": 0.0429, "step": 8090 }, { "epoch": 8.341915550978372, "grad_norm": 0.31747815012931824, "learning_rate": 6.228317900585083e-05, "loss": 0.0425, "step": 8100 }, { "epoch": 8.35221421215242, "grad_norm": 0.3648073375225067, "learning_rate": 6.219411391081055e-05, "loss": 0.0384, "step": 8110 }, { "epoch": 8.362512873326468, "grad_norm": 0.26562437415122986, "learning_rate": 6.210500765744925e-05, "loss": 0.036, "step": 8120 }, { "epoch": 8.372811534500515, "grad_norm": 0.2761411666870117, "learning_rate": 6.201586054652379e-05, "loss": 0.0466, "step": 8130 }, { "epoch": 8.383110195674563, "grad_norm": 0.46033117175102234, "learning_rate": 6.192667287892905e-05, "loss": 0.0432, "step": 8140 }, { "epoch": 8.393408856848609, "grad_norm": 0.3292730450630188, "learning_rate": 6.183744495569666e-05, "loss": 0.0426, "step": 8150 }, { "epoch": 8.403707518022657, "grad_norm": 0.2943620979785919, "learning_rate": 6.174817707799417e-05, "loss": 0.0483, "step": 8160 }, { "epoch": 8.414006179196704, "grad_norm": 0.3903990685939789, "learning_rate": 6.165886954712401e-05, "loss": 0.043, "step": 8170 }, { "epoch": 8.424304840370752, "grad_norm": 0.41772767901420593, "learning_rate": 6.156952266452247e-05, "loss": 0.0407, "step": 8180 }, { "epoch": 8.4346035015448, "grad_norm": 0.5899285078048706, "learning_rate": 6.148013673175857e-05, "loss": 0.0434, "step": 8190 }, { "epoch": 8.444902162718847, "grad_norm": 0.22386884689331055, "learning_rate": 6.13907120505332e-05, "loss": 0.042, "step": 8200 }, { "epoch": 8.455200823892893, "grad_norm": 0.3034772276878357, "learning_rate": 6.130124892267806e-05, "loss": 0.0365, "step": 8210 }, { "epoch": 8.46549948506694, "grad_norm": 0.37777379155158997, "learning_rate": 6.121174765015455e-05, "loss": 0.0419, "step": 8220 }, { "epoch": 8.475798146240988, "grad_norm": 0.30282172560691833, "learning_rate": 6.112220853505288e-05, "loss": 0.0418, "step": 8230 }, { "epoch": 8.486096807415036, "grad_norm": 0.5801701545715332, "learning_rate": 6.103263187959095e-05, "loss": 0.049, "step": 8240 }, { "epoch": 8.496395468589084, "grad_norm": 0.32179057598114014, "learning_rate": 6.094301798611338e-05, "loss": 0.0396, "step": 8250 }, { "epoch": 8.506694129763131, "grad_norm": 0.2766133248806, "learning_rate": 6.085336715709049e-05, "loss": 0.0484, "step": 8260 }, { "epoch": 8.516992790937179, "grad_norm": 0.2891679108142853, "learning_rate": 6.076367969511725e-05, "loss": 0.0483, "step": 8270 }, { "epoch": 8.527291452111225, "grad_norm": 0.35707661509513855, "learning_rate": 6.067395590291226e-05, "loss": 0.0468, "step": 8280 }, { "epoch": 8.537590113285273, "grad_norm": 0.29469162225723267, "learning_rate": 6.0584196083316794e-05, "loss": 0.0441, "step": 8290 }, { "epoch": 8.54788877445932, "grad_norm": 0.29220518469810486, "learning_rate": 6.0494400539293675e-05, "loss": 0.0389, "step": 8300 }, { "epoch": 8.558187435633368, "grad_norm": 0.3941989243030548, "learning_rate": 6.040456957392635e-05, "loss": 0.0389, "step": 8310 }, { "epoch": 8.568486096807415, "grad_norm": 0.2707824409008026, "learning_rate": 6.03147034904178e-05, "loss": 0.0471, "step": 8320 }, { "epoch": 8.578784757981463, "grad_norm": 0.35828855633735657, "learning_rate": 6.0224802592089513e-05, "loss": 0.0453, "step": 8330 }, { "epoch": 8.589083419155509, "grad_norm": 0.2687852382659912, "learning_rate": 6.013486718238055e-05, "loss": 0.041, "step": 8340 }, { "epoch": 8.599382080329557, "grad_norm": 0.25436437129974365, "learning_rate": 6.004489756484641e-05, "loss": 0.0411, "step": 8350 }, { "epoch": 8.609680741503604, "grad_norm": 0.22475087642669678, "learning_rate": 5.995489404315806e-05, "loss": 0.0409, "step": 8360 }, { "epoch": 8.619979402677652, "grad_norm": 0.32723718881607056, "learning_rate": 5.98648569211009e-05, "loss": 0.0477, "step": 8370 }, { "epoch": 8.6302780638517, "grad_norm": 0.2676869034767151, "learning_rate": 5.977478650257374e-05, "loss": 0.0363, "step": 8380 }, { "epoch": 8.640576725025747, "grad_norm": 0.6640805006027222, "learning_rate": 5.9684683091587804e-05, "loss": 0.0396, "step": 8390 }, { "epoch": 8.650875386199793, "grad_norm": 0.29109275341033936, "learning_rate": 5.959454699226562e-05, "loss": 0.0452, "step": 8400 }, { "epoch": 8.66117404737384, "grad_norm": 0.39319050312042236, "learning_rate": 5.95043785088401e-05, "loss": 0.0359, "step": 8410 }, { "epoch": 8.671472708547888, "grad_norm": 0.2134009450674057, "learning_rate": 5.941417794565343e-05, "loss": 0.0387, "step": 8420 }, { "epoch": 8.681771369721936, "grad_norm": 0.21827584505081177, "learning_rate": 5.9323945607156076e-05, "loss": 0.0382, "step": 8430 }, { "epoch": 8.692070030895984, "grad_norm": 0.41963616013526917, "learning_rate": 5.9233681797905785e-05, "loss": 0.0404, "step": 8440 }, { "epoch": 8.702368692070031, "grad_norm": 0.21744829416275024, "learning_rate": 5.914338682256647e-05, "loss": 0.0437, "step": 8450 }, { "epoch": 8.712667353244079, "grad_norm": 0.27720943093299866, "learning_rate": 5.905306098590728e-05, "loss": 0.0403, "step": 8460 }, { "epoch": 8.722966014418125, "grad_norm": 0.30195143818855286, "learning_rate": 5.896270459280153e-05, "loss": 0.0374, "step": 8470 }, { "epoch": 8.733264675592173, "grad_norm": 0.32989758253097534, "learning_rate": 5.8872317948225644e-05, "loss": 0.0368, "step": 8480 }, { "epoch": 8.74356333676622, "grad_norm": 0.22078627347946167, "learning_rate": 5.8781901357258165e-05, "loss": 0.0467, "step": 8490 }, { "epoch": 8.753861997940268, "grad_norm": 0.5876451134681702, "learning_rate": 5.869145512507872e-05, "loss": 0.0407, "step": 8500 }, { "epoch": 8.764160659114316, "grad_norm": 0.44796323776245117, "learning_rate": 5.860097955696698e-05, "loss": 0.0382, "step": 8510 }, { "epoch": 8.774459320288363, "grad_norm": 0.35779476165771484, "learning_rate": 5.851047495830163e-05, "loss": 0.0438, "step": 8520 }, { "epoch": 8.784757981462409, "grad_norm": 0.28585049510002136, "learning_rate": 5.841994163455934e-05, "loss": 0.0376, "step": 8530 }, { "epoch": 8.795056642636457, "grad_norm": 0.26791223883628845, "learning_rate": 5.832937989131374e-05, "loss": 0.0387, "step": 8540 }, { "epoch": 8.805355303810504, "grad_norm": 0.5671482086181641, "learning_rate": 5.823879003423438e-05, "loss": 0.0366, "step": 8550 }, { "epoch": 8.815653964984552, "grad_norm": 0.1565544456243515, "learning_rate": 5.8148172369085686e-05, "loss": 0.0369, "step": 8560 }, { "epoch": 8.8259526261586, "grad_norm": 0.46639129519462585, "learning_rate": 5.8057527201725984e-05, "loss": 0.0398, "step": 8570 }, { "epoch": 8.836251287332647, "grad_norm": 0.8469918370246887, "learning_rate": 5.796685483810637e-05, "loss": 0.047, "step": 8580 }, { "epoch": 8.846549948506695, "grad_norm": 0.1878482550382614, "learning_rate": 5.7876155584269785e-05, "loss": 0.0386, "step": 8590 }, { "epoch": 8.85684860968074, "grad_norm": 0.26714402437210083, "learning_rate": 5.7785429746349905e-05, "loss": 0.049, "step": 8600 }, { "epoch": 8.867147270854788, "grad_norm": 0.35005736351013184, "learning_rate": 5.7694677630570146e-05, "loss": 0.0435, "step": 8610 }, { "epoch": 8.877445932028836, "grad_norm": 0.48994550108909607, "learning_rate": 5.760389954324261e-05, "loss": 0.049, "step": 8620 }, { "epoch": 8.887744593202884, "grad_norm": 0.24901621043682098, "learning_rate": 5.7513095790767066e-05, "loss": 0.0445, "step": 8630 }, { "epoch": 8.898043254376931, "grad_norm": 0.32309484481811523, "learning_rate": 5.742226667962991e-05, "loss": 0.0471, "step": 8640 }, { "epoch": 8.908341915550979, "grad_norm": 0.30904820561408997, "learning_rate": 5.733141251640315e-05, "loss": 0.0377, "step": 8650 }, { "epoch": 8.918640576725025, "grad_norm": 0.30617690086364746, "learning_rate": 5.724053360774327e-05, "loss": 0.0378, "step": 8660 }, { "epoch": 8.928939237899073, "grad_norm": 0.19513899087905884, "learning_rate": 5.7149630260390384e-05, "loss": 0.0315, "step": 8670 }, { "epoch": 8.93923789907312, "grad_norm": 0.5502423048019409, "learning_rate": 5.705870278116703e-05, "loss": 0.0422, "step": 8680 }, { "epoch": 8.949536560247168, "grad_norm": 0.3435225486755371, "learning_rate": 5.6967751476977215e-05, "loss": 0.0406, "step": 8690 }, { "epoch": 8.959835221421216, "grad_norm": 0.28045403957366943, "learning_rate": 5.687677665480533e-05, "loss": 0.0473, "step": 8700 }, { "epoch": 8.970133882595263, "grad_norm": 0.2749752700328827, "learning_rate": 5.6785778621715225e-05, "loss": 0.0378, "step": 8710 }, { "epoch": 8.98043254376931, "grad_norm": 0.39981475472450256, "learning_rate": 5.669475768484901e-05, "loss": 0.0406, "step": 8720 }, { "epoch": 8.990731204943357, "grad_norm": 0.28953787684440613, "learning_rate": 5.660371415142611e-05, "loss": 0.0379, "step": 8730 }, { "epoch": 9.001029866117404, "grad_norm": 0.17452044785022736, "learning_rate": 5.65126483287423e-05, "loss": 0.0412, "step": 8740 }, { "epoch": 9.011328527291452, "grad_norm": 0.3600793182849884, "learning_rate": 5.642156052416849e-05, "loss": 0.041, "step": 8750 }, { "epoch": 9.0216271884655, "grad_norm": 0.2760295569896698, "learning_rate": 5.633045104514982e-05, "loss": 0.0435, "step": 8760 }, { "epoch": 9.031925849639547, "grad_norm": 0.3825409710407257, "learning_rate": 5.6239320199204616e-05, "loss": 0.0408, "step": 8770 }, { "epoch": 9.042224510813595, "grad_norm": 0.374891072511673, "learning_rate": 5.614816829392328e-05, "loss": 0.0383, "step": 8780 }, { "epoch": 9.052523171987641, "grad_norm": 0.27747559547424316, "learning_rate": 5.60569956369673e-05, "loss": 0.0464, "step": 8790 }, { "epoch": 9.062821833161689, "grad_norm": 0.28678062558174133, "learning_rate": 5.596580253606824e-05, "loss": 0.0487, "step": 8800 }, { "epoch": 9.073120494335736, "grad_norm": 0.4970363676548004, "learning_rate": 5.587458929902664e-05, "loss": 0.051, "step": 8810 }, { "epoch": 9.083419155509784, "grad_norm": 0.30037108063697815, "learning_rate": 5.5783356233711005e-05, "loss": 0.0383, "step": 8820 }, { "epoch": 9.093717816683832, "grad_norm": 0.2640860676765442, "learning_rate": 5.569210364805677e-05, "loss": 0.0462, "step": 8830 }, { "epoch": 9.10401647785788, "grad_norm": 0.30006083846092224, "learning_rate": 5.5600831850065274e-05, "loss": 0.0362, "step": 8840 }, { "epoch": 9.114315139031925, "grad_norm": 0.3721349537372589, "learning_rate": 5.550954114780269e-05, "loss": 0.0399, "step": 8850 }, { "epoch": 9.124613800205973, "grad_norm": 0.336732417345047, "learning_rate": 5.541823184939896e-05, "loss": 0.0421, "step": 8860 }, { "epoch": 9.13491246138002, "grad_norm": 0.26279309391975403, "learning_rate": 5.532690426304685e-05, "loss": 0.0433, "step": 8870 }, { "epoch": 9.145211122554068, "grad_norm": 0.2945043742656708, "learning_rate": 5.5235558697000836e-05, "loss": 0.0439, "step": 8880 }, { "epoch": 9.155509783728116, "grad_norm": 0.47877517342567444, "learning_rate": 5.514419545957606e-05, "loss": 0.0431, "step": 8890 }, { "epoch": 9.165808444902163, "grad_norm": 0.3854601979255676, "learning_rate": 5.5052814859147315e-05, "loss": 0.0365, "step": 8900 }, { "epoch": 9.176107106076211, "grad_norm": 0.3006962835788727, "learning_rate": 5.496141720414804e-05, "loss": 0.0427, "step": 8910 }, { "epoch": 9.186405767250257, "grad_norm": 0.5065596699714661, "learning_rate": 5.487000280306917e-05, "loss": 0.0395, "step": 8920 }, { "epoch": 9.196704428424304, "grad_norm": 0.4032178521156311, "learning_rate": 5.4778571964458214e-05, "loss": 0.0341, "step": 8930 }, { "epoch": 9.207003089598352, "grad_norm": 0.357695609331131, "learning_rate": 5.468712499691816e-05, "loss": 0.0427, "step": 8940 }, { "epoch": 9.2173017507724, "grad_norm": 0.6212796568870544, "learning_rate": 5.45956622091064e-05, "loss": 0.0444, "step": 8950 }, { "epoch": 9.227600411946447, "grad_norm": 0.29458391666412354, "learning_rate": 5.4504183909733734e-05, "loss": 0.0402, "step": 8960 }, { "epoch": 9.237899073120495, "grad_norm": 0.309467613697052, "learning_rate": 5.441269040756334e-05, "loss": 0.0412, "step": 8970 }, { "epoch": 9.248197734294541, "grad_norm": 0.17707674205303192, "learning_rate": 5.43211820114097e-05, "loss": 0.0423, "step": 8980 }, { "epoch": 9.258496395468589, "grad_norm": 0.4098307490348816, "learning_rate": 5.422965903013757e-05, "loss": 0.0421, "step": 8990 }, { "epoch": 9.268795056642636, "grad_norm": 0.31290164589881897, "learning_rate": 5.41381217726609e-05, "loss": 0.0402, "step": 9000 }, { "epoch": 9.279093717816684, "grad_norm": 0.20957662165164948, "learning_rate": 5.404657054794189e-05, "loss": 0.0426, "step": 9010 }, { "epoch": 9.289392378990732, "grad_norm": 0.2308698147535324, "learning_rate": 5.3955005664989834e-05, "loss": 0.0389, "step": 9020 }, { "epoch": 9.29969104016478, "grad_norm": 0.2409774512052536, "learning_rate": 5.3863427432860125e-05, "loss": 0.0352, "step": 9030 }, { "epoch": 9.309989701338825, "grad_norm": 0.24483443796634674, "learning_rate": 5.3771836160653254e-05, "loss": 0.0406, "step": 9040 }, { "epoch": 9.320288362512873, "grad_norm": 0.2869531810283661, "learning_rate": 5.368023215751369e-05, "loss": 0.0379, "step": 9050 }, { "epoch": 9.33058702368692, "grad_norm": 0.27807915210723877, "learning_rate": 5.3588615732628854e-05, "loss": 0.0451, "step": 9060 }, { "epoch": 9.340885684860968, "grad_norm": 0.33199331164360046, "learning_rate": 5.3496987195228156e-05, "loss": 0.034, "step": 9070 }, { "epoch": 9.351184346035016, "grad_norm": 0.2562348246574402, "learning_rate": 5.340534685458185e-05, "loss": 0.0413, "step": 9080 }, { "epoch": 9.361483007209063, "grad_norm": 0.3097791075706482, "learning_rate": 5.3313695020000024e-05, "loss": 0.039, "step": 9090 }, { "epoch": 9.371781668383111, "grad_norm": 0.3079645037651062, "learning_rate": 5.322203200083154e-05, "loss": 0.0349, "step": 9100 }, { "epoch": 9.382080329557157, "grad_norm": 0.4117037057876587, "learning_rate": 5.3130358106463104e-05, "loss": 0.0407, "step": 9110 }, { "epoch": 9.392378990731205, "grad_norm": 0.4133201539516449, "learning_rate": 5.303867364631804e-05, "loss": 0.045, "step": 9120 }, { "epoch": 9.402677651905252, "grad_norm": 0.2096584141254425, "learning_rate": 5.294697892985534e-05, "loss": 0.0335, "step": 9130 }, { "epoch": 9.4129763130793, "grad_norm": 0.28559908270835876, "learning_rate": 5.285527426656865e-05, "loss": 0.0398, "step": 9140 }, { "epoch": 9.423274974253347, "grad_norm": 0.3598606288433075, "learning_rate": 5.2763559965985184e-05, "loss": 0.0419, "step": 9150 }, { "epoch": 9.433573635427395, "grad_norm": 0.35209372639656067, "learning_rate": 5.2671836337664634e-05, "loss": 0.0405, "step": 9160 }, { "epoch": 9.443872296601441, "grad_norm": 0.23415158689022064, "learning_rate": 5.2580103691198255e-05, "loss": 0.0366, "step": 9170 }, { "epoch": 9.454170957775489, "grad_norm": 0.2906668484210968, "learning_rate": 5.24883623362077e-05, "loss": 0.0493, "step": 9180 }, { "epoch": 9.464469618949536, "grad_norm": 0.21137650310993195, "learning_rate": 5.2396612582343986e-05, "loss": 0.0423, "step": 9190 }, { "epoch": 9.474768280123584, "grad_norm": 0.23499812185764313, "learning_rate": 5.230485473928651e-05, "loss": 0.0416, "step": 9200 }, { "epoch": 9.485066941297632, "grad_norm": 0.372158020734787, "learning_rate": 5.221308911674201e-05, "loss": 0.0407, "step": 9210 }, { "epoch": 9.49536560247168, "grad_norm": 0.2552221119403839, "learning_rate": 5.2121316024443415e-05, "loss": 0.0408, "step": 9220 }, { "epoch": 9.505664263645727, "grad_norm": 0.27116450667381287, "learning_rate": 5.202953577214889e-05, "loss": 0.0375, "step": 9230 }, { "epoch": 9.515962924819773, "grad_norm": 1.0216639041900635, "learning_rate": 5.1937748669640776e-05, "loss": 0.0412, "step": 9240 }, { "epoch": 9.52626158599382, "grad_norm": 0.39132076501846313, "learning_rate": 5.1845955026724535e-05, "loss": 0.0408, "step": 9250 }, { "epoch": 9.536560247167868, "grad_norm": 0.3046022653579712, "learning_rate": 5.175415515322768e-05, "loss": 0.0349, "step": 9260 }, { "epoch": 9.546858908341916, "grad_norm": 0.5317039489746094, "learning_rate": 5.1662349358998796e-05, "loss": 0.0377, "step": 9270 }, { "epoch": 9.557157569515963, "grad_norm": 0.308902382850647, "learning_rate": 5.157053795390642e-05, "loss": 0.0416, "step": 9280 }, { "epoch": 9.567456230690011, "grad_norm": 0.1709175854921341, "learning_rate": 5.147872124783805e-05, "loss": 0.0367, "step": 9290 }, { "epoch": 9.577754891864057, "grad_norm": 0.35447025299072266, "learning_rate": 5.138689955069902e-05, "loss": 0.0339, "step": 9300 }, { "epoch": 9.588053553038105, "grad_norm": 0.20557384192943573, "learning_rate": 5.12950731724116e-05, "loss": 0.0435, "step": 9310 }, { "epoch": 9.598352214212152, "grad_norm": 0.27278539538383484, "learning_rate": 5.12032424229138e-05, "loss": 0.0399, "step": 9320 }, { "epoch": 9.6086508753862, "grad_norm": 0.3033859133720398, "learning_rate": 5.111140761215839e-05, "loss": 0.0376, "step": 9330 }, { "epoch": 9.618949536560248, "grad_norm": 0.3543021082878113, "learning_rate": 5.101956905011185e-05, "loss": 0.0427, "step": 9340 }, { "epoch": 9.629248197734295, "grad_norm": 0.2944181561470032, "learning_rate": 5.0927727046753336e-05, "loss": 0.0371, "step": 9350 }, { "epoch": 9.639546858908343, "grad_norm": 0.3597414493560791, "learning_rate": 5.08358819120736e-05, "loss": 0.0373, "step": 9360 }, { "epoch": 9.649845520082389, "grad_norm": 0.33194977045059204, "learning_rate": 5.074403395607399e-05, "loss": 0.0424, "step": 9370 }, { "epoch": 9.660144181256436, "grad_norm": 0.21433711051940918, "learning_rate": 5.0652183488765335e-05, "loss": 0.0407, "step": 9380 }, { "epoch": 9.670442842430484, "grad_norm": 0.3961849808692932, "learning_rate": 5.056033082016699e-05, "loss": 0.0419, "step": 9390 }, { "epoch": 9.680741503604532, "grad_norm": 0.9774559140205383, "learning_rate": 5.046847626030569e-05, "loss": 0.041, "step": 9400 }, { "epoch": 9.69104016477858, "grad_norm": 0.36883220076560974, "learning_rate": 5.037662011921459e-05, "loss": 0.0377, "step": 9410 }, { "epoch": 9.701338825952627, "grad_norm": 0.37542909383773804, "learning_rate": 5.028476270693217e-05, "loss": 0.0408, "step": 9420 }, { "epoch": 9.711637487126673, "grad_norm": 0.45353376865386963, "learning_rate": 5.0192904333501214e-05, "loss": 0.0419, "step": 9430 }, { "epoch": 9.72193614830072, "grad_norm": 0.27116161584854126, "learning_rate": 5.010104530896771e-05, "loss": 0.0447, "step": 9440 }, { "epoch": 9.732234809474768, "grad_norm": 0.26916906237602234, "learning_rate": 5.000918594337989e-05, "loss": 0.0461, "step": 9450 }, { "epoch": 9.742533470648816, "grad_norm": 0.3069358766078949, "learning_rate": 4.991732654678709e-05, "loss": 0.0458, "step": 9460 }, { "epoch": 9.752832131822863, "grad_norm": 0.42274564504623413, "learning_rate": 4.9825467429238834e-05, "loss": 0.0401, "step": 9470 }, { "epoch": 9.763130792996911, "grad_norm": 0.17982327938079834, "learning_rate": 4.973360890078358e-05, "loss": 0.0427, "step": 9480 }, { "epoch": 9.773429454170957, "grad_norm": 0.23251447081565857, "learning_rate": 4.96417512714679e-05, "loss": 0.0326, "step": 9490 }, { "epoch": 9.783728115345005, "grad_norm": 0.2869229018688202, "learning_rate": 4.954989485133533e-05, "loss": 0.0507, "step": 9500 }, { "epoch": 9.794026776519052, "grad_norm": 1.0959696769714355, "learning_rate": 4.9458039950425224e-05, "loss": 0.0518, "step": 9510 }, { "epoch": 9.8043254376931, "grad_norm": 0.3641543686389923, "learning_rate": 4.9366186878771926e-05, "loss": 0.0434, "step": 9520 }, { "epoch": 9.814624098867148, "grad_norm": 0.5896167159080505, "learning_rate": 4.927433594640354e-05, "loss": 0.0409, "step": 9530 }, { "epoch": 9.824922760041195, "grad_norm": 0.24302540719509125, "learning_rate": 4.918248746334096e-05, "loss": 0.0451, "step": 9540 }, { "epoch": 9.835221421215241, "grad_norm": 0.2889201045036316, "learning_rate": 4.909064173959681e-05, "loss": 0.0384, "step": 9550 }, { "epoch": 9.845520082389289, "grad_norm": 0.37873101234436035, "learning_rate": 4.8998799085174455e-05, "loss": 0.0404, "step": 9560 }, { "epoch": 9.855818743563336, "grad_norm": 0.4369457960128784, "learning_rate": 4.89069598100668e-05, "loss": 0.0431, "step": 9570 }, { "epoch": 9.866117404737384, "grad_norm": 0.37580832839012146, "learning_rate": 4.881512422425541e-05, "loss": 0.044, "step": 9580 }, { "epoch": 9.876416065911432, "grad_norm": 0.46920913457870483, "learning_rate": 4.872329263770942e-05, "loss": 0.0469, "step": 9590 }, { "epoch": 9.88671472708548, "grad_norm": 0.24571798741817474, "learning_rate": 4.8631465360384385e-05, "loss": 0.0398, "step": 9600 }, { "epoch": 9.897013388259527, "grad_norm": 0.3728749454021454, "learning_rate": 4.85396427022214e-05, "loss": 0.0352, "step": 9610 }, { "epoch": 9.907312049433573, "grad_norm": 0.301878958940506, "learning_rate": 4.844782497314591e-05, "loss": 0.0432, "step": 9620 }, { "epoch": 9.91761071060762, "grad_norm": 0.26632949709892273, "learning_rate": 4.835601248306675e-05, "loss": 0.0439, "step": 9630 }, { "epoch": 9.927909371781668, "grad_norm": 0.31497064232826233, "learning_rate": 4.826420554187506e-05, "loss": 0.0399, "step": 9640 }, { "epoch": 9.938208032955716, "grad_norm": 0.26114657521247864, "learning_rate": 4.817240445944327e-05, "loss": 0.0408, "step": 9650 }, { "epoch": 9.948506694129764, "grad_norm": 0.2729547619819641, "learning_rate": 4.8080609545624004e-05, "loss": 0.0392, "step": 9660 }, { "epoch": 9.958805355303811, "grad_norm": 0.22712601721286774, "learning_rate": 4.798882111024912e-05, "loss": 0.0363, "step": 9670 }, { "epoch": 9.969104016477857, "grad_norm": 0.47241315245628357, "learning_rate": 4.7897039463128524e-05, "loss": 0.0369, "step": 9680 }, { "epoch": 9.979402677651905, "grad_norm": 0.3929249048233032, "learning_rate": 4.780526491404929e-05, "loss": 0.0436, "step": 9690 }, { "epoch": 9.989701338825952, "grad_norm": 0.32324254512786865, "learning_rate": 4.771349777277452e-05, "loss": 0.0418, "step": 9700 }, { "epoch": 10.0, "grad_norm": 0.4991161525249481, "learning_rate": 4.762173834904225e-05, "loss": 0.0352, "step": 9710 }, { "epoch": 10.010298661174048, "grad_norm": 0.2615014612674713, "learning_rate": 4.752998695256455e-05, "loss": 0.0412, "step": 9720 }, { "epoch": 10.020597322348095, "grad_norm": 0.29027608036994934, "learning_rate": 4.743824389302635e-05, "loss": 0.035, "step": 9730 }, { "epoch": 10.030895983522143, "grad_norm": 0.3496328294277191, "learning_rate": 4.734650948008445e-05, "loss": 0.038, "step": 9740 }, { "epoch": 10.041194644696189, "grad_norm": 0.25003111362457275, "learning_rate": 4.7254784023366444e-05, "loss": 0.0408, "step": 9750 }, { "epoch": 10.051493305870236, "grad_norm": 0.28183093667030334, "learning_rate": 4.716306783246977e-05, "loss": 0.0415, "step": 9760 }, { "epoch": 10.061791967044284, "grad_norm": 0.3574424386024475, "learning_rate": 4.707136121696048e-05, "loss": 0.0394, "step": 9770 }, { "epoch": 10.072090628218332, "grad_norm": 0.2761897146701813, "learning_rate": 4.69796644863724e-05, "loss": 0.034, "step": 9780 }, { "epoch": 10.08238928939238, "grad_norm": 0.2602722644805908, "learning_rate": 4.688797795020597e-05, "loss": 0.0354, "step": 9790 }, { "epoch": 10.092687950566427, "grad_norm": 0.2515560984611511, "learning_rate": 4.6796301917927166e-05, "loss": 0.0402, "step": 9800 }, { "epoch": 10.102986611740473, "grad_norm": 0.24942000210285187, "learning_rate": 4.670463669896659e-05, "loss": 0.0406, "step": 9810 }, { "epoch": 10.11328527291452, "grad_norm": 0.29609471559524536, "learning_rate": 4.66129826027183e-05, "loss": 0.0397, "step": 9820 }, { "epoch": 10.123583934088568, "grad_norm": 0.3640936613082886, "learning_rate": 4.652133993853883e-05, "loss": 0.0456, "step": 9830 }, { "epoch": 10.133882595262616, "grad_norm": 0.2724517285823822, "learning_rate": 4.64297090157461e-05, "loss": 0.0371, "step": 9840 }, { "epoch": 10.144181256436664, "grad_norm": 0.33307430148124695, "learning_rate": 4.633809014361843e-05, "loss": 0.0438, "step": 9850 }, { "epoch": 10.154479917610711, "grad_norm": 0.45976462960243225, "learning_rate": 4.624648363139344e-05, "loss": 0.0479, "step": 9860 }, { "epoch": 10.164778578784759, "grad_norm": 0.24571570754051208, "learning_rate": 4.615488978826709e-05, "loss": 0.0375, "step": 9870 }, { "epoch": 10.175077239958805, "grad_norm": 0.4202505052089691, "learning_rate": 4.6063308923392485e-05, "loss": 0.0446, "step": 9880 }, { "epoch": 10.185375901132852, "grad_norm": 0.30180397629737854, "learning_rate": 4.5971741345879e-05, "loss": 0.0372, "step": 9890 }, { "epoch": 10.1956745623069, "grad_norm": 0.39542245864868164, "learning_rate": 4.588018736479115e-05, "loss": 0.0407, "step": 9900 }, { "epoch": 10.205973223480948, "grad_norm": 0.5576333403587341, "learning_rate": 4.5788647289147516e-05, "loss": 0.0372, "step": 9910 }, { "epoch": 10.216271884654995, "grad_norm": 0.2639693319797516, "learning_rate": 4.56971214279198e-05, "loss": 0.0463, "step": 9920 }, { "epoch": 10.226570545829043, "grad_norm": 0.26938265562057495, "learning_rate": 4.56056100900317e-05, "loss": 0.0367, "step": 9930 }, { "epoch": 10.236869207003089, "grad_norm": 0.27783456444740295, "learning_rate": 4.5514113584357873e-05, "loss": 0.0369, "step": 9940 }, { "epoch": 10.247167868177137, "grad_norm": 0.27680081129074097, "learning_rate": 4.542263221972295e-05, "loss": 0.0393, "step": 9950 }, { "epoch": 10.257466529351184, "grad_norm": 0.2161240130662918, "learning_rate": 4.5331166304900464e-05, "loss": 0.042, "step": 9960 }, { "epoch": 10.267765190525232, "grad_norm": 0.27455902099609375, "learning_rate": 4.5239716148611724e-05, "loss": 0.0434, "step": 9970 }, { "epoch": 10.27806385169928, "grad_norm": 0.3013168275356293, "learning_rate": 4.514828205952495e-05, "loss": 0.0395, "step": 9980 }, { "epoch": 10.288362512873327, "grad_norm": 0.2296813279390335, "learning_rate": 4.505686434625409e-05, "loss": 0.0368, "step": 9990 }, { "epoch": 10.298661174047373, "grad_norm": 0.19806218147277832, "learning_rate": 4.496546331735778e-05, "loss": 0.0391, "step": 10000 }, { "epoch": 10.30895983522142, "grad_norm": 0.24850870668888092, "learning_rate": 4.4874079281338416e-05, "loss": 0.0407, "step": 10010 }, { "epoch": 10.319258496395468, "grad_norm": 0.16531158983707428, "learning_rate": 4.478271254664097e-05, "loss": 0.0359, "step": 10020 }, { "epoch": 10.329557157569516, "grad_norm": 0.5394207835197449, "learning_rate": 4.469136342165207e-05, "loss": 0.0375, "step": 10030 }, { "epoch": 10.339855818743564, "grad_norm": 0.4204263687133789, "learning_rate": 4.460003221469886e-05, "loss": 0.042, "step": 10040 }, { "epoch": 10.350154479917611, "grad_norm": 2.313096284866333, "learning_rate": 4.450871923404806e-05, "loss": 0.0465, "step": 10050 }, { "epoch": 10.360453141091659, "grad_norm": 0.6360970735549927, "learning_rate": 4.441742478790481e-05, "loss": 0.0421, "step": 10060 }, { "epoch": 10.370751802265705, "grad_norm": 0.23286186158657074, "learning_rate": 4.432614918441175e-05, "loss": 0.0352, "step": 10070 }, { "epoch": 10.381050463439752, "grad_norm": 0.3724748194217682, "learning_rate": 4.4234892731647866e-05, "loss": 0.0434, "step": 10080 }, { "epoch": 10.3913491246138, "grad_norm": 0.212792307138443, "learning_rate": 4.414365573762755e-05, "loss": 0.0357, "step": 10090 }, { "epoch": 10.401647785787848, "grad_norm": 0.22442536056041718, "learning_rate": 4.4052438510299515e-05, "loss": 0.0398, "step": 10100 }, { "epoch": 10.411946446961895, "grad_norm": 0.3250674307346344, "learning_rate": 4.3961241357545706e-05, "loss": 0.0377, "step": 10110 }, { "epoch": 10.422245108135943, "grad_norm": 0.2997426986694336, "learning_rate": 4.387006458718037e-05, "loss": 0.0385, "step": 10120 }, { "epoch": 10.432543769309989, "grad_norm": 0.26953554153442383, "learning_rate": 4.377890850694893e-05, "loss": 0.0352, "step": 10130 }, { "epoch": 10.442842430484037, "grad_norm": 0.3824928402900696, "learning_rate": 4.368777342452697e-05, "loss": 0.038, "step": 10140 }, { "epoch": 10.453141091658084, "grad_norm": 0.33039042353630066, "learning_rate": 4.35966596475192e-05, "loss": 0.0354, "step": 10150 }, { "epoch": 10.463439752832132, "grad_norm": 0.665787935256958, "learning_rate": 4.3505567483458456e-05, "loss": 0.0393, "step": 10160 }, { "epoch": 10.47373841400618, "grad_norm": 0.25892671942710876, "learning_rate": 4.341449723980457e-05, "loss": 0.0403, "step": 10170 }, { "epoch": 10.484037075180227, "grad_norm": 0.8381480574607849, "learning_rate": 4.3323449223943416e-05, "loss": 0.0403, "step": 10180 }, { "epoch": 10.494335736354273, "grad_norm": 0.2520352303981781, "learning_rate": 4.323242374318586e-05, "loss": 0.0376, "step": 10190 }, { "epoch": 10.50463439752832, "grad_norm": 0.30395472049713135, "learning_rate": 4.314142110476666e-05, "loss": 0.039, "step": 10200 }, { "epoch": 10.514933058702368, "grad_norm": 0.2134946584701538, "learning_rate": 4.305044161584352e-05, "loss": 0.0356, "step": 10210 }, { "epoch": 10.525231719876416, "grad_norm": 0.30410531163215637, "learning_rate": 4.295948558349598e-05, "loss": 0.0399, "step": 10220 }, { "epoch": 10.535530381050464, "grad_norm": 0.3639879524707794, "learning_rate": 4.2868553314724425e-05, "loss": 0.0377, "step": 10230 }, { "epoch": 10.545829042224511, "grad_norm": 0.7833529114723206, "learning_rate": 4.2777645116449004e-05, "loss": 0.042, "step": 10240 }, { "epoch": 10.556127703398559, "grad_norm": 0.3496880829334259, "learning_rate": 4.268676129550869e-05, "loss": 0.043, "step": 10250 }, { "epoch": 10.566426364572605, "grad_norm": 0.24933426082134247, "learning_rate": 4.2595902158660074e-05, "loss": 0.0392, "step": 10260 }, { "epoch": 10.576725025746653, "grad_norm": 0.35013383626937866, "learning_rate": 4.250506801257653e-05, "loss": 0.0403, "step": 10270 }, { "epoch": 10.5870236869207, "grad_norm": 0.5155181884765625, "learning_rate": 4.241425916384699e-05, "loss": 0.0383, "step": 10280 }, { "epoch": 10.597322348094748, "grad_norm": 0.5019784569740295, "learning_rate": 4.2323475918975075e-05, "loss": 0.0412, "step": 10290 }, { "epoch": 10.607621009268795, "grad_norm": 0.38487544655799866, "learning_rate": 4.223271858437799e-05, "loss": 0.0377, "step": 10300 }, { "epoch": 10.617919670442843, "grad_norm": 0.2794114947319031, "learning_rate": 4.21419874663854e-05, "loss": 0.0398, "step": 10310 }, { "epoch": 10.628218331616889, "grad_norm": 0.1784840226173401, "learning_rate": 4.205128287123858e-05, "loss": 0.0375, "step": 10320 }, { "epoch": 10.638516992790937, "grad_norm": 0.19784130156040192, "learning_rate": 4.196060510508922e-05, "loss": 0.0329, "step": 10330 }, { "epoch": 10.648815653964984, "grad_norm": 0.25078096985816956, "learning_rate": 4.186995447399849e-05, "loss": 0.0305, "step": 10340 }, { "epoch": 10.659114315139032, "grad_norm": 0.2800082862377167, "learning_rate": 4.177933128393594e-05, "loss": 0.0386, "step": 10350 }, { "epoch": 10.66941297631308, "grad_norm": 0.2689889073371887, "learning_rate": 4.1688735840778546e-05, "loss": 0.0355, "step": 10360 }, { "epoch": 10.679711637487127, "grad_norm": 0.26448753476142883, "learning_rate": 4.159816845030957e-05, "loss": 0.0357, "step": 10370 }, { "epoch": 10.690010298661175, "grad_norm": 0.2718246579170227, "learning_rate": 4.1507629418217634e-05, "loss": 0.0339, "step": 10380 }, { "epoch": 10.70030895983522, "grad_norm": 0.2607558071613312, "learning_rate": 4.141711905009566e-05, "loss": 0.0397, "step": 10390 }, { "epoch": 10.710607621009268, "grad_norm": 0.324266254901886, "learning_rate": 4.132663765143975e-05, "loss": 0.0355, "step": 10400 }, { "epoch": 10.720906282183316, "grad_norm": 0.31110501289367676, "learning_rate": 4.1236185527648294e-05, "loss": 0.0389, "step": 10410 }, { "epoch": 10.731204943357364, "grad_norm": 0.3010208010673523, "learning_rate": 4.114576298402084e-05, "loss": 0.0384, "step": 10420 }, { "epoch": 10.741503604531411, "grad_norm": 0.42494192719459534, "learning_rate": 4.1055370325757106e-05, "loss": 0.0407, "step": 10430 }, { "epoch": 10.751802265705459, "grad_norm": 0.26597830653190613, "learning_rate": 4.096500785795591e-05, "loss": 0.0351, "step": 10440 }, { "epoch": 10.762100926879505, "grad_norm": 0.3270758092403412, "learning_rate": 4.087467588561424e-05, "loss": 0.0351, "step": 10450 }, { "epoch": 10.772399588053553, "grad_norm": 0.35372480750083923, "learning_rate": 4.0784374713626076e-05, "loss": 0.0431, "step": 10460 }, { "epoch": 10.7826982492276, "grad_norm": 0.3251330256462097, "learning_rate": 4.069410464678148e-05, "loss": 0.0352, "step": 10470 }, { "epoch": 10.792996910401648, "grad_norm": 0.26621249318122864, "learning_rate": 4.0603865989765504e-05, "loss": 0.0432, "step": 10480 }, { "epoch": 10.803295571575696, "grad_norm": 0.3128867745399475, "learning_rate": 4.05136590471572e-05, "loss": 0.0412, "step": 10490 }, { "epoch": 10.813594232749743, "grad_norm": 0.20734545588493347, "learning_rate": 4.042348412342861e-05, "loss": 0.0352, "step": 10500 }, { "epoch": 10.82389289392379, "grad_norm": 0.3195039629936218, "learning_rate": 4.0333341522943614e-05, "loss": 0.0374, "step": 10510 }, { "epoch": 10.834191555097837, "grad_norm": 0.27724260091781616, "learning_rate": 4.024323154995708e-05, "loss": 0.0405, "step": 10520 }, { "epoch": 10.844490216271884, "grad_norm": 0.2909531593322754, "learning_rate": 4.015315450861371e-05, "loss": 0.0364, "step": 10530 }, { "epoch": 10.854788877445932, "grad_norm": 0.28578925132751465, "learning_rate": 4.006311070294702e-05, "loss": 0.0354, "step": 10540 }, { "epoch": 10.86508753861998, "grad_norm": 0.2503175437450409, "learning_rate": 3.997310043687842e-05, "loss": 0.0348, "step": 10550 }, { "epoch": 10.875386199794027, "grad_norm": 0.36039701104164124, "learning_rate": 3.988312401421609e-05, "loss": 0.0414, "step": 10560 }, { "epoch": 10.885684860968075, "grad_norm": 0.45128464698791504, "learning_rate": 3.979318173865393e-05, "loss": 0.04, "step": 10570 }, { "epoch": 10.89598352214212, "grad_norm": 0.35974377393722534, "learning_rate": 3.970327391377064e-05, "loss": 0.0392, "step": 10580 }, { "epoch": 10.906282183316168, "grad_norm": 0.22907008230686188, "learning_rate": 3.9613400843028666e-05, "loss": 0.0342, "step": 10590 }, { "epoch": 10.916580844490216, "grad_norm": 0.3276582956314087, "learning_rate": 3.9523562829773036e-05, "loss": 0.043, "step": 10600 }, { "epoch": 10.926879505664264, "grad_norm": 0.27974191308021545, "learning_rate": 3.943376017723057e-05, "loss": 0.0357, "step": 10610 }, { "epoch": 10.937178166838311, "grad_norm": 0.3858673572540283, "learning_rate": 3.934399318850868e-05, "loss": 0.0369, "step": 10620 }, { "epoch": 10.947476828012359, "grad_norm": 0.29965823888778687, "learning_rate": 3.925426216659438e-05, "loss": 0.0369, "step": 10630 }, { "epoch": 10.957775489186405, "grad_norm": 0.3583829998970032, "learning_rate": 3.916456741435336e-05, "loss": 0.0425, "step": 10640 }, { "epoch": 10.968074150360453, "grad_norm": 0.27793335914611816, "learning_rate": 3.9074909234528826e-05, "loss": 0.0399, "step": 10650 }, { "epoch": 10.9783728115345, "grad_norm": 0.24120087921619415, "learning_rate": 3.898528792974056e-05, "loss": 0.0403, "step": 10660 }, { "epoch": 10.988671472708548, "grad_norm": 0.22013327479362488, "learning_rate": 3.8895703802483916e-05, "loss": 0.034, "step": 10670 }, { "epoch": 10.998970133882596, "grad_norm": 0.2588166296482086, "learning_rate": 3.880615715512868e-05, "loss": 0.0316, "step": 10680 }, { "epoch": 11.009268795056643, "grad_norm": 0.2514420449733734, "learning_rate": 3.871664828991822e-05, "loss": 0.0383, "step": 10690 }, { "epoch": 11.019567456230691, "grad_norm": 0.3404804468154907, "learning_rate": 3.862717750896837e-05, "loss": 0.0352, "step": 10700 }, { "epoch": 11.029866117404737, "grad_norm": 0.9497872591018677, "learning_rate": 3.853774511426634e-05, "loss": 0.0366, "step": 10710 }, { "epoch": 11.040164778578784, "grad_norm": 0.28247174620628357, "learning_rate": 3.844835140766988e-05, "loss": 0.0473, "step": 10720 }, { "epoch": 11.050463439752832, "grad_norm": 0.28879600763320923, "learning_rate": 3.83589966909061e-05, "loss": 0.0344, "step": 10730 }, { "epoch": 11.06076210092688, "grad_norm": 0.23894581198692322, "learning_rate": 3.82696812655705e-05, "loss": 0.0349, "step": 10740 }, { "epoch": 11.071060762100927, "grad_norm": 0.26289770007133484, "learning_rate": 3.818040543312598e-05, "loss": 0.0384, "step": 10750 }, { "epoch": 11.081359423274975, "grad_norm": 0.33045023679733276, "learning_rate": 3.809116949490184e-05, "loss": 0.0331, "step": 10760 }, { "epoch": 11.091658084449021, "grad_norm": 0.46705836057662964, "learning_rate": 3.8001973752092655e-05, "loss": 0.0386, "step": 10770 }, { "epoch": 11.101956745623069, "grad_norm": 0.5863741040229797, "learning_rate": 3.791281850575737e-05, "loss": 0.0415, "step": 10780 }, { "epoch": 11.112255406797116, "grad_norm": 0.24471549689769745, "learning_rate": 3.782370405681828e-05, "loss": 0.0372, "step": 10790 }, { "epoch": 11.122554067971164, "grad_norm": 0.3259426951408386, "learning_rate": 3.773463070605987e-05, "loss": 0.043, "step": 10800 }, { "epoch": 11.132852729145212, "grad_norm": 0.2583596408367157, "learning_rate": 3.764559875412803e-05, "loss": 0.0354, "step": 10810 }, { "epoch": 11.14315139031926, "grad_norm": 0.46032634377479553, "learning_rate": 3.7556608501528846e-05, "loss": 0.0393, "step": 10820 }, { "epoch": 11.153450051493305, "grad_norm": 0.38069912791252136, "learning_rate": 3.7467660248627654e-05, "loss": 0.0398, "step": 10830 }, { "epoch": 11.163748712667353, "grad_norm": 0.28435567021369934, "learning_rate": 3.737875429564807e-05, "loss": 0.0388, "step": 10840 }, { "epoch": 11.1740473738414, "grad_norm": 0.34043052792549133, "learning_rate": 3.7289890942670946e-05, "loss": 0.0296, "step": 10850 }, { "epoch": 11.184346035015448, "grad_norm": 0.3213551938533783, "learning_rate": 3.720107048963327e-05, "loss": 0.0296, "step": 10860 }, { "epoch": 11.194644696189496, "grad_norm": 0.45642250776290894, "learning_rate": 3.711229323632732e-05, "loss": 0.0347, "step": 10870 }, { "epoch": 11.204943357363543, "grad_norm": 0.29973405599594116, "learning_rate": 3.70235594823995e-05, "loss": 0.036, "step": 10880 }, { "epoch": 11.215242018537591, "grad_norm": 0.2634925842285156, "learning_rate": 3.693486952734941e-05, "loss": 0.0337, "step": 10890 }, { "epoch": 11.225540679711637, "grad_norm": 0.25237777829170227, "learning_rate": 3.684622367052887e-05, "loss": 0.0347, "step": 10900 }, { "epoch": 11.235839340885684, "grad_norm": 0.20709861814975739, "learning_rate": 3.675762221114077e-05, "loss": 0.0305, "step": 10910 }, { "epoch": 11.246138002059732, "grad_norm": 0.14299030601978302, "learning_rate": 3.66690654482382e-05, "loss": 0.0334, "step": 10920 }, { "epoch": 11.25643666323378, "grad_norm": 0.2454812377691269, "learning_rate": 3.658055368072339e-05, "loss": 0.0375, "step": 10930 }, { "epoch": 11.266735324407827, "grad_norm": 0.2894679307937622, "learning_rate": 3.6492087207346666e-05, "loss": 0.0416, "step": 10940 }, { "epoch": 11.277033985581875, "grad_norm": 0.2871219217777252, "learning_rate": 3.640366632670549e-05, "loss": 0.034, "step": 10950 }, { "epoch": 11.287332646755921, "grad_norm": 0.30559393763542175, "learning_rate": 3.631529133724348e-05, "loss": 0.0369, "step": 10960 }, { "epoch": 11.297631307929969, "grad_norm": 0.35164326429367065, "learning_rate": 3.622696253724927e-05, "loss": 0.035, "step": 10970 }, { "epoch": 11.307929969104016, "grad_norm": 0.27396318316459656, "learning_rate": 3.613868022485566e-05, "loss": 0.0389, "step": 10980 }, { "epoch": 11.318228630278064, "grad_norm": 0.27721869945526123, "learning_rate": 3.605044469803854e-05, "loss": 0.0365, "step": 10990 }, { "epoch": 11.328527291452112, "grad_norm": 0.2726707458496094, "learning_rate": 3.5962256254615853e-05, "loss": 0.0382, "step": 11000 }, { "epoch": 11.33882595262616, "grad_norm": 0.3522757589817047, "learning_rate": 3.587411519224665e-05, "loss": 0.0432, "step": 11010 }, { "epoch": 11.349124613800207, "grad_norm": 0.2744219899177551, "learning_rate": 3.5786021808430054e-05, "loss": 0.0328, "step": 11020 }, { "epoch": 11.359423274974253, "grad_norm": 0.36627647280693054, "learning_rate": 3.569797640050423e-05, "loss": 0.0407, "step": 11030 }, { "epoch": 11.3697219361483, "grad_norm": 0.20793434977531433, "learning_rate": 3.560997926564545e-05, "loss": 0.0284, "step": 11040 }, { "epoch": 11.380020597322348, "grad_norm": 0.23446743190288544, "learning_rate": 3.552203070086707e-05, "loss": 0.0355, "step": 11050 }, { "epoch": 11.390319258496396, "grad_norm": 0.48527511954307556, "learning_rate": 3.543413100301843e-05, "loss": 0.0378, "step": 11060 }, { "epoch": 11.400617919670443, "grad_norm": 0.39768174290657043, "learning_rate": 3.534628046878403e-05, "loss": 0.0329, "step": 11070 }, { "epoch": 11.410916580844491, "grad_norm": 0.19781740009784698, "learning_rate": 3.525847939468233e-05, "loss": 0.0371, "step": 11080 }, { "epoch": 11.421215242018537, "grad_norm": 0.2503238022327423, "learning_rate": 3.517072807706492e-05, "loss": 0.0363, "step": 11090 }, { "epoch": 11.431513903192585, "grad_norm": 0.3444472849369049, "learning_rate": 3.508302681211546e-05, "loss": 0.0343, "step": 11100 }, { "epoch": 11.441812564366632, "grad_norm": 0.3007254898548126, "learning_rate": 3.499537589584859e-05, "loss": 0.0441, "step": 11110 }, { "epoch": 11.45211122554068, "grad_norm": 0.38914212584495544, "learning_rate": 3.490777562410907e-05, "loss": 0.0331, "step": 11120 }, { "epoch": 11.462409886714727, "grad_norm": 0.3051401674747467, "learning_rate": 3.482022629257074e-05, "loss": 0.0328, "step": 11130 }, { "epoch": 11.472708547888775, "grad_norm": 0.306740403175354, "learning_rate": 3.473272819673542e-05, "loss": 0.039, "step": 11140 }, { "epoch": 11.483007209062821, "grad_norm": 0.42291760444641113, "learning_rate": 3.4645281631932074e-05, "loss": 0.0526, "step": 11150 }, { "epoch": 11.493305870236869, "grad_norm": 0.2984221577644348, "learning_rate": 3.455788689331574e-05, "loss": 0.0345, "step": 11160 }, { "epoch": 11.503604531410916, "grad_norm": 0.19411993026733398, "learning_rate": 3.447054427586644e-05, "loss": 0.0384, "step": 11170 }, { "epoch": 11.513903192584964, "grad_norm": 0.3595150113105774, "learning_rate": 3.438325407438837e-05, "loss": 0.0358, "step": 11180 }, { "epoch": 11.524201853759012, "grad_norm": 0.289594829082489, "learning_rate": 3.4296016583508775e-05, "loss": 0.0314, "step": 11190 }, { "epoch": 11.53450051493306, "grad_norm": 0.3801267743110657, "learning_rate": 3.420883209767697e-05, "loss": 0.0453, "step": 11200 }, { "epoch": 11.544799176107105, "grad_norm": 0.45930567383766174, "learning_rate": 3.4121700911163366e-05, "loss": 0.0418, "step": 11210 }, { "epoch": 11.555097837281153, "grad_norm": 0.2295006662607193, "learning_rate": 3.403462331805852e-05, "loss": 0.0378, "step": 11220 }, { "epoch": 11.5653964984552, "grad_norm": 0.38683414459228516, "learning_rate": 3.394759961227202e-05, "loss": 0.038, "step": 11230 }, { "epoch": 11.575695159629248, "grad_norm": 0.32741764187812805, "learning_rate": 3.386063008753164e-05, "loss": 0.0403, "step": 11240 }, { "epoch": 11.585993820803296, "grad_norm": 0.3826991319656372, "learning_rate": 3.377371503738227e-05, "loss": 0.0408, "step": 11250 }, { "epoch": 11.596292481977343, "grad_norm": 0.5855404138565063, "learning_rate": 3.368685475518488e-05, "loss": 0.0343, "step": 11260 }, { "epoch": 11.606591143151391, "grad_norm": 0.30145469307899475, "learning_rate": 3.360004953411566e-05, "loss": 0.0292, "step": 11270 }, { "epoch": 11.616889804325437, "grad_norm": 1.2090197801589966, "learning_rate": 3.3513299667164864e-05, "loss": 0.0298, "step": 11280 }, { "epoch": 11.627188465499485, "grad_norm": 0.7051903009414673, "learning_rate": 3.3426605447136004e-05, "loss": 0.0366, "step": 11290 }, { "epoch": 11.637487126673532, "grad_norm": 0.3094668984413147, "learning_rate": 3.3339967166644726e-05, "loss": 0.0378, "step": 11300 }, { "epoch": 11.64778578784758, "grad_norm": 0.3277672231197357, "learning_rate": 3.325338511811784e-05, "loss": 0.0407, "step": 11310 }, { "epoch": 11.658084449021628, "grad_norm": 0.27167952060699463, "learning_rate": 3.316685959379241e-05, "loss": 0.0377, "step": 11320 }, { "epoch": 11.668383110195675, "grad_norm": 0.5050401091575623, "learning_rate": 3.308039088571469e-05, "loss": 0.039, "step": 11330 }, { "epoch": 11.678681771369721, "grad_norm": 0.23651434481143951, "learning_rate": 3.2993979285739143e-05, "loss": 0.0339, "step": 11340 }, { "epoch": 11.688980432543769, "grad_norm": 0.3040764331817627, "learning_rate": 3.2907625085527503e-05, "loss": 0.0351, "step": 11350 }, { "epoch": 11.699279093717816, "grad_norm": 0.23311540484428406, "learning_rate": 3.28213285765478e-05, "loss": 0.0347, "step": 11360 }, { "epoch": 11.709577754891864, "grad_norm": 0.21837526559829712, "learning_rate": 3.273509005007327e-05, "loss": 0.0397, "step": 11370 }, { "epoch": 11.719876416065912, "grad_norm": 0.24095067381858826, "learning_rate": 3.264890979718147e-05, "loss": 0.0335, "step": 11380 }, { "epoch": 11.73017507723996, "grad_norm": 0.4714142680168152, "learning_rate": 3.256278810875332e-05, "loss": 0.0355, "step": 11390 }, { "epoch": 11.740473738414007, "grad_norm": 0.3001396059989929, "learning_rate": 3.247672527547197e-05, "loss": 0.0311, "step": 11400 }, { "epoch": 11.750772399588053, "grad_norm": 0.2514890730381012, "learning_rate": 3.239072158782198e-05, "loss": 0.0374, "step": 11410 }, { "epoch": 11.7610710607621, "grad_norm": 0.22603774070739746, "learning_rate": 3.230477733608831e-05, "loss": 0.0368, "step": 11420 }, { "epoch": 11.771369721936148, "grad_norm": 0.22810235619544983, "learning_rate": 3.221889281035522e-05, "loss": 0.0331, "step": 11430 }, { "epoch": 11.781668383110196, "grad_norm": 0.18763025104999542, "learning_rate": 3.2133068300505455e-05, "loss": 0.0328, "step": 11440 }, { "epoch": 11.791967044284243, "grad_norm": 0.32261693477630615, "learning_rate": 3.204730409621917e-05, "loss": 0.0408, "step": 11450 }, { "epoch": 11.802265705458291, "grad_norm": 0.27985504269599915, "learning_rate": 3.196160048697293e-05, "loss": 0.0415, "step": 11460 }, { "epoch": 11.812564366632337, "grad_norm": 0.28317996859550476, "learning_rate": 3.187595776203886e-05, "loss": 0.0413, "step": 11470 }, { "epoch": 11.822863027806385, "grad_norm": 0.2768697440624237, "learning_rate": 3.1790376210483494e-05, "loss": 0.0433, "step": 11480 }, { "epoch": 11.833161688980432, "grad_norm": 0.27718645334243774, "learning_rate": 3.170485612116697e-05, "loss": 0.028, "step": 11490 }, { "epoch": 11.84346035015448, "grad_norm": 0.27956560254096985, "learning_rate": 3.161939778274191e-05, "loss": 0.0318, "step": 11500 }, { "epoch": 11.853759011328528, "grad_norm": 0.25807636976242065, "learning_rate": 3.1534001483652556e-05, "loss": 0.0439, "step": 11510 }, { "epoch": 11.864057672502575, "grad_norm": 0.6703087687492371, "learning_rate": 3.14486675121337e-05, "loss": 0.0298, "step": 11520 }, { "epoch": 11.874356333676623, "grad_norm": 0.46335524320602417, "learning_rate": 3.136339615620985e-05, "loss": 0.0481, "step": 11530 }, { "epoch": 11.884654994850669, "grad_norm": 0.250967800617218, "learning_rate": 3.127818770369406e-05, "loss": 0.0337, "step": 11540 }, { "epoch": 11.894953656024716, "grad_norm": 0.2240300476551056, "learning_rate": 3.119304244218715e-05, "loss": 0.0327, "step": 11550 }, { "epoch": 11.905252317198764, "grad_norm": 0.2884691655635834, "learning_rate": 3.110796065907665e-05, "loss": 0.0363, "step": 11560 }, { "epoch": 11.915550978372812, "grad_norm": 0.28418871760368347, "learning_rate": 3.102294264153577e-05, "loss": 0.0325, "step": 11570 }, { "epoch": 11.92584963954686, "grad_norm": 0.2494005262851715, "learning_rate": 3.093798867652257e-05, "loss": 0.0358, "step": 11580 }, { "epoch": 11.936148300720907, "grad_norm": 0.43249595165252686, "learning_rate": 3.0853099050778854e-05, "loss": 0.0361, "step": 11590 }, { "epoch": 11.946446961894953, "grad_norm": 0.32216548919677734, "learning_rate": 3.0768274050829306e-05, "loss": 0.0359, "step": 11600 }, { "epoch": 11.956745623069, "grad_norm": 0.3839482069015503, "learning_rate": 3.0683513962980456e-05, "loss": 0.0338, "step": 11610 }, { "epoch": 11.967044284243048, "grad_norm": 0.25899192690849304, "learning_rate": 3.059881907331979e-05, "loss": 0.0326, "step": 11620 }, { "epoch": 11.977342945417096, "grad_norm": 0.2512173652648926, "learning_rate": 3.0514189667714632e-05, "loss": 0.0352, "step": 11630 }, { "epoch": 11.987641606591144, "grad_norm": 0.43213722109794617, "learning_rate": 3.042962603181138e-05, "loss": 0.0395, "step": 11640 }, { "epoch": 11.997940267765191, "grad_norm": 0.25386422872543335, "learning_rate": 3.034512845103441e-05, "loss": 0.0314, "step": 11650 }, { "epoch": 12.008238928939237, "grad_norm": 0.35718950629234314, "learning_rate": 3.0260697210585108e-05, "loss": 0.0371, "step": 11660 }, { "epoch": 12.018537590113285, "grad_norm": 0.29993295669555664, "learning_rate": 3.017633259544101e-05, "loss": 0.035, "step": 11670 }, { "epoch": 12.028836251287332, "grad_norm": 0.3331249952316284, "learning_rate": 3.0092034890354694e-05, "loss": 0.0406, "step": 11680 }, { "epoch": 12.03913491246138, "grad_norm": 0.22086752951145172, "learning_rate": 3.0007804379852977e-05, "loss": 0.0252, "step": 11690 }, { "epoch": 12.049433573635428, "grad_norm": 0.22861167788505554, "learning_rate": 2.9923641348235843e-05, "loss": 0.0426, "step": 11700 }, { "epoch": 12.059732234809475, "grad_norm": 0.26923444867134094, "learning_rate": 2.9839546079575497e-05, "loss": 0.0454, "step": 11710 }, { "epoch": 12.070030895983523, "grad_norm": 0.23918205499649048, "learning_rate": 2.9755518857715448e-05, "loss": 0.0402, "step": 11720 }, { "epoch": 12.080329557157569, "grad_norm": 0.23139654099941254, "learning_rate": 2.967155996626956e-05, "loss": 0.0303, "step": 11730 }, { "epoch": 12.090628218331616, "grad_norm": 0.38359567523002625, "learning_rate": 2.9587669688620988e-05, "loss": 0.0398, "step": 11740 }, { "epoch": 12.100926879505664, "grad_norm": 0.23274274170398712, "learning_rate": 2.950384830792136e-05, "loss": 0.0283, "step": 11750 }, { "epoch": 12.111225540679712, "grad_norm": 0.29843324422836304, "learning_rate": 2.942009610708976e-05, "loss": 0.0339, "step": 11760 }, { "epoch": 12.12152420185376, "grad_norm": 0.2866639494895935, "learning_rate": 2.9336413368811723e-05, "loss": 0.0325, "step": 11770 }, { "epoch": 12.131822863027807, "grad_norm": 0.3042534589767456, "learning_rate": 2.9252800375538368e-05, "loss": 0.0355, "step": 11780 }, { "epoch": 12.142121524201853, "grad_norm": 0.2678833305835724, "learning_rate": 2.9169257409485418e-05, "loss": 0.0329, "step": 11790 }, { "epoch": 12.1524201853759, "grad_norm": 0.19894133508205414, "learning_rate": 2.9085784752632157e-05, "loss": 0.0383, "step": 11800 }, { "epoch": 12.162718846549948, "grad_norm": 0.19369176030158997, "learning_rate": 2.9002382686720676e-05, "loss": 0.0303, "step": 11810 }, { "epoch": 12.173017507723996, "grad_norm": 0.23142315447330475, "learning_rate": 2.8919051493254724e-05, "loss": 0.0404, "step": 11820 }, { "epoch": 12.183316168898044, "grad_norm": 0.2168169468641281, "learning_rate": 2.883579145349884e-05, "loss": 0.0352, "step": 11830 }, { "epoch": 12.193614830072091, "grad_norm": 0.27123361825942993, "learning_rate": 2.8752602848477432e-05, "loss": 0.0358, "step": 11840 }, { "epoch": 12.203913491246137, "grad_norm": 1.34294593334198, "learning_rate": 2.8669485958973775e-05, "loss": 0.0336, "step": 11850 }, { "epoch": 12.214212152420185, "grad_norm": 0.35292431712150574, "learning_rate": 2.858644106552909e-05, "loss": 0.0356, "step": 11860 }, { "epoch": 12.224510813594232, "grad_norm": 0.5437068939208984, "learning_rate": 2.850346844844157e-05, "loss": 0.04, "step": 11870 }, { "epoch": 12.23480947476828, "grad_norm": 0.7077152729034424, "learning_rate": 2.8420568387765557e-05, "loss": 0.0381, "step": 11880 }, { "epoch": 12.245108135942328, "grad_norm": 1.2102924585342407, "learning_rate": 2.8337741163310317e-05, "loss": 0.0316, "step": 11890 }, { "epoch": 12.255406797116375, "grad_norm": 0.22898398339748383, "learning_rate": 2.825498705463947e-05, "loss": 0.0355, "step": 11900 }, { "epoch": 12.265705458290423, "grad_norm": 0.16343450546264648, "learning_rate": 2.8172306341069672e-05, "loss": 0.0333, "step": 11910 }, { "epoch": 12.276004119464469, "grad_norm": 0.2778915762901306, "learning_rate": 2.8089699301670002e-05, "loss": 0.034, "step": 11920 }, { "epoch": 12.286302780638517, "grad_norm": 0.2954021096229553, "learning_rate": 2.800716621526078e-05, "loss": 0.03, "step": 11930 }, { "epoch": 12.296601441812564, "grad_norm": 0.18878135085105896, "learning_rate": 2.7924707360412746e-05, "loss": 0.0322, "step": 11940 }, { "epoch": 12.306900102986612, "grad_norm": 0.25053462386131287, "learning_rate": 2.7842323015446082e-05, "loss": 0.0376, "step": 11950 }, { "epoch": 12.31719876416066, "grad_norm": 0.21085461974143982, "learning_rate": 2.7760013458429475e-05, "loss": 0.0333, "step": 11960 }, { "epoch": 12.327497425334707, "grad_norm": 0.27033373713493347, "learning_rate": 2.767777896717919e-05, "loss": 0.0387, "step": 11970 }, { "epoch": 12.337796086508753, "grad_norm": 0.2603791356086731, "learning_rate": 2.7595619819258116e-05, "loss": 0.0336, "step": 11980 }, { "epoch": 12.3480947476828, "grad_norm": 0.2735675573348999, "learning_rate": 2.7513536291974895e-05, "loss": 0.0367, "step": 11990 }, { "epoch": 12.358393408856848, "grad_norm": 0.2710510790348053, "learning_rate": 2.743152866238281e-05, "loss": 0.0359, "step": 12000 }, { "epoch": 12.368692070030896, "grad_norm": 0.3120410144329071, "learning_rate": 2.7349597207279088e-05, "loss": 0.0353, "step": 12010 }, { "epoch": 12.378990731204944, "grad_norm": 1.238741159439087, "learning_rate": 2.7267742203203795e-05, "loss": 0.0328, "step": 12020 }, { "epoch": 12.389289392378991, "grad_norm": 0.24720178544521332, "learning_rate": 2.718596392643895e-05, "loss": 0.035, "step": 12030 }, { "epoch": 12.399588053553039, "grad_norm": 0.5230728387832642, "learning_rate": 2.7104262653007616e-05, "loss": 0.0385, "step": 12040 }, { "epoch": 12.409886714727085, "grad_norm": 0.30197054147720337, "learning_rate": 2.7022638658672933e-05, "loss": 0.0378, "step": 12050 }, { "epoch": 12.420185375901132, "grad_norm": 0.35036417841911316, "learning_rate": 2.6941092218937214e-05, "loss": 0.0316, "step": 12060 }, { "epoch": 12.43048403707518, "grad_norm": 0.1900859922170639, "learning_rate": 2.6859623609040984e-05, "loss": 0.0416, "step": 12070 }, { "epoch": 12.440782698249228, "grad_norm": 0.3137092888355255, "learning_rate": 2.6778233103962158e-05, "loss": 0.0347, "step": 12080 }, { "epoch": 12.451081359423275, "grad_norm": 0.2586371600627899, "learning_rate": 2.6696920978414862e-05, "loss": 0.0313, "step": 12090 }, { "epoch": 12.461380020597323, "grad_norm": 0.22871264815330505, "learning_rate": 2.6615687506848864e-05, "loss": 0.0384, "step": 12100 }, { "epoch": 12.471678681771369, "grad_norm": 0.500694751739502, "learning_rate": 2.6534532963448274e-05, "loss": 0.0365, "step": 12110 }, { "epoch": 12.481977342945417, "grad_norm": 0.23115640878677368, "learning_rate": 2.645345762213094e-05, "loss": 0.0359, "step": 12120 }, { "epoch": 12.492276004119464, "grad_norm": 0.27199363708496094, "learning_rate": 2.6372461756547306e-05, "loss": 0.0367, "step": 12130 }, { "epoch": 12.502574665293512, "grad_norm": 0.4970080256462097, "learning_rate": 2.6291545640079583e-05, "loss": 0.038, "step": 12140 }, { "epoch": 12.51287332646756, "grad_norm": 0.31872427463531494, "learning_rate": 2.6210709545840816e-05, "loss": 0.0349, "step": 12150 }, { "epoch": 12.523171987641607, "grad_norm": 0.543602705001831, "learning_rate": 2.612995374667394e-05, "loss": 0.0456, "step": 12160 }, { "epoch": 12.533470648815655, "grad_norm": 0.24425791203975677, "learning_rate": 2.6049278515150888e-05, "loss": 0.0343, "step": 12170 }, { "epoch": 12.5437693099897, "grad_norm": 0.32970938086509705, "learning_rate": 2.5968684123571625e-05, "loss": 0.0358, "step": 12180 }, { "epoch": 12.554067971163748, "grad_norm": 0.24140028655529022, "learning_rate": 2.5888170843963332e-05, "loss": 0.0415, "step": 12190 }, { "epoch": 12.564366632337796, "grad_norm": 0.1907021552324295, "learning_rate": 2.5807738948079307e-05, "loss": 0.0332, "step": 12200 }, { "epoch": 12.574665293511844, "grad_norm": 0.2994469404220581, "learning_rate": 2.572738870739827e-05, "loss": 0.0332, "step": 12210 }, { "epoch": 12.584963954685891, "grad_norm": 0.3281172811985016, "learning_rate": 2.5647120393123246e-05, "loss": 0.0355, "step": 12220 }, { "epoch": 12.595262615859939, "grad_norm": 0.222566619515419, "learning_rate": 2.5566934276180792e-05, "loss": 0.0299, "step": 12230 }, { "epoch": 12.605561277033985, "grad_norm": 0.38741955161094666, "learning_rate": 2.5486830627219993e-05, "loss": 0.0369, "step": 12240 }, { "epoch": 12.615859938208033, "grad_norm": 0.24740222096443176, "learning_rate": 2.540680971661161e-05, "loss": 0.034, "step": 12250 }, { "epoch": 12.62615859938208, "grad_norm": 0.2917155623435974, "learning_rate": 2.5326871814447116e-05, "loss": 0.0325, "step": 12260 }, { "epoch": 12.636457260556128, "grad_norm": 0.3306695818901062, "learning_rate": 2.5247017190537802e-05, "loss": 0.0314, "step": 12270 }, { "epoch": 12.646755921730175, "grad_norm": 0.3189143240451813, "learning_rate": 2.5167246114413956e-05, "loss": 0.0406, "step": 12280 }, { "epoch": 12.657054582904223, "grad_norm": 0.27937018871307373, "learning_rate": 2.5087558855323718e-05, "loss": 0.037, "step": 12290 }, { "epoch": 12.667353244078269, "grad_norm": 0.23929426074028015, "learning_rate": 2.5007955682232498e-05, "loss": 0.0366, "step": 12300 }, { "epoch": 12.677651905252317, "grad_norm": 0.38764917850494385, "learning_rate": 2.4928436863821725e-05, "loss": 0.0357, "step": 12310 }, { "epoch": 12.687950566426364, "grad_norm": 0.22392131388187408, "learning_rate": 2.4849002668488245e-05, "loss": 0.031, "step": 12320 }, { "epoch": 12.698249227600412, "grad_norm": 0.35927116870880127, "learning_rate": 2.4769653364343222e-05, "loss": 0.0355, "step": 12330 }, { "epoch": 12.70854788877446, "grad_norm": 0.3391915261745453, "learning_rate": 2.4690389219211273e-05, "loss": 0.0346, "step": 12340 }, { "epoch": 12.718846549948507, "grad_norm": 0.21950756013393402, "learning_rate": 2.4611210500629618e-05, "loss": 0.0339, "step": 12350 }, { "epoch": 12.729145211122553, "grad_norm": 0.22874067723751068, "learning_rate": 2.453211747584711e-05, "loss": 0.0347, "step": 12360 }, { "epoch": 12.7394438722966, "grad_norm": 0.5297624468803406, "learning_rate": 2.4453110411823382e-05, "loss": 0.0308, "step": 12370 }, { "epoch": 12.749742533470648, "grad_norm": 0.31514862179756165, "learning_rate": 2.4374189575227902e-05, "loss": 0.032, "step": 12380 }, { "epoch": 12.760041194644696, "grad_norm": 0.26266971230506897, "learning_rate": 2.429535523243917e-05, "loss": 0.0357, "step": 12390 }, { "epoch": 12.770339855818744, "grad_norm": 0.18397288024425507, "learning_rate": 2.4216607649543628e-05, "loss": 0.0307, "step": 12400 }, { "epoch": 12.780638516992791, "grad_norm": 0.26537027955055237, "learning_rate": 2.4137947092334994e-05, "loss": 0.0363, "step": 12410 }, { "epoch": 12.790937178166839, "grad_norm": 0.28661102056503296, "learning_rate": 2.4059373826313185e-05, "loss": 0.0306, "step": 12420 }, { "epoch": 12.801235839340885, "grad_norm": 0.26964297890663147, "learning_rate": 2.3980888116683515e-05, "loss": 0.0324, "step": 12430 }, { "epoch": 12.811534500514933, "grad_norm": 0.2776640057563782, "learning_rate": 2.3902490228355756e-05, "loss": 0.0329, "step": 12440 }, { "epoch": 12.82183316168898, "grad_norm": 0.4814803898334503, "learning_rate": 2.3824180425943277e-05, "loss": 0.0303, "step": 12450 }, { "epoch": 12.832131822863028, "grad_norm": 0.22867955267429352, "learning_rate": 2.374595897376211e-05, "loss": 0.0288, "step": 12460 }, { "epoch": 12.842430484037076, "grad_norm": 0.21567359566688538, "learning_rate": 2.366782613583009e-05, "loss": 0.0325, "step": 12470 }, { "epoch": 12.852729145211123, "grad_norm": 0.290703684091568, "learning_rate": 2.3589782175866015e-05, "loss": 0.0298, "step": 12480 }, { "epoch": 12.863027806385169, "grad_norm": 0.3255325257778168, "learning_rate": 2.3511827357288575e-05, "loss": 0.0363, "step": 12490 }, { "epoch": 12.873326467559217, "grad_norm": 0.44946736097335815, "learning_rate": 2.343396194321572e-05, "loss": 0.0332, "step": 12500 }, { "epoch": 12.883625128733264, "grad_norm": 0.25294211506843567, "learning_rate": 2.33561861964635e-05, "loss": 0.0348, "step": 12510 }, { "epoch": 12.893923789907312, "grad_norm": 0.18743322789669037, "learning_rate": 2.3278500379545436e-05, "loss": 0.0336, "step": 12520 }, { "epoch": 12.90422245108136, "grad_norm": 0.16629280149936676, "learning_rate": 2.3200904754671453e-05, "loss": 0.0381, "step": 12530 }, { "epoch": 12.914521112255407, "grad_norm": 0.1841958910226822, "learning_rate": 2.312339958374705e-05, "loss": 0.0273, "step": 12540 }, { "epoch": 12.924819773429455, "grad_norm": 0.3820919096469879, "learning_rate": 2.3045985128372442e-05, "loss": 0.0354, "step": 12550 }, { "epoch": 12.9351184346035, "grad_norm": 0.22891731560230255, "learning_rate": 2.2968661649841643e-05, "loss": 0.0393, "step": 12560 }, { "epoch": 12.945417095777549, "grad_norm": 0.21805356442928314, "learning_rate": 2.2891429409141594e-05, "loss": 0.0312, "step": 12570 }, { "epoch": 12.955715756951596, "grad_norm": 0.29530712962150574, "learning_rate": 2.281428866695128e-05, "loss": 0.034, "step": 12580 }, { "epoch": 12.966014418125644, "grad_norm": 0.3417767286300659, "learning_rate": 2.2737239683640908e-05, "loss": 0.0291, "step": 12590 }, { "epoch": 12.976313079299691, "grad_norm": 0.36338862776756287, "learning_rate": 2.266028271927087e-05, "loss": 0.0288, "step": 12600 }, { "epoch": 12.98661174047374, "grad_norm": 0.18803521990776062, "learning_rate": 2.258341803359108e-05, "loss": 0.035, "step": 12610 }, { "epoch": 12.996910401647785, "grad_norm": 0.2204011231660843, "learning_rate": 2.2506645886039918e-05, "loss": 0.0331, "step": 12620 }, { "epoch": 13.007209062821833, "grad_norm": 0.23867210745811462, "learning_rate": 2.242996653574345e-05, "loss": 0.0327, "step": 12630 }, { "epoch": 13.01750772399588, "grad_norm": 0.22372329235076904, "learning_rate": 2.2353380241514515e-05, "loss": 0.0313, "step": 12640 }, { "epoch": 13.027806385169928, "grad_norm": 0.2398245483636856, "learning_rate": 2.2276887261851875e-05, "loss": 0.0405, "step": 12650 }, { "epoch": 13.038105046343976, "grad_norm": 0.20746667683124542, "learning_rate": 2.2200487854939322e-05, "loss": 0.0332, "step": 12660 }, { "epoch": 13.048403707518023, "grad_norm": 0.23980452120304108, "learning_rate": 2.21241822786448e-05, "loss": 0.0331, "step": 12670 }, { "epoch": 13.058702368692071, "grad_norm": 0.2431352734565735, "learning_rate": 2.204797079051962e-05, "loss": 0.0337, "step": 12680 }, { "epoch": 13.069001029866117, "grad_norm": 0.21622303128242493, "learning_rate": 2.1971853647797415e-05, "loss": 0.0369, "step": 12690 }, { "epoch": 13.079299691040164, "grad_norm": 0.17636331915855408, "learning_rate": 2.1895831107393484e-05, "loss": 0.0385, "step": 12700 }, { "epoch": 13.089598352214212, "grad_norm": 0.3212912976741791, "learning_rate": 2.181990342590371e-05, "loss": 0.0388, "step": 12710 }, { "epoch": 13.09989701338826, "grad_norm": 0.4048994183540344, "learning_rate": 2.1744070859603897e-05, "loss": 0.0314, "step": 12720 }, { "epoch": 13.110195674562307, "grad_norm": 0.2608017921447754, "learning_rate": 2.1668333664448776e-05, "loss": 0.0348, "step": 12730 }, { "epoch": 13.120494335736355, "grad_norm": 0.22120167315006256, "learning_rate": 2.1592692096071153e-05, "loss": 0.0282, "step": 12740 }, { "epoch": 13.130792996910401, "grad_norm": 0.22117048501968384, "learning_rate": 2.1517146409781103e-05, "loss": 0.0346, "step": 12750 }, { "epoch": 13.141091658084449, "grad_norm": 0.2921169102191925, "learning_rate": 2.1441696860565048e-05, "loss": 0.0342, "step": 12760 }, { "epoch": 13.151390319258496, "grad_norm": 0.22612257301807404, "learning_rate": 2.1366343703084936e-05, "loss": 0.0312, "step": 12770 }, { "epoch": 13.161688980432544, "grad_norm": 0.27955397963523865, "learning_rate": 2.1291087191677343e-05, "loss": 0.0332, "step": 12780 }, { "epoch": 13.171987641606592, "grad_norm": 0.2641075849533081, "learning_rate": 2.121592758035273e-05, "loss": 0.0368, "step": 12790 }, { "epoch": 13.18228630278064, "grad_norm": 0.26150405406951904, "learning_rate": 2.114086512279434e-05, "loss": 0.0355, "step": 12800 }, { "epoch": 13.192584963954685, "grad_norm": 0.2792717218399048, "learning_rate": 2.1065900072357635e-05, "loss": 0.029, "step": 12810 }, { "epoch": 13.202883625128733, "grad_norm": 0.21909286081790924, "learning_rate": 2.0991032682069246e-05, "loss": 0.0379, "step": 12820 }, { "epoch": 13.21318228630278, "grad_norm": 0.2866324782371521, "learning_rate": 2.0916263204626162e-05, "loss": 0.0282, "step": 12830 }, { "epoch": 13.223480947476828, "grad_norm": 0.28694427013397217, "learning_rate": 2.0841591892394925e-05, "loss": 0.0399, "step": 12840 }, { "epoch": 13.233779608650876, "grad_norm": 0.31920716166496277, "learning_rate": 2.0767018997410713e-05, "loss": 0.0365, "step": 12850 }, { "epoch": 13.244078269824923, "grad_norm": 0.35022082924842834, "learning_rate": 2.0692544771376543e-05, "loss": 0.0264, "step": 12860 }, { "epoch": 13.254376930998971, "grad_norm": 0.25149139761924744, "learning_rate": 2.0618169465662364e-05, "loss": 0.0302, "step": 12870 }, { "epoch": 13.264675592173017, "grad_norm": 0.2645907402038574, "learning_rate": 2.0543893331304333e-05, "loss": 0.0328, "step": 12880 }, { "epoch": 13.274974253347064, "grad_norm": 0.17596539855003357, "learning_rate": 2.0469716619003725e-05, "loss": 0.0328, "step": 12890 }, { "epoch": 13.285272914521112, "grad_norm": 0.2291368991136551, "learning_rate": 2.039563957912642e-05, "loss": 0.0318, "step": 12900 }, { "epoch": 13.29557157569516, "grad_norm": 0.21256229281425476, "learning_rate": 2.0321662461701696e-05, "loss": 0.0334, "step": 12910 }, { "epoch": 13.305870236869207, "grad_norm": 0.30739450454711914, "learning_rate": 2.024778551642172e-05, "loss": 0.0321, "step": 12920 }, { "epoch": 13.316168898043255, "grad_norm": 0.2791813015937805, "learning_rate": 2.017400899264047e-05, "loss": 0.0302, "step": 12930 }, { "epoch": 13.326467559217301, "grad_norm": 0.3258625864982605, "learning_rate": 2.0100333139372985e-05, "loss": 0.0361, "step": 12940 }, { "epoch": 13.336766220391349, "grad_norm": 0.2523643672466278, "learning_rate": 2.0026758205294533e-05, "loss": 0.0322, "step": 12950 }, { "epoch": 13.347064881565396, "grad_norm": 0.2704935073852539, "learning_rate": 1.9953284438739733e-05, "loss": 0.0321, "step": 12960 }, { "epoch": 13.357363542739444, "grad_norm": 0.45123302936553955, "learning_rate": 1.9879912087701753e-05, "loss": 0.0331, "step": 12970 }, { "epoch": 13.367662203913492, "grad_norm": 1.1362191438674927, "learning_rate": 1.9806641399831433e-05, "loss": 0.0352, "step": 12980 }, { "epoch": 13.37796086508754, "grad_norm": 0.3239549398422241, "learning_rate": 1.9733472622436544e-05, "loss": 0.0317, "step": 12990 }, { "epoch": 13.388259526261585, "grad_norm": 0.20692795515060425, "learning_rate": 1.9660406002480765e-05, "loss": 0.0328, "step": 13000 }, { "epoch": 13.398558187435633, "grad_norm": 0.24428331851959229, "learning_rate": 1.9587441786583076e-05, "loss": 0.0344, "step": 13010 }, { "epoch": 13.40885684860968, "grad_norm": 0.17566567659378052, "learning_rate": 1.951458022101676e-05, "loss": 0.0346, "step": 13020 }, { "epoch": 13.419155509783728, "grad_norm": 0.2601017951965332, "learning_rate": 1.944182155170864e-05, "loss": 0.0413, "step": 13030 }, { "epoch": 13.429454170957776, "grad_norm": 0.22690336406230927, "learning_rate": 1.9369166024238232e-05, "loss": 0.039, "step": 13040 }, { "epoch": 13.439752832131823, "grad_norm": 0.34189629554748535, "learning_rate": 1.9296613883836945e-05, "loss": 0.0297, "step": 13050 }, { "epoch": 13.450051493305871, "grad_norm": 0.39015287160873413, "learning_rate": 1.9224165375387193e-05, "loss": 0.0352, "step": 13060 }, { "epoch": 13.460350154479917, "grad_norm": 0.16422075033187866, "learning_rate": 1.9151820743421617e-05, "loss": 0.0298, "step": 13070 }, { "epoch": 13.470648815653965, "grad_norm": 0.20099236071109772, "learning_rate": 1.9079580232122303e-05, "loss": 0.0271, "step": 13080 }, { "epoch": 13.480947476828012, "grad_norm": 0.37444478273391724, "learning_rate": 1.9007444085319786e-05, "loss": 0.0382, "step": 13090 }, { "epoch": 13.49124613800206, "grad_norm": 0.24139359593391418, "learning_rate": 1.8935412546492486e-05, "loss": 0.0334, "step": 13100 }, { "epoch": 13.501544799176108, "grad_norm": 0.3007052540779114, "learning_rate": 1.88634858587656e-05, "loss": 0.0341, "step": 13110 }, { "epoch": 13.511843460350155, "grad_norm": 0.30898720026016235, "learning_rate": 1.8791664264910537e-05, "loss": 0.0324, "step": 13120 }, { "epoch": 13.522142121524201, "grad_norm": 0.3256855905056, "learning_rate": 1.8719948007343936e-05, "loss": 0.0376, "step": 13130 }, { "epoch": 13.532440782698249, "grad_norm": 0.2092374563217163, "learning_rate": 1.8648337328126906e-05, "loss": 0.0298, "step": 13140 }, { "epoch": 13.542739443872296, "grad_norm": 0.34433215856552124, "learning_rate": 1.85768324689642e-05, "loss": 0.0371, "step": 13150 }, { "epoch": 13.553038105046344, "grad_norm": 0.47145530581474304, "learning_rate": 1.850543367120341e-05, "loss": 0.0389, "step": 13160 }, { "epoch": 13.563336766220392, "grad_norm": 1.9276230335235596, "learning_rate": 1.8434141175834125e-05, "loss": 0.0356, "step": 13170 }, { "epoch": 13.57363542739444, "grad_norm": 0.1196000725030899, "learning_rate": 1.8362955223487143e-05, "loss": 0.0292, "step": 13180 }, { "epoch": 13.583934088568487, "grad_norm": 0.21239057183265686, "learning_rate": 1.8291876054433693e-05, "loss": 0.0314, "step": 13190 }, { "epoch": 13.594232749742533, "grad_norm": 0.27161744236946106, "learning_rate": 1.8220903908584492e-05, "loss": 0.0323, "step": 13200 }, { "epoch": 13.60453141091658, "grad_norm": 0.23213060200214386, "learning_rate": 1.8150039025489113e-05, "loss": 0.0335, "step": 13210 }, { "epoch": 13.614830072090628, "grad_norm": 0.26432856917381287, "learning_rate": 1.8079281644335055e-05, "loss": 0.0348, "step": 13220 }, { "epoch": 13.625128733264676, "grad_norm": 0.24627777934074402, "learning_rate": 1.8008632003946957e-05, "loss": 0.0308, "step": 13230 }, { "epoch": 13.635427394438723, "grad_norm": 0.3506312966346741, "learning_rate": 1.7938090342785817e-05, "loss": 0.0379, "step": 13240 }, { "epoch": 13.645726055612771, "grad_norm": 0.20565661787986755, "learning_rate": 1.7867656898948187e-05, "loss": 0.0338, "step": 13250 }, { "epoch": 13.656024716786817, "grad_norm": 0.2677291929721832, "learning_rate": 1.7797331910165336e-05, "loss": 0.0325, "step": 13260 }, { "epoch": 13.666323377960865, "grad_norm": 0.30942559242248535, "learning_rate": 1.7727115613802465e-05, "loss": 0.0365, "step": 13270 }, { "epoch": 13.676622039134912, "grad_norm": 0.23922519385814667, "learning_rate": 1.765700824685797e-05, "loss": 0.0366, "step": 13280 }, { "epoch": 13.68692070030896, "grad_norm": 0.18366648256778717, "learning_rate": 1.758701004596247e-05, "loss": 0.0305, "step": 13290 }, { "epoch": 13.697219361483008, "grad_norm": 0.2875716984272003, "learning_rate": 1.751712124737826e-05, "loss": 0.0363, "step": 13300 }, { "epoch": 13.707518022657055, "grad_norm": 0.3050890564918518, "learning_rate": 1.744734208699822e-05, "loss": 0.037, "step": 13310 }, { "epoch": 13.717816683831103, "grad_norm": 0.24879583716392517, "learning_rate": 1.7377672800345302e-05, "loss": 0.0285, "step": 13320 }, { "epoch": 13.728115345005149, "grad_norm": 0.22065865993499756, "learning_rate": 1.7308113622571544e-05, "loss": 0.0299, "step": 13330 }, { "epoch": 13.738414006179196, "grad_norm": 0.1869887113571167, "learning_rate": 1.7238664788457342e-05, "loss": 0.0344, "step": 13340 }, { "epoch": 13.748712667353244, "grad_norm": 0.21137484908103943, "learning_rate": 1.7169326532410663e-05, "loss": 0.0332, "step": 13350 }, { "epoch": 13.759011328527292, "grad_norm": 0.3234722912311554, "learning_rate": 1.7100099088466242e-05, "loss": 0.0345, "step": 13360 }, { "epoch": 13.76930998970134, "grad_norm": 0.2264581024646759, "learning_rate": 1.7030982690284792e-05, "loss": 0.0291, "step": 13370 }, { "epoch": 13.779608650875387, "grad_norm": 0.29631558060646057, "learning_rate": 1.69619775711522e-05, "loss": 0.0361, "step": 13380 }, { "epoch": 13.789907312049433, "grad_norm": 0.292219340801239, "learning_rate": 1.689308396397882e-05, "loss": 0.0256, "step": 13390 }, { "epoch": 13.80020597322348, "grad_norm": 0.17191918194293976, "learning_rate": 1.6824302101298526e-05, "loss": 0.0349, "step": 13400 }, { "epoch": 13.810504634397528, "grad_norm": 0.22219271957874298, "learning_rate": 1.6755632215268118e-05, "loss": 0.0316, "step": 13410 }, { "epoch": 13.820803295571576, "grad_norm": 0.18818335235118866, "learning_rate": 1.6687074537666398e-05, "loss": 0.0325, "step": 13420 }, { "epoch": 13.831101956745623, "grad_norm": 0.2848359942436218, "learning_rate": 1.6618629299893434e-05, "loss": 0.0327, "step": 13430 }, { "epoch": 13.841400617919671, "grad_norm": 0.26240599155426025, "learning_rate": 1.6550296732969795e-05, "loss": 0.0321, "step": 13440 }, { "epoch": 13.851699279093717, "grad_norm": 0.166743665933609, "learning_rate": 1.648207706753575e-05, "loss": 0.0361, "step": 13450 }, { "epoch": 13.861997940267765, "grad_norm": 0.2783146798610687, "learning_rate": 1.6413970533850498e-05, "loss": 0.0395, "step": 13460 }, { "epoch": 13.872296601441812, "grad_norm": 0.2442004680633545, "learning_rate": 1.6345977361791366e-05, "loss": 0.0385, "step": 13470 }, { "epoch": 13.88259526261586, "grad_norm": 0.16581279039382935, "learning_rate": 1.6278097780853136e-05, "loss": 0.0356, "step": 13480 }, { "epoch": 13.892893923789908, "grad_norm": 0.37210017442703247, "learning_rate": 1.6210332020147055e-05, "loss": 0.0363, "step": 13490 }, { "epoch": 13.903192584963955, "grad_norm": 0.18403227627277374, "learning_rate": 1.6142680308400338e-05, "loss": 0.0389, "step": 13500 }, { "epoch": 13.913491246138001, "grad_norm": 0.283448189496994, "learning_rate": 1.6075142873955164e-05, "loss": 0.0318, "step": 13510 }, { "epoch": 13.923789907312049, "grad_norm": 0.24017812311649323, "learning_rate": 1.6007719944768025e-05, "loss": 0.035, "step": 13520 }, { "epoch": 13.934088568486096, "grad_norm": 0.14648008346557617, "learning_rate": 1.594041174840894e-05, "loss": 0.0276, "step": 13530 }, { "epoch": 13.944387229660144, "grad_norm": 0.31949880719184875, "learning_rate": 1.587321851206061e-05, "loss": 0.0312, "step": 13540 }, { "epoch": 13.954685890834192, "grad_norm": 0.27566295862197876, "learning_rate": 1.5806140462517828e-05, "loss": 0.0308, "step": 13550 }, { "epoch": 13.96498455200824, "grad_norm": 0.221617192029953, "learning_rate": 1.573917782618651e-05, "loss": 0.033, "step": 13560 }, { "epoch": 13.975283213182287, "grad_norm": 0.15257342159748077, "learning_rate": 1.567233082908306e-05, "loss": 0.0272, "step": 13570 }, { "epoch": 13.985581874356333, "grad_norm": 0.31881460547447205, "learning_rate": 1.5605599696833544e-05, "loss": 0.036, "step": 13580 }, { "epoch": 13.99588053553038, "grad_norm": 0.21161913871765137, "learning_rate": 1.5538984654673016e-05, "loss": 0.0272, "step": 13590 }, { "epoch": 14.006179196704428, "grad_norm": 0.22538325190544128, "learning_rate": 1.5472485927444597e-05, "loss": 0.023, "step": 13600 }, { "epoch": 14.016477857878476, "grad_norm": 0.2999170422554016, "learning_rate": 1.5406103739598903e-05, "loss": 0.032, "step": 13610 }, { "epoch": 14.026776519052524, "grad_norm": 0.26565343141555786, "learning_rate": 1.5339838315193156e-05, "loss": 0.031, "step": 13620 }, { "epoch": 14.037075180226571, "grad_norm": 0.3137536942958832, "learning_rate": 1.5273689877890485e-05, "loss": 0.0302, "step": 13630 }, { "epoch": 14.047373841400617, "grad_norm": 0.1854087859392166, "learning_rate": 1.5207658650959138e-05, "loss": 0.0345, "step": 13640 }, { "epoch": 14.057672502574665, "grad_norm": 0.2928926348686218, "learning_rate": 1.5141744857271778e-05, "loss": 0.0334, "step": 13650 }, { "epoch": 14.067971163748712, "grad_norm": 0.42930635809898376, "learning_rate": 1.5075948719304672e-05, "loss": 0.0272, "step": 13660 }, { "epoch": 14.07826982492276, "grad_norm": 0.20846472680568695, "learning_rate": 1.5010270459136966e-05, "loss": 0.0331, "step": 13670 }, { "epoch": 14.088568486096808, "grad_norm": 0.2335253208875656, "learning_rate": 1.4944710298449999e-05, "loss": 0.0312, "step": 13680 }, { "epoch": 14.098867147270855, "grad_norm": 0.18406903743743896, "learning_rate": 1.4879268458526379e-05, "loss": 0.033, "step": 13690 }, { "epoch": 14.109165808444903, "grad_norm": 0.26444944739341736, "learning_rate": 1.481394516024947e-05, "loss": 0.0282, "step": 13700 }, { "epoch": 14.119464469618949, "grad_norm": 0.19681231677532196, "learning_rate": 1.4748740624102459e-05, "loss": 0.0354, "step": 13710 }, { "epoch": 14.129763130792997, "grad_norm": 0.22566291689872742, "learning_rate": 1.468365507016769e-05, "loss": 0.0327, "step": 13720 }, { "epoch": 14.140061791967044, "grad_norm": 0.24647872149944305, "learning_rate": 1.4618688718125929e-05, "loss": 0.0301, "step": 13730 }, { "epoch": 14.150360453141092, "grad_norm": 0.2727005183696747, "learning_rate": 1.455384178725555e-05, "loss": 0.0261, "step": 13740 }, { "epoch": 14.16065911431514, "grad_norm": 0.2636515200138092, "learning_rate": 1.4489114496431938e-05, "loss": 0.0362, "step": 13750 }, { "epoch": 14.170957775489187, "grad_norm": 0.24423463642597198, "learning_rate": 1.4424507064126597e-05, "loss": 0.0308, "step": 13760 }, { "epoch": 14.181256436663233, "grad_norm": 0.2822682559490204, "learning_rate": 1.4360019708406487e-05, "loss": 0.038, "step": 13770 }, { "epoch": 14.19155509783728, "grad_norm": 0.19930243492126465, "learning_rate": 1.4295652646933277e-05, "loss": 0.0291, "step": 13780 }, { "epoch": 14.201853759011328, "grad_norm": 0.1978948414325714, "learning_rate": 1.4231406096962669e-05, "loss": 0.0302, "step": 13790 }, { "epoch": 14.212152420185376, "grad_norm": 0.17142613232135773, "learning_rate": 1.4167280275343492e-05, "loss": 0.0257, "step": 13800 }, { "epoch": 14.222451081359424, "grad_norm": 0.2695595622062683, "learning_rate": 1.4103275398517197e-05, "loss": 0.0349, "step": 13810 }, { "epoch": 14.232749742533471, "grad_norm": 0.23960620164871216, "learning_rate": 1.4039391682516972e-05, "loss": 0.0307, "step": 13820 }, { "epoch": 14.243048403707519, "grad_norm": 0.279876172542572, "learning_rate": 1.3975629342967001e-05, "loss": 0.0334, "step": 13830 }, { "epoch": 14.253347064881565, "grad_norm": 0.260696142911911, "learning_rate": 1.3911988595081893e-05, "loss": 0.0316, "step": 13840 }, { "epoch": 14.263645726055612, "grad_norm": 0.24109739065170288, "learning_rate": 1.3848469653665786e-05, "loss": 0.0306, "step": 13850 }, { "epoch": 14.27394438722966, "grad_norm": 0.3289351165294647, "learning_rate": 1.378507273311171e-05, "loss": 0.0362, "step": 13860 }, { "epoch": 14.284243048403708, "grad_norm": 0.33488863706588745, "learning_rate": 1.3721798047400813e-05, "loss": 0.0408, "step": 13870 }, { "epoch": 14.294541709577755, "grad_norm": 3.9080820083618164, "learning_rate": 1.3658645810101755e-05, "loss": 0.0278, "step": 13880 }, { "epoch": 14.304840370751803, "grad_norm": 0.2996270954608917, "learning_rate": 1.3595616234369762e-05, "loss": 0.0277, "step": 13890 }, { "epoch": 14.315139031925849, "grad_norm": 0.2796926498413086, "learning_rate": 1.3532709532946186e-05, "loss": 0.0328, "step": 13900 }, { "epoch": 14.325437693099897, "grad_norm": 0.24468347430229187, "learning_rate": 1.3469925918157567e-05, "loss": 0.0327, "step": 13910 }, { "epoch": 14.335736354273944, "grad_norm": 0.23212593793869019, "learning_rate": 1.3407265601914976e-05, "loss": 0.0317, "step": 13920 }, { "epoch": 14.346035015447992, "grad_norm": 0.23879218101501465, "learning_rate": 1.3344728795713413e-05, "loss": 0.0365, "step": 13930 }, { "epoch": 14.35633367662204, "grad_norm": 0.2575908303260803, "learning_rate": 1.3282315710630882e-05, "loss": 0.0385, "step": 13940 }, { "epoch": 14.366632337796087, "grad_norm": 0.3186909556388855, "learning_rate": 1.3220026557327898e-05, "loss": 0.0403, "step": 13950 }, { "epoch": 14.376930998970133, "grad_norm": 0.2613557279109955, "learning_rate": 1.3157861546046613e-05, "loss": 0.0328, "step": 13960 }, { "epoch": 14.38722966014418, "grad_norm": 0.3558288514614105, "learning_rate": 1.3095820886610188e-05, "loss": 0.0293, "step": 13970 }, { "epoch": 14.397528321318228, "grad_norm": 0.2622450292110443, "learning_rate": 1.3033904788422047e-05, "loss": 0.0261, "step": 13980 }, { "epoch": 14.407826982492276, "grad_norm": 0.23433591425418854, "learning_rate": 1.2972113460465246e-05, "loss": 0.0286, "step": 13990 }, { "epoch": 14.418125643666324, "grad_norm": 0.2427792251110077, "learning_rate": 1.2910447111301604e-05, "loss": 0.0316, "step": 14000 }, { "epoch": 14.428424304840371, "grad_norm": 0.3044346570968628, "learning_rate": 1.284890594907121e-05, "loss": 0.0273, "step": 14010 }, { "epoch": 14.438722966014419, "grad_norm": 0.16404663026332855, "learning_rate": 1.2787490181491568e-05, "loss": 0.0257, "step": 14020 }, { "epoch": 14.449021627188465, "grad_norm": 0.26250144839286804, "learning_rate": 1.2726200015856892e-05, "loss": 0.0328, "step": 14030 }, { "epoch": 14.459320288362512, "grad_norm": 0.7278460264205933, "learning_rate": 1.2665035659037561e-05, "loss": 0.0297, "step": 14040 }, { "epoch": 14.46961894953656, "grad_norm": 0.34996357560157776, "learning_rate": 1.2603997317479238e-05, "loss": 0.0324, "step": 14050 }, { "epoch": 14.479917610710608, "grad_norm": 0.44799286127090454, "learning_rate": 1.2543085197202287e-05, "loss": 0.036, "step": 14060 }, { "epoch": 14.490216271884655, "grad_norm": 0.24697241187095642, "learning_rate": 1.2482299503801016e-05, "loss": 0.0315, "step": 14070 }, { "epoch": 14.500514933058703, "grad_norm": 0.3266669511795044, "learning_rate": 1.2421640442443055e-05, "loss": 0.0351, "step": 14080 }, { "epoch": 14.510813594232749, "grad_norm": 0.42595696449279785, "learning_rate": 1.2361108217868544e-05, "loss": 0.029, "step": 14090 }, { "epoch": 14.521112255406797, "grad_norm": 0.28600630164146423, "learning_rate": 1.23007030343896e-05, "loss": 0.0288, "step": 14100 }, { "epoch": 14.531410916580844, "grad_norm": 0.32830336689949036, "learning_rate": 1.2240425095889495e-05, "loss": 0.0323, "step": 14110 }, { "epoch": 14.541709577754892, "grad_norm": 0.23947954177856445, "learning_rate": 1.2180274605821989e-05, "loss": 0.0301, "step": 14120 }, { "epoch": 14.55200823892894, "grad_norm": 0.14854808151721954, "learning_rate": 1.2120251767210755e-05, "loss": 0.0305, "step": 14130 }, { "epoch": 14.562306900102987, "grad_norm": 0.4753403961658478, "learning_rate": 1.2060356782648503e-05, "loss": 0.0333, "step": 14140 }, { "epoch": 14.572605561277033, "grad_norm": 0.15201760828495026, "learning_rate": 1.2000589854296507e-05, "loss": 0.0348, "step": 14150 }, { "epoch": 14.58290422245108, "grad_norm": 0.36805441975593567, "learning_rate": 1.1940951183883742e-05, "loss": 0.0315, "step": 14160 }, { "epoch": 14.593202883625128, "grad_norm": 0.22207669913768768, "learning_rate": 1.1881440972706315e-05, "loss": 0.0299, "step": 14170 }, { "epoch": 14.603501544799176, "grad_norm": 0.27251651883125305, "learning_rate": 1.1822059421626724e-05, "loss": 0.0364, "step": 14180 }, { "epoch": 14.613800205973224, "grad_norm": 0.2771929204463959, "learning_rate": 1.1762806731073261e-05, "loss": 0.0272, "step": 14190 }, { "epoch": 14.624098867147271, "grad_norm": 0.2667066156864166, "learning_rate": 1.1703683101039197e-05, "loss": 0.0271, "step": 14200 }, { "epoch": 14.634397528321319, "grad_norm": 0.2355891466140747, "learning_rate": 1.1644688731082242e-05, "loss": 0.0299, "step": 14210 }, { "epoch": 14.644696189495365, "grad_norm": 0.39315053820610046, "learning_rate": 1.1585823820323843e-05, "loss": 0.0334, "step": 14220 }, { "epoch": 14.654994850669413, "grad_norm": 0.298880010843277, "learning_rate": 1.1527088567448407e-05, "loss": 0.0309, "step": 14230 }, { "epoch": 14.66529351184346, "grad_norm": 0.21369227766990662, "learning_rate": 1.1468483170702805e-05, "loss": 0.0271, "step": 14240 }, { "epoch": 14.675592173017508, "grad_norm": 0.21962594985961914, "learning_rate": 1.141000782789554e-05, "loss": 0.0296, "step": 14250 }, { "epoch": 14.685890834191555, "grad_norm": 0.3962979316711426, "learning_rate": 1.135166273639619e-05, "loss": 0.0361, "step": 14260 }, { "epoch": 14.696189495365603, "grad_norm": 0.2696010172367096, "learning_rate": 1.1293448093134656e-05, "loss": 0.0317, "step": 14270 }, { "epoch": 14.706488156539649, "grad_norm": 0.16473254561424255, "learning_rate": 1.1235364094600632e-05, "loss": 0.0259, "step": 14280 }, { "epoch": 14.716786817713697, "grad_norm": 0.18638800084590912, "learning_rate": 1.1177410936842719e-05, "loss": 0.0236, "step": 14290 }, { "epoch": 14.727085478887744, "grad_norm": 0.35101962089538574, "learning_rate": 1.1119588815468012e-05, "loss": 0.0266, "step": 14300 }, { "epoch": 14.737384140061792, "grad_norm": 0.2792340815067291, "learning_rate": 1.1061897925641296e-05, "loss": 0.0318, "step": 14310 }, { "epoch": 14.74768280123584, "grad_norm": 0.19751253724098206, "learning_rate": 1.100433846208434e-05, "loss": 0.0294, "step": 14320 }, { "epoch": 14.757981462409887, "grad_norm": 0.2783863842487335, "learning_rate": 1.094691061907544e-05, "loss": 0.0359, "step": 14330 }, { "epoch": 14.768280123583935, "grad_norm": 0.2864331305027008, "learning_rate": 1.088961459044852e-05, "loss": 0.0289, "step": 14340 }, { "epoch": 14.77857878475798, "grad_norm": 0.19958889484405518, "learning_rate": 1.0832450569592684e-05, "loss": 0.0296, "step": 14350 }, { "epoch": 14.788877445932028, "grad_norm": 0.2572004199028015, "learning_rate": 1.0775418749451427e-05, "loss": 0.0299, "step": 14360 }, { "epoch": 14.799176107106076, "grad_norm": 0.24685412645339966, "learning_rate": 1.0718519322522053e-05, "loss": 0.0346, "step": 14370 }, { "epoch": 14.809474768280124, "grad_norm": 0.2643430829048157, "learning_rate": 1.0661752480854975e-05, "loss": 0.0253, "step": 14380 }, { "epoch": 14.819773429454171, "grad_norm": 0.2792705297470093, "learning_rate": 1.0605118416053162e-05, "loss": 0.0295, "step": 14390 }, { "epoch": 14.830072090628219, "grad_norm": 0.4018799662590027, "learning_rate": 1.0548617319271342e-05, "loss": 0.034, "step": 14400 }, { "epoch": 14.840370751802265, "grad_norm": 0.20562392473220825, "learning_rate": 1.049224938121548e-05, "loss": 0.0386, "step": 14410 }, { "epoch": 14.850669412976313, "grad_norm": 0.2107439637184143, "learning_rate": 1.043601479214214e-05, "loss": 0.038, "step": 14420 }, { "epoch": 14.86096807415036, "grad_norm": 0.2785644829273224, "learning_rate": 1.0379913741857699e-05, "loss": 0.0308, "step": 14430 }, { "epoch": 14.871266735324408, "grad_norm": 0.23650747537612915, "learning_rate": 1.03239464197179e-05, "loss": 0.0312, "step": 14440 }, { "epoch": 14.881565396498456, "grad_norm": 0.2766387462615967, "learning_rate": 1.0268113014627073e-05, "loss": 0.0265, "step": 14450 }, { "epoch": 14.891864057672503, "grad_norm": 0.2568782567977905, "learning_rate": 1.021241371503755e-05, "loss": 0.037, "step": 14460 }, { "epoch": 14.90216271884655, "grad_norm": 0.18696804344654083, "learning_rate": 1.0156848708949006e-05, "loss": 0.0266, "step": 14470 }, { "epoch": 14.912461380020597, "grad_norm": 0.23785705864429474, "learning_rate": 1.0101418183907896e-05, "loss": 0.0304, "step": 14480 }, { "epoch": 14.922760041194644, "grad_norm": 0.2720486521720886, "learning_rate": 1.004612232700669e-05, "loss": 0.0359, "step": 14490 }, { "epoch": 14.933058702368692, "grad_norm": 0.21330799162387848, "learning_rate": 9.990961324883358e-06, "loss": 0.0288, "step": 14500 }, { "epoch": 14.94335736354274, "grad_norm": 0.24091622233390808, "learning_rate": 9.935935363720728e-06, "loss": 0.0275, "step": 14510 }, { "epoch": 14.953656024716787, "grad_norm": 0.34269654750823975, "learning_rate": 9.88104462924575e-06, "loss": 0.0323, "step": 14520 }, { "epoch": 14.963954685890835, "grad_norm": 0.23459886014461517, "learning_rate": 9.826289306729052e-06, "loss": 0.0293, "step": 14530 }, { "epoch": 14.97425334706488, "grad_norm": 0.27133437991142273, "learning_rate": 9.7716695809841e-06, "loss": 0.0329, "step": 14540 }, { "epoch": 14.984552008238929, "grad_norm": 0.24615567922592163, "learning_rate": 9.717185636366783e-06, "loss": 0.0317, "step": 14550 }, { "epoch": 14.994850669412976, "grad_norm": 0.26164570450782776, "learning_rate": 9.662837656774632e-06, "loss": 0.031, "step": 14560 }, { "epoch": 15.005149330587024, "grad_norm": 0.18910399079322815, "learning_rate": 9.608625825646288e-06, "loss": 0.0349, "step": 14570 }, { "epoch": 15.015447991761071, "grad_norm": 0.3117832541465759, "learning_rate": 9.554550325960853e-06, "loss": 0.032, "step": 14580 }, { "epoch": 15.02574665293512, "grad_norm": 0.22034838795661926, "learning_rate": 9.500611340237258e-06, "loss": 0.0301, "step": 14590 }, { "epoch": 15.036045314109165, "grad_norm": 0.2756035029888153, "learning_rate": 9.446809050533678e-06, "loss": 0.0272, "step": 14600 }, { "epoch": 15.046343975283213, "grad_norm": 0.3038906157016754, "learning_rate": 9.393143638446889e-06, "loss": 0.0327, "step": 14610 }, { "epoch": 15.05664263645726, "grad_norm": 0.22907866537570953, "learning_rate": 9.33961528511172e-06, "loss": 0.0307, "step": 14620 }, { "epoch": 15.066941297631308, "grad_norm": 0.4842381775379181, "learning_rate": 9.286224171200297e-06, "loss": 0.0284, "step": 14630 }, { "epoch": 15.077239958805356, "grad_norm": 0.8235160112380981, "learning_rate": 9.232970476921626e-06, "loss": 0.0336, "step": 14640 }, { "epoch": 15.087538619979403, "grad_norm": 0.4762952923774719, "learning_rate": 9.17985438202082e-06, "loss": 0.0315, "step": 14650 }, { "epoch": 15.097837281153451, "grad_norm": 0.20582009851932526, "learning_rate": 9.12687606577859e-06, "loss": 0.0283, "step": 14660 }, { "epoch": 15.108135942327497, "grad_norm": 0.20658078789710999, "learning_rate": 9.074035707010575e-06, "loss": 0.0277, "step": 14670 }, { "epoch": 15.118434603501544, "grad_norm": 0.2650274336338043, "learning_rate": 9.02133348406684e-06, "loss": 0.031, "step": 14680 }, { "epoch": 15.128733264675592, "grad_norm": 0.26044949889183044, "learning_rate": 8.968769574831115e-06, "loss": 0.0287, "step": 14690 }, { "epoch": 15.13903192584964, "grad_norm": 0.25187498331069946, "learning_rate": 8.916344156720335e-06, "loss": 0.0301, "step": 14700 }, { "epoch": 15.149330587023687, "grad_norm": 0.4505482017993927, "learning_rate": 8.864057406684023e-06, "loss": 0.0264, "step": 14710 }, { "epoch": 15.159629248197735, "grad_norm": 0.2146962434053421, "learning_rate": 8.81190950120357e-06, "loss": 0.0386, "step": 14720 }, { "epoch": 15.169927909371781, "grad_norm": 0.17643073201179504, "learning_rate": 8.759900616291834e-06, "loss": 0.0271, "step": 14730 }, { "epoch": 15.180226570545829, "grad_norm": 0.3004768192768097, "learning_rate": 8.708030927492345e-06, "loss": 0.034, "step": 14740 }, { "epoch": 15.190525231719876, "grad_norm": 0.33159592747688293, "learning_rate": 8.656300609878898e-06, "loss": 0.033, "step": 14750 }, { "epoch": 15.200823892893924, "grad_norm": 0.2567281126976013, "learning_rate": 8.604709838054813e-06, "loss": 0.0325, "step": 14760 }, { "epoch": 15.211122554067972, "grad_norm": 0.20799218118190765, "learning_rate": 8.55325878615244e-06, "loss": 0.0317, "step": 14770 }, { "epoch": 15.22142121524202, "grad_norm": 0.2914055585861206, "learning_rate": 8.501947627832507e-06, "loss": 0.0308, "step": 14780 }, { "epoch": 15.231719876416065, "grad_norm": 0.24458810687065125, "learning_rate": 8.450776536283594e-06, "loss": 0.0359, "step": 14790 }, { "epoch": 15.242018537590113, "grad_norm": 0.30409494042396545, "learning_rate": 8.399745684221499e-06, "loss": 0.0357, "step": 14800 }, { "epoch": 15.25231719876416, "grad_norm": 0.2720089852809906, "learning_rate": 8.348855243888681e-06, "loss": 0.0344, "step": 14810 }, { "epoch": 15.262615859938208, "grad_norm": 0.25461846590042114, "learning_rate": 8.2981053870537e-06, "loss": 0.0325, "step": 14820 }, { "epoch": 15.272914521112256, "grad_norm": 0.2355855405330658, "learning_rate": 8.247496285010548e-06, "loss": 0.0276, "step": 14830 }, { "epoch": 15.283213182286303, "grad_norm": 0.1807708442211151, "learning_rate": 8.197028108578197e-06, "loss": 0.03, "step": 14840 }, { "epoch": 15.293511843460351, "grad_norm": 0.21903660893440247, "learning_rate": 8.146701028099917e-06, "loss": 0.0254, "step": 14850 }, { "epoch": 15.303810504634397, "grad_norm": 0.5081159472465515, "learning_rate": 8.096515213442762e-06, "loss": 0.0276, "step": 14860 }, { "epoch": 15.314109165808445, "grad_norm": 0.22669517993927002, "learning_rate": 8.046470833996973e-06, "loss": 0.0272, "step": 14870 }, { "epoch": 15.324407826982492, "grad_norm": 0.2578093409538269, "learning_rate": 7.996568058675402e-06, "loss": 0.0304, "step": 14880 }, { "epoch": 15.33470648815654, "grad_norm": 0.20256255567073822, "learning_rate": 7.946807055912959e-06, "loss": 0.0292, "step": 14890 }, { "epoch": 15.345005149330587, "grad_norm": 0.2500031888484955, "learning_rate": 7.897187993666022e-06, "loss": 0.0315, "step": 14900 }, { "epoch": 15.355303810504635, "grad_norm": 0.2907675802707672, "learning_rate": 7.84771103941192e-06, "loss": 0.0341, "step": 14910 }, { "epoch": 15.365602471678681, "grad_norm": 0.1547321081161499, "learning_rate": 7.79837636014827e-06, "loss": 0.0249, "step": 14920 }, { "epoch": 15.375901132852729, "grad_norm": 0.2814120054244995, "learning_rate": 7.749184122392539e-06, "loss": 0.0365, "step": 14930 }, { "epoch": 15.386199794026776, "grad_norm": 0.37319841980934143, "learning_rate": 7.700134492181344e-06, "loss": 0.0274, "step": 14940 }, { "epoch": 15.396498455200824, "grad_norm": 0.24200180172920227, "learning_rate": 7.651227635070041e-06, "loss": 0.0306, "step": 14950 }, { "epoch": 15.406797116374872, "grad_norm": 0.6322610378265381, "learning_rate": 7.602463716132041e-06, "loss": 0.0279, "step": 14960 }, { "epoch": 15.41709577754892, "grad_norm": 0.43964508175849915, "learning_rate": 7.553842899958308e-06, "loss": 0.032, "step": 14970 }, { "epoch": 15.427394438722967, "grad_norm": 0.3598411977291107, "learning_rate": 7.505365350656812e-06, "loss": 0.0275, "step": 14980 }, { "epoch": 15.437693099897013, "grad_norm": 0.19508050382137299, "learning_rate": 7.457031231851941e-06, "loss": 0.034, "step": 14990 }, { "epoch": 15.44799176107106, "grad_norm": 0.29256248474121094, "learning_rate": 7.4088407066839784e-06, "loss": 0.0387, "step": 15000 }, { "epoch": 15.458290422245108, "grad_norm": 0.2301289290189743, "learning_rate": 7.36079393780853e-06, "loss": 0.0311, "step": 15010 }, { "epoch": 15.468589083419156, "grad_norm": 0.29095834493637085, "learning_rate": 7.312891087396034e-06, "loss": 0.0259, "step": 15020 }, { "epoch": 15.478887744593203, "grad_norm": 0.2932276129722595, "learning_rate": 7.2651323171310795e-06, "loss": 0.0293, "step": 15030 }, { "epoch": 15.489186405767251, "grad_norm": 0.24277035892009735, "learning_rate": 7.217517788212025e-06, "loss": 0.0334, "step": 15040 }, { "epoch": 15.499485066941297, "grad_norm": 0.23208442330360413, "learning_rate": 7.170047661350349e-06, "loss": 0.0296, "step": 15050 }, { "epoch": 15.509783728115345, "grad_norm": 0.1625526398420334, "learning_rate": 7.122722096770123e-06, "loss": 0.0283, "step": 15060 }, { "epoch": 15.520082389289392, "grad_norm": 0.29437604546546936, "learning_rate": 7.075541254207502e-06, "loss": 0.0284, "step": 15070 }, { "epoch": 15.53038105046344, "grad_norm": 0.3337920308113098, "learning_rate": 7.028505292910154e-06, "loss": 0.0235, "step": 15080 }, { "epoch": 15.540679711637488, "grad_norm": 0.16761137545108795, "learning_rate": 6.981614371636747e-06, "loss": 0.0261, "step": 15090 }, { "epoch": 15.550978372811535, "grad_norm": 0.18191471695899963, "learning_rate": 6.934868648656373e-06, "loss": 0.0273, "step": 15100 }, { "epoch": 15.561277033985581, "grad_norm": 0.2083984911441803, "learning_rate": 6.8882682817481006e-06, "loss": 0.0339, "step": 15110 }, { "epoch": 15.571575695159629, "grad_norm": 0.33254730701446533, "learning_rate": 6.841813428200306e-06, "loss": 0.0335, "step": 15120 }, { "epoch": 15.581874356333676, "grad_norm": 0.22721487283706665, "learning_rate": 6.795504244810285e-06, "loss": 0.0284, "step": 15130 }, { "epoch": 15.592173017507724, "grad_norm": 0.3968798816204071, "learning_rate": 6.749340887883626e-06, "loss": 0.0326, "step": 15140 }, { "epoch": 15.602471678681772, "grad_norm": 0.1721322387456894, "learning_rate": 6.7033235132337225e-06, "loss": 0.0267, "step": 15150 }, { "epoch": 15.61277033985582, "grad_norm": 0.3585062026977539, "learning_rate": 6.6574522761812366e-06, "loss": 0.0297, "step": 15160 }, { "epoch": 15.623069001029865, "grad_norm": 0.45918750762939453, "learning_rate": 6.611727331553586e-06, "loss": 0.0275, "step": 15170 }, { "epoch": 15.633367662203913, "grad_norm": 0.3067721724510193, "learning_rate": 6.566148833684399e-06, "loss": 0.0287, "step": 15180 }, { "epoch": 15.64366632337796, "grad_norm": 0.2751639187335968, "learning_rate": 6.520716936413018e-06, "loss": 0.0295, "step": 15190 }, { "epoch": 15.653964984552008, "grad_norm": 0.21889840066432953, "learning_rate": 6.475431793083974e-06, "loss": 0.0321, "step": 15200 }, { "epoch": 15.664263645726056, "grad_norm": 0.3290077745914459, "learning_rate": 6.4302935565464514e-06, "loss": 0.031, "step": 15210 }, { "epoch": 15.674562306900103, "grad_norm": 0.5243391394615173, "learning_rate": 6.385302379153818e-06, "loss": 0.0248, "step": 15220 }, { "epoch": 15.684860968074151, "grad_norm": 1.0162177085876465, "learning_rate": 6.3404584127630115e-06, "loss": 0.0243, "step": 15230 }, { "epoch": 15.695159629248197, "grad_norm": 0.33608901500701904, "learning_rate": 6.295761808734174e-06, "loss": 0.0307, "step": 15240 }, { "epoch": 15.705458290422245, "grad_norm": 0.2736285626888275, "learning_rate": 6.251212717930017e-06, "loss": 0.0341, "step": 15250 }, { "epoch": 15.715756951596292, "grad_norm": 0.3048650920391083, "learning_rate": 6.206811290715353e-06, "loss": 0.035, "step": 15260 }, { "epoch": 15.72605561277034, "grad_norm": 0.2898007929325104, "learning_rate": 6.16255767695661e-06, "loss": 0.0304, "step": 15270 }, { "epoch": 15.736354273944388, "grad_norm": 0.2866269052028656, "learning_rate": 6.118452026021299e-06, "loss": 0.0344, "step": 15280 }, { "epoch": 15.746652935118435, "grad_norm": 0.29790258407592773, "learning_rate": 6.07449448677751e-06, "loss": 0.0333, "step": 15290 }, { "epoch": 15.756951596292481, "grad_norm": 0.33838725090026855, "learning_rate": 6.030685207593423e-06, "loss": 0.0345, "step": 15300 }, { "epoch": 15.767250257466529, "grad_norm": 0.28657403588294983, "learning_rate": 5.9870243363368275e-06, "loss": 0.0321, "step": 15310 }, { "epoch": 15.777548918640576, "grad_norm": 0.34499257802963257, "learning_rate": 5.943512020374537e-06, "loss": 0.0367, "step": 15320 }, { "epoch": 15.787847579814624, "grad_norm": 0.2314077764749527, "learning_rate": 5.90014840657202e-06, "loss": 0.0351, "step": 15330 }, { "epoch": 15.798146240988672, "grad_norm": 0.40013644099235535, "learning_rate": 5.856933641292789e-06, "loss": 0.0305, "step": 15340 }, { "epoch": 15.80844490216272, "grad_norm": 0.6308583617210388, "learning_rate": 5.813867870397977e-06, "loss": 0.0331, "step": 15350 }, { "epoch": 15.818743563336767, "grad_norm": 0.3136028051376343, "learning_rate": 5.770951239245803e-06, "loss": 0.0313, "step": 15360 }, { "epoch": 15.829042224510813, "grad_norm": 0.18756185472011566, "learning_rate": 5.72818389269113e-06, "loss": 0.0261, "step": 15370 }, { "epoch": 15.83934088568486, "grad_norm": 0.22854579985141754, "learning_rate": 5.685565975084911e-06, "loss": 0.0307, "step": 15380 }, { "epoch": 15.849639546858908, "grad_norm": 0.18659406900405884, "learning_rate": 5.643097630273769e-06, "loss": 0.0293, "step": 15390 }, { "epoch": 15.859938208032956, "grad_norm": 0.2682023048400879, "learning_rate": 5.600779001599455e-06, "loss": 0.0339, "step": 15400 }, { "epoch": 15.870236869207003, "grad_norm": 0.29009154438972473, "learning_rate": 5.558610231898393e-06, "loss": 0.037, "step": 15410 }, { "epoch": 15.880535530381051, "grad_norm": 0.32601863145828247, "learning_rate": 5.516591463501231e-06, "loss": 0.0322, "step": 15420 }, { "epoch": 15.890834191555097, "grad_norm": 0.25241759419441223, "learning_rate": 5.474722838232254e-06, "loss": 0.0335, "step": 15430 }, { "epoch": 15.901132852729145, "grad_norm": 0.34431523084640503, "learning_rate": 5.433004497409039e-06, "loss": 0.027, "step": 15440 }, { "epoch": 15.911431513903192, "grad_norm": 0.24490360915660858, "learning_rate": 5.391436581841886e-06, "loss": 0.0287, "step": 15450 }, { "epoch": 15.92173017507724, "grad_norm": 0.25288495421409607, "learning_rate": 5.350019231833364e-06, "loss": 0.0301, "step": 15460 }, { "epoch": 15.932028836251288, "grad_norm": 0.23814049363136292, "learning_rate": 5.3087525871778565e-06, "loss": 0.0291, "step": 15470 }, { "epoch": 15.942327497425335, "grad_norm": 0.2367774397134781, "learning_rate": 5.2676367871610675e-06, "loss": 0.0325, "step": 15480 }, { "epoch": 15.952626158599383, "grad_norm": 0.20925898849964142, "learning_rate": 5.226671970559577e-06, "loss": 0.0307, "step": 15490 }, { "epoch": 15.962924819773429, "grad_norm": 0.36154627799987793, "learning_rate": 5.185858275640332e-06, "loss": 0.0328, "step": 15500 }, { "epoch": 15.973223480947476, "grad_norm": 0.25385522842407227, "learning_rate": 5.145195840160239e-06, "loss": 0.0299, "step": 15510 }, { "epoch": 15.983522142121524, "grad_norm": 0.25496914982795715, "learning_rate": 5.1046848013656165e-06, "loss": 0.0292, "step": 15520 }, { "epoch": 15.993820803295572, "grad_norm": 0.2563509941101074, "learning_rate": 5.064325295991829e-06, "loss": 0.0284, "step": 15530 }, { "epoch": 16.004119464469618, "grad_norm": 0.2616461217403412, "learning_rate": 5.024117460262751e-06, "loss": 0.0439, "step": 15540 }, { "epoch": 16.014418125643665, "grad_norm": 0.3009835481643677, "learning_rate": 4.984061429890324e-06, "loss": 0.0304, "step": 15550 }, { "epoch": 16.024716786817713, "grad_norm": 0.29534780979156494, "learning_rate": 4.94415734007413e-06, "loss": 0.0319, "step": 15560 }, { "epoch": 16.03501544799176, "grad_norm": 0.21110209822654724, "learning_rate": 4.9044053255008935e-06, "loss": 0.0309, "step": 15570 }, { "epoch": 16.04531410916581, "grad_norm": 0.257237046957016, "learning_rate": 4.864805520344051e-06, "loss": 0.0274, "step": 15580 }, { "epoch": 16.055612770339856, "grad_norm": 0.3104022741317749, "learning_rate": 4.8253580582632906e-06, "loss": 0.0294, "step": 15590 }, { "epoch": 16.065911431513904, "grad_norm": 0.1543678343296051, "learning_rate": 4.786063072404112e-06, "loss": 0.0247, "step": 15600 }, { "epoch": 16.07621009268795, "grad_norm": 0.18241259455680847, "learning_rate": 4.7469206953973495e-06, "loss": 0.0245, "step": 15610 }, { "epoch": 16.086508753862, "grad_norm": 0.18561235070228577, "learning_rate": 4.707931059358783e-06, "loss": 0.0282, "step": 15620 }, { "epoch": 16.096807415036047, "grad_norm": 0.36796221137046814, "learning_rate": 4.669094295888588e-06, "loss": 0.0323, "step": 15630 }, { "epoch": 16.107106076210094, "grad_norm": 0.21030554175376892, "learning_rate": 4.630410536071006e-06, "loss": 0.0271, "step": 15640 }, { "epoch": 16.117404737384142, "grad_norm": 0.23774808645248413, "learning_rate": 4.59187991047384e-06, "loss": 0.0319, "step": 15650 }, { "epoch": 16.127703398558186, "grad_norm": 0.16403083503246307, "learning_rate": 4.553502549148009e-06, "loss": 0.0339, "step": 15660 }, { "epoch": 16.138002059732234, "grad_norm": 0.23186904191970825, "learning_rate": 4.515278581627141e-06, "loss": 0.0301, "step": 15670 }, { "epoch": 16.14830072090628, "grad_norm": 0.24327369034290314, "learning_rate": 4.477208136927119e-06, "loss": 0.0308, "step": 15680 }, { "epoch": 16.15859938208033, "grad_norm": 0.2953716814517975, "learning_rate": 4.439291343545643e-06, "loss": 0.0281, "step": 15690 }, { "epoch": 16.168898043254377, "grad_norm": 0.24078382551670074, "learning_rate": 4.401528329461779e-06, "loss": 0.0304, "step": 15700 }, { "epoch": 16.179196704428424, "grad_norm": 0.3598305583000183, "learning_rate": 4.363919222135604e-06, "loss": 0.0279, "step": 15710 }, { "epoch": 16.189495365602472, "grad_norm": 0.18711034953594208, "learning_rate": 4.326464148507647e-06, "loss": 0.0289, "step": 15720 }, { "epoch": 16.19979402677652, "grad_norm": 0.3203088045120239, "learning_rate": 4.289163234998589e-06, "loss": 0.0334, "step": 15730 }, { "epoch": 16.210092687950567, "grad_norm": 0.2985017001628876, "learning_rate": 4.2520166075087635e-06, "loss": 0.0246, "step": 15740 }, { "epoch": 16.220391349124615, "grad_norm": 0.25471287965774536, "learning_rate": 4.2150243914177325e-06, "loss": 0.029, "step": 15750 }, { "epoch": 16.230690010298662, "grad_norm": 0.22707876563072205, "learning_rate": 4.178186711583904e-06, "loss": 0.0258, "step": 15760 }, { "epoch": 16.24098867147271, "grad_norm": 0.2530466914176941, "learning_rate": 4.141503692344062e-06, "loss": 0.0324, "step": 15770 }, { "epoch": 16.251287332646754, "grad_norm": 0.23593966662883759, "learning_rate": 4.1049754575129935e-06, "loss": 0.0299, "step": 15780 }, { "epoch": 16.261585993820802, "grad_norm": 0.26746660470962524, "learning_rate": 4.068602130383031e-06, "loss": 0.025, "step": 15790 }, { "epoch": 16.27188465499485, "grad_norm": 0.3687654733657837, "learning_rate": 4.032383833723657e-06, "loss": 0.0344, "step": 15800 }, { "epoch": 16.282183316168897, "grad_norm": 0.26962026953697205, "learning_rate": 3.99632068978108e-06, "loss": 0.0315, "step": 15810 }, { "epoch": 16.292481977342945, "grad_norm": 0.3096659779548645, "learning_rate": 3.960412820277865e-06, "loss": 0.0241, "step": 15820 }, { "epoch": 16.302780638516992, "grad_norm": 0.3644077777862549, "learning_rate": 3.924660346412418e-06, "loss": 0.0348, "step": 15830 }, { "epoch": 16.31307929969104, "grad_norm": 0.2755933701992035, "learning_rate": 3.8890633888587046e-06, "loss": 0.0309, "step": 15840 }, { "epoch": 16.323377960865088, "grad_norm": 0.5915675163269043, "learning_rate": 3.8536220677657495e-06, "loss": 0.0314, "step": 15850 }, { "epoch": 16.333676622039135, "grad_norm": 0.2403060346841812, "learning_rate": 3.8183365027572805e-06, "loss": 0.0304, "step": 15860 }, { "epoch": 16.343975283213183, "grad_norm": 0.24288389086723328, "learning_rate": 3.783206812931289e-06, "loss": 0.0291, "step": 15870 }, { "epoch": 16.35427394438723, "grad_norm": 0.3532700836658478, "learning_rate": 3.7482331168596675e-06, "loss": 0.0289, "step": 15880 }, { "epoch": 16.36457260556128, "grad_norm": 0.18153394758701324, "learning_rate": 3.7134155325877772e-06, "loss": 0.0329, "step": 15890 }, { "epoch": 16.374871266735326, "grad_norm": 0.4066762924194336, "learning_rate": 3.678754177634053e-06, "loss": 0.0293, "step": 15900 }, { "epoch": 16.38516992790937, "grad_norm": 0.33672627806663513, "learning_rate": 3.64424916898965e-06, "loss": 0.0303, "step": 15910 }, { "epoch": 16.395468589083418, "grad_norm": 0.273366242647171, "learning_rate": 3.6099006231179622e-06, "loss": 0.0307, "step": 15920 }, { "epoch": 16.405767250257465, "grad_norm": 0.22325216233730316, "learning_rate": 3.575708655954324e-06, "loss": 0.0327, "step": 15930 }, { "epoch": 16.416065911431513, "grad_norm": 0.18643653392791748, "learning_rate": 3.541673382905558e-06, "loss": 0.0346, "step": 15940 }, { "epoch": 16.42636457260556, "grad_norm": 0.2503977119922638, "learning_rate": 3.5077949188495996e-06, "loss": 0.033, "step": 15950 }, { "epoch": 16.43666323377961, "grad_norm": 0.29063940048217773, "learning_rate": 3.474073378135123e-06, "loss": 0.0286, "step": 15960 }, { "epoch": 16.446961894953656, "grad_norm": 0.2275126725435257, "learning_rate": 3.440508874581139e-06, "loss": 0.0321, "step": 15970 }, { "epoch": 16.457260556127704, "grad_norm": 0.24945175647735596, "learning_rate": 3.4071015214766134e-06, "loss": 0.0312, "step": 15980 }, { "epoch": 16.46755921730175, "grad_norm": 0.4091668725013733, "learning_rate": 3.3738514315800995e-06, "loss": 0.0351, "step": 15990 }, { "epoch": 16.4778578784758, "grad_norm": 0.20869703590869904, "learning_rate": 3.3407587171193354e-06, "loss": 0.0262, "step": 16000 }, { "epoch": 16.488156539649847, "grad_norm": 0.19803866744041443, "learning_rate": 3.3078234897908788e-06, "loss": 0.0293, "step": 16010 }, { "epoch": 16.498455200823894, "grad_norm": 0.24785685539245605, "learning_rate": 3.2750458607597457e-06, "loss": 0.0295, "step": 16020 }, { "epoch": 16.508753861997942, "grad_norm": 0.23679105937480927, "learning_rate": 3.2424259406589664e-06, "loss": 0.0269, "step": 16030 }, { "epoch": 16.519052523171986, "grad_norm": 0.21375852823257446, "learning_rate": 3.209963839589325e-06, "loss": 0.0236, "step": 16040 }, { "epoch": 16.529351184346034, "grad_norm": 0.1723773181438446, "learning_rate": 3.177659667118882e-06, "loss": 0.0312, "step": 16050 }, { "epoch": 16.53964984552008, "grad_norm": 0.24385997653007507, "learning_rate": 3.1455135322826678e-06, "loss": 0.0301, "step": 16060 }, { "epoch": 16.54994850669413, "grad_norm": 0.2073340266942978, "learning_rate": 3.1135255435822796e-06, "loss": 0.0286, "step": 16070 }, { "epoch": 16.560247167868177, "grad_norm": 0.2794674336910248, "learning_rate": 3.0816958089855462e-06, "loss": 0.0265, "step": 16080 }, { "epoch": 16.570545829042224, "grad_norm": 0.2308894544839859, "learning_rate": 3.0500244359261355e-06, "loss": 0.0284, "step": 16090 }, { "epoch": 16.580844490216272, "grad_norm": 0.2674751579761505, "learning_rate": 3.018511531303203e-06, "loss": 0.0282, "step": 16100 }, { "epoch": 16.59114315139032, "grad_norm": 0.20278188586235046, "learning_rate": 2.9871572014810555e-06, "loss": 0.0272, "step": 16110 }, { "epoch": 16.601441812564367, "grad_norm": 0.20840872824192047, "learning_rate": 2.9559615522887273e-06, "loss": 0.0358, "step": 16120 }, { "epoch": 16.611740473738415, "grad_norm": 0.26591232419013977, "learning_rate": 2.924924689019698e-06, "loss": 0.0262, "step": 16130 }, { "epoch": 16.622039134912463, "grad_norm": 0.22082144021987915, "learning_rate": 2.8940467164314924e-06, "loss": 0.0321, "step": 16140 }, { "epoch": 16.63233779608651, "grad_norm": 0.2413538098335266, "learning_rate": 2.8633277387453308e-06, "loss": 0.0377, "step": 16150 }, { "epoch": 16.642636457260558, "grad_norm": 0.2731287479400635, "learning_rate": 2.8327678596457963e-06, "loss": 0.031, "step": 16160 }, { "epoch": 16.652935118434602, "grad_norm": 0.18613195419311523, "learning_rate": 2.802367182280463e-06, "loss": 0.0367, "step": 16170 }, { "epoch": 16.66323377960865, "grad_norm": 0.19616888463497162, "learning_rate": 2.7721258092595627e-06, "loss": 0.0265, "step": 16180 }, { "epoch": 16.673532440782697, "grad_norm": 0.20527370274066925, "learning_rate": 2.7420438426556338e-06, "loss": 0.0331, "step": 16190 }, { "epoch": 16.683831101956745, "grad_norm": 0.21385008096694946, "learning_rate": 2.712121384003169e-06, "loss": 0.0271, "step": 16200 }, { "epoch": 16.694129763130793, "grad_norm": 0.2785768210887909, "learning_rate": 2.682358534298285e-06, "loss": 0.0365, "step": 16210 }, { "epoch": 16.70442842430484, "grad_norm": 0.2710186243057251, "learning_rate": 2.652755393998396e-06, "loss": 0.0245, "step": 16220 }, { "epoch": 16.714727085478888, "grad_norm": 0.2453254610300064, "learning_rate": 2.6233120630218045e-06, "loss": 0.0327, "step": 16230 }, { "epoch": 16.725025746652936, "grad_norm": 0.2788352072238922, "learning_rate": 2.594028640747476e-06, "loss": 0.0292, "step": 16240 }, { "epoch": 16.735324407826983, "grad_norm": 0.4019950032234192, "learning_rate": 2.564905226014597e-06, "loss": 0.029, "step": 16250 }, { "epoch": 16.74562306900103, "grad_norm": 0.2551436424255371, "learning_rate": 2.5359419171223086e-06, "loss": 0.0296, "step": 16260 }, { "epoch": 16.75592173017508, "grad_norm": 0.2889397442340851, "learning_rate": 2.507138811829346e-06, "loss": 0.033, "step": 16270 }, { "epoch": 16.766220391349126, "grad_norm": 0.25674816966056824, "learning_rate": 2.4784960073537143e-06, "loss": 0.0267, "step": 16280 }, { "epoch": 16.77651905252317, "grad_norm": 0.21177352964878082, "learning_rate": 2.4500136003723638e-06, "loss": 0.0262, "step": 16290 }, { "epoch": 16.786817713697218, "grad_norm": 0.21103815734386444, "learning_rate": 2.421691687020855e-06, "loss": 0.0295, "step": 16300 }, { "epoch": 16.797116374871266, "grad_norm": 0.26780322194099426, "learning_rate": 2.3935303628930707e-06, "loss": 0.0327, "step": 16310 }, { "epoch": 16.807415036045313, "grad_norm": 0.49311545491218567, "learning_rate": 2.3655297230408045e-06, "loss": 0.03, "step": 16320 }, { "epoch": 16.81771369721936, "grad_norm": 0.2364225834608078, "learning_rate": 2.3376898619735577e-06, "loss": 0.0276, "step": 16330 }, { "epoch": 16.82801235839341, "grad_norm": 0.29716435074806213, "learning_rate": 2.3100108736581305e-06, "loss": 0.027, "step": 16340 }, { "epoch": 16.838311019567456, "grad_norm": 0.20759916305541992, "learning_rate": 2.282492851518342e-06, "loss": 0.0275, "step": 16350 }, { "epoch": 16.848609680741504, "grad_norm": 0.1657613217830658, "learning_rate": 2.2551358884347007e-06, "loss": 0.0273, "step": 16360 }, { "epoch": 16.85890834191555, "grad_norm": 0.16528256237506866, "learning_rate": 2.227940076744117e-06, "loss": 0.0309, "step": 16370 }, { "epoch": 16.8692070030896, "grad_norm": 0.28386402130126953, "learning_rate": 2.2009055082395537e-06, "loss": 0.0324, "step": 16380 }, { "epoch": 16.879505664263647, "grad_norm": 0.23188601434230804, "learning_rate": 2.174032274169746e-06, "loss": 0.0283, "step": 16390 }, { "epoch": 16.889804325437694, "grad_norm": 0.34195181727409363, "learning_rate": 2.1473204652388834e-06, "loss": 0.031, "step": 16400 }, { "epoch": 16.900102986611742, "grad_norm": 0.19225898385047913, "learning_rate": 2.1207701716062956e-06, "loss": 0.0374, "step": 16410 }, { "epoch": 16.910401647785786, "grad_norm": 0.4472239911556244, "learning_rate": 2.0943814828861762e-06, "loss": 0.0304, "step": 16420 }, { "epoch": 16.920700308959834, "grad_norm": 0.26532843708992004, "learning_rate": 2.0681544881472283e-06, "loss": 0.0291, "step": 16430 }, { "epoch": 16.93099897013388, "grad_norm": 0.27116134762763977, "learning_rate": 2.0420892759124176e-06, "loss": 0.0224, "step": 16440 }, { "epoch": 16.94129763130793, "grad_norm": 0.3424379825592041, "learning_rate": 2.0161859341586597e-06, "loss": 0.0274, "step": 16450 }, { "epoch": 16.951596292481977, "grad_norm": 0.23772460222244263, "learning_rate": 1.9904445503164838e-06, "loss": 0.0308, "step": 16460 }, { "epoch": 16.961894953656024, "grad_norm": 0.23013190925121307, "learning_rate": 1.964865211269801e-06, "loss": 0.0265, "step": 16470 }, { "epoch": 16.972193614830072, "grad_norm": 0.2528025805950165, "learning_rate": 1.939448003355554e-06, "loss": 0.0342, "step": 16480 }, { "epoch": 16.98249227600412, "grad_norm": 0.39106324315071106, "learning_rate": 1.914193012363469e-06, "loss": 0.0326, "step": 16490 }, { "epoch": 16.992790937178167, "grad_norm": 0.4082978069782257, "learning_rate": 1.8891003235357308e-06, "loss": 0.0321, "step": 16500 }, { "epoch": 17.003089598352215, "grad_norm": 0.1785215586423874, "learning_rate": 1.8641700215667413e-06, "loss": 0.0265, "step": 16510 }, { "epoch": 17.013388259526263, "grad_norm": 0.5540566444396973, "learning_rate": 1.839402190602757e-06, "loss": 0.0281, "step": 16520 }, { "epoch": 17.02368692070031, "grad_norm": 0.2588430941104889, "learning_rate": 1.8147969142417066e-06, "loss": 0.0284, "step": 16530 }, { "epoch": 17.033985581874358, "grad_norm": 0.3563145399093628, "learning_rate": 1.7903542755328073e-06, "loss": 0.0308, "step": 16540 }, { "epoch": 17.044284243048402, "grad_norm": 0.303353488445282, "learning_rate": 1.766074356976366e-06, "loss": 0.0302, "step": 16550 }, { "epoch": 17.05458290422245, "grad_norm": 0.24329645931720734, "learning_rate": 1.7419572405234453e-06, "loss": 0.0282, "step": 16560 }, { "epoch": 17.064881565396497, "grad_norm": 0.212374746799469, "learning_rate": 1.7180030075756136e-06, "loss": 0.0298, "step": 16570 }, { "epoch": 17.075180226570545, "grad_norm": 0.22339214384555817, "learning_rate": 1.6942117389846746e-06, "loss": 0.0314, "step": 16580 }, { "epoch": 17.085478887744593, "grad_norm": 0.2897525131702423, "learning_rate": 1.6705835150523707e-06, "loss": 0.0331, "step": 16590 }, { "epoch": 17.09577754891864, "grad_norm": 0.20139732956886292, "learning_rate": 1.6471184155301355e-06, "loss": 0.0271, "step": 16600 }, { "epoch": 17.106076210092688, "grad_norm": 0.30817776918411255, "learning_rate": 1.6238165196188039e-06, "loss": 0.0288, "step": 16610 }, { "epoch": 17.116374871266736, "grad_norm": 0.23742049932479858, "learning_rate": 1.6006779059683784e-06, "loss": 0.0317, "step": 16620 }, { "epoch": 17.126673532440783, "grad_norm": 0.2712803781032562, "learning_rate": 1.5777026526777094e-06, "loss": 0.029, "step": 16630 }, { "epoch": 17.13697219361483, "grad_norm": 0.19828765094280243, "learning_rate": 1.5548908372942983e-06, "loss": 0.0315, "step": 16640 }, { "epoch": 17.14727085478888, "grad_norm": 0.27912184596061707, "learning_rate": 1.5322425368139714e-06, "loss": 0.0293, "step": 16650 }, { "epoch": 17.157569515962926, "grad_norm": 0.41649627685546875, "learning_rate": 1.5097578276806633e-06, "loss": 0.0299, "step": 16660 }, { "epoch": 17.167868177136974, "grad_norm": 0.20297054946422577, "learning_rate": 1.487436785786145e-06, "loss": 0.0313, "step": 16670 }, { "epoch": 17.178166838311018, "grad_norm": 0.38883742690086365, "learning_rate": 1.4652794864697671e-06, "loss": 0.0293, "step": 16680 }, { "epoch": 17.188465499485066, "grad_norm": 0.2401762455701828, "learning_rate": 1.4432860045182017e-06, "loss": 0.0282, "step": 16690 }, { "epoch": 17.198764160659113, "grad_norm": 0.3450429141521454, "learning_rate": 1.4214564141651898e-06, "loss": 0.0249, "step": 16700 }, { "epoch": 17.20906282183316, "grad_norm": 0.17480014264583588, "learning_rate": 1.3997907890913265e-06, "loss": 0.0271, "step": 16710 }, { "epoch": 17.21936148300721, "grad_norm": 0.2633569538593292, "learning_rate": 1.3782892024237327e-06, "loss": 0.0282, "step": 16720 }, { "epoch": 17.229660144181256, "grad_norm": 0.22684310376644135, "learning_rate": 1.3569517267359e-06, "loss": 0.0325, "step": 16730 }, { "epoch": 17.239958805355304, "grad_norm": 0.30432412028312683, "learning_rate": 1.33577843404738e-06, "loss": 0.027, "step": 16740 }, { "epoch": 17.25025746652935, "grad_norm": 0.3308713734149933, "learning_rate": 1.3147693958235618e-06, "loss": 0.0296, "step": 16750 }, { "epoch": 17.2605561277034, "grad_norm": 0.2591300904750824, "learning_rate": 1.2939246829754503e-06, "loss": 0.0191, "step": 16760 }, { "epoch": 17.270854788877447, "grad_norm": 0.3229091763496399, "learning_rate": 1.2732443658593884e-06, "loss": 0.0278, "step": 16770 }, { "epoch": 17.281153450051495, "grad_norm": 0.3232883810997009, "learning_rate": 1.2527285142768574e-06, "loss": 0.0308, "step": 16780 }, { "epoch": 17.291452111225542, "grad_norm": 0.16374994814395905, "learning_rate": 1.2323771974742104e-06, "loss": 0.0285, "step": 16790 }, { "epoch": 17.301750772399586, "grad_norm": 0.4016587734222412, "learning_rate": 1.212190484142467e-06, "loss": 0.0287, "step": 16800 }, { "epoch": 17.312049433573634, "grad_norm": 0.7468344569206238, "learning_rate": 1.192168442417052e-06, "loss": 0.0318, "step": 16810 }, { "epoch": 17.32234809474768, "grad_norm": 0.62845778465271, "learning_rate": 1.1723111398776077e-06, "loss": 0.0307, "step": 16820 }, { "epoch": 17.33264675592173, "grad_norm": 0.29316961765289307, "learning_rate": 1.1526186435476927e-06, "loss": 0.0322, "step": 16830 }, { "epoch": 17.342945417095777, "grad_norm": 0.2891688942909241, "learning_rate": 1.1330910198946442e-06, "loss": 0.0274, "step": 16840 }, { "epoch": 17.353244078269825, "grad_norm": 0.28778383135795593, "learning_rate": 1.1137283348292892e-06, "loss": 0.0341, "step": 16850 }, { "epoch": 17.363542739443872, "grad_norm": 0.17100463807582855, "learning_rate": 1.0945306537057555e-06, "loss": 0.0334, "step": 16860 }, { "epoch": 17.37384140061792, "grad_norm": 0.17976661026477814, "learning_rate": 1.0754980413212268e-06, "loss": 0.0299, "step": 16870 }, { "epoch": 17.384140061791967, "grad_norm": 0.2614526152610779, "learning_rate": 1.0566305619157502e-06, "loss": 0.0278, "step": 16880 }, { "epoch": 17.394438722966015, "grad_norm": 0.195588618516922, "learning_rate": 1.0379282791719958e-06, "loss": 0.028, "step": 16890 }, { "epoch": 17.404737384140063, "grad_norm": 1.0282113552093506, "learning_rate": 1.0193912562150464e-06, "loss": 0.0291, "step": 16900 }, { "epoch": 17.41503604531411, "grad_norm": 0.2868080735206604, "learning_rate": 1.0010195556122203e-06, "loss": 0.0329, "step": 16910 }, { "epoch": 17.425334706488158, "grad_norm": 0.2227233201265335, "learning_rate": 9.828132393727875e-07, "loss": 0.0262, "step": 16920 }, { "epoch": 17.435633367662202, "grad_norm": 0.20315021276474, "learning_rate": 9.647723689478305e-07, "loss": 0.0324, "step": 16930 }, { "epoch": 17.44593202883625, "grad_norm": 0.6371609568595886, "learning_rate": 9.468970052300019e-07, "loss": 0.0318, "step": 16940 }, { "epoch": 17.456230690010297, "grad_norm": 0.18564990162849426, "learning_rate": 9.291872085533227e-07, "loss": 0.0289, "step": 16950 }, { "epoch": 17.466529351184345, "grad_norm": 0.22705796360969543, "learning_rate": 9.116430386929886e-07, "loss": 0.0249, "step": 16960 }, { "epoch": 17.476828012358393, "grad_norm": 0.2133428156375885, "learning_rate": 8.942645548651541e-07, "loss": 0.0376, "step": 16970 }, { "epoch": 17.48712667353244, "grad_norm": 0.19329524040222168, "learning_rate": 8.770518157267482e-07, "loss": 0.0308, "step": 16980 }, { "epoch": 17.497425334706488, "grad_norm": 0.2410387098789215, "learning_rate": 8.60004879375259e-07, "loss": 0.0273, "step": 16990 }, { "epoch": 17.507723995880536, "grad_norm": 0.20141083002090454, "learning_rate": 8.4312380334855e-07, "loss": 0.0336, "step": 17000 }, { "epoch": 17.518022657054583, "grad_norm": 0.27098795771598816, "learning_rate": 8.264086446246655e-07, "loss": 0.0313, "step": 17010 }, { "epoch": 17.52832131822863, "grad_norm": 0.35340428352355957, "learning_rate": 8.098594596216424e-07, "loss": 0.0348, "step": 17020 }, { "epoch": 17.53861997940268, "grad_norm": 0.3264867663383484, "learning_rate": 7.934763041972937e-07, "loss": 0.0302, "step": 17030 }, { "epoch": 17.548918640576726, "grad_norm": 0.2895232141017914, "learning_rate": 7.772592336490525e-07, "loss": 0.0325, "step": 17040 }, { "epoch": 17.559217301750774, "grad_norm": 0.24770499765872955, "learning_rate": 7.612083027137728e-07, "loss": 0.0319, "step": 17050 }, { "epoch": 17.569515962924818, "grad_norm": 0.4487510323524475, "learning_rate": 7.453235655675406e-07, "loss": 0.0258, "step": 17060 }, { "epoch": 17.579814624098866, "grad_norm": 0.38243043422698975, "learning_rate": 7.296050758254957e-07, "loss": 0.0308, "step": 17070 }, { "epoch": 17.590113285272913, "grad_norm": 0.5216277837753296, "learning_rate": 7.140528865416441e-07, "loss": 0.0268, "step": 17080 }, { "epoch": 17.60041194644696, "grad_norm": 0.300006240606308, "learning_rate": 6.986670502086901e-07, "loss": 0.0324, "step": 17090 }, { "epoch": 17.61071060762101, "grad_norm": 0.22057189047336578, "learning_rate": 6.834476187578543e-07, "loss": 0.0282, "step": 17100 }, { "epoch": 17.621009268795056, "grad_norm": 0.26959654688835144, "learning_rate": 6.683946435586952e-07, "loss": 0.0307, "step": 17110 }, { "epoch": 17.631307929969104, "grad_norm": 0.28995075821876526, "learning_rate": 6.535081754189321e-07, "loss": 0.0318, "step": 17120 }, { "epoch": 17.64160659114315, "grad_norm": 0.3135945200920105, "learning_rate": 6.387882645842947e-07, "loss": 0.0287, "step": 17130 }, { "epoch": 17.6519052523172, "grad_norm": 0.26953238248825073, "learning_rate": 6.24234960738318e-07, "loss": 0.0292, "step": 17140 }, { "epoch": 17.662203913491247, "grad_norm": 0.2764807343482971, "learning_rate": 6.098483130022148e-07, "loss": 0.027, "step": 17150 }, { "epoch": 17.672502574665295, "grad_norm": 0.3281687796115875, "learning_rate": 5.956283699346754e-07, "loss": 0.0254, "step": 17160 }, { "epoch": 17.682801235839342, "grad_norm": 0.17730310559272766, "learning_rate": 5.815751795317237e-07, "loss": 0.0277, "step": 17170 }, { "epoch": 17.69309989701339, "grad_norm": 0.43514519929885864, "learning_rate": 5.676887892265559e-07, "loss": 0.0238, "step": 17180 }, { "epoch": 17.703398558187434, "grad_norm": 0.31942808628082275, "learning_rate": 5.539692458893575e-07, "loss": 0.027, "step": 17190 }, { "epoch": 17.71369721936148, "grad_norm": 1.2527509927749634, "learning_rate": 5.404165958271811e-07, "loss": 0.029, "step": 17200 }, { "epoch": 17.72399588053553, "grad_norm": 0.2568182051181793, "learning_rate": 5.270308847837579e-07, "loss": 0.0316, "step": 17210 }, { "epoch": 17.734294541709577, "grad_norm": 0.32886284589767456, "learning_rate": 5.13812157939364e-07, "loss": 0.0341, "step": 17220 }, { "epoch": 17.744593202883625, "grad_norm": 0.1350669264793396, "learning_rate": 5.007604599106486e-07, "loss": 0.0279, "step": 17230 }, { "epoch": 17.754891864057672, "grad_norm": 0.24451610445976257, "learning_rate": 4.878758347505175e-07, "loss": 0.0261, "step": 17240 }, { "epoch": 17.76519052523172, "grad_norm": 0.23091380298137665, "learning_rate": 4.751583259479331e-07, "loss": 0.031, "step": 17250 }, { "epoch": 17.775489186405768, "grad_norm": 0.311443030834198, "learning_rate": 4.6260797642782014e-07, "loss": 0.032, "step": 17260 }, { "epoch": 17.785787847579815, "grad_norm": 0.2045062929391861, "learning_rate": 4.5022482855088255e-07, "loss": 0.0256, "step": 17270 }, { "epoch": 17.796086508753863, "grad_norm": 0.339093953371048, "learning_rate": 4.380089241134866e-07, "loss": 0.0306, "step": 17280 }, { "epoch": 17.80638516992791, "grad_norm": 0.3019813597202301, "learning_rate": 4.259603043475002e-07, "loss": 0.0302, "step": 17290 }, { "epoch": 17.816683831101958, "grad_norm": 0.21195490658283234, "learning_rate": 4.1407900992015414e-07, "loss": 0.0318, "step": 17300 }, { "epoch": 17.826982492276002, "grad_norm": 0.2570505142211914, "learning_rate": 4.023650809339363e-07, "loss": 0.0387, "step": 17310 }, { "epoch": 17.83728115345005, "grad_norm": 0.36077165603637695, "learning_rate": 3.9081855692640333e-07, "loss": 0.0281, "step": 17320 }, { "epoch": 17.847579814624098, "grad_norm": 0.24089422821998596, "learning_rate": 3.7943947687010816e-07, "loss": 0.0265, "step": 17330 }, { "epoch": 17.857878475798145, "grad_norm": 0.3065880835056305, "learning_rate": 3.6822787917240587e-07, "loss": 0.0265, "step": 17340 }, { "epoch": 17.868177136972193, "grad_norm": 0.20888155698776245, "learning_rate": 3.571838016753759e-07, "loss": 0.0345, "step": 17350 }, { "epoch": 17.87847579814624, "grad_norm": 0.42461952567100525, "learning_rate": 3.4630728165566117e-07, "loss": 0.0334, "step": 17360 }, { "epoch": 17.888774459320288, "grad_norm": 0.36267679929733276, "learning_rate": 3.3559835582435695e-07, "loss": 0.0306, "step": 17370 }, { "epoch": 17.899073120494336, "grad_norm": 0.1654314249753952, "learning_rate": 3.250570603268943e-07, "loss": 0.0247, "step": 17380 }, { "epoch": 17.909371781668384, "grad_norm": 0.2670270800590515, "learning_rate": 3.1468343074290143e-07, "loss": 0.032, "step": 17390 }, { "epoch": 17.91967044284243, "grad_norm": 0.2694757878780365, "learning_rate": 3.0447750208607573e-07, "loss": 0.0269, "step": 17400 }, { "epoch": 17.92996910401648, "grad_norm": 0.34293317794799805, "learning_rate": 2.944393088041009e-07, "loss": 0.0234, "step": 17410 }, { "epoch": 17.940267765190526, "grad_norm": 0.25010308623313904, "learning_rate": 2.8456888477850776e-07, "loss": 0.0294, "step": 17420 }, { "epoch": 17.950566426364574, "grad_norm": 0.34105420112609863, "learning_rate": 2.7486626332455245e-07, "loss": 0.0292, "step": 17430 }, { "epoch": 17.96086508753862, "grad_norm": 0.2277262657880783, "learning_rate": 2.653314771911108e-07, "loss": 0.0398, "step": 17440 }, { "epoch": 17.971163748712666, "grad_norm": 0.3880465030670166, "learning_rate": 2.5596455856058963e-07, "loss": 0.0323, "step": 17450 }, { "epoch": 17.981462409886714, "grad_norm": 0.1923012137413025, "learning_rate": 2.467655390487822e-07, "loss": 0.0227, "step": 17460 }, { "epoch": 17.99176107106076, "grad_norm": 0.24936918914318085, "learning_rate": 2.3773444970477955e-07, "loss": 0.0249, "step": 17470 }, { "epoch": 18.00205973223481, "grad_norm": 0.2869769334793091, "learning_rate": 2.2887132101087615e-07, "loss": 0.0248, "step": 17480 }, { "epoch": 18.012358393408856, "grad_norm": 0.25350290536880493, "learning_rate": 2.201761828824367e-07, "loss": 0.0327, "step": 17490 }, { "epoch": 18.022657054582904, "grad_norm": 0.27213600277900696, "learning_rate": 2.1164906466783485e-07, "loss": 0.0285, "step": 17500 }, { "epoch": 18.03295571575695, "grad_norm": 0.257794588804245, "learning_rate": 2.032899951483147e-07, "loss": 0.0281, "step": 17510 }, { "epoch": 18.043254376931, "grad_norm": 0.2469080537557602, "learning_rate": 1.9509900253792955e-07, "loss": 0.0259, "step": 17520 }, { "epoch": 18.053553038105047, "grad_norm": 0.2920747995376587, "learning_rate": 1.870761144834088e-07, "loss": 0.0287, "step": 17530 }, { "epoch": 18.063851699279095, "grad_norm": 0.2282969057559967, "learning_rate": 1.7922135806410778e-07, "loss": 0.0277, "step": 17540 }, { "epoch": 18.074150360453142, "grad_norm": 0.28502708673477173, "learning_rate": 1.7153475979186927e-07, "loss": 0.0345, "step": 17550 }, { "epoch": 18.08444902162719, "grad_norm": 0.23902451992034912, "learning_rate": 1.6401634561098444e-07, "loss": 0.0335, "step": 17560 }, { "epoch": 18.094747682801234, "grad_norm": 0.3159581124782562, "learning_rate": 1.566661408980541e-07, "loss": 0.0299, "step": 17570 }, { "epoch": 18.105046343975282, "grad_norm": 0.12344943732023239, "learning_rate": 1.4948417046194985e-07, "loss": 0.0272, "step": 17580 }, { "epoch": 18.11534500514933, "grad_norm": 0.3794369101524353, "learning_rate": 1.42470458543692e-07, "loss": 0.0338, "step": 17590 }, { "epoch": 18.125643666323377, "grad_norm": 0.1987241804599762, "learning_rate": 1.3562502881639404e-07, "loss": 0.0223, "step": 17600 }, { "epoch": 18.135942327497425, "grad_norm": 0.21883957087993622, "learning_rate": 1.2894790438516824e-07, "loss": 0.0275, "step": 17610 }, { "epoch": 18.146240988671472, "grad_norm": 0.2665363550186157, "learning_rate": 1.2243910778705348e-07, "loss": 0.033, "step": 17620 }, { "epoch": 18.15653964984552, "grad_norm": 0.15010571479797363, "learning_rate": 1.1609866099094313e-07, "loss": 0.0227, "step": 17630 }, { "epoch": 18.166838311019568, "grad_norm": 0.19142857193946838, "learning_rate": 1.0992658539750178e-07, "loss": 0.0279, "step": 17640 }, { "epoch": 18.177136972193615, "grad_norm": 0.2638980746269226, "learning_rate": 1.0392290183909304e-07, "loss": 0.0265, "step": 17650 }, { "epoch": 18.187435633367663, "grad_norm": 0.19933411478996277, "learning_rate": 9.808763057971849e-08, "loss": 0.0294, "step": 17660 }, { "epoch": 18.19773429454171, "grad_norm": 0.32049107551574707, "learning_rate": 9.242079131495107e-08, "loss": 0.0268, "step": 17670 }, { "epoch": 18.20803295571576, "grad_norm": 0.22636005282402039, "learning_rate": 8.69224031718463e-08, "loss": 0.0359, "step": 17680 }, { "epoch": 18.218331616889806, "grad_norm": 0.19072987139225006, "learning_rate": 8.159248470890334e-08, "loss": 0.0272, "step": 17690 }, { "epoch": 18.22863027806385, "grad_norm": 0.5597253441810608, "learning_rate": 7.643105391598737e-08, "loss": 0.0296, "step": 17700 }, { "epoch": 18.238928939237898, "grad_norm": 0.20172372460365295, "learning_rate": 7.143812821427953e-08, "loss": 0.0321, "step": 17710 }, { "epoch": 18.249227600411945, "grad_norm": 0.49044567346572876, "learning_rate": 6.661372445621039e-08, "loss": 0.0284, "step": 17720 }, { "epoch": 18.259526261585993, "grad_norm": 0.2032887190580368, "learning_rate": 6.19578589253933e-08, "loss": 0.03, "step": 17730 }, { "epoch": 18.26982492276004, "grad_norm": 0.30425992608070374, "learning_rate": 5.747054733660773e-08, "loss": 0.0301, "step": 17740 }, { "epoch": 18.28012358393409, "grad_norm": 0.2486412227153778, "learning_rate": 5.3151804835688267e-08, "loss": 0.0261, "step": 17750 }, { "epoch": 18.290422245108136, "grad_norm": 0.21091780066490173, "learning_rate": 4.9001645999524613e-08, "loss": 0.0276, "step": 17760 }, { "epoch": 18.300720906282184, "grad_norm": 0.36458486318588257, "learning_rate": 4.502008483598941e-08, "loss": 0.0277, "step": 17770 }, { "epoch": 18.31101956745623, "grad_norm": 0.21798443794250488, "learning_rate": 4.1207134783888265e-08, "loss": 0.0307, "step": 17780 }, { "epoch": 18.32131822863028, "grad_norm": 0.27093908190727234, "learning_rate": 3.756280871293205e-08, "loss": 0.0328, "step": 17790 }, { "epoch": 18.331616889804327, "grad_norm": 0.1765187829732895, "learning_rate": 3.4087118923659125e-08, "loss": 0.0305, "step": 17800 }, { "epoch": 18.341915550978374, "grad_norm": 0.9125376343727112, "learning_rate": 3.078007714744646e-08, "loss": 0.0408, "step": 17810 }, { "epoch": 18.352214212152422, "grad_norm": 0.1739547997713089, "learning_rate": 2.7641694546409746e-08, "loss": 0.0282, "step": 17820 }, { "epoch": 18.362512873326466, "grad_norm": 0.2467593103647232, "learning_rate": 2.467198171342e-08, "loss": 0.0266, "step": 17830 }, { "epoch": 18.372811534500514, "grad_norm": 0.7820371389389038, "learning_rate": 2.1870948672036984e-08, "loss": 0.0263, "step": 17840 }, { "epoch": 18.38311019567456, "grad_norm": 0.30878883600234985, "learning_rate": 1.9238604876470334e-08, "loss": 0.03, "step": 17850 }, { "epoch": 18.39340885684861, "grad_norm": 0.2729048728942871, "learning_rate": 1.6774959211568465e-08, "loss": 0.035, "step": 17860 }, { "epoch": 18.403707518022657, "grad_norm": 0.33503258228302, "learning_rate": 1.4480019992785254e-08, "loss": 0.0261, "step": 17870 }, { "epoch": 18.414006179196704, "grad_norm": 0.24983762204647064, "learning_rate": 1.2353794966135646e-08, "loss": 0.0265, "step": 17880 }, { "epoch": 18.424304840370752, "grad_norm": 0.24591587483882904, "learning_rate": 1.0396291308190087e-08, "loss": 0.0248, "step": 17890 }, { "epoch": 18.4346035015448, "grad_norm": 0.24605391919612885, "learning_rate": 8.607515626030128e-09, "loss": 0.0289, "step": 17900 }, { "epoch": 18.444902162718847, "grad_norm": 0.2520316541194916, "learning_rate": 6.987473957242863e-09, "loss": 0.0307, "step": 17910 }, { "epoch": 18.455200823892895, "grad_norm": 0.46191495656967163, "learning_rate": 5.536171769887632e-09, "loss": 0.0303, "step": 17920 }, { "epoch": 18.465499485066942, "grad_norm": 0.26452863216400146, "learning_rate": 4.253613962496017e-09, "loss": 0.0329, "step": 17930 }, { "epoch": 18.47579814624099, "grad_norm": 0.3968678116798401, "learning_rate": 3.1398048640385315e-09, "loss": 0.0356, "step": 17940 }, { "epoch": 18.486096807415038, "grad_norm": 0.19242151081562042, "learning_rate": 2.1947482338968705e-09, "loss": 0.0265, "step": 17950 }, { "epoch": 18.496395468589082, "grad_norm": 0.20866911113262177, "learning_rate": 1.4184472618972154e-09, "loss": 0.0251, "step": 17960 }, { "epoch": 18.50669412976313, "grad_norm": 0.17729917168617249, "learning_rate": 8.109045682547223e-10, "loss": 0.0264, "step": 17970 }, { "epoch": 18.516992790937177, "grad_norm": 0.19232727587223053, "learning_rate": 3.721222035846239e-10, "loss": 0.0366, "step": 17980 }, { "epoch": 18.527291452111225, "grad_norm": 0.41915977001190186, "learning_rate": 1.0210164889112861e-10, "loss": 0.0288, "step": 17990 }, { "epoch": 18.537590113285273, "grad_norm": 0.742242693901062, "learning_rate": 8.438155674195258e-13, "loss": 0.0335, "step": 18000 }, { "epoch": 18.537590113285273, "step": 18000, "total_flos": 0.0, "train_loss": 0.05001054983586073, "train_runtime": 5749.6082, "train_samples_per_second": 100.181, "train_steps_per_second": 3.131 } ], "logging_steps": 10, "max_steps": 18000, "num_input_tokens_seen": 0, "num_train_epochs": 19, "save_steps": 20000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }