diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12643 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 18.537590113285273, + "eval_steps": 500, + "global_step": 18000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.010298661174047374, + "grad_norm": 28.08726692199707, + "learning_rate": 1.0000000000000002e-06, + "loss": 2.0981, + "step": 10 + }, + { + "epoch": 0.02059732234809475, + "grad_norm": 16.46196746826172, + "learning_rate": 2.1111111111111114e-06, + "loss": 1.9619, + "step": 20 + }, + { + "epoch": 0.030895983522142123, + "grad_norm": 13.453218460083008, + "learning_rate": 3.2222222222222222e-06, + "loss": 1.5883, + "step": 30 + }, + { + "epoch": 0.0411946446961895, + "grad_norm": 3.0111494064331055, + "learning_rate": 4.333333333333334e-06, + "loss": 0.8443, + "step": 40 + }, + { + "epoch": 0.05149330587023687, + "grad_norm": 1.8473039865493774, + "learning_rate": 5.444444444444445e-06, + "loss": 0.4851, + "step": 50 + }, + { + "epoch": 0.061791967044284246, + "grad_norm": 1.983799695968628, + "learning_rate": 6.555555555555556e-06, + "loss": 0.4895, + "step": 60 + }, + { + "epoch": 0.07209062821833162, + "grad_norm": 1.359467625617981, + "learning_rate": 7.666666666666667e-06, + "loss": 0.3476, + "step": 70 + }, + { + "epoch": 0.082389289392379, + "grad_norm": 1.6559157371520996, + "learning_rate": 8.777777777777778e-06, + "loss": 0.3161, + "step": 80 + }, + { + "epoch": 0.09268795056642637, + "grad_norm": 1.4577065706253052, + "learning_rate": 9.888888888888889e-06, + "loss": 0.2894, + "step": 90 + }, + { + "epoch": 0.10298661174047374, + "grad_norm": 1.9685674905776978, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.2669, + "step": 100 + }, + { + "epoch": 0.11328527291452112, + "grad_norm": 1.0735788345336914, + "learning_rate": 1.2111111111111112e-05, + "loss": 0.2384, + "step": 110 + }, + { + "epoch": 0.12358393408856849, + "grad_norm": 2.065934419631958, + "learning_rate": 1.3222222222222221e-05, + "loss": 0.2315, + "step": 120 + }, + { + "epoch": 0.13388259526261587, + "grad_norm": 1.3160645961761475, + "learning_rate": 1.4333333333333334e-05, + "loss": 0.2212, + "step": 130 + }, + { + "epoch": 0.14418125643666324, + "grad_norm": 1.132812738418579, + "learning_rate": 1.5444444444444446e-05, + "loss": 0.2107, + "step": 140 + }, + { + "epoch": 0.15447991761071062, + "grad_norm": 0.8556684851646423, + "learning_rate": 1.655555555555556e-05, + "loss": 0.1983, + "step": 150 + }, + { + "epoch": 0.164778578784758, + "grad_norm": 1.1401009559631348, + "learning_rate": 1.7666666666666668e-05, + "loss": 0.1821, + "step": 160 + }, + { + "epoch": 0.17507723995880536, + "grad_norm": 0.9898369312286377, + "learning_rate": 1.8777777777777777e-05, + "loss": 0.1725, + "step": 170 + }, + { + "epoch": 0.18537590113285274, + "grad_norm": 1.2845979928970337, + "learning_rate": 1.988888888888889e-05, + "loss": 0.1798, + "step": 180 + }, + { + "epoch": 0.1956745623069001, + "grad_norm": 0.7349956631660461, + "learning_rate": 2.1e-05, + "loss": 0.1553, + "step": 190 + }, + { + "epoch": 0.2059732234809475, + "grad_norm": 1.0893903970718384, + "learning_rate": 2.211111111111111e-05, + "loss": 0.161, + "step": 200 + }, + { + "epoch": 0.21627188465499486, + "grad_norm": 1.4773167371749878, + "learning_rate": 2.3222222222222224e-05, + "loss": 0.1687, + "step": 210 + }, + { + "epoch": 0.22657054582904224, + "grad_norm": 0.7343375086784363, + "learning_rate": 2.4333333333333336e-05, + "loss": 0.1541, + "step": 220 + }, + { + "epoch": 0.2368692070030896, + "grad_norm": 1.459641456604004, + "learning_rate": 2.5444444444444442e-05, + "loss": 0.1546, + "step": 230 + }, + { + "epoch": 0.24716786817713698, + "grad_norm": 1.007576823234558, + "learning_rate": 2.6555555555555555e-05, + "loss": 0.1397, + "step": 240 + }, + { + "epoch": 0.25746652935118436, + "grad_norm": 0.7707590460777283, + "learning_rate": 2.7666666666666667e-05, + "loss": 0.1395, + "step": 250 + }, + { + "epoch": 0.26776519052523173, + "grad_norm": 0.8418192863464355, + "learning_rate": 2.877777777777778e-05, + "loss": 0.1367, + "step": 260 + }, + { + "epoch": 0.2780638516992791, + "grad_norm": 1.433361291885376, + "learning_rate": 2.988888888888889e-05, + "loss": 0.1443, + "step": 270 + }, + { + "epoch": 0.2883625128733265, + "grad_norm": 1.6851385831832886, + "learning_rate": 3.1e-05, + "loss": 0.1412, + "step": 280 + }, + { + "epoch": 0.29866117404737386, + "grad_norm": 1.0967495441436768, + "learning_rate": 3.2111111111111114e-05, + "loss": 0.1465, + "step": 290 + }, + { + "epoch": 0.30895983522142123, + "grad_norm": 0.9680765867233276, + "learning_rate": 3.322222222222222e-05, + "loss": 0.1409, + "step": 300 + }, + { + "epoch": 0.3192584963954686, + "grad_norm": 0.8024266362190247, + "learning_rate": 3.433333333333333e-05, + "loss": 0.151, + "step": 310 + }, + { + "epoch": 0.329557157569516, + "grad_norm": 1.2099324464797974, + "learning_rate": 3.5444444444444445e-05, + "loss": 0.1276, + "step": 320 + }, + { + "epoch": 0.33985581874356335, + "grad_norm": 1.553401231765747, + "learning_rate": 3.655555555555556e-05, + "loss": 0.1407, + "step": 330 + }, + { + "epoch": 0.35015447991761073, + "grad_norm": 0.9965718388557434, + "learning_rate": 3.766666666666667e-05, + "loss": 0.1193, + "step": 340 + }, + { + "epoch": 0.3604531410916581, + "grad_norm": 1.0881636142730713, + "learning_rate": 3.877777777777778e-05, + "loss": 0.1161, + "step": 350 + }, + { + "epoch": 0.3707518022657055, + "grad_norm": 0.7971917986869812, + "learning_rate": 3.9888888888888895e-05, + "loss": 0.1153, + "step": 360 + }, + { + "epoch": 0.38105046343975285, + "grad_norm": 0.6419103741645813, + "learning_rate": 4.1e-05, + "loss": 0.1268, + "step": 370 + }, + { + "epoch": 0.3913491246138002, + "grad_norm": 0.8467381596565247, + "learning_rate": 4.211111111111111e-05, + "loss": 0.1089, + "step": 380 + }, + { + "epoch": 0.4016477857878476, + "grad_norm": 0.7437835335731506, + "learning_rate": 4.3222222222222226e-05, + "loss": 0.1196, + "step": 390 + }, + { + "epoch": 0.411946446961895, + "grad_norm": 1.1879000663757324, + "learning_rate": 4.433333333333334e-05, + "loss": 0.1104, + "step": 400 + }, + { + "epoch": 0.42224510813594235, + "grad_norm": 1.103964924812317, + "learning_rate": 4.5444444444444444e-05, + "loss": 0.1154, + "step": 410 + }, + { + "epoch": 0.4325437693099897, + "grad_norm": 1.20859956741333, + "learning_rate": 4.6555555555555556e-05, + "loss": 0.1151, + "step": 420 + }, + { + "epoch": 0.4428424304840371, + "grad_norm": 1.3592861890792847, + "learning_rate": 4.766666666666667e-05, + "loss": 0.1221, + "step": 430 + }, + { + "epoch": 0.45314109165808447, + "grad_norm": 0.7694193720817566, + "learning_rate": 4.8777777777777775e-05, + "loss": 0.1081, + "step": 440 + }, + { + "epoch": 0.46343975283213185, + "grad_norm": 0.8526501655578613, + "learning_rate": 4.9888888888888894e-05, + "loss": 0.1071, + "step": 450 + }, + { + "epoch": 0.4737384140061792, + "grad_norm": 0.8666425943374634, + "learning_rate": 5.1000000000000006e-05, + "loss": 0.1125, + "step": 460 + }, + { + "epoch": 0.4840370751802266, + "grad_norm": 1.0404722690582275, + "learning_rate": 5.211111111111111e-05, + "loss": 0.1235, + "step": 470 + }, + { + "epoch": 0.49433573635427397, + "grad_norm": 0.8314346671104431, + "learning_rate": 5.322222222222223e-05, + "loss": 0.1156, + "step": 480 + }, + { + "epoch": 0.5046343975283213, + "grad_norm": 0.8053165674209595, + "learning_rate": 5.433333333333334e-05, + "loss": 0.0963, + "step": 490 + }, + { + "epoch": 0.5149330587023687, + "grad_norm": 0.9703218340873718, + "learning_rate": 5.544444444444444e-05, + "loss": 0.1081, + "step": 500 + }, + { + "epoch": 0.525231719876416, + "grad_norm": 1.0357967615127563, + "learning_rate": 5.655555555555556e-05, + "loss": 0.1053, + "step": 510 + }, + { + "epoch": 0.5355303810504635, + "grad_norm": 0.6202366948127747, + "learning_rate": 5.766666666666667e-05, + "loss": 0.1161, + "step": 520 + }, + { + "epoch": 0.5458290422245108, + "grad_norm": 0.9413891434669495, + "learning_rate": 5.8777777777777774e-05, + "loss": 0.1109, + "step": 530 + }, + { + "epoch": 0.5561277033985582, + "grad_norm": 0.9725326299667358, + "learning_rate": 5.988888888888889e-05, + "loss": 0.1087, + "step": 540 + }, + { + "epoch": 0.5664263645726055, + "grad_norm": 1.1372697353363037, + "learning_rate": 6.1e-05, + "loss": 0.0934, + "step": 550 + }, + { + "epoch": 0.576725025746653, + "grad_norm": 0.9730582237243652, + "learning_rate": 6.21111111111111e-05, + "loss": 0.089, + "step": 560 + }, + { + "epoch": 0.5870236869207003, + "grad_norm": 1.031986117362976, + "learning_rate": 6.322222222222223e-05, + "loss": 0.0921, + "step": 570 + }, + { + "epoch": 0.5973223480947477, + "grad_norm": 0.9803087115287781, + "learning_rate": 6.433333333333333e-05, + "loss": 0.109, + "step": 580 + }, + { + "epoch": 0.607621009268795, + "grad_norm": 1.2565224170684814, + "learning_rate": 6.544444444444446e-05, + "loss": 0.1075, + "step": 590 + }, + { + "epoch": 0.6179196704428425, + "grad_norm": 0.6035177707672119, + "learning_rate": 6.655555555555555e-05, + "loss": 0.1069, + "step": 600 + }, + { + "epoch": 0.6282183316168898, + "grad_norm": 0.6485044360160828, + "learning_rate": 6.766666666666667e-05, + "loss": 0.1041, + "step": 610 + }, + { + "epoch": 0.6385169927909372, + "grad_norm": 0.9063082337379456, + "learning_rate": 6.877777777777778e-05, + "loss": 0.087, + "step": 620 + }, + { + "epoch": 0.6488156539649845, + "grad_norm": 0.7508301734924316, + "learning_rate": 6.988888888888889e-05, + "loss": 0.0993, + "step": 630 + }, + { + "epoch": 0.659114315139032, + "grad_norm": 0.7371131777763367, + "learning_rate": 7.1e-05, + "loss": 0.0965, + "step": 640 + }, + { + "epoch": 0.6694129763130793, + "grad_norm": 0.9033893942832947, + "learning_rate": 7.211111111111112e-05, + "loss": 0.0927, + "step": 650 + }, + { + "epoch": 0.6797116374871267, + "grad_norm": 1.0828319787979126, + "learning_rate": 7.322222222222223e-05, + "loss": 0.1039, + "step": 660 + }, + { + "epoch": 0.690010298661174, + "grad_norm": 0.7973754405975342, + "learning_rate": 7.433333333333333e-05, + "loss": 0.0942, + "step": 670 + }, + { + "epoch": 0.7003089598352215, + "grad_norm": 0.9999275803565979, + "learning_rate": 7.544444444444445e-05, + "loss": 0.0938, + "step": 680 + }, + { + "epoch": 0.7106076210092688, + "grad_norm": 0.7432506680488586, + "learning_rate": 7.655555555555555e-05, + "loss": 0.0822, + "step": 690 + }, + { + "epoch": 0.7209062821833162, + "grad_norm": 0.7960357069969177, + "learning_rate": 7.766666666666667e-05, + "loss": 0.0885, + "step": 700 + }, + { + "epoch": 0.7312049433573635, + "grad_norm": 0.6295223236083984, + "learning_rate": 7.877777777777778e-05, + "loss": 0.0984, + "step": 710 + }, + { + "epoch": 0.741503604531411, + "grad_norm": 0.6425987482070923, + "learning_rate": 7.988888888888889e-05, + "loss": 0.0851, + "step": 720 + }, + { + "epoch": 0.7518022657054583, + "grad_norm": 0.7241719961166382, + "learning_rate": 8.1e-05, + "loss": 0.0818, + "step": 730 + }, + { + "epoch": 0.7621009268795057, + "grad_norm": 0.6875414252281189, + "learning_rate": 8.211111111111112e-05, + "loss": 0.0776, + "step": 740 + }, + { + "epoch": 0.772399588053553, + "grad_norm": 0.7593461275100708, + "learning_rate": 8.322222222222223e-05, + "loss": 0.0862, + "step": 750 + }, + { + "epoch": 0.7826982492276005, + "grad_norm": 1.1254090070724487, + "learning_rate": 8.433333333333334e-05, + "loss": 0.0831, + "step": 760 + }, + { + "epoch": 0.7929969104016478, + "grad_norm": 0.6563543677330017, + "learning_rate": 8.544444444444445e-05, + "loss": 0.0756, + "step": 770 + }, + { + "epoch": 0.8032955715756952, + "grad_norm": 0.500499963760376, + "learning_rate": 8.655555555555555e-05, + "loss": 0.09, + "step": 780 + }, + { + "epoch": 0.8135942327497425, + "grad_norm": 0.6962169408798218, + "learning_rate": 8.766666666666668e-05, + "loss": 0.0913, + "step": 790 + }, + { + "epoch": 0.82389289392379, + "grad_norm": 0.8879425525665283, + "learning_rate": 8.877777777777778e-05, + "loss": 0.094, + "step": 800 + }, + { + "epoch": 0.8341915550978373, + "grad_norm": 0.7109111547470093, + "learning_rate": 8.988888888888889e-05, + "loss": 0.0899, + "step": 810 + }, + { + "epoch": 0.8444902162718847, + "grad_norm": 0.6895614266395569, + "learning_rate": 9.1e-05, + "loss": 0.0899, + "step": 820 + }, + { + "epoch": 0.854788877445932, + "grad_norm": 0.5885145664215088, + "learning_rate": 9.211111111111112e-05, + "loss": 0.0894, + "step": 830 + }, + { + "epoch": 0.8650875386199794, + "grad_norm": 0.6228615641593933, + "learning_rate": 9.322222222222223e-05, + "loss": 0.0826, + "step": 840 + }, + { + "epoch": 0.8753861997940268, + "grad_norm": 0.6920461654663086, + "learning_rate": 9.433333333333334e-05, + "loss": 0.0926, + "step": 850 + }, + { + "epoch": 0.8856848609680742, + "grad_norm": 0.8142651319503784, + "learning_rate": 9.544444444444445e-05, + "loss": 0.0769, + "step": 860 + }, + { + "epoch": 0.8959835221421215, + "grad_norm": 0.8525772094726562, + "learning_rate": 9.655555555555555e-05, + "loss": 0.0775, + "step": 870 + }, + { + "epoch": 0.9062821833161689, + "grad_norm": 0.6274034976959229, + "learning_rate": 9.766666666666668e-05, + "loss": 0.0793, + "step": 880 + }, + { + "epoch": 0.9165808444902163, + "grad_norm": 0.7031662464141846, + "learning_rate": 9.877777777777778e-05, + "loss": 0.081, + "step": 890 + }, + { + "epoch": 0.9268795056642637, + "grad_norm": 0.542312741279602, + "learning_rate": 9.98888888888889e-05, + "loss": 0.0878, + "step": 900 + }, + { + "epoch": 0.937178166838311, + "grad_norm": 0.5504183173179626, + "learning_rate": 9.999993165095463e-05, + "loss": 0.0711, + "step": 910 + }, + { + "epoch": 0.9474768280123584, + "grad_norm": 0.6083622574806213, + "learning_rate": 9.999969538288952e-05, + "loss": 0.0774, + "step": 920 + }, + { + "epoch": 0.9577754891864058, + "grad_norm": 0.7640944123268127, + "learning_rate": 9.999929035278659e-05, + "loss": 0.0711, + "step": 930 + }, + { + "epoch": 0.9680741503604532, + "grad_norm": 0.34581655263900757, + "learning_rate": 9.999871656201292e-05, + "loss": 0.0716, + "step": 940 + }, + { + "epoch": 0.9783728115345005, + "grad_norm": 0.6435947418212891, + "learning_rate": 9.999797401250521e-05, + "loss": 0.0833, + "step": 950 + }, + { + "epoch": 0.9886714727085479, + "grad_norm": 0.6153683662414551, + "learning_rate": 9.999706270676973e-05, + "loss": 0.0683, + "step": 960 + }, + { + "epoch": 0.9989701338825953, + "grad_norm": 0.5145250558853149, + "learning_rate": 9.999598264788241e-05, + "loss": 0.0679, + "step": 970 + }, + { + "epoch": 1.0092687950566426, + "grad_norm": 0.5474639534950256, + "learning_rate": 9.999473383948872e-05, + "loss": 0.0652, + "step": 980 + }, + { + "epoch": 1.01956745623069, + "grad_norm": 0.4673866331577301, + "learning_rate": 9.99933162858037e-05, + "loss": 0.0806, + "step": 990 + }, + { + "epoch": 1.0298661174047374, + "grad_norm": 0.500733494758606, + "learning_rate": 9.999172999161198e-05, + "loss": 0.0746, + "step": 1000 + }, + { + "epoch": 1.0401647785787849, + "grad_norm": 0.6277178525924683, + "learning_rate": 9.998997496226772e-05, + "loss": 0.0691, + "step": 1010 + }, + { + "epoch": 1.050463439752832, + "grad_norm": 0.34232184290885925, + "learning_rate": 9.998805120369458e-05, + "loss": 0.069, + "step": 1020 + }, + { + "epoch": 1.0607621009268795, + "grad_norm": 0.6583809852600098, + "learning_rate": 9.998595872238577e-05, + "loss": 0.0646, + "step": 1030 + }, + { + "epoch": 1.071060762100927, + "grad_norm": 0.5400450825691223, + "learning_rate": 9.998369752540395e-05, + "loss": 0.0709, + "step": 1040 + }, + { + "epoch": 1.0813594232749741, + "grad_norm": 0.716460645198822, + "learning_rate": 9.998126762038126e-05, + "loss": 0.0659, + "step": 1050 + }, + { + "epoch": 1.0916580844490216, + "grad_norm": 0.7969040274620056, + "learning_rate": 9.997866901551926e-05, + "loss": 0.0834, + "step": 1060 + }, + { + "epoch": 1.101956745623069, + "grad_norm": 0.6805360317230225, + "learning_rate": 9.997590171958892e-05, + "loss": 0.0661, + "step": 1070 + }, + { + "epoch": 1.1122554067971164, + "grad_norm": 0.6645709872245789, + "learning_rate": 9.997296574193058e-05, + "loss": 0.0719, + "step": 1080 + }, + { + "epoch": 1.1225540679711639, + "grad_norm": 0.9983972311019897, + "learning_rate": 9.996986109245395e-05, + "loss": 0.063, + "step": 1090 + }, + { + "epoch": 1.132852729145211, + "grad_norm": 0.47811999917030334, + "learning_rate": 9.996658778163802e-05, + "loss": 0.0812, + "step": 1100 + }, + { + "epoch": 1.1431513903192585, + "grad_norm": 0.9598459601402283, + "learning_rate": 9.996314582053106e-05, + "loss": 0.0797, + "step": 1110 + }, + { + "epoch": 1.153450051493306, + "grad_norm": 0.8147891759872437, + "learning_rate": 9.995953522075061e-05, + "loss": 0.076, + "step": 1120 + }, + { + "epoch": 1.1637487126673531, + "grad_norm": 0.36551281809806824, + "learning_rate": 9.995575599448336e-05, + "loss": 0.0689, + "step": 1130 + }, + { + "epoch": 1.1740473738414006, + "grad_norm": 0.41024380922317505, + "learning_rate": 9.995180815448523e-05, + "loss": 0.091, + "step": 1140 + }, + { + "epoch": 1.184346035015448, + "grad_norm": 0.5559478998184204, + "learning_rate": 9.994769171408118e-05, + "loss": 0.0783, + "step": 1150 + }, + { + "epoch": 1.1946446961894954, + "grad_norm": 0.39498281478881836, + "learning_rate": 9.994340668716527e-05, + "loss": 0.0655, + "step": 1160 + }, + { + "epoch": 1.2049433573635429, + "grad_norm": 0.7332147359848022, + "learning_rate": 9.993895308820058e-05, + "loss": 0.0739, + "step": 1170 + }, + { + "epoch": 1.21524201853759, + "grad_norm": 0.5935864448547363, + "learning_rate": 9.99343309322192e-05, + "loss": 0.0651, + "step": 1180 + }, + { + "epoch": 1.2255406797116375, + "grad_norm": 0.5222606658935547, + "learning_rate": 9.99295402348221e-05, + "loss": 0.0676, + "step": 1190 + }, + { + "epoch": 1.235839340885685, + "grad_norm": 0.5474528670310974, + "learning_rate": 9.992458101217912e-05, + "loss": 0.0775, + "step": 1200 + }, + { + "epoch": 1.2461380020597321, + "grad_norm": 0.7393515110015869, + "learning_rate": 9.991945328102897e-05, + "loss": 0.0679, + "step": 1210 + }, + { + "epoch": 1.2564366632337796, + "grad_norm": 0.48135286569595337, + "learning_rate": 9.991415705867903e-05, + "loss": 0.0627, + "step": 1220 + }, + { + "epoch": 1.266735324407827, + "grad_norm": 0.40880492329597473, + "learning_rate": 9.990869236300546e-05, + "loss": 0.0621, + "step": 1230 + }, + { + "epoch": 1.2770339855818744, + "grad_norm": 0.4522377550601959, + "learning_rate": 9.990305921245306e-05, + "loss": 0.0629, + "step": 1240 + }, + { + "epoch": 1.2873326467559219, + "grad_norm": 0.5431732535362244, + "learning_rate": 9.989725762603515e-05, + "loss": 0.0711, + "step": 1250 + }, + { + "epoch": 1.297631307929969, + "grad_norm": 0.4390816390514374, + "learning_rate": 9.989128762333362e-05, + "loss": 0.058, + "step": 1260 + }, + { + "epoch": 1.3079299691040165, + "grad_norm": 0.5823209881782532, + "learning_rate": 9.988514922449879e-05, + "loss": 0.0742, + "step": 1270 + }, + { + "epoch": 1.318228630278064, + "grad_norm": 0.6167677044868469, + "learning_rate": 9.987884245024934e-05, + "loss": 0.0698, + "step": 1280 + }, + { + "epoch": 1.3285272914521111, + "grad_norm": 0.470501184463501, + "learning_rate": 9.98723673218723e-05, + "loss": 0.0669, + "step": 1290 + }, + { + "epoch": 1.3388259526261586, + "grad_norm": 0.3435496985912323, + "learning_rate": 9.986572386122291e-05, + "loss": 0.0655, + "step": 1300 + }, + { + "epoch": 1.349124613800206, + "grad_norm": 0.5990545749664307, + "learning_rate": 9.98589120907246e-05, + "loss": 0.0653, + "step": 1310 + }, + { + "epoch": 1.3594232749742534, + "grad_norm": 0.7209518551826477, + "learning_rate": 9.985193203336886e-05, + "loss": 0.0654, + "step": 1320 + }, + { + "epoch": 1.3697219361483008, + "grad_norm": 0.6588581800460815, + "learning_rate": 9.984478371271521e-05, + "loss": 0.066, + "step": 1330 + }, + { + "epoch": 1.380020597322348, + "grad_norm": 0.5437431931495667, + "learning_rate": 9.98374671528911e-05, + "loss": 0.0685, + "step": 1340 + }, + { + "epoch": 1.3903192584963955, + "grad_norm": 0.4081268012523651, + "learning_rate": 9.982998237859184e-05, + "loss": 0.0649, + "step": 1350 + }, + { + "epoch": 1.400617919670443, + "grad_norm": 0.5363196134567261, + "learning_rate": 9.98223294150805e-05, + "loss": 0.0614, + "step": 1360 + }, + { + "epoch": 1.4109165808444901, + "grad_norm": 0.5327999591827393, + "learning_rate": 9.981450828818783e-05, + "loss": 0.058, + "step": 1370 + }, + { + "epoch": 1.4212152420185376, + "grad_norm": 0.39524152874946594, + "learning_rate": 9.980651902431216e-05, + "loss": 0.0606, + "step": 1380 + }, + { + "epoch": 1.431513903192585, + "grad_norm": 0.5942156910896301, + "learning_rate": 9.979836165041936e-05, + "loss": 0.0589, + "step": 1390 + }, + { + "epoch": 1.4418125643666324, + "grad_norm": 0.6506125330924988, + "learning_rate": 9.97900361940427e-05, + "loss": 0.0618, + "step": 1400 + }, + { + "epoch": 1.4521112255406798, + "grad_norm": 0.43637052178382874, + "learning_rate": 9.978154268328276e-05, + "loss": 0.0728, + "step": 1410 + }, + { + "epoch": 1.462409886714727, + "grad_norm": 0.5816675424575806, + "learning_rate": 9.977288114680737e-05, + "loss": 0.0738, + "step": 1420 + }, + { + "epoch": 1.4727085478887745, + "grad_norm": 0.3983500301837921, + "learning_rate": 9.976405161385147e-05, + "loss": 0.0674, + "step": 1430 + }, + { + "epoch": 1.483007209062822, + "grad_norm": 0.41254571080207825, + "learning_rate": 9.975505411421704e-05, + "loss": 0.066, + "step": 1440 + }, + { + "epoch": 1.4933058702368691, + "grad_norm": 0.4647277593612671, + "learning_rate": 9.974588867827301e-05, + "loss": 0.0646, + "step": 1450 + }, + { + "epoch": 1.5036045314109165, + "grad_norm": 0.4378807544708252, + "learning_rate": 9.97365553369551e-05, + "loss": 0.0589, + "step": 1460 + }, + { + "epoch": 1.513903192584964, + "grad_norm": 0.6178969144821167, + "learning_rate": 9.972705412176577e-05, + "loss": 0.0621, + "step": 1470 + }, + { + "epoch": 1.5242018537590112, + "grad_norm": 0.5825141072273254, + "learning_rate": 9.971738506477414e-05, + "loss": 0.0644, + "step": 1480 + }, + { + "epoch": 1.5345005149330588, + "grad_norm": 0.5849868655204773, + "learning_rate": 9.970754819861577e-05, + "loss": 0.0669, + "step": 1490 + }, + { + "epoch": 1.544799176107106, + "grad_norm": 0.5067623853683472, + "learning_rate": 9.969754355649268e-05, + "loss": 0.071, + "step": 1500 + }, + { + "epoch": 1.5550978372811535, + "grad_norm": 0.5842755436897278, + "learning_rate": 9.968737117217313e-05, + "loss": 0.0713, + "step": 1510 + }, + { + "epoch": 1.565396498455201, + "grad_norm": 0.3868110179901123, + "learning_rate": 9.967703107999158e-05, + "loss": 0.0635, + "step": 1520 + }, + { + "epoch": 1.575695159629248, + "grad_norm": 0.4535583257675171, + "learning_rate": 9.966652331484853e-05, + "loss": 0.0587, + "step": 1530 + }, + { + "epoch": 1.5859938208032955, + "grad_norm": 0.38644909858703613, + "learning_rate": 9.965584791221048e-05, + "loss": 0.0708, + "step": 1540 + }, + { + "epoch": 1.596292481977343, + "grad_norm": 0.460753858089447, + "learning_rate": 9.964500490810966e-05, + "loss": 0.0645, + "step": 1550 + }, + { + "epoch": 1.6065911431513902, + "grad_norm": 0.5585173964500427, + "learning_rate": 9.963399433914405e-05, + "loss": 0.0587, + "step": 1560 + }, + { + "epoch": 1.6168898043254378, + "grad_norm": 0.6196934580802917, + "learning_rate": 9.962281624247722e-05, + "loss": 0.0663, + "step": 1570 + }, + { + "epoch": 1.627188465499485, + "grad_norm": 0.440153568983078, + "learning_rate": 9.961147065583813e-05, + "loss": 0.0568, + "step": 1580 + }, + { + "epoch": 1.6374871266735325, + "grad_norm": 0.49740493297576904, + "learning_rate": 9.959995761752112e-05, + "loss": 0.0616, + "step": 1590 + }, + { + "epoch": 1.64778578784758, + "grad_norm": 0.7940653562545776, + "learning_rate": 9.958827716638572e-05, + "loss": 0.0656, + "step": 1600 + }, + { + "epoch": 1.658084449021627, + "grad_norm": 0.39363256096839905, + "learning_rate": 9.957642934185648e-05, + "loss": 0.059, + "step": 1610 + }, + { + "epoch": 1.6683831101956745, + "grad_norm": 0.5798192620277405, + "learning_rate": 9.95644141839229e-05, + "loss": 0.057, + "step": 1620 + }, + { + "epoch": 1.678681771369722, + "grad_norm": 0.43519875407218933, + "learning_rate": 9.955223173313931e-05, + "loss": 0.0547, + "step": 1630 + }, + { + "epoch": 1.6889804325437692, + "grad_norm": 0.5713900327682495, + "learning_rate": 9.953988203062463e-05, + "loss": 0.0655, + "step": 1640 + }, + { + "epoch": 1.6992790937178168, + "grad_norm": 0.8694477677345276, + "learning_rate": 9.952736511806236e-05, + "loss": 0.0793, + "step": 1650 + }, + { + "epoch": 1.709577754891864, + "grad_norm": 0.344855397939682, + "learning_rate": 9.951468103770032e-05, + "loss": 0.0654, + "step": 1660 + }, + { + "epoch": 1.7198764160659115, + "grad_norm": 0.747203528881073, + "learning_rate": 9.950182983235063e-05, + "loss": 0.0694, + "step": 1670 + }, + { + "epoch": 1.730175077239959, + "grad_norm": 0.44555550813674927, + "learning_rate": 9.948881154538945e-05, + "loss": 0.0729, + "step": 1680 + }, + { + "epoch": 1.740473738414006, + "grad_norm": 0.4354792535305023, + "learning_rate": 9.94756262207569e-05, + "loss": 0.0739, + "step": 1690 + }, + { + "epoch": 1.7507723995880535, + "grad_norm": 0.4117138683795929, + "learning_rate": 9.946227390295689e-05, + "loss": 0.0648, + "step": 1700 + }, + { + "epoch": 1.761071060762101, + "grad_norm": 0.5352147221565247, + "learning_rate": 9.9448754637057e-05, + "loss": 0.0614, + "step": 1710 + }, + { + "epoch": 1.7713697219361482, + "grad_norm": 0.3937685787677765, + "learning_rate": 9.943506846868826e-05, + "loss": 0.0668, + "step": 1720 + }, + { + "epoch": 1.7816683831101958, + "grad_norm": 0.510313868522644, + "learning_rate": 9.942121544404509e-05, + "loss": 0.0564, + "step": 1730 + }, + { + "epoch": 1.791967044284243, + "grad_norm": 0.43196746706962585, + "learning_rate": 9.940719560988505e-05, + "loss": 0.0515, + "step": 1740 + }, + { + "epoch": 1.8022657054582905, + "grad_norm": 0.4649578928947449, + "learning_rate": 9.939300901352876e-05, + "loss": 0.0681, + "step": 1750 + }, + { + "epoch": 1.8125643666323379, + "grad_norm": 0.6281247735023499, + "learning_rate": 9.937865570285967e-05, + "loss": 0.0721, + "step": 1760 + }, + { + "epoch": 1.822863027806385, + "grad_norm": 0.6799906492233276, + "learning_rate": 9.936413572632397e-05, + "loss": 0.0565, + "step": 1770 + }, + { + "epoch": 1.8331616889804325, + "grad_norm": 0.4169757068157196, + "learning_rate": 9.934944913293038e-05, + "loss": 0.0626, + "step": 1780 + }, + { + "epoch": 1.84346035015448, + "grad_norm": 0.42282024025917053, + "learning_rate": 9.933459597224997e-05, + "loss": 0.0654, + "step": 1790 + }, + { + "epoch": 1.8537590113285272, + "grad_norm": 0.34127193689346313, + "learning_rate": 9.931957629441607e-05, + "loss": 0.0572, + "step": 1800 + }, + { + "epoch": 1.8640576725025748, + "grad_norm": 0.3683079183101654, + "learning_rate": 9.930439015012396e-05, + "loss": 0.0621, + "step": 1810 + }, + { + "epoch": 1.874356333676622, + "grad_norm": 0.5137266516685486, + "learning_rate": 9.92890375906309e-05, + "loss": 0.0554, + "step": 1820 + }, + { + "epoch": 1.8846549948506695, + "grad_norm": 0.4121856391429901, + "learning_rate": 9.927351866775578e-05, + "loss": 0.0631, + "step": 1830 + }, + { + "epoch": 1.8949536560247169, + "grad_norm": 0.5225406289100647, + "learning_rate": 9.925783343387903e-05, + "loss": 0.0557, + "step": 1840 + }, + { + "epoch": 1.905252317198764, + "grad_norm": 0.3983275294303894, + "learning_rate": 9.924198194194237e-05, + "loss": 0.0631, + "step": 1850 + }, + { + "epoch": 1.9155509783728115, + "grad_norm": 0.49256351590156555, + "learning_rate": 9.922596424544876e-05, + "loss": 0.0661, + "step": 1860 + }, + { + "epoch": 1.925849639546859, + "grad_norm": 0.5363610982894897, + "learning_rate": 9.92097803984621e-05, + "loss": 0.0706, + "step": 1870 + }, + { + "epoch": 1.9361483007209062, + "grad_norm": 0.4455360472202301, + "learning_rate": 9.919343045560712e-05, + "loss": 0.0698, + "step": 1880 + }, + { + "epoch": 1.9464469618949538, + "grad_norm": 0.5394087433815002, + "learning_rate": 9.917691447206913e-05, + "loss": 0.0616, + "step": 1890 + }, + { + "epoch": 1.956745623069001, + "grad_norm": 0.3595924377441406, + "learning_rate": 9.91602325035939e-05, + "loss": 0.067, + "step": 1900 + }, + { + "epoch": 1.9670442842430484, + "grad_norm": 0.2918682396411896, + "learning_rate": 9.914338460648743e-05, + "loss": 0.0732, + "step": 1910 + }, + { + "epoch": 1.9773429454170959, + "grad_norm": 0.41418296098709106, + "learning_rate": 9.912637083761578e-05, + "loss": 0.0635, + "step": 1920 + }, + { + "epoch": 1.987641606591143, + "grad_norm": 0.5165850520133972, + "learning_rate": 9.910919125440485e-05, + "loss": 0.069, + "step": 1930 + }, + { + "epoch": 1.9979402677651905, + "grad_norm": 0.3793902099132538, + "learning_rate": 9.909184591484027e-05, + "loss": 0.0717, + "step": 1940 + }, + { + "epoch": 2.008238928939238, + "grad_norm": 0.6616620421409607, + "learning_rate": 9.907433487746702e-05, + "loss": 0.0586, + "step": 1950 + }, + { + "epoch": 2.018537590113285, + "grad_norm": 0.5687305331230164, + "learning_rate": 9.905665820138949e-05, + "loss": 0.0569, + "step": 1960 + }, + { + "epoch": 2.028836251287333, + "grad_norm": 0.49890944361686707, + "learning_rate": 9.903881594627105e-05, + "loss": 0.0668, + "step": 1970 + }, + { + "epoch": 2.03913491246138, + "grad_norm": 0.5814046859741211, + "learning_rate": 9.902080817233398e-05, + "loss": 0.0644, + "step": 1980 + }, + { + "epoch": 2.049433573635427, + "grad_norm": 0.32920873165130615, + "learning_rate": 9.900263494035921e-05, + "loss": 0.0611, + "step": 1990 + }, + { + "epoch": 2.059732234809475, + "grad_norm": 0.5075499415397644, + "learning_rate": 9.898429631168619e-05, + "loss": 0.0586, + "step": 2000 + }, + { + "epoch": 2.070030895983522, + "grad_norm": 0.4823492169380188, + "learning_rate": 9.896579234821253e-05, + "loss": 0.0468, + "step": 2010 + }, + { + "epoch": 2.0803295571575697, + "grad_norm": 0.5481283068656921, + "learning_rate": 9.894712311239398e-05, + "loss": 0.0611, + "step": 2020 + }, + { + "epoch": 2.090628218331617, + "grad_norm": 0.4776170551776886, + "learning_rate": 9.892828866724406e-05, + "loss": 0.0657, + "step": 2030 + }, + { + "epoch": 2.100926879505664, + "grad_norm": 0.5601367354393005, + "learning_rate": 9.8909289076334e-05, + "loss": 0.0665, + "step": 2040 + }, + { + "epoch": 2.111225540679712, + "grad_norm": 0.3499130308628082, + "learning_rate": 9.88901244037923e-05, + "loss": 0.0563, + "step": 2050 + }, + { + "epoch": 2.121524201853759, + "grad_norm": 0.4545436501502991, + "learning_rate": 9.88707947143048e-05, + "loss": 0.0557, + "step": 2060 + }, + { + "epoch": 2.131822863027806, + "grad_norm": 0.46852630376815796, + "learning_rate": 9.885130007311423e-05, + "loss": 0.0522, + "step": 2070 + }, + { + "epoch": 2.142121524201854, + "grad_norm": 0.308856338262558, + "learning_rate": 9.883164054602012e-05, + "loss": 0.058, + "step": 2080 + }, + { + "epoch": 2.152420185375901, + "grad_norm": 0.7965716123580933, + "learning_rate": 9.881181619937848e-05, + "loss": 0.0535, + "step": 2090 + }, + { + "epoch": 2.1627188465499483, + "grad_norm": 0.3949962556362152, + "learning_rate": 9.879182710010169e-05, + "loss": 0.0536, + "step": 2100 + }, + { + "epoch": 2.173017507723996, + "grad_norm": 0.40669289231300354, + "learning_rate": 9.877167331565816e-05, + "loss": 0.0598, + "step": 2110 + }, + { + "epoch": 2.183316168898043, + "grad_norm": 0.6267198324203491, + "learning_rate": 9.875135491407217e-05, + "loss": 0.0647, + "step": 2120 + }, + { + "epoch": 2.193614830072091, + "grad_norm": 0.3919011950492859, + "learning_rate": 9.873087196392368e-05, + "loss": 0.063, + "step": 2130 + }, + { + "epoch": 2.203913491246138, + "grad_norm": 0.3769017457962036, + "learning_rate": 9.871022453434798e-05, + "loss": 0.0558, + "step": 2140 + }, + { + "epoch": 2.214212152420185, + "grad_norm": 0.382344126701355, + "learning_rate": 9.868941269503551e-05, + "loss": 0.0615, + "step": 2150 + }, + { + "epoch": 2.224510813594233, + "grad_norm": 0.7266145348548889, + "learning_rate": 9.86684365162317e-05, + "loss": 0.0611, + "step": 2160 + }, + { + "epoch": 2.23480947476828, + "grad_norm": 0.5791377425193787, + "learning_rate": 9.864729606873663e-05, + "loss": 0.0575, + "step": 2170 + }, + { + "epoch": 2.2451081359423277, + "grad_norm": 0.40031886100769043, + "learning_rate": 9.862599142390482e-05, + "loss": 0.0559, + "step": 2180 + }, + { + "epoch": 2.255406797116375, + "grad_norm": 0.34372609853744507, + "learning_rate": 9.860452265364502e-05, + "loss": 0.0623, + "step": 2190 + }, + { + "epoch": 2.265705458290422, + "grad_norm": 0.5310713052749634, + "learning_rate": 9.858288983041996e-05, + "loss": 0.0628, + "step": 2200 + }, + { + "epoch": 2.27600411946447, + "grad_norm": 0.4002261459827423, + "learning_rate": 9.856109302724603e-05, + "loss": 0.0528, + "step": 2210 + }, + { + "epoch": 2.286302780638517, + "grad_norm": 0.3995415270328522, + "learning_rate": 9.853913231769318e-05, + "loss": 0.0603, + "step": 2220 + }, + { + "epoch": 2.296601441812564, + "grad_norm": 0.5082608461380005, + "learning_rate": 9.851700777588453e-05, + "loss": 0.0555, + "step": 2230 + }, + { + "epoch": 2.306900102986612, + "grad_norm": 0.3878387212753296, + "learning_rate": 9.849471947649617e-05, + "loss": 0.054, + "step": 2240 + }, + { + "epoch": 2.317198764160659, + "grad_norm": 0.44272416830062866, + "learning_rate": 9.847226749475695e-05, + "loss": 0.067, + "step": 2250 + }, + { + "epoch": 2.3274974253347063, + "grad_norm": 0.38929831981658936, + "learning_rate": 9.844965190644817e-05, + "loss": 0.0518, + "step": 2260 + }, + { + "epoch": 2.337796086508754, + "grad_norm": 0.3083374798297882, + "learning_rate": 9.842687278790337e-05, + "loss": 0.0484, + "step": 2270 + }, + { + "epoch": 2.348094747682801, + "grad_norm": 0.41075581312179565, + "learning_rate": 9.8403930216008e-05, + "loss": 0.0635, + "step": 2280 + }, + { + "epoch": 2.358393408856849, + "grad_norm": 0.2911306917667389, + "learning_rate": 9.838082426819926e-05, + "loss": 0.0599, + "step": 2290 + }, + { + "epoch": 2.368692070030896, + "grad_norm": 0.524851381778717, + "learning_rate": 9.835755502246575e-05, + "loss": 0.0542, + "step": 2300 + }, + { + "epoch": 2.378990731204943, + "grad_norm": 0.45933887362480164, + "learning_rate": 9.833412255734724e-05, + "loss": 0.0671, + "step": 2310 + }, + { + "epoch": 2.389289392378991, + "grad_norm": 0.38324400782585144, + "learning_rate": 9.831052695193445e-05, + "loss": 0.0596, + "step": 2320 + }, + { + "epoch": 2.399588053553038, + "grad_norm": 0.7916087508201599, + "learning_rate": 9.828676828586871e-05, + "loss": 0.0722, + "step": 2330 + }, + { + "epoch": 2.4098867147270857, + "grad_norm": 0.4739670157432556, + "learning_rate": 9.826284663934171e-05, + "loss": 0.0596, + "step": 2340 + }, + { + "epoch": 2.420185375901133, + "grad_norm": 0.37064895033836365, + "learning_rate": 9.823876209309527e-05, + "loss": 0.062, + "step": 2350 + }, + { + "epoch": 2.43048403707518, + "grad_norm": 0.6001970171928406, + "learning_rate": 9.821451472842102e-05, + "loss": 0.0623, + "step": 2360 + }, + { + "epoch": 2.4407826982492278, + "grad_norm": 0.40998250246047974, + "learning_rate": 9.819010462716016e-05, + "loss": 0.0586, + "step": 2370 + }, + { + "epoch": 2.451081359423275, + "grad_norm": 0.4756927490234375, + "learning_rate": 9.816553187170317e-05, + "loss": 0.0522, + "step": 2380 + }, + { + "epoch": 2.461380020597322, + "grad_norm": 0.47659242153167725, + "learning_rate": 9.814079654498949e-05, + "loss": 0.0573, + "step": 2390 + }, + { + "epoch": 2.47167868177137, + "grad_norm": 0.4043289124965668, + "learning_rate": 9.811589873050735e-05, + "loss": 0.0654, + "step": 2400 + }, + { + "epoch": 2.481977342945417, + "grad_norm": 0.7355890870094299, + "learning_rate": 9.809083851229335e-05, + "loss": 0.0523, + "step": 2410 + }, + { + "epoch": 2.4922760041194643, + "grad_norm": 0.4957990348339081, + "learning_rate": 9.806561597493228e-05, + "loss": 0.0566, + "step": 2420 + }, + { + "epoch": 2.502574665293512, + "grad_norm": 0.3758098781108856, + "learning_rate": 9.80402312035568e-05, + "loss": 0.0509, + "step": 2430 + }, + { + "epoch": 2.512873326467559, + "grad_norm": 0.4361479878425598, + "learning_rate": 9.801468428384716e-05, + "loss": 0.0566, + "step": 2440 + }, + { + "epoch": 2.5231719876416063, + "grad_norm": 0.4788246750831604, + "learning_rate": 9.798897530203087e-05, + "loss": 0.0577, + "step": 2450 + }, + { + "epoch": 2.533470648815654, + "grad_norm": 0.3828676640987396, + "learning_rate": 9.796310434488248e-05, + "loss": 0.0552, + "step": 2460 + }, + { + "epoch": 2.543769309989701, + "grad_norm": 0.34888461232185364, + "learning_rate": 9.79370714997232e-05, + "loss": 0.0562, + "step": 2470 + }, + { + "epoch": 2.554067971163749, + "grad_norm": 0.5660400986671448, + "learning_rate": 9.791087685442071e-05, + "loss": 0.0593, + "step": 2480 + }, + { + "epoch": 2.564366632337796, + "grad_norm": 0.3883237838745117, + "learning_rate": 9.788452049738879e-05, + "loss": 0.0567, + "step": 2490 + }, + { + "epoch": 2.5746652935118437, + "grad_norm": 0.34366926550865173, + "learning_rate": 9.785800251758701e-05, + "loss": 0.055, + "step": 2500 + }, + { + "epoch": 2.584963954685891, + "grad_norm": 0.2992055416107178, + "learning_rate": 9.783132300452049e-05, + "loss": 0.053, + "step": 2510 + }, + { + "epoch": 2.595262615859938, + "grad_norm": 0.3543379306793213, + "learning_rate": 9.780448204823958e-05, + "loss": 0.0587, + "step": 2520 + }, + { + "epoch": 2.6055612770339858, + "grad_norm": 0.32997754216194153, + "learning_rate": 9.777747973933948e-05, + "loss": 0.0483, + "step": 2530 + }, + { + "epoch": 2.615859938208033, + "grad_norm": 0.4290192425251007, + "learning_rate": 9.775031616896008e-05, + "loss": 0.0565, + "step": 2540 + }, + { + "epoch": 2.62615859938208, + "grad_norm": 0.39540722966194153, + "learning_rate": 9.772299142878549e-05, + "loss": 0.0567, + "step": 2550 + }, + { + "epoch": 2.636457260556128, + "grad_norm": 0.46537721157073975, + "learning_rate": 9.769550561104388e-05, + "loss": 0.0511, + "step": 2560 + }, + { + "epoch": 2.646755921730175, + "grad_norm": 0.4019800126552582, + "learning_rate": 9.766785880850707e-05, + "loss": 0.0576, + "step": 2570 + }, + { + "epoch": 2.6570545829042223, + "grad_norm": 0.3543599545955658, + "learning_rate": 9.764005111449021e-05, + "loss": 0.0561, + "step": 2580 + }, + { + "epoch": 2.66735324407827, + "grad_norm": 0.459049791097641, + "learning_rate": 9.761208262285155e-05, + "loss": 0.0626, + "step": 2590 + }, + { + "epoch": 2.677651905252317, + "grad_norm": 0.4867796003818512, + "learning_rate": 9.758395342799206e-05, + "loss": 0.0504, + "step": 2600 + }, + { + "epoch": 2.6879505664263643, + "grad_norm": 0.42788106203079224, + "learning_rate": 9.755566362485512e-05, + "loss": 0.0578, + "step": 2610 + }, + { + "epoch": 2.698249227600412, + "grad_norm": 0.3226776719093323, + "learning_rate": 9.752721330892624e-05, + "loss": 0.0552, + "step": 2620 + }, + { + "epoch": 2.708547888774459, + "grad_norm": 0.4271225333213806, + "learning_rate": 9.749860257623263e-05, + "loss": 0.0549, + "step": 2630 + }, + { + "epoch": 2.718846549948507, + "grad_norm": 0.39057081937789917, + "learning_rate": 9.7469831523343e-05, + "loss": 0.0558, + "step": 2640 + }, + { + "epoch": 2.729145211122554, + "grad_norm": 0.4585021436214447, + "learning_rate": 9.744090024736719e-05, + "loss": 0.0481, + "step": 2650 + }, + { + "epoch": 2.7394438722966017, + "grad_norm": 0.4004554748535156, + "learning_rate": 9.741180884595578e-05, + "loss": 0.0671, + "step": 2660 + }, + { + "epoch": 2.749742533470649, + "grad_norm": 0.3565993010997772, + "learning_rate": 9.738255741729987e-05, + "loss": 0.0623, + "step": 2670 + }, + { + "epoch": 2.760041194644696, + "grad_norm": 0.30855366587638855, + "learning_rate": 9.735314606013068e-05, + "loss": 0.0588, + "step": 2680 + }, + { + "epoch": 2.7703398558187438, + "grad_norm": 0.4170495271682739, + "learning_rate": 9.732357487371924e-05, + "loss": 0.056, + "step": 2690 + }, + { + "epoch": 2.780638516992791, + "grad_norm": 0.5667279362678528, + "learning_rate": 9.729384395787602e-05, + "loss": 0.0612, + "step": 2700 + }, + { + "epoch": 2.790937178166838, + "grad_norm": 0.27353501319885254, + "learning_rate": 9.726395341295062e-05, + "loss": 0.0493, + "step": 2710 + }, + { + "epoch": 2.801235839340886, + "grad_norm": 0.5288174152374268, + "learning_rate": 9.723390333983144e-05, + "loss": 0.0629, + "step": 2720 + }, + { + "epoch": 2.811534500514933, + "grad_norm": 0.4831124544143677, + "learning_rate": 9.720369383994535e-05, + "loss": 0.0549, + "step": 2730 + }, + { + "epoch": 2.8218331616889802, + "grad_norm": 0.3807002902030945, + "learning_rate": 9.717332501525729e-05, + "loss": 0.0561, + "step": 2740 + }, + { + "epoch": 2.832131822863028, + "grad_norm": 0.6944444179534912, + "learning_rate": 9.714279696826998e-05, + "loss": 0.0564, + "step": 2750 + }, + { + "epoch": 2.842430484037075, + "grad_norm": 0.3146667778491974, + "learning_rate": 9.711210980202354e-05, + "loss": 0.0544, + "step": 2760 + }, + { + "epoch": 2.8527291452111223, + "grad_norm": 0.4342884421348572, + "learning_rate": 9.708126362009522e-05, + "loss": 0.0541, + "step": 2770 + }, + { + "epoch": 2.86302780638517, + "grad_norm": 0.4473581612110138, + "learning_rate": 9.70502585265989e-05, + "loss": 0.0567, + "step": 2780 + }, + { + "epoch": 2.873326467559217, + "grad_norm": 0.34954315423965454, + "learning_rate": 9.70190946261849e-05, + "loss": 0.0508, + "step": 2790 + }, + { + "epoch": 2.883625128733265, + "grad_norm": 0.37677961587905884, + "learning_rate": 9.698777202403953e-05, + "loss": 0.0555, + "step": 2800 + }, + { + "epoch": 2.893923789907312, + "grad_norm": 0.3924347460269928, + "learning_rate": 9.695629082588473e-05, + "loss": 0.0607, + "step": 2810 + }, + { + "epoch": 2.9042224510813597, + "grad_norm": 0.34362998604774475, + "learning_rate": 9.69246511379778e-05, + "loss": 0.0479, + "step": 2820 + }, + { + "epoch": 2.914521112255407, + "grad_norm": 0.48478758335113525, + "learning_rate": 9.689285306711094e-05, + "loss": 0.0564, + "step": 2830 + }, + { + "epoch": 2.924819773429454, + "grad_norm": 0.39429691433906555, + "learning_rate": 9.686089672061094e-05, + "loss": 0.0552, + "step": 2840 + }, + { + "epoch": 2.9351184346035017, + "grad_norm": 0.27760738134384155, + "learning_rate": 9.682878220633885e-05, + "loss": 0.0507, + "step": 2850 + }, + { + "epoch": 2.945417095777549, + "grad_norm": 0.3564143180847168, + "learning_rate": 9.679650963268951e-05, + "loss": 0.0529, + "step": 2860 + }, + { + "epoch": 2.955715756951596, + "grad_norm": 0.3425343930721283, + "learning_rate": 9.676407910859131e-05, + "loss": 0.05, + "step": 2870 + }, + { + "epoch": 2.966014418125644, + "grad_norm": 0.3504887819290161, + "learning_rate": 9.673149074350573e-05, + "loss": 0.0529, + "step": 2880 + }, + { + "epoch": 2.976313079299691, + "grad_norm": 0.432216078042984, + "learning_rate": 9.669874464742705e-05, + "loss": 0.0582, + "step": 2890 + }, + { + "epoch": 2.9866117404737382, + "grad_norm": 0.4117823541164398, + "learning_rate": 9.666584093088189e-05, + "loss": 0.0516, + "step": 2900 + }, + { + "epoch": 2.996910401647786, + "grad_norm": 0.4118179380893707, + "learning_rate": 9.663277970492886e-05, + "loss": 0.0664, + "step": 2910 + }, + { + "epoch": 3.007209062821833, + "grad_norm": 0.31822094321250916, + "learning_rate": 9.659956108115827e-05, + "loss": 0.0607, + "step": 2920 + }, + { + "epoch": 3.0175077239958807, + "grad_norm": 0.34220412373542786, + "learning_rate": 9.656618517169164e-05, + "loss": 0.0523, + "step": 2930 + }, + { + "epoch": 3.027806385169928, + "grad_norm": 0.33871203660964966, + "learning_rate": 9.65326520891814e-05, + "loss": 0.0486, + "step": 2940 + }, + { + "epoch": 3.038105046343975, + "grad_norm": 0.4035494327545166, + "learning_rate": 9.649896194681045e-05, + "loss": 0.0497, + "step": 2950 + }, + { + "epoch": 3.048403707518023, + "grad_norm": 0.36851248145103455, + "learning_rate": 9.646511485829186e-05, + "loss": 0.062, + "step": 2960 + }, + { + "epoch": 3.05870236869207, + "grad_norm": 0.3193969428539276, + "learning_rate": 9.643111093786835e-05, + "loss": 0.0514, + "step": 2970 + }, + { + "epoch": 3.0690010298661172, + "grad_norm": 0.331909716129303, + "learning_rate": 9.639695030031204e-05, + "loss": 0.0488, + "step": 2980 + }, + { + "epoch": 3.079299691040165, + "grad_norm": 0.35757410526275635, + "learning_rate": 9.636263306092406e-05, + "loss": 0.0576, + "step": 2990 + }, + { + "epoch": 3.089598352214212, + "grad_norm": 0.4217674434185028, + "learning_rate": 9.6328159335534e-05, + "loss": 0.0554, + "step": 3000 + }, + { + "epoch": 3.0998970133882597, + "grad_norm": 0.3531946539878845, + "learning_rate": 9.629352924049975e-05, + "loss": 0.059, + "step": 3010 + }, + { + "epoch": 3.110195674562307, + "grad_norm": 0.39479324221611023, + "learning_rate": 9.625874289270688e-05, + "loss": 0.0621, + "step": 3020 + }, + { + "epoch": 3.120494335736354, + "grad_norm": 0.29987436532974243, + "learning_rate": 9.622380040956842e-05, + "loss": 0.0511, + "step": 3030 + }, + { + "epoch": 3.130792996910402, + "grad_norm": 0.5292258262634277, + "learning_rate": 9.61887019090244e-05, + "loss": 0.0564, + "step": 3040 + }, + { + "epoch": 3.141091658084449, + "grad_norm": 0.33128613233566284, + "learning_rate": 9.615344750954141e-05, + "loss": 0.0548, + "step": 3050 + }, + { + "epoch": 3.151390319258496, + "grad_norm": 0.43356847763061523, + "learning_rate": 9.611803733011229e-05, + "loss": 0.0557, + "step": 3060 + }, + { + "epoch": 3.161688980432544, + "grad_norm": 0.4408741295337677, + "learning_rate": 9.60824714902556e-05, + "loss": 0.0582, + "step": 3070 + }, + { + "epoch": 3.171987641606591, + "grad_norm": 0.307669460773468, + "learning_rate": 9.604675011001538e-05, + "loss": 0.0442, + "step": 3080 + }, + { + "epoch": 3.1822863027806383, + "grad_norm": 0.49202683568000793, + "learning_rate": 9.601087330996061e-05, + "loss": 0.0599, + "step": 3090 + }, + { + "epoch": 3.192584963954686, + "grad_norm": 0.3430628180503845, + "learning_rate": 9.597484121118487e-05, + "loss": 0.0501, + "step": 3100 + }, + { + "epoch": 3.202883625128733, + "grad_norm": 0.45715686678886414, + "learning_rate": 9.593865393530592e-05, + "loss": 0.0533, + "step": 3110 + }, + { + "epoch": 3.213182286302781, + "grad_norm": 0.29405537247657776, + "learning_rate": 9.590231160446526e-05, + "loss": 0.0579, + "step": 3120 + }, + { + "epoch": 3.223480947476828, + "grad_norm": 0.4138418436050415, + "learning_rate": 9.586581434132775e-05, + "loss": 0.0553, + "step": 3130 + }, + { + "epoch": 3.233779608650875, + "grad_norm": 0.2747637927532196, + "learning_rate": 9.582916226908118e-05, + "loss": 0.0534, + "step": 3140 + }, + { + "epoch": 3.244078269824923, + "grad_norm": 0.3608400821685791, + "learning_rate": 9.57923555114359e-05, + "loss": 0.0512, + "step": 3150 + }, + { + "epoch": 3.25437693099897, + "grad_norm": 0.4042729437351227, + "learning_rate": 9.575539419262434e-05, + "loss": 0.0445, + "step": 3160 + }, + { + "epoch": 3.2646755921730177, + "grad_norm": 0.35471370816230774, + "learning_rate": 9.571827843740057e-05, + "loss": 0.0542, + "step": 3170 + }, + { + "epoch": 3.274974253347065, + "grad_norm": 0.2936842441558838, + "learning_rate": 9.568100837104e-05, + "loss": 0.0505, + "step": 3180 + }, + { + "epoch": 3.285272914521112, + "grad_norm": 0.2880595028400421, + "learning_rate": 9.56435841193388e-05, + "loss": 0.0458, + "step": 3190 + }, + { + "epoch": 3.29557157569516, + "grad_norm": 0.33003637194633484, + "learning_rate": 9.560600580861365e-05, + "loss": 0.0576, + "step": 3200 + }, + { + "epoch": 3.305870236869207, + "grad_norm": 0.4025996923446655, + "learning_rate": 9.556827356570116e-05, + "loss": 0.0598, + "step": 3210 + }, + { + "epoch": 3.316168898043254, + "grad_norm": 0.5448514819145203, + "learning_rate": 9.553038751795746e-05, + "loss": 0.0503, + "step": 3220 + }, + { + "epoch": 3.326467559217302, + "grad_norm": 0.39959079027175903, + "learning_rate": 9.549234779325792e-05, + "loss": 0.0581, + "step": 3230 + }, + { + "epoch": 3.336766220391349, + "grad_norm": 0.31689804792404175, + "learning_rate": 9.545415451999653e-05, + "loss": 0.054, + "step": 3240 + }, + { + "epoch": 3.3470648815653963, + "grad_norm": 0.5861422419548035, + "learning_rate": 9.541580782708557e-05, + "loss": 0.0498, + "step": 3250 + }, + { + "epoch": 3.357363542739444, + "grad_norm": 0.36639899015426636, + "learning_rate": 9.537730784395514e-05, + "loss": 0.0625, + "step": 3260 + }, + { + "epoch": 3.367662203913491, + "grad_norm": 0.3032686710357666, + "learning_rate": 9.533865470055275e-05, + "loss": 0.0543, + "step": 3270 + }, + { + "epoch": 3.377960865087539, + "grad_norm": 0.4109341502189636, + "learning_rate": 9.529984852734285e-05, + "loss": 0.0582, + "step": 3280 + }, + { + "epoch": 3.388259526261586, + "grad_norm": 0.38670700788497925, + "learning_rate": 9.526088945530645e-05, + "loss": 0.0547, + "step": 3290 + }, + { + "epoch": 3.398558187435633, + "grad_norm": 0.30283281207084656, + "learning_rate": 9.522177761594057e-05, + "loss": 0.0434, + "step": 3300 + }, + { + "epoch": 3.408856848609681, + "grad_norm": 0.3940243721008301, + "learning_rate": 9.518251314125788e-05, + "loss": 0.0548, + "step": 3310 + }, + { + "epoch": 3.419155509783728, + "grad_norm": 0.6107800006866455, + "learning_rate": 9.514309616378626e-05, + "loss": 0.0453, + "step": 3320 + }, + { + "epoch": 3.4294541709577757, + "grad_norm": 0.3535449802875519, + "learning_rate": 9.510352681656832e-05, + "loss": 0.0509, + "step": 3330 + }, + { + "epoch": 3.439752832131823, + "grad_norm": 0.4279785454273224, + "learning_rate": 9.50638052331609e-05, + "loss": 0.0511, + "step": 3340 + }, + { + "epoch": 3.45005149330587, + "grad_norm": 0.5184943675994873, + "learning_rate": 9.502393154763478e-05, + "loss": 0.0553, + "step": 3350 + }, + { + "epoch": 3.460350154479918, + "grad_norm": 0.6247850656509399, + "learning_rate": 9.498390589457404e-05, + "loss": 0.0485, + "step": 3360 + }, + { + "epoch": 3.470648815653965, + "grad_norm": 0.4810273349285126, + "learning_rate": 9.494372840907572e-05, + "loss": 0.0646, + "step": 3370 + }, + { + "epoch": 3.480947476828012, + "grad_norm": 0.31024450063705444, + "learning_rate": 9.490339922674934e-05, + "loss": 0.0506, + "step": 3380 + }, + { + "epoch": 3.49124613800206, + "grad_norm": 0.3408045172691345, + "learning_rate": 9.486291848371643e-05, + "loss": 0.0598, + "step": 3390 + }, + { + "epoch": 3.501544799176107, + "grad_norm": 0.3190326988697052, + "learning_rate": 9.482228631661005e-05, + "loss": 0.0569, + "step": 3400 + }, + { + "epoch": 3.5118434603501543, + "grad_norm": 0.3894359767436981, + "learning_rate": 9.478150286257443e-05, + "loss": 0.048, + "step": 3410 + }, + { + "epoch": 3.522142121524202, + "grad_norm": 0.33339062333106995, + "learning_rate": 9.474056825926434e-05, + "loss": 0.0533, + "step": 3420 + }, + { + "epoch": 3.532440782698249, + "grad_norm": 0.4688987731933594, + "learning_rate": 9.46994826448448e-05, + "loss": 0.0495, + "step": 3430 + }, + { + "epoch": 3.5427394438722963, + "grad_norm": 0.24669192731380463, + "learning_rate": 9.465824615799046e-05, + "loss": 0.0487, + "step": 3440 + }, + { + "epoch": 3.553038105046344, + "grad_norm": 0.43672746419906616, + "learning_rate": 9.461685893788526e-05, + "loss": 0.0529, + "step": 3450 + }, + { + "epoch": 3.563336766220391, + "grad_norm": 0.3806833028793335, + "learning_rate": 9.457532112422187e-05, + "loss": 0.0644, + "step": 3460 + }, + { + "epoch": 3.573635427394439, + "grad_norm": 0.43160000443458557, + "learning_rate": 9.453363285720129e-05, + "loss": 0.046, + "step": 3470 + }, + { + "epoch": 3.583934088568486, + "grad_norm": 0.3873897194862366, + "learning_rate": 9.44917942775323e-05, + "loss": 0.0561, + "step": 3480 + }, + { + "epoch": 3.5942327497425337, + "grad_norm": 0.420244425535202, + "learning_rate": 9.444980552643103e-05, + "loss": 0.0544, + "step": 3490 + }, + { + "epoch": 3.604531410916581, + "grad_norm": 0.2572662830352783, + "learning_rate": 9.44076667456205e-05, + "loss": 0.0609, + "step": 3500 + }, + { + "epoch": 3.614830072090628, + "grad_norm": 0.5829557776451111, + "learning_rate": 9.43653780773301e-05, + "loss": 0.0683, + "step": 3510 + }, + { + "epoch": 3.6251287332646758, + "grad_norm": 0.5830304622650146, + "learning_rate": 9.432293966429514e-05, + "loss": 0.067, + "step": 3520 + }, + { + "epoch": 3.635427394438723, + "grad_norm": 0.38021519780158997, + "learning_rate": 9.428035164975636e-05, + "loss": 0.0498, + "step": 3530 + }, + { + "epoch": 3.64572605561277, + "grad_norm": 0.4201594591140747, + "learning_rate": 9.423761417745942e-05, + "loss": 0.0569, + "step": 3540 + }, + { + "epoch": 3.656024716786818, + "grad_norm": 0.5576770305633545, + "learning_rate": 9.419472739165449e-05, + "loss": 0.0667, + "step": 3550 + }, + { + "epoch": 3.666323377960865, + "grad_norm": 0.34150251746177673, + "learning_rate": 9.415169143709565e-05, + "loss": 0.0539, + "step": 3560 + }, + { + "epoch": 3.6766220391349123, + "grad_norm": 0.5191327333450317, + "learning_rate": 9.410850645904049e-05, + "loss": 0.0609, + "step": 3570 + }, + { + "epoch": 3.68692070030896, + "grad_norm": 0.3418954610824585, + "learning_rate": 9.40651726032496e-05, + "loss": 0.0485, + "step": 3580 + }, + { + "epoch": 3.697219361483007, + "grad_norm": 0.44254234433174133, + "learning_rate": 9.402169001598611e-05, + "loss": 0.0552, + "step": 3590 + }, + { + "epoch": 3.7075180226570543, + "grad_norm": 0.549349308013916, + "learning_rate": 9.397805884401504e-05, + "loss": 0.0601, + "step": 3600 + }, + { + "epoch": 3.717816683831102, + "grad_norm": 0.4500453472137451, + "learning_rate": 9.393427923460308e-05, + "loss": 0.0496, + "step": 3610 + }, + { + "epoch": 3.728115345005149, + "grad_norm": 0.5540750622749329, + "learning_rate": 9.389035133551778e-05, + "loss": 0.0563, + "step": 3620 + }, + { + "epoch": 3.738414006179197, + "grad_norm": 0.28786641359329224, + "learning_rate": 9.38462752950273e-05, + "loss": 0.0532, + "step": 3630 + }, + { + "epoch": 3.748712667353244, + "grad_norm": 0.3725302219390869, + "learning_rate": 9.380205126189983e-05, + "loss": 0.0558, + "step": 3640 + }, + { + "epoch": 3.7590113285272917, + "grad_norm": 0.47449609637260437, + "learning_rate": 9.375767938540299e-05, + "loss": 0.0559, + "step": 3650 + }, + { + "epoch": 3.769309989701339, + "grad_norm": 0.5294702649116516, + "learning_rate": 9.371315981530349e-05, + "loss": 0.0534, + "step": 3660 + }, + { + "epoch": 3.779608650875386, + "grad_norm": 0.29216107726097107, + "learning_rate": 9.366849270186649e-05, + "loss": 0.0519, + "step": 3670 + }, + { + "epoch": 3.7899073120494338, + "grad_norm": 0.28166675567626953, + "learning_rate": 9.362367819585518e-05, + "loss": 0.0532, + "step": 3680 + }, + { + "epoch": 3.800205973223481, + "grad_norm": 0.5699660778045654, + "learning_rate": 9.357871644853024e-05, + "loss": 0.0533, + "step": 3690 + }, + { + "epoch": 3.810504634397528, + "grad_norm": 0.44877076148986816, + "learning_rate": 9.353360761164931e-05, + "loss": 0.0569, + "step": 3700 + }, + { + "epoch": 3.820803295571576, + "grad_norm": 0.4341685175895691, + "learning_rate": 9.348835183746649e-05, + "loss": 0.0579, + "step": 3710 + }, + { + "epoch": 3.831101956745623, + "grad_norm": 0.37804657220840454, + "learning_rate": 9.344294927873188e-05, + "loss": 0.0535, + "step": 3720 + }, + { + "epoch": 3.8414006179196702, + "grad_norm": 0.47172001004219055, + "learning_rate": 9.339740008869092e-05, + "loss": 0.049, + "step": 3730 + }, + { + "epoch": 3.851699279093718, + "grad_norm": 0.29430967569351196, + "learning_rate": 9.335170442108408e-05, + "loss": 0.0547, + "step": 3740 + }, + { + "epoch": 3.861997940267765, + "grad_norm": 0.40547069907188416, + "learning_rate": 9.330586243014617e-05, + "loss": 0.0486, + "step": 3750 + }, + { + "epoch": 3.8722966014418123, + "grad_norm": 0.3896206319332123, + "learning_rate": 9.325987427060586e-05, + "loss": 0.0585, + "step": 3760 + }, + { + "epoch": 3.88259526261586, + "grad_norm": 0.29565155506134033, + "learning_rate": 9.321374009768525e-05, + "loss": 0.0508, + "step": 3770 + }, + { + "epoch": 3.892893923789907, + "grad_norm": 0.5239169597625732, + "learning_rate": 9.316746006709919e-05, + "loss": 0.0608, + "step": 3780 + }, + { + "epoch": 3.903192584963955, + "grad_norm": 0.2817414402961731, + "learning_rate": 9.31210343350549e-05, + "loss": 0.0465, + "step": 3790 + }, + { + "epoch": 3.913491246138002, + "grad_norm": 0.4744998514652252, + "learning_rate": 9.307446305825135e-05, + "loss": 0.0616, + "step": 3800 + }, + { + "epoch": 3.9237899073120497, + "grad_norm": 0.4715334475040436, + "learning_rate": 9.302774639387877e-05, + "loss": 0.0557, + "step": 3810 + }, + { + "epoch": 3.934088568486097, + "grad_norm": 0.5753309726715088, + "learning_rate": 9.298088449961813e-05, + "loss": 0.0592, + "step": 3820 + }, + { + "epoch": 3.944387229660144, + "grad_norm": 0.318158358335495, + "learning_rate": 9.293387753364052e-05, + "loss": 0.0604, + "step": 3830 + }, + { + "epoch": 3.9546858908341918, + "grad_norm": 0.4752749800682068, + "learning_rate": 9.288672565460679e-05, + "loss": 0.049, + "step": 3840 + }, + { + "epoch": 3.964984552008239, + "grad_norm": 0.284682035446167, + "learning_rate": 9.283942902166681e-05, + "loss": 0.0491, + "step": 3850 + }, + { + "epoch": 3.975283213182286, + "grad_norm": 0.4126709997653961, + "learning_rate": 9.27919877944591e-05, + "loss": 0.0508, + "step": 3860 + }, + { + "epoch": 3.985581874356334, + "grad_norm": 0.34126409888267517, + "learning_rate": 9.27444021331102e-05, + "loss": 0.0545, + "step": 3870 + }, + { + "epoch": 3.995880535530381, + "grad_norm": 0.5670478343963623, + "learning_rate": 9.269667219823412e-05, + "loss": 0.0483, + "step": 3880 + }, + { + "epoch": 4.006179196704428, + "grad_norm": 0.3084736466407776, + "learning_rate": 9.264879815093191e-05, + "loss": 0.0499, + "step": 3890 + }, + { + "epoch": 4.016477857878476, + "grad_norm": 0.4823373258113861, + "learning_rate": 9.260078015279096e-05, + "loss": 0.0558, + "step": 3900 + }, + { + "epoch": 4.0267765190525235, + "grad_norm": 0.2889825105667114, + "learning_rate": 9.255261836588458e-05, + "loss": 0.0561, + "step": 3910 + }, + { + "epoch": 4.03707518022657, + "grad_norm": 0.28834298253059387, + "learning_rate": 9.250431295277137e-05, + "loss": 0.0498, + "step": 3920 + }, + { + "epoch": 4.047373841400618, + "grad_norm": 0.40643489360809326, + "learning_rate": 9.245586407649473e-05, + "loss": 0.0479, + "step": 3930 + }, + { + "epoch": 4.057672502574666, + "grad_norm": 0.3214862644672394, + "learning_rate": 9.240727190058227e-05, + "loss": 0.0498, + "step": 3940 + }, + { + "epoch": 4.067971163748712, + "grad_norm": 0.40402647852897644, + "learning_rate": 9.235853658904529e-05, + "loss": 0.0522, + "step": 3950 + }, + { + "epoch": 4.07826982492276, + "grad_norm": 0.3338010311126709, + "learning_rate": 9.230965830637821e-05, + "loss": 0.0506, + "step": 3960 + }, + { + "epoch": 4.088568486096808, + "grad_norm": 0.42742258310317993, + "learning_rate": 9.226063721755799e-05, + "loss": 0.053, + "step": 3970 + }, + { + "epoch": 4.098867147270854, + "grad_norm": 0.3947793245315552, + "learning_rate": 9.221147348804362e-05, + "loss": 0.0541, + "step": 3980 + }, + { + "epoch": 4.109165808444902, + "grad_norm": 0.4395465552806854, + "learning_rate": 9.216216728377554e-05, + "loss": 0.0509, + "step": 3990 + }, + { + "epoch": 4.11946446961895, + "grad_norm": 0.28796476125717163, + "learning_rate": 9.211271877117507e-05, + "loss": 0.0501, + "step": 4000 + }, + { + "epoch": 4.1297631307929965, + "grad_norm": 0.31560418009757996, + "learning_rate": 9.206312811714386e-05, + "loss": 0.0502, + "step": 4010 + }, + { + "epoch": 4.140061791967044, + "grad_norm": 0.45714765787124634, + "learning_rate": 9.201339548906332e-05, + "loss": 0.0579, + "step": 4020 + }, + { + "epoch": 4.150360453141092, + "grad_norm": 0.3373541831970215, + "learning_rate": 9.196352105479409e-05, + "loss": 0.0504, + "step": 4030 + }, + { + "epoch": 4.1606591143151395, + "grad_norm": 0.5105737447738647, + "learning_rate": 9.19135049826754e-05, + "loss": 0.0619, + "step": 4040 + }, + { + "epoch": 4.170957775489186, + "grad_norm": 0.3023523688316345, + "learning_rate": 9.186334744152458e-05, + "loss": 0.0499, + "step": 4050 + }, + { + "epoch": 4.181256436663234, + "grad_norm": 0.3311084508895874, + "learning_rate": 9.18130486006364e-05, + "loss": 0.0484, + "step": 4060 + }, + { + "epoch": 4.1915550978372815, + "grad_norm": 0.3167574405670166, + "learning_rate": 9.176260862978263e-05, + "loss": 0.0605, + "step": 4070 + }, + { + "epoch": 4.201853759011328, + "grad_norm": 0.3764163553714752, + "learning_rate": 9.171202769921134e-05, + "loss": 0.0521, + "step": 4080 + }, + { + "epoch": 4.212152420185376, + "grad_norm": 0.325210303068161, + "learning_rate": 9.16613059796464e-05, + "loss": 0.0471, + "step": 4090 + }, + { + "epoch": 4.222451081359424, + "grad_norm": 0.3970625102519989, + "learning_rate": 9.161044364228683e-05, + "loss": 0.0545, + "step": 4100 + }, + { + "epoch": 4.23274974253347, + "grad_norm": 0.306384414434433, + "learning_rate": 9.155944085880637e-05, + "loss": 0.0539, + "step": 4110 + }, + { + "epoch": 4.243048403707518, + "grad_norm": 0.4230334162712097, + "learning_rate": 9.150829780135269e-05, + "loss": 0.0456, + "step": 4120 + }, + { + "epoch": 4.253347064881566, + "grad_norm": 0.29097849130630493, + "learning_rate": 9.145701464254698e-05, + "loss": 0.0511, + "step": 4130 + }, + { + "epoch": 4.263645726055612, + "grad_norm": 0.390979140996933, + "learning_rate": 9.140559155548333e-05, + "loss": 0.0461, + "step": 4140 + }, + { + "epoch": 4.27394438722966, + "grad_norm": 0.2566828429698944, + "learning_rate": 9.135402871372808e-05, + "loss": 0.0508, + "step": 4150 + }, + { + "epoch": 4.284243048403708, + "grad_norm": 0.4710136651992798, + "learning_rate": 9.130232629131932e-05, + "loss": 0.0503, + "step": 4160 + }, + { + "epoch": 4.2945417095777545, + "grad_norm": 0.4374995827674866, + "learning_rate": 9.125048446276618e-05, + "loss": 0.0599, + "step": 4170 + }, + { + "epoch": 4.304840370751802, + "grad_norm": 0.43765076994895935, + "learning_rate": 9.119850340304843e-05, + "loss": 0.0531, + "step": 4180 + }, + { + "epoch": 4.31513903192585, + "grad_norm": 0.45118576288223267, + "learning_rate": 9.114638328761571e-05, + "loss": 0.0527, + "step": 4190 + }, + { + "epoch": 4.325437693099897, + "grad_norm": 0.3243924379348755, + "learning_rate": 9.109412429238704e-05, + "loss": 0.0431, + "step": 4200 + }, + { + "epoch": 4.335736354273944, + "grad_norm": 0.33518919348716736, + "learning_rate": 9.104172659375017e-05, + "loss": 0.0491, + "step": 4210 + }, + { + "epoch": 4.346035015447992, + "grad_norm": 0.6875081062316895, + "learning_rate": 9.098919036856102e-05, + "loss": 0.0488, + "step": 4220 + }, + { + "epoch": 4.3563336766220395, + "grad_norm": 0.5093826055526733, + "learning_rate": 9.093651579414311e-05, + "loss": 0.0487, + "step": 4230 + }, + { + "epoch": 4.366632337796086, + "grad_norm": 0.37270835041999817, + "learning_rate": 9.088370304828685e-05, + "loss": 0.0559, + "step": 4240 + }, + { + "epoch": 4.376930998970134, + "grad_norm": 0.4596996307373047, + "learning_rate": 9.083075230924913e-05, + "loss": 0.0578, + "step": 4250 + }, + { + "epoch": 4.387229660144182, + "grad_norm": 0.3775595426559448, + "learning_rate": 9.077766375575246e-05, + "loss": 0.0562, + "step": 4260 + }, + { + "epoch": 4.397528321318228, + "grad_norm": 0.3252449333667755, + "learning_rate": 9.072443756698459e-05, + "loss": 0.0558, + "step": 4270 + }, + { + "epoch": 4.407826982492276, + "grad_norm": 0.42610299587249756, + "learning_rate": 9.067107392259783e-05, + "loss": 0.0455, + "step": 4280 + }, + { + "epoch": 4.418125643666324, + "grad_norm": 0.36227330565452576, + "learning_rate": 9.061757300270845e-05, + "loss": 0.0498, + "step": 4290 + }, + { + "epoch": 4.42842430484037, + "grad_norm": 0.4343869686126709, + "learning_rate": 9.056393498789602e-05, + "loss": 0.0504, + "step": 4300 + }, + { + "epoch": 4.438722966014418, + "grad_norm": 0.4492502808570862, + "learning_rate": 9.051016005920282e-05, + "loss": 0.0526, + "step": 4310 + }, + { + "epoch": 4.449021627188466, + "grad_norm": 0.2649560868740082, + "learning_rate": 9.045624839813334e-05, + "loss": 0.0488, + "step": 4320 + }, + { + "epoch": 4.4593202883625125, + "grad_norm": 0.2290182262659073, + "learning_rate": 9.040220018665347e-05, + "loss": 0.0427, + "step": 4330 + }, + { + "epoch": 4.46961894953656, + "grad_norm": 0.37687376141548157, + "learning_rate": 9.034801560719011e-05, + "loss": 0.0437, + "step": 4340 + }, + { + "epoch": 4.479917610710608, + "grad_norm": 0.21943651139736176, + "learning_rate": 9.029369484263033e-05, + "loss": 0.047, + "step": 4350 + }, + { + "epoch": 4.490216271884655, + "grad_norm": 0.32304951548576355, + "learning_rate": 9.02392380763209e-05, + "loss": 0.0461, + "step": 4360 + }, + { + "epoch": 4.500514933058702, + "grad_norm": 0.21305856108665466, + "learning_rate": 9.018464549206769e-05, + "loss": 0.0461, + "step": 4370 + }, + { + "epoch": 4.51081359423275, + "grad_norm": 0.6847507953643799, + "learning_rate": 9.012991727413487e-05, + "loss": 0.0475, + "step": 4380 + }, + { + "epoch": 4.521112255406797, + "grad_norm": 0.3444644808769226, + "learning_rate": 9.007505360724453e-05, + "loss": 0.0423, + "step": 4390 + }, + { + "epoch": 4.531410916580844, + "grad_norm": 0.3524458110332489, + "learning_rate": 9.002005467657586e-05, + "loss": 0.058, + "step": 4400 + }, + { + "epoch": 4.541709577754892, + "grad_norm": 0.4131333529949188, + "learning_rate": 8.996492066776464e-05, + "loss": 0.0462, + "step": 4410 + }, + { + "epoch": 4.55200823892894, + "grad_norm": 0.35865673422813416, + "learning_rate": 8.990965176690252e-05, + "loss": 0.0493, + "step": 4420 + }, + { + "epoch": 4.562306900102986, + "grad_norm": 0.3511912524700165, + "learning_rate": 8.985424816053651e-05, + "loss": 0.0561, + "step": 4430 + }, + { + "epoch": 4.572605561277034, + "grad_norm": 0.2704029083251953, + "learning_rate": 8.979871003566826e-05, + "loss": 0.0526, + "step": 4440 + }, + { + "epoch": 4.582904222451082, + "grad_norm": 0.3202318847179413, + "learning_rate": 8.974303757975345e-05, + "loss": 0.0532, + "step": 4450 + }, + { + "epoch": 4.593202883625128, + "grad_norm": 0.31483763456344604, + "learning_rate": 8.968723098070117e-05, + "loss": 0.051, + "step": 4460 + }, + { + "epoch": 4.603501544799176, + "grad_norm": 0.3457460403442383, + "learning_rate": 8.963129042687329e-05, + "loss": 0.0507, + "step": 4470 + }, + { + "epoch": 4.613800205973224, + "grad_norm": 0.31409910321235657, + "learning_rate": 8.957521610708375e-05, + "loss": 0.0503, + "step": 4480 + }, + { + "epoch": 4.6240988671472705, + "grad_norm": 0.2827114164829254, + "learning_rate": 8.951900821059809e-05, + "loss": 0.0494, + "step": 4490 + }, + { + "epoch": 4.634397528321318, + "grad_norm": 0.31604471802711487, + "learning_rate": 8.946266692713261e-05, + "loss": 0.0483, + "step": 4500 + }, + { + "epoch": 4.644696189495366, + "grad_norm": 0.3118681311607361, + "learning_rate": 8.940619244685388e-05, + "loss": 0.0553, + "step": 4510 + }, + { + "epoch": 4.6549948506694125, + "grad_norm": 0.2974856197834015, + "learning_rate": 8.934958496037802e-05, + "loss": 0.051, + "step": 4520 + }, + { + "epoch": 4.66529351184346, + "grad_norm": 0.3584068715572357, + "learning_rate": 8.92928446587701e-05, + "loss": 0.0459, + "step": 4530 + }, + { + "epoch": 4.675592173017508, + "grad_norm": 0.36687174439430237, + "learning_rate": 8.923597173354345e-05, + "loss": 0.0483, + "step": 4540 + }, + { + "epoch": 4.6858908341915555, + "grad_norm": 0.35569944977760315, + "learning_rate": 8.917896637665908e-05, + "loss": 0.05, + "step": 4550 + }, + { + "epoch": 4.696189495365602, + "grad_norm": 0.38467368483543396, + "learning_rate": 8.912182878052495e-05, + "loss": 0.0421, + "step": 4560 + }, + { + "epoch": 4.70648815653965, + "grad_norm": 0.36783739924430847, + "learning_rate": 8.906455913799538e-05, + "loss": 0.0509, + "step": 4570 + }, + { + "epoch": 4.716786817713698, + "grad_norm": 0.2462991178035736, + "learning_rate": 8.900715764237037e-05, + "loss": 0.0469, + "step": 4580 + }, + { + "epoch": 4.727085478887744, + "grad_norm": 0.3449934720993042, + "learning_rate": 8.894962448739499e-05, + "loss": 0.0467, + "step": 4590 + }, + { + "epoch": 4.737384140061792, + "grad_norm": 0.38251376152038574, + "learning_rate": 8.889195986725865e-05, + "loss": 0.049, + "step": 4600 + }, + { + "epoch": 4.74768280123584, + "grad_norm": 0.30399325489997864, + "learning_rate": 8.883416397659452e-05, + "loss": 0.0532, + "step": 4610 + }, + { + "epoch": 4.757981462409886, + "grad_norm": 0.4609906077384949, + "learning_rate": 8.877623701047885e-05, + "loss": 0.0511, + "step": 4620 + }, + { + "epoch": 4.768280123583934, + "grad_norm": 0.40049266815185547, + "learning_rate": 8.871817916443025e-05, + "loss": 0.0567, + "step": 4630 + }, + { + "epoch": 4.778578784757982, + "grad_norm": 0.5834691524505615, + "learning_rate": 8.865999063440916e-05, + "loss": 0.0491, + "step": 4640 + }, + { + "epoch": 4.7888774459320285, + "grad_norm": 0.4367988705635071, + "learning_rate": 8.860167161681707e-05, + "loss": 0.0573, + "step": 4650 + }, + { + "epoch": 4.799176107106076, + "grad_norm": 0.33364230394363403, + "learning_rate": 8.854322230849588e-05, + "loss": 0.0604, + "step": 4660 + }, + { + "epoch": 4.809474768280124, + "grad_norm": 0.42235320806503296, + "learning_rate": 8.848464290672729e-05, + "loss": 0.0518, + "step": 4670 + }, + { + "epoch": 4.819773429454171, + "grad_norm": 0.32555538415908813, + "learning_rate": 8.84259336092321e-05, + "loss": 0.0457, + "step": 4680 + }, + { + "epoch": 4.830072090628218, + "grad_norm": 0.34331732988357544, + "learning_rate": 8.836709461416952e-05, + "loss": 0.0558, + "step": 4690 + }, + { + "epoch": 4.840370751802266, + "grad_norm": 0.6019324064254761, + "learning_rate": 8.830812612013655e-05, + "loss": 0.0573, + "step": 4700 + }, + { + "epoch": 4.850669412976313, + "grad_norm": 0.2844030261039734, + "learning_rate": 8.824902832616723e-05, + "loss": 0.0571, + "step": 4710 + }, + { + "epoch": 4.86096807415036, + "grad_norm": 0.47788453102111816, + "learning_rate": 8.818980143173213e-05, + "loss": 0.0565, + "step": 4720 + }, + { + "epoch": 4.871266735324408, + "grad_norm": 0.24314385652542114, + "learning_rate": 8.81304456367374e-05, + "loss": 0.046, + "step": 4730 + }, + { + "epoch": 4.8815653964984556, + "grad_norm": 0.3316558301448822, + "learning_rate": 8.807096114152442e-05, + "loss": 0.0519, + "step": 4740 + }, + { + "epoch": 4.891864057672502, + "grad_norm": 0.4027853012084961, + "learning_rate": 8.801134814686891e-05, + "loss": 0.0495, + "step": 4750 + }, + { + "epoch": 4.90216271884655, + "grad_norm": 0.3290289342403412, + "learning_rate": 8.795160685398027e-05, + "loss": 0.0449, + "step": 4760 + }, + { + "epoch": 4.912461380020598, + "grad_norm": 0.3217390775680542, + "learning_rate": 8.789173746450101e-05, + "loss": 0.0578, + "step": 4770 + }, + { + "epoch": 4.922760041194644, + "grad_norm": 0.43397730588912964, + "learning_rate": 8.783174018050594e-05, + "loss": 0.0483, + "step": 4780 + }, + { + "epoch": 4.933058702368692, + "grad_norm": 0.38298988342285156, + "learning_rate": 8.777161520450158e-05, + "loss": 0.0479, + "step": 4790 + }, + { + "epoch": 4.94335736354274, + "grad_norm": 0.36208289861679077, + "learning_rate": 8.771136273942544e-05, + "loss": 0.0525, + "step": 4800 + }, + { + "epoch": 4.9536560247167865, + "grad_norm": 0.3291323482990265, + "learning_rate": 8.765098298864533e-05, + "loss": 0.0469, + "step": 4810 + }, + { + "epoch": 4.963954685890834, + "grad_norm": 0.23334382474422455, + "learning_rate": 8.759047615595869e-05, + "loss": 0.0478, + "step": 4820 + }, + { + "epoch": 4.974253347064882, + "grad_norm": 0.3632581830024719, + "learning_rate": 8.752984244559188e-05, + "loss": 0.0558, + "step": 4830 + }, + { + "epoch": 4.9845520082389285, + "grad_norm": 0.3983827531337738, + "learning_rate": 8.746908206219955e-05, + "loss": 0.0584, + "step": 4840 + }, + { + "epoch": 4.994850669412976, + "grad_norm": 0.5021440982818604, + "learning_rate": 8.740819521086383e-05, + "loss": 0.0522, + "step": 4850 + }, + { + "epoch": 5.005149330587024, + "grad_norm": 0.4782863259315491, + "learning_rate": 8.734718209709377e-05, + "loss": 0.0503, + "step": 4860 + }, + { + "epoch": 5.0154479917610715, + "grad_norm": 0.3124346435070038, + "learning_rate": 8.728604292682459e-05, + "loss": 0.0523, + "step": 4870 + }, + { + "epoch": 5.025746652935118, + "grad_norm": 0.46991485357284546, + "learning_rate": 8.722477790641694e-05, + "loss": 0.0507, + "step": 4880 + }, + { + "epoch": 5.036045314109166, + "grad_norm": 0.381569504737854, + "learning_rate": 8.71633872426563e-05, + "loss": 0.0473, + "step": 4890 + }, + { + "epoch": 5.0463439752832135, + "grad_norm": 0.4210774004459381, + "learning_rate": 8.710187114275219e-05, + "loss": 0.0521, + "step": 4900 + }, + { + "epoch": 5.05664263645726, + "grad_norm": 0.3999352753162384, + "learning_rate": 8.70402298143375e-05, + "loss": 0.0548, + "step": 4910 + }, + { + "epoch": 5.066941297631308, + "grad_norm": 0.32023027539253235, + "learning_rate": 8.697846346546787e-05, + "loss": 0.0508, + "step": 4920 + }, + { + "epoch": 5.077239958805356, + "grad_norm": 0.38814589381217957, + "learning_rate": 8.691657230462083e-05, + "loss": 0.0484, + "step": 4930 + }, + { + "epoch": 5.087538619979402, + "grad_norm": 0.3033084571361542, + "learning_rate": 8.685455654069523e-05, + "loss": 0.0432, + "step": 4940 + }, + { + "epoch": 5.09783728115345, + "grad_norm": 0.39010483026504517, + "learning_rate": 8.679241638301049e-05, + "loss": 0.0506, + "step": 4950 + }, + { + "epoch": 5.108135942327498, + "grad_norm": 0.28835776448249817, + "learning_rate": 8.673015204130586e-05, + "loss": 0.0543, + "step": 4960 + }, + { + "epoch": 5.1184346035015444, + "grad_norm": 0.5217164754867554, + "learning_rate": 8.66677637257398e-05, + "loss": 0.0501, + "step": 4970 + }, + { + "epoch": 5.128733264675592, + "grad_norm": 0.4083517789840698, + "learning_rate": 8.660525164688913e-05, + "loss": 0.0572, + "step": 4980 + }, + { + "epoch": 5.13903192584964, + "grad_norm": 0.5034805536270142, + "learning_rate": 8.654261601574849e-05, + "loss": 0.0541, + "step": 4990 + }, + { + "epoch": 5.1493305870236865, + "grad_norm": 0.3255571126937866, + "learning_rate": 8.647985704372948e-05, + "loss": 0.0539, + "step": 5000 + }, + { + "epoch": 5.159629248197734, + "grad_norm": 0.589500367641449, + "learning_rate": 8.641697494266006e-05, + "loss": 0.0497, + "step": 5010 + }, + { + "epoch": 5.169927909371782, + "grad_norm": 0.3600839674472809, + "learning_rate": 8.635396992478371e-05, + "loss": 0.0564, + "step": 5020 + }, + { + "epoch": 5.1802265705458295, + "grad_norm": 0.3535096049308777, + "learning_rate": 8.629084220275887e-05, + "loss": 0.0528, + "step": 5030 + }, + { + "epoch": 5.190525231719876, + "grad_norm": 0.3266212046146393, + "learning_rate": 8.622759198965809e-05, + "loss": 0.0476, + "step": 5040 + }, + { + "epoch": 5.200823892893924, + "grad_norm": 0.4038067162036896, + "learning_rate": 8.616421949896734e-05, + "loss": 0.0517, + "step": 5050 + }, + { + "epoch": 5.2111225540679715, + "grad_norm": 0.3460542857646942, + "learning_rate": 8.610072494458535e-05, + "loss": 0.0474, + "step": 5060 + }, + { + "epoch": 5.221421215242018, + "grad_norm": 0.41362518072128296, + "learning_rate": 8.603710854082286e-05, + "loss": 0.0515, + "step": 5070 + }, + { + "epoch": 5.231719876416066, + "grad_norm": 0.2805697023868561, + "learning_rate": 8.597337050240184e-05, + "loss": 0.0519, + "step": 5080 + }, + { + "epoch": 5.242018537590114, + "grad_norm": 0.4825451374053955, + "learning_rate": 8.590951104445482e-05, + "loss": 0.0504, + "step": 5090 + }, + { + "epoch": 5.25231719876416, + "grad_norm": 0.3441821038722992, + "learning_rate": 8.584553038252414e-05, + "loss": 0.0581, + "step": 5100 + }, + { + "epoch": 5.262615859938208, + "grad_norm": 0.39510828256607056, + "learning_rate": 8.578142873256129e-05, + "loss": 0.0532, + "step": 5110 + }, + { + "epoch": 5.272914521112256, + "grad_norm": 0.3733309805393219, + "learning_rate": 8.571720631092609e-05, + "loss": 0.057, + "step": 5120 + }, + { + "epoch": 5.283213182286302, + "grad_norm": 0.3860830068588257, + "learning_rate": 8.565286333438594e-05, + "loss": 0.049, + "step": 5130 + }, + { + "epoch": 5.29351184346035, + "grad_norm": 0.3507029414176941, + "learning_rate": 8.558840002011528e-05, + "loss": 0.0542, + "step": 5140 + }, + { + "epoch": 5.303810504634398, + "grad_norm": 0.30535757541656494, + "learning_rate": 8.552381658569457e-05, + "loss": 0.0584, + "step": 5150 + }, + { + "epoch": 5.3141091658084445, + "grad_norm": 0.3580070734024048, + "learning_rate": 8.545911324910982e-05, + "loss": 0.0509, + "step": 5160 + }, + { + "epoch": 5.324407826982492, + "grad_norm": 0.21992090344429016, + "learning_rate": 8.539429022875169e-05, + "loss": 0.0412, + "step": 5170 + }, + { + "epoch": 5.33470648815654, + "grad_norm": 0.6406000852584839, + "learning_rate": 8.532934774341483e-05, + "loss": 0.0518, + "step": 5180 + }, + { + "epoch": 5.3450051493305875, + "grad_norm": 0.43300265073776245, + "learning_rate": 8.526428601229706e-05, + "loss": 0.0539, + "step": 5190 + }, + { + "epoch": 5.355303810504634, + "grad_norm": 0.5168215036392212, + "learning_rate": 8.519910525499874e-05, + "loss": 0.0552, + "step": 5200 + }, + { + "epoch": 5.365602471678682, + "grad_norm": 0.2501913905143738, + "learning_rate": 8.513380569152196e-05, + "loss": 0.0506, + "step": 5210 + }, + { + "epoch": 5.3759011328527295, + "grad_norm": 0.2757486402988434, + "learning_rate": 8.506838754226982e-05, + "loss": 0.0565, + "step": 5220 + }, + { + "epoch": 5.386199794026776, + "grad_norm": 0.47264114022254944, + "learning_rate": 8.500285102804568e-05, + "loss": 0.0519, + "step": 5230 + }, + { + "epoch": 5.396498455200824, + "grad_norm": 0.30214348435401917, + "learning_rate": 8.493719637005237e-05, + "loss": 0.0424, + "step": 5240 + }, + { + "epoch": 5.406797116374872, + "grad_norm": 0.4345119893550873, + "learning_rate": 8.487142378989152e-05, + "loss": 0.0412, + "step": 5250 + }, + { + "epoch": 5.417095777548918, + "grad_norm": 0.33627235889434814, + "learning_rate": 8.480553350956282e-05, + "loss": 0.0481, + "step": 5260 + }, + { + "epoch": 5.427394438722966, + "grad_norm": 0.3047385811805725, + "learning_rate": 8.473952575146312e-05, + "loss": 0.0481, + "step": 5270 + }, + { + "epoch": 5.437693099897014, + "grad_norm": 0.4447433352470398, + "learning_rate": 8.46734007383859e-05, + "loss": 0.046, + "step": 5280 + }, + { + "epoch": 5.44799176107106, + "grad_norm": 0.4087453782558441, + "learning_rate": 8.460715869352035e-05, + "loss": 0.0487, + "step": 5290 + }, + { + "epoch": 5.458290422245108, + "grad_norm": 0.3321467339992523, + "learning_rate": 8.454079984045065e-05, + "loss": 0.0413, + "step": 5300 + }, + { + "epoch": 5.468589083419156, + "grad_norm": 0.356514036655426, + "learning_rate": 8.447432440315533e-05, + "loss": 0.049, + "step": 5310 + }, + { + "epoch": 5.4788877445932025, + "grad_norm": 0.37567445635795593, + "learning_rate": 8.44077326060063e-05, + "loss": 0.0461, + "step": 5320 + }, + { + "epoch": 5.48918640576725, + "grad_norm": 0.3040042519569397, + "learning_rate": 8.434102467376832e-05, + "loss": 0.0401, + "step": 5330 + }, + { + "epoch": 5.499485066941298, + "grad_norm": 0.39934873580932617, + "learning_rate": 8.427420083159807e-05, + "loss": 0.0493, + "step": 5340 + }, + { + "epoch": 5.509783728115345, + "grad_norm": 0.4000271260738373, + "learning_rate": 8.420726130504351e-05, + "loss": 0.0541, + "step": 5350 + }, + { + "epoch": 5.520082389289392, + "grad_norm": 0.2750590443611145, + "learning_rate": 8.414020632004299e-05, + "loss": 0.0481, + "step": 5360 + }, + { + "epoch": 5.53038105046344, + "grad_norm": 0.4174776077270508, + "learning_rate": 8.407303610292462e-05, + "loss": 0.0501, + "step": 5370 + }, + { + "epoch": 5.5406797116374875, + "grad_norm": 0.2651192247867584, + "learning_rate": 8.400575088040548e-05, + "loss": 0.0491, + "step": 5380 + }, + { + "epoch": 5.550978372811534, + "grad_norm": 0.49490901827812195, + "learning_rate": 8.393835087959072e-05, + "loss": 0.0488, + "step": 5390 + }, + { + "epoch": 5.561277033985582, + "grad_norm": 0.6012644171714783, + "learning_rate": 8.387083632797299e-05, + "loss": 0.05, + "step": 5400 + }, + { + "epoch": 5.57157569515963, + "grad_norm": 0.4538785219192505, + "learning_rate": 8.380320745343153e-05, + "loss": 0.0479, + "step": 5410 + }, + { + "epoch": 5.581874356333676, + "grad_norm": 0.358992338180542, + "learning_rate": 8.373546448423147e-05, + "loss": 0.05, + "step": 5420 + }, + { + "epoch": 5.592173017507724, + "grad_norm": 0.3814113736152649, + "learning_rate": 8.366760764902304e-05, + "loss": 0.0415, + "step": 5430 + }, + { + "epoch": 5.602471678681772, + "grad_norm": 0.6442550420761108, + "learning_rate": 8.359963717684077e-05, + "loss": 0.0495, + "step": 5440 + }, + { + "epoch": 5.612770339855818, + "grad_norm": 0.34561294317245483, + "learning_rate": 8.353155329710279e-05, + "loss": 0.0507, + "step": 5450 + }, + { + "epoch": 5.623069001029866, + "grad_norm": 0.333892822265625, + "learning_rate": 8.346335623960998e-05, + "loss": 0.0406, + "step": 5460 + }, + { + "epoch": 5.633367662203914, + "grad_norm": 0.21642594039440155, + "learning_rate": 8.339504623454521e-05, + "loss": 0.05, + "step": 5470 + }, + { + "epoch": 5.6436663233779605, + "grad_norm": 0.21974137425422668, + "learning_rate": 8.332662351247262e-05, + "loss": 0.0497, + "step": 5480 + }, + { + "epoch": 5.653964984552008, + "grad_norm": 0.35917189717292786, + "learning_rate": 8.325808830433679e-05, + "loss": 0.041, + "step": 5490 + }, + { + "epoch": 5.664263645726056, + "grad_norm": 0.2640712857246399, + "learning_rate": 8.318944084146192e-05, + "loss": 0.047, + "step": 5500 + }, + { + "epoch": 5.674562306900103, + "grad_norm": 6.280691623687744, + "learning_rate": 8.312068135555115e-05, + "loss": 0.0481, + "step": 5510 + }, + { + "epoch": 5.68486096807415, + "grad_norm": 0.269490122795105, + "learning_rate": 8.305181007868572e-05, + "loss": 0.0416, + "step": 5520 + }, + { + "epoch": 5.695159629248198, + "grad_norm": 0.408123254776001, + "learning_rate": 8.298282724332419e-05, + "loss": 0.049, + "step": 5530 + }, + { + "epoch": 5.705458290422245, + "grad_norm": 0.2983226478099823, + "learning_rate": 8.291373308230165e-05, + "loss": 0.0497, + "step": 5540 + }, + { + "epoch": 5.715756951596292, + "grad_norm": 0.35842761397361755, + "learning_rate": 8.284452782882894e-05, + "loss": 0.0477, + "step": 5550 + }, + { + "epoch": 5.72605561277034, + "grad_norm": 0.2742210328578949, + "learning_rate": 8.277521171649189e-05, + "loss": 0.052, + "step": 5560 + }, + { + "epoch": 5.736354273944388, + "grad_norm": 0.2822439968585968, + "learning_rate": 8.27057849792505e-05, + "loss": 0.0491, + "step": 5570 + }, + { + "epoch": 5.746652935118434, + "grad_norm": 0.3104664385318756, + "learning_rate": 8.263624785143812e-05, + "loss": 0.0493, + "step": 5580 + }, + { + "epoch": 5.756951596292482, + "grad_norm": 0.32532253861427307, + "learning_rate": 8.256660056776076e-05, + "loss": 0.0581, + "step": 5590 + }, + { + "epoch": 5.76725025746653, + "grad_norm": 0.3366002142429352, + "learning_rate": 8.249684336329617e-05, + "loss": 0.043, + "step": 5600 + }, + { + "epoch": 5.777548918640576, + "grad_norm": 0.25842759013175964, + "learning_rate": 8.242697647349317e-05, + "loss": 0.0485, + "step": 5610 + }, + { + "epoch": 5.787847579814624, + "grad_norm": 0.302432656288147, + "learning_rate": 8.235700013417076e-05, + "loss": 0.0521, + "step": 5620 + }, + { + "epoch": 5.798146240988672, + "grad_norm": 0.3358532190322876, + "learning_rate": 8.228691458151738e-05, + "loss": 0.0441, + "step": 5630 + }, + { + "epoch": 5.8084449021627185, + "grad_norm": 0.4343230724334717, + "learning_rate": 8.221672005209008e-05, + "loss": 0.0521, + "step": 5640 + }, + { + "epoch": 5.818743563336766, + "grad_norm": 0.30650976300239563, + "learning_rate": 8.214641678281374e-05, + "loss": 0.0538, + "step": 5650 + }, + { + "epoch": 5.829042224510814, + "grad_norm": 0.3401453197002411, + "learning_rate": 8.207600501098026e-05, + "loss": 0.0428, + "step": 5660 + }, + { + "epoch": 5.8393408856848605, + "grad_norm": 0.45636221766471863, + "learning_rate": 8.200548497424778e-05, + "loss": 0.0582, + "step": 5670 + }, + { + "epoch": 5.849639546858908, + "grad_norm": 0.2774709165096283, + "learning_rate": 8.193485691063985e-05, + "loss": 0.048, + "step": 5680 + }, + { + "epoch": 5.859938208032956, + "grad_norm": 0.29194507002830505, + "learning_rate": 8.186412105854463e-05, + "loss": 0.0534, + "step": 5690 + }, + { + "epoch": 5.8702368692070035, + "grad_norm": 0.36549675464630127, + "learning_rate": 8.17932776567141e-05, + "loss": 0.0571, + "step": 5700 + }, + { + "epoch": 5.88053553038105, + "grad_norm": 0.302418977022171, + "learning_rate": 8.172232694426329e-05, + "loss": 0.0423, + "step": 5710 + }, + { + "epoch": 5.890834191555098, + "grad_norm": 0.27770909667015076, + "learning_rate": 8.165126916066936e-05, + "loss": 0.0487, + "step": 5720 + }, + { + "epoch": 5.901132852729146, + "grad_norm": 0.3784064054489136, + "learning_rate": 8.158010454577093e-05, + "loss": 0.0504, + "step": 5730 + }, + { + "epoch": 5.911431513903192, + "grad_norm": 0.29943570494651794, + "learning_rate": 8.150883333976713e-05, + "loss": 0.0458, + "step": 5740 + }, + { + "epoch": 5.92173017507724, + "grad_norm": 0.26842376589775085, + "learning_rate": 8.143745578321695e-05, + "loss": 0.0523, + "step": 5750 + }, + { + "epoch": 5.932028836251288, + "grad_norm": 0.19866850972175598, + "learning_rate": 8.136597211703827e-05, + "loss": 0.0429, + "step": 5760 + }, + { + "epoch": 5.942327497425334, + "grad_norm": 0.30413612723350525, + "learning_rate": 8.129438258250712e-05, + "loss": 0.0441, + "step": 5770 + }, + { + "epoch": 5.952626158599382, + "grad_norm": 0.2791491746902466, + "learning_rate": 8.122268742125695e-05, + "loss": 0.047, + "step": 5780 + }, + { + "epoch": 5.96292481977343, + "grad_norm": 0.34201282262802124, + "learning_rate": 8.115088687527761e-05, + "loss": 0.0501, + "step": 5790 + }, + { + "epoch": 5.9732234809474765, + "grad_norm": 0.39383724331855774, + "learning_rate": 8.107898118691473e-05, + "loss": 0.0497, + "step": 5800 + }, + { + "epoch": 5.983522142121524, + "grad_norm": 0.3670088052749634, + "learning_rate": 8.100697059886879e-05, + "loss": 0.0428, + "step": 5810 + }, + { + "epoch": 5.993820803295572, + "grad_norm": 0.3595752716064453, + "learning_rate": 8.093485535419434e-05, + "loss": 0.0467, + "step": 5820 + }, + { + "epoch": 6.0041194644696185, + "grad_norm": 0.403352290391922, + "learning_rate": 8.086263569629919e-05, + "loss": 0.0441, + "step": 5830 + }, + { + "epoch": 6.014418125643666, + "grad_norm": 0.18506278097629547, + "learning_rate": 8.079031186894354e-05, + "loss": 0.0508, + "step": 5840 + }, + { + "epoch": 6.024716786817714, + "grad_norm": 0.5713401436805725, + "learning_rate": 8.071788411623922e-05, + "loss": 0.0491, + "step": 5850 + }, + { + "epoch": 6.0350154479917615, + "grad_norm": 0.20415346324443817, + "learning_rate": 8.064535268264883e-05, + "loss": 0.0502, + "step": 5860 + }, + { + "epoch": 6.045314109165808, + "grad_norm": 0.28075137734413147, + "learning_rate": 8.057271781298489e-05, + "loss": 0.0512, + "step": 5870 + }, + { + "epoch": 6.055612770339856, + "grad_norm": 0.3114660680294037, + "learning_rate": 8.049997975240909e-05, + "loss": 0.0508, + "step": 5880 + }, + { + "epoch": 6.0659114315139036, + "grad_norm": 0.3134065866470337, + "learning_rate": 8.042713874643136e-05, + "loss": 0.0531, + "step": 5890 + }, + { + "epoch": 6.07621009268795, + "grad_norm": 0.24600578844547272, + "learning_rate": 8.035419504090915e-05, + "loss": 0.0478, + "step": 5900 + }, + { + "epoch": 6.086508753861998, + "grad_norm": 0.34766799211502075, + "learning_rate": 8.028114888204653e-05, + "loss": 0.0486, + "step": 5910 + }, + { + "epoch": 6.096807415036046, + "grad_norm": 0.3067956268787384, + "learning_rate": 8.020800051639337e-05, + "loss": 0.0452, + "step": 5920 + }, + { + "epoch": 6.107106076210092, + "grad_norm": 0.3019874691963196, + "learning_rate": 8.013475019084453e-05, + "loss": 0.0458, + "step": 5930 + }, + { + "epoch": 6.11740473738414, + "grad_norm": 0.3271634578704834, + "learning_rate": 8.006139815263898e-05, + "loss": 0.0561, + "step": 5940 + }, + { + "epoch": 6.127703398558188, + "grad_norm": 0.2930561304092407, + "learning_rate": 7.998794464935904e-05, + "loss": 0.0407, + "step": 5950 + }, + { + "epoch": 6.1380020597322344, + "grad_norm": 0.37962770462036133, + "learning_rate": 7.991438992892946e-05, + "loss": 0.048, + "step": 5960 + }, + { + "epoch": 6.148300720906282, + "grad_norm": 0.36476749181747437, + "learning_rate": 7.984073423961664e-05, + "loss": 0.0439, + "step": 5970 + }, + { + "epoch": 6.15859938208033, + "grad_norm": 0.31208914518356323, + "learning_rate": 7.97669778300278e-05, + "loss": 0.0431, + "step": 5980 + }, + { + "epoch": 6.1688980432543765, + "grad_norm": 0.758002758026123, + "learning_rate": 7.969312094911007e-05, + "loss": 0.0481, + "step": 5990 + }, + { + "epoch": 6.179196704428424, + "grad_norm": 1.8981136083602905, + "learning_rate": 7.961916384614975e-05, + "loss": 0.0621, + "step": 6000 + }, + { + "epoch": 6.189495365602472, + "grad_norm": 0.277136892080307, + "learning_rate": 7.954510677077138e-05, + "loss": 0.0586, + "step": 6010 + }, + { + "epoch": 6.1997940267765195, + "grad_norm": 0.27095285058021545, + "learning_rate": 7.947094997293695e-05, + "loss": 0.0484, + "step": 6020 + }, + { + "epoch": 6.210092687950566, + "grad_norm": 0.2608092427253723, + "learning_rate": 7.9396693702945e-05, + "loss": 0.0457, + "step": 6030 + }, + { + "epoch": 6.220391349124614, + "grad_norm": 0.5210095643997192, + "learning_rate": 7.932233821142987e-05, + "loss": 0.0473, + "step": 6040 + }, + { + "epoch": 6.2306900102986615, + "grad_norm": 0.254302978515625, + "learning_rate": 7.924788374936078e-05, + "loss": 0.045, + "step": 6050 + }, + { + "epoch": 6.240988671472708, + "grad_norm": 0.343322217464447, + "learning_rate": 7.917333056804097e-05, + "loss": 0.054, + "step": 6060 + }, + { + "epoch": 6.251287332646756, + "grad_norm": 0.4098043143749237, + "learning_rate": 7.909867891910694e-05, + "loss": 0.0435, + "step": 6070 + }, + { + "epoch": 6.261585993820804, + "grad_norm": 0.34776240587234497, + "learning_rate": 7.902392905452749e-05, + "loss": 0.0538, + "step": 6080 + }, + { + "epoch": 6.27188465499485, + "grad_norm": 0.5250643491744995, + "learning_rate": 7.894908122660296e-05, + "loss": 0.0431, + "step": 6090 + }, + { + "epoch": 6.282183316168898, + "grad_norm": 0.37657663226127625, + "learning_rate": 7.887413568796433e-05, + "loss": 0.0532, + "step": 6100 + }, + { + "epoch": 6.292481977342946, + "grad_norm": 0.28036069869995117, + "learning_rate": 7.879909269157236e-05, + "loss": 0.0382, + "step": 6110 + }, + { + "epoch": 6.302780638516992, + "grad_norm": 0.4012965261936188, + "learning_rate": 7.87239524907168e-05, + "loss": 0.0472, + "step": 6120 + }, + { + "epoch": 6.31307929969104, + "grad_norm": 0.4002419412136078, + "learning_rate": 7.864871533901544e-05, + "loss": 0.051, + "step": 6130 + }, + { + "epoch": 6.323377960865088, + "grad_norm": 0.3897566795349121, + "learning_rate": 7.857338149041332e-05, + "loss": 0.0487, + "step": 6140 + }, + { + "epoch": 6.3336766220391345, + "grad_norm": 0.4365810751914978, + "learning_rate": 7.849795119918191e-05, + "loss": 0.0486, + "step": 6150 + }, + { + "epoch": 6.343975283213182, + "grad_norm": 0.38556814193725586, + "learning_rate": 7.842242471991809e-05, + "loss": 0.0509, + "step": 6160 + }, + { + "epoch": 6.35427394438723, + "grad_norm": 0.3570299744606018, + "learning_rate": 7.834680230754353e-05, + "loss": 0.0485, + "step": 6170 + }, + { + "epoch": 6.364572605561277, + "grad_norm": 0.25796523690223694, + "learning_rate": 7.82710842173036e-05, + "loss": 0.0474, + "step": 6180 + }, + { + "epoch": 6.374871266735324, + "grad_norm": 0.4013979732990265, + "learning_rate": 7.819527070476665e-05, + "loss": 0.0453, + "step": 6190 + }, + { + "epoch": 6.385169927909372, + "grad_norm": 0.2755083739757538, + "learning_rate": 7.811936202582306e-05, + "loss": 0.0407, + "step": 6200 + }, + { + "epoch": 6.3954685890834195, + "grad_norm": 0.8050864338874817, + "learning_rate": 7.80433584366845e-05, + "loss": 0.0468, + "step": 6210 + }, + { + "epoch": 6.405767250257466, + "grad_norm": 0.5987268686294556, + "learning_rate": 7.796726019388295e-05, + "loss": 0.0445, + "step": 6220 + }, + { + "epoch": 6.416065911431514, + "grad_norm": 0.31688612699508667, + "learning_rate": 7.789106755426985e-05, + "loss": 0.0414, + "step": 6230 + }, + { + "epoch": 6.426364572605562, + "grad_norm": 0.2687252163887024, + "learning_rate": 7.781478077501525e-05, + "loss": 0.0381, + "step": 6240 + }, + { + "epoch": 6.436663233779608, + "grad_norm": 0.31859585642814636, + "learning_rate": 7.773840011360698e-05, + "loss": 0.0486, + "step": 6250 + }, + { + "epoch": 6.446961894953656, + "grad_norm": 0.39176130294799805, + "learning_rate": 7.766192582784974e-05, + "loss": 0.0492, + "step": 6260 + }, + { + "epoch": 6.457260556127704, + "grad_norm": 0.4192884862422943, + "learning_rate": 7.758535817586424e-05, + "loss": 0.0524, + "step": 6270 + }, + { + "epoch": 6.46755921730175, + "grad_norm": 0.41165101528167725, + "learning_rate": 7.750869741608628e-05, + "loss": 0.0459, + "step": 6280 + }, + { + "epoch": 6.477857878475798, + "grad_norm": 0.37704214453697205, + "learning_rate": 7.7431943807266e-05, + "loss": 0.0555, + "step": 6290 + }, + { + "epoch": 6.488156539649846, + "grad_norm": 0.4949089586734772, + "learning_rate": 7.735509760846682e-05, + "loss": 0.0493, + "step": 6300 + }, + { + "epoch": 6.4984552008238925, + "grad_norm": 0.27363213896751404, + "learning_rate": 7.727815907906481e-05, + "loss": 0.0498, + "step": 6310 + }, + { + "epoch": 6.50875386199794, + "grad_norm": 0.32286787033081055, + "learning_rate": 7.720112847874759e-05, + "loss": 0.0445, + "step": 6320 + }, + { + "epoch": 6.519052523171988, + "grad_norm": 0.2211546152830124, + "learning_rate": 7.712400606751356e-05, + "loss": 0.0475, + "step": 6330 + }, + { + "epoch": 6.5293511843460355, + "grad_norm": 0.2400301843881607, + "learning_rate": 7.7046792105671e-05, + "loss": 0.0459, + "step": 6340 + }, + { + "epoch": 6.539649845520082, + "grad_norm": 0.3111647069454193, + "learning_rate": 7.696948685383725e-05, + "loss": 0.0492, + "step": 6350 + }, + { + "epoch": 6.54994850669413, + "grad_norm": 0.3468630313873291, + "learning_rate": 7.68920905729377e-05, + "loss": 0.0422, + "step": 6360 + }, + { + "epoch": 6.5602471678681775, + "grad_norm": 0.4992178678512573, + "learning_rate": 7.6814603524205e-05, + "loss": 0.0489, + "step": 6370 + }, + { + "epoch": 6.570545829042224, + "grad_norm": 0.33954063057899475, + "learning_rate": 7.673702596917824e-05, + "loss": 0.0483, + "step": 6380 + }, + { + "epoch": 6.580844490216272, + "grad_norm": 0.3721350133419037, + "learning_rate": 7.665935816970193e-05, + "loss": 0.0415, + "step": 6390 + }, + { + "epoch": 6.59114315139032, + "grad_norm": 0.30230167508125305, + "learning_rate": 7.658160038792518e-05, + "loss": 0.0431, + "step": 6400 + }, + { + "epoch": 6.601441812564366, + "grad_norm": 0.2966795861721039, + "learning_rate": 7.650375288630083e-05, + "loss": 0.0431, + "step": 6410 + }, + { + "epoch": 6.611740473738414, + "grad_norm": 0.28090888261795044, + "learning_rate": 7.642581592758453e-05, + "loss": 0.0413, + "step": 6420 + }, + { + "epoch": 6.622039134912462, + "grad_norm": 0.3371041715145111, + "learning_rate": 7.634778977483389e-05, + "loss": 0.0469, + "step": 6430 + }, + { + "epoch": 6.632337796086508, + "grad_norm": 0.28260523080825806, + "learning_rate": 7.626967469140754e-05, + "loss": 0.0437, + "step": 6440 + }, + { + "epoch": 6.642636457260556, + "grad_norm": 0.2734527289867401, + "learning_rate": 7.619147094096434e-05, + "loss": 0.043, + "step": 6450 + }, + { + "epoch": 6.652935118434604, + "grad_norm": 0.3294004797935486, + "learning_rate": 7.611317878746238e-05, + "loss": 0.0414, + "step": 6460 + }, + { + "epoch": 6.663233779608651, + "grad_norm": 0.45815443992614746, + "learning_rate": 7.60347984951581e-05, + "loss": 0.0496, + "step": 6470 + }, + { + "epoch": 6.673532440782698, + "grad_norm": 0.24537749588489532, + "learning_rate": 7.59563303286055e-05, + "loss": 0.0425, + "step": 6480 + }, + { + "epoch": 6.683831101956746, + "grad_norm": 0.32262513041496277, + "learning_rate": 7.587777455265515e-05, + "loss": 0.042, + "step": 6490 + }, + { + "epoch": 6.6941297631307926, + "grad_norm": 0.19561485946178436, + "learning_rate": 7.579913143245328e-05, + "loss": 0.0424, + "step": 6500 + }, + { + "epoch": 6.70442842430484, + "grad_norm": 0.29754048585891724, + "learning_rate": 7.572040123344103e-05, + "loss": 0.0466, + "step": 6510 + }, + { + "epoch": 6.714727085478888, + "grad_norm": 0.33084553480148315, + "learning_rate": 7.564158422135337e-05, + "loss": 0.0496, + "step": 6520 + }, + { + "epoch": 6.7250257466529355, + "grad_norm": 0.40858951210975647, + "learning_rate": 7.55626806622183e-05, + "loss": 0.0481, + "step": 6530 + }, + { + "epoch": 6.735324407826982, + "grad_norm": 0.9231746792793274, + "learning_rate": 7.548369082235595e-05, + "loss": 0.0512, + "step": 6540 + }, + { + "epoch": 6.74562306900103, + "grad_norm": 0.4263251721858978, + "learning_rate": 7.54046149683777e-05, + "loss": 0.0429, + "step": 6550 + }, + { + "epoch": 6.755921730175078, + "grad_norm": 0.2868654131889343, + "learning_rate": 7.532545336718521e-05, + "loss": 0.048, + "step": 6560 + }, + { + "epoch": 6.766220391349124, + "grad_norm": 0.250887930393219, + "learning_rate": 7.524620628596954e-05, + "loss": 0.0477, + "step": 6570 + }, + { + "epoch": 6.776519052523172, + "grad_norm": 0.3410227298736572, + "learning_rate": 7.516687399221037e-05, + "loss": 0.0474, + "step": 6580 + }, + { + "epoch": 6.78681771369722, + "grad_norm": 0.42289555072784424, + "learning_rate": 7.508745675367483e-05, + "loss": 0.0445, + "step": 6590 + }, + { + "epoch": 6.797116374871266, + "grad_norm": 0.3723140358924866, + "learning_rate": 7.500795483841692e-05, + "loss": 0.0473, + "step": 6600 + }, + { + "epoch": 6.807415036045314, + "grad_norm": 0.5165073275566101, + "learning_rate": 7.492836851477636e-05, + "loss": 0.0502, + "step": 6610 + }, + { + "epoch": 6.817713697219362, + "grad_norm": 0.3081056773662567, + "learning_rate": 7.484869805137778e-05, + "loss": 0.0478, + "step": 6620 + }, + { + "epoch": 6.8280123583934085, + "grad_norm": 0.39798182249069214, + "learning_rate": 7.476894371712982e-05, + "loss": 0.0516, + "step": 6630 + }, + { + "epoch": 6.838311019567456, + "grad_norm": 0.3031449615955353, + "learning_rate": 7.468910578122418e-05, + "loss": 0.0458, + "step": 6640 + }, + { + "epoch": 6.848609680741504, + "grad_norm": 0.40421777963638306, + "learning_rate": 7.460918451313481e-05, + "loss": 0.0464, + "step": 6650 + }, + { + "epoch": 6.858908341915551, + "grad_norm": 0.3347015976905823, + "learning_rate": 7.452918018261684e-05, + "loss": 0.0427, + "step": 6660 + }, + { + "epoch": 6.869207003089598, + "grad_norm": 0.46592167019844055, + "learning_rate": 7.444909305970578e-05, + "loss": 0.0395, + "step": 6670 + }, + { + "epoch": 6.879505664263646, + "grad_norm": 0.31017211079597473, + "learning_rate": 7.436892341471663e-05, + "loss": 0.052, + "step": 6680 + }, + { + "epoch": 6.889804325437693, + "grad_norm": 0.575901210308075, + "learning_rate": 7.428867151824287e-05, + "loss": 0.0489, + "step": 6690 + }, + { + "epoch": 6.90010298661174, + "grad_norm": 0.372746080160141, + "learning_rate": 7.420833764115561e-05, + "loss": 0.0428, + "step": 6700 + }, + { + "epoch": 6.910401647785788, + "grad_norm": 0.37451857328414917, + "learning_rate": 7.41279220546027e-05, + "loss": 0.0432, + "step": 6710 + }, + { + "epoch": 6.920700308959836, + "grad_norm": 0.3189006447792053, + "learning_rate": 7.404742503000776e-05, + "loss": 0.0519, + "step": 6720 + }, + { + "epoch": 6.930998970133882, + "grad_norm": 0.22485186159610748, + "learning_rate": 7.396684683906928e-05, + "loss": 0.0507, + "step": 6730 + }, + { + "epoch": 6.94129763130793, + "grad_norm": 0.3649514615535736, + "learning_rate": 7.38861877537597e-05, + "loss": 0.0485, + "step": 6740 + }, + { + "epoch": 6.951596292481978, + "grad_norm": 0.37899455428123474, + "learning_rate": 7.380544804632453e-05, + "loss": 0.0454, + "step": 6750 + }, + { + "epoch": 6.961894953656024, + "grad_norm": 0.4623110294342041, + "learning_rate": 7.372462798928137e-05, + "loss": 0.0446, + "step": 6760 + }, + { + "epoch": 6.972193614830072, + "grad_norm": 0.41896483302116394, + "learning_rate": 7.364372785541902e-05, + "loss": 0.0432, + "step": 6770 + }, + { + "epoch": 6.98249227600412, + "grad_norm": 0.28001904487609863, + "learning_rate": 7.356274791779661e-05, + "loss": 0.0447, + "step": 6780 + }, + { + "epoch": 6.9927909371781665, + "grad_norm": 0.35105225443840027, + "learning_rate": 7.348168844974254e-05, + "loss": 0.0445, + "step": 6790 + }, + { + "epoch": 7.003089598352214, + "grad_norm": 0.41556599736213684, + "learning_rate": 7.340054972485371e-05, + "loss": 0.0512, + "step": 6800 + }, + { + "epoch": 7.013388259526262, + "grad_norm": 0.4035722017288208, + "learning_rate": 7.331933201699457e-05, + "loss": 0.0423, + "step": 6810 + }, + { + "epoch": 7.0236869207003085, + "grad_norm": 0.4090428352355957, + "learning_rate": 7.323803560029605e-05, + "loss": 0.0514, + "step": 6820 + }, + { + "epoch": 7.033985581874356, + "grad_norm": 0.3787795901298523, + "learning_rate": 7.315666074915481e-05, + "loss": 0.0402, + "step": 6830 + }, + { + "epoch": 7.044284243048404, + "grad_norm": 0.32284408807754517, + "learning_rate": 7.307520773823227e-05, + "loss": 0.0466, + "step": 6840 + }, + { + "epoch": 7.0545829042224515, + "grad_norm": 0.35008612275123596, + "learning_rate": 7.299367684245362e-05, + "loss": 0.0451, + "step": 6850 + }, + { + "epoch": 7.064881565396498, + "grad_norm": 0.38151565194129944, + "learning_rate": 7.29120683370069e-05, + "loss": 0.0364, + "step": 6860 + }, + { + "epoch": 7.075180226570546, + "grad_norm": 0.21700677275657654, + "learning_rate": 7.283038249734217e-05, + "loss": 0.0504, + "step": 6870 + }, + { + "epoch": 7.085478887744594, + "grad_norm": 0.3018152415752411, + "learning_rate": 7.27486195991705e-05, + "loss": 0.0519, + "step": 6880 + }, + { + "epoch": 7.09577754891864, + "grad_norm": 0.2052696943283081, + "learning_rate": 7.266677991846301e-05, + "loss": 0.042, + "step": 6890 + }, + { + "epoch": 7.106076210092688, + "grad_norm": 0.39970454573631287, + "learning_rate": 7.258486373144999e-05, + "loss": 0.0409, + "step": 6900 + }, + { + "epoch": 7.116374871266736, + "grad_norm": 0.22980281710624695, + "learning_rate": 7.250287131462004e-05, + "loss": 0.0445, + "step": 6910 + }, + { + "epoch": 7.126673532440782, + "grad_norm": 0.3403468430042267, + "learning_rate": 7.242080294471895e-05, + "loss": 0.0565, + "step": 6920 + }, + { + "epoch": 7.13697219361483, + "grad_norm": 0.25713488459587097, + "learning_rate": 7.233865889874891e-05, + "loss": 0.0456, + "step": 6930 + }, + { + "epoch": 7.147270854788878, + "grad_norm": 0.3376232981681824, + "learning_rate": 7.225643945396757e-05, + "loss": 0.0378, + "step": 6940 + }, + { + "epoch": 7.1575695159629245, + "grad_norm": 0.255604088306427, + "learning_rate": 7.217414488788702e-05, + "loss": 0.041, + "step": 6950 + }, + { + "epoch": 7.167868177136972, + "grad_norm": 0.2713391184806824, + "learning_rate": 7.209177547827294e-05, + "loss": 0.0527, + "step": 6960 + }, + { + "epoch": 7.17816683831102, + "grad_norm": 0.2645740509033203, + "learning_rate": 7.20093315031436e-05, + "loss": 0.0432, + "step": 6970 + }, + { + "epoch": 7.1884654994850665, + "grad_norm": 0.3499581515789032, + "learning_rate": 7.192681324076896e-05, + "loss": 0.0516, + "step": 6980 + }, + { + "epoch": 7.198764160659114, + "grad_norm": 0.24416272342205048, + "learning_rate": 7.184422096966971e-05, + "loss": 0.0435, + "step": 6990 + }, + { + "epoch": 7.209062821833162, + "grad_norm": 0.3371264338493347, + "learning_rate": 7.176155496861638e-05, + "loss": 0.0463, + "step": 7000 + }, + { + "epoch": 7.2193614830072095, + "grad_norm": 0.3851630687713623, + "learning_rate": 7.167881551662831e-05, + "loss": 0.0407, + "step": 7010 + }, + { + "epoch": 7.229660144181256, + "grad_norm": 0.2070106714963913, + "learning_rate": 7.159600289297276e-05, + "loss": 0.0386, + "step": 7020 + }, + { + "epoch": 7.239958805355304, + "grad_norm": 0.3137363791465759, + "learning_rate": 7.151311737716397e-05, + "loss": 0.0411, + "step": 7030 + }, + { + "epoch": 7.2502574665293515, + "grad_norm": 0.3703240752220154, + "learning_rate": 7.143015924896226e-05, + "loss": 0.0426, + "step": 7040 + }, + { + "epoch": 7.260556127703398, + "grad_norm": 0.3365670144557953, + "learning_rate": 7.134712878837294e-05, + "loss": 0.0506, + "step": 7050 + }, + { + "epoch": 7.270854788877446, + "grad_norm": 0.2538038194179535, + "learning_rate": 7.126402627564555e-05, + "loss": 0.0466, + "step": 7060 + }, + { + "epoch": 7.281153450051494, + "grad_norm": 0.43290919065475464, + "learning_rate": 7.118085199127276e-05, + "loss": 0.0463, + "step": 7070 + }, + { + "epoch": 7.29145211122554, + "grad_norm": 0.2167598456144333, + "learning_rate": 7.109760621598952e-05, + "loss": 0.0421, + "step": 7080 + }, + { + "epoch": 7.301750772399588, + "grad_norm": 0.24321898818016052, + "learning_rate": 7.101428923077209e-05, + "loss": 0.0382, + "step": 7090 + }, + { + "epoch": 7.312049433573636, + "grad_norm": 0.31298938393592834, + "learning_rate": 7.093090131683704e-05, + "loss": 0.0401, + "step": 7100 + }, + { + "epoch": 7.3223480947476824, + "grad_norm": 0.38020390272140503, + "learning_rate": 7.08474427556404e-05, + "loss": 0.0454, + "step": 7110 + }, + { + "epoch": 7.33264675592173, + "grad_norm": 0.37544867396354675, + "learning_rate": 7.076391382887661e-05, + "loss": 0.0408, + "step": 7120 + }, + { + "epoch": 7.342945417095778, + "grad_norm": 0.2992228865623474, + "learning_rate": 7.068031481847762e-05, + "loss": 0.0454, + "step": 7130 + }, + { + "epoch": 7.3532440782698245, + "grad_norm": 0.48509418964385986, + "learning_rate": 7.059664600661196e-05, + "loss": 0.044, + "step": 7140 + }, + { + "epoch": 7.363542739443872, + "grad_norm": 0.4964796304702759, + "learning_rate": 7.051290767568371e-05, + "loss": 0.0526, + "step": 7150 + }, + { + "epoch": 7.37384140061792, + "grad_norm": 0.22935813665390015, + "learning_rate": 7.042910010833163e-05, + "loss": 0.0416, + "step": 7160 + }, + { + "epoch": 7.3841400617919675, + "grad_norm": 0.2570447325706482, + "learning_rate": 7.034522358742816e-05, + "loss": 0.0488, + "step": 7170 + }, + { + "epoch": 7.394438722966014, + "grad_norm": 0.23174193501472473, + "learning_rate": 7.026127839607847e-05, + "loss": 0.0423, + "step": 7180 + }, + { + "epoch": 7.404737384140062, + "grad_norm": 0.33260369300842285, + "learning_rate": 7.017726481761951e-05, + "loss": 0.0464, + "step": 7190 + }, + { + "epoch": 7.4150360453141095, + "grad_norm": 0.4475546181201935, + "learning_rate": 7.009318313561908e-05, + "loss": 0.0475, + "step": 7200 + }, + { + "epoch": 7.425334706488156, + "grad_norm": 0.2761160731315613, + "learning_rate": 7.000903363387482e-05, + "loss": 0.0448, + "step": 7210 + }, + { + "epoch": 7.435633367662204, + "grad_norm": 0.39867162704467773, + "learning_rate": 6.99248165964133e-05, + "loss": 0.0455, + "step": 7220 + }, + { + "epoch": 7.445932028836252, + "grad_norm": 0.3500315546989441, + "learning_rate": 6.9840532307489e-05, + "loss": 0.0452, + "step": 7230 + }, + { + "epoch": 7.456230690010298, + "grad_norm": 0.30247119069099426, + "learning_rate": 6.975618105158346e-05, + "loss": 0.0458, + "step": 7240 + }, + { + "epoch": 7.466529351184346, + "grad_norm": 0.357147753238678, + "learning_rate": 6.967176311340418e-05, + "loss": 0.0401, + "step": 7250 + }, + { + "epoch": 7.476828012358394, + "grad_norm": 0.36390820145606995, + "learning_rate": 6.958727877788378e-05, + "loss": 0.0432, + "step": 7260 + }, + { + "epoch": 7.48712667353244, + "grad_norm": 0.3110693395137787, + "learning_rate": 6.950272833017896e-05, + "loss": 0.0413, + "step": 7270 + }, + { + "epoch": 7.497425334706488, + "grad_norm": 0.26132798194885254, + "learning_rate": 6.941811205566957e-05, + "loss": 0.0448, + "step": 7280 + }, + { + "epoch": 7.507723995880536, + "grad_norm": 0.2721041142940521, + "learning_rate": 6.933343023995767e-05, + "loss": 0.0358, + "step": 7290 + }, + { + "epoch": 7.518022657054583, + "grad_norm": 0.26367267966270447, + "learning_rate": 6.924868316886649e-05, + "loss": 0.0515, + "step": 7300 + }, + { + "epoch": 7.52832131822863, + "grad_norm": 0.4417518377304077, + "learning_rate": 6.916387112843957e-05, + "loss": 0.054, + "step": 7310 + }, + { + "epoch": 7.538619979402678, + "grad_norm": 0.3166719079017639, + "learning_rate": 6.907899440493968e-05, + "loss": 0.0485, + "step": 7320 + }, + { + "epoch": 7.548918640576725, + "grad_norm": 0.330705463886261, + "learning_rate": 6.899405328484794e-05, + "loss": 0.0444, + "step": 7330 + }, + { + "epoch": 7.559217301750772, + "grad_norm": 0.22663088142871857, + "learning_rate": 6.890904805486286e-05, + "loss": 0.0424, + "step": 7340 + }, + { + "epoch": 7.56951596292482, + "grad_norm": 0.3720453083515167, + "learning_rate": 6.88239790018993e-05, + "loss": 0.043, + "step": 7350 + }, + { + "epoch": 7.5798146240988675, + "grad_norm": 0.2161106914281845, + "learning_rate": 6.873884641308752e-05, + "loss": 0.042, + "step": 7360 + }, + { + "epoch": 7.590113285272914, + "grad_norm": 0.3371187448501587, + "learning_rate": 6.865365057577227e-05, + "loss": 0.0463, + "step": 7370 + }, + { + "epoch": 7.600411946446962, + "grad_norm": 0.3055129945278168, + "learning_rate": 6.856839177751176e-05, + "loss": 0.0474, + "step": 7380 + }, + { + "epoch": 7.61071060762101, + "grad_norm": 0.3375736474990845, + "learning_rate": 6.84830703060767e-05, + "loss": 0.0439, + "step": 7390 + }, + { + "epoch": 7.621009268795056, + "grad_norm": 0.3460111916065216, + "learning_rate": 6.839768644944937e-05, + "loss": 0.0464, + "step": 7400 + }, + { + "epoch": 7.631307929969104, + "grad_norm": 0.3610309660434723, + "learning_rate": 6.83122404958226e-05, + "loss": 0.0441, + "step": 7410 + }, + { + "epoch": 7.641606591143152, + "grad_norm": 0.32009249925613403, + "learning_rate": 6.82267327335988e-05, + "loss": 0.0405, + "step": 7420 + }, + { + "epoch": 7.651905252317198, + "grad_norm": 0.532019853591919, + "learning_rate": 6.814116345138902e-05, + "loss": 0.0401, + "step": 7430 + }, + { + "epoch": 7.662203913491246, + "grad_norm": 0.25246256589889526, + "learning_rate": 6.805553293801196e-05, + "loss": 0.0476, + "step": 7440 + }, + { + "epoch": 7.672502574665294, + "grad_norm": 0.2576782703399658, + "learning_rate": 6.796984148249295e-05, + "loss": 0.0456, + "step": 7450 + }, + { + "epoch": 7.6828012358393405, + "grad_norm": 0.4437432885169983, + "learning_rate": 6.788408937406307e-05, + "loss": 0.0434, + "step": 7460 + }, + { + "epoch": 7.693099897013388, + "grad_norm": 0.3884623050689697, + "learning_rate": 6.77982769021581e-05, + "loss": 0.0433, + "step": 7470 + }, + { + "epoch": 7.703398558187436, + "grad_norm": 0.30564385652542114, + "learning_rate": 6.771240435641754e-05, + "loss": 0.0419, + "step": 7480 + }, + { + "epoch": 7.7136972193614834, + "grad_norm": 0.29946035146713257, + "learning_rate": 6.762647202668366e-05, + "loss": 0.0481, + "step": 7490 + }, + { + "epoch": 7.72399588053553, + "grad_norm": 0.270355761051178, + "learning_rate": 6.754048020300054e-05, + "loss": 0.0432, + "step": 7500 + }, + { + "epoch": 7.734294541709578, + "grad_norm": 0.3664805293083191, + "learning_rate": 6.745442917561309e-05, + "loss": 0.0379, + "step": 7510 + }, + { + "epoch": 7.7445932028836255, + "grad_norm": 0.788110077381134, + "learning_rate": 6.736831923496596e-05, + "loss": 0.0521, + "step": 7520 + }, + { + "epoch": 7.754891864057672, + "grad_norm": 0.46117472648620605, + "learning_rate": 6.728215067170273e-05, + "loss": 0.0487, + "step": 7530 + }, + { + "epoch": 7.76519052523172, + "grad_norm": 0.18957702815532684, + "learning_rate": 6.719592377666483e-05, + "loss": 0.0479, + "step": 7540 + }, + { + "epoch": 7.775489186405768, + "grad_norm": 0.4086840748786926, + "learning_rate": 6.710963884089054e-05, + "loss": 0.0426, + "step": 7550 + }, + { + "epoch": 7.785787847579814, + "grad_norm": 0.21845366060733795, + "learning_rate": 6.70232961556141e-05, + "loss": 0.0402, + "step": 7560 + }, + { + "epoch": 7.796086508753862, + "grad_norm": 0.18775074183940887, + "learning_rate": 6.693689601226458e-05, + "loss": 0.04, + "step": 7570 + }, + { + "epoch": 7.80638516992791, + "grad_norm": 0.30147698521614075, + "learning_rate": 6.685043870246507e-05, + "loss": 0.0434, + "step": 7580 + }, + { + "epoch": 7.816683831101956, + "grad_norm": 0.366470068693161, + "learning_rate": 6.676392451803161e-05, + "loss": 0.0463, + "step": 7590 + }, + { + "epoch": 7.826982492276004, + "grad_norm": 0.3885975778102875, + "learning_rate": 6.667735375097214e-05, + "loss": 0.0453, + "step": 7600 + }, + { + "epoch": 7.837281153450052, + "grad_norm": 0.29683852195739746, + "learning_rate": 6.659072669348564e-05, + "loss": 0.0419, + "step": 7610 + }, + { + "epoch": 7.8475798146240985, + "grad_norm": 0.29188981652259827, + "learning_rate": 6.650404363796108e-05, + "loss": 0.0371, + "step": 7620 + }, + { + "epoch": 7.857878475798146, + "grad_norm": 0.40961870551109314, + "learning_rate": 6.641730487697639e-05, + "loss": 0.0435, + "step": 7630 + }, + { + "epoch": 7.868177136972194, + "grad_norm": 0.33139774203300476, + "learning_rate": 6.633051070329759e-05, + "loss": 0.0413, + "step": 7640 + }, + { + "epoch": 7.8784757981462405, + "grad_norm": 0.28173500299453735, + "learning_rate": 6.624366140987768e-05, + "loss": 0.0452, + "step": 7650 + }, + { + "epoch": 7.888774459320288, + "grad_norm": 0.2889021039009094, + "learning_rate": 6.615675728985572e-05, + "loss": 0.0423, + "step": 7660 + }, + { + "epoch": 7.899073120494336, + "grad_norm": 0.6384182572364807, + "learning_rate": 6.606979863655583e-05, + "loss": 0.0379, + "step": 7670 + }, + { + "epoch": 7.9093717816683835, + "grad_norm": 0.4132192134857178, + "learning_rate": 6.598278574348619e-05, + "loss": 0.0391, + "step": 7680 + }, + { + "epoch": 7.91967044284243, + "grad_norm": 0.3432478606700897, + "learning_rate": 6.589571890433803e-05, + "loss": 0.0473, + "step": 7690 + }, + { + "epoch": 7.929969104016478, + "grad_norm": 0.3030139207839966, + "learning_rate": 6.580859841298471e-05, + "loss": 0.0374, + "step": 7700 + }, + { + "epoch": 7.940267765190526, + "grad_norm": 0.27307939529418945, + "learning_rate": 6.572142456348065e-05, + "loss": 0.0402, + "step": 7710 + }, + { + "epoch": 7.950566426364572, + "grad_norm": 0.2667880952358246, + "learning_rate": 6.563419765006038e-05, + "loss": 0.0463, + "step": 7720 + }, + { + "epoch": 7.96086508753862, + "grad_norm": 0.37028032541275024, + "learning_rate": 6.55469179671375e-05, + "loss": 0.038, + "step": 7730 + }, + { + "epoch": 7.971163748712668, + "grad_norm": 0.3381376266479492, + "learning_rate": 6.545958580930377e-05, + "loss": 0.0455, + "step": 7740 + }, + { + "epoch": 7.981462409886714, + "grad_norm": 0.28161460161209106, + "learning_rate": 6.537220147132805e-05, + "loss": 0.0396, + "step": 7750 + }, + { + "epoch": 7.991761071060762, + "grad_norm": 0.26298263669013977, + "learning_rate": 6.528476524815528e-05, + "loss": 0.0424, + "step": 7760 + }, + { + "epoch": 8.002059732234809, + "grad_norm": 0.2671511769294739, + "learning_rate": 6.519727743490561e-05, + "loss": 0.0384, + "step": 7770 + }, + { + "epoch": 8.012358393408856, + "grad_norm": 0.3101862967014313, + "learning_rate": 6.510973832687323e-05, + "loss": 0.0465, + "step": 7780 + }, + { + "epoch": 8.022657054582904, + "grad_norm": 0.3037969768047333, + "learning_rate": 6.502214821952555e-05, + "loss": 0.0473, + "step": 7790 + }, + { + "epoch": 8.032955715756952, + "grad_norm": 0.45323264598846436, + "learning_rate": 6.493450740850203e-05, + "loss": 0.0432, + "step": 7800 + }, + { + "epoch": 8.043254376931, + "grad_norm": 0.41797924041748047, + "learning_rate": 6.484681618961331e-05, + "loss": 0.048, + "step": 7810 + }, + { + "epoch": 8.053553038105047, + "grad_norm": 0.4865727424621582, + "learning_rate": 6.47590748588402e-05, + "loss": 0.0512, + "step": 7820 + }, + { + "epoch": 8.063851699279093, + "grad_norm": 0.3105076849460602, + "learning_rate": 6.46712837123326e-05, + "loss": 0.0448, + "step": 7830 + }, + { + "epoch": 8.07415036045314, + "grad_norm": 0.25625815987586975, + "learning_rate": 6.458344304640858e-05, + "loss": 0.0416, + "step": 7840 + }, + { + "epoch": 8.084449021627188, + "grad_norm": 0.31119033694267273, + "learning_rate": 6.449555315755333e-05, + "loss": 0.041, + "step": 7850 + }, + { + "epoch": 8.094747682801236, + "grad_norm": 0.39366838335990906, + "learning_rate": 6.440761434241821e-05, + "loss": 0.0404, + "step": 7860 + }, + { + "epoch": 8.105046343975284, + "grad_norm": 0.31691083312034607, + "learning_rate": 6.431962689781969e-05, + "loss": 0.0392, + "step": 7870 + }, + { + "epoch": 8.115345005149331, + "grad_norm": 0.23836584389209747, + "learning_rate": 6.423159112073838e-05, + "loss": 0.0455, + "step": 7880 + }, + { + "epoch": 8.125643666323377, + "grad_norm": 0.2766348719596863, + "learning_rate": 6.414350730831805e-05, + "loss": 0.0405, + "step": 7890 + }, + { + "epoch": 8.135942327497425, + "grad_norm": 0.3610820174217224, + "learning_rate": 6.405537575786456e-05, + "loss": 0.0459, + "step": 7900 + }, + { + "epoch": 8.146240988671472, + "grad_norm": 0.4069831669330597, + "learning_rate": 6.396719676684494e-05, + "loss": 0.0449, + "step": 7910 + }, + { + "epoch": 8.15653964984552, + "grad_norm": 0.38294172286987305, + "learning_rate": 6.387897063288635e-05, + "loss": 0.0495, + "step": 7920 + }, + { + "epoch": 8.166838311019568, + "grad_norm": 0.3302978575229645, + "learning_rate": 6.3790697653775e-05, + "loss": 0.0453, + "step": 7930 + }, + { + "epoch": 8.177136972193615, + "grad_norm": 0.26982101798057556, + "learning_rate": 6.37023781274553e-05, + "loss": 0.0463, + "step": 7940 + }, + { + "epoch": 8.187435633367663, + "grad_norm": 0.23370954394340515, + "learning_rate": 6.361401235202872e-05, + "loss": 0.0465, + "step": 7950 + }, + { + "epoch": 8.197734294541709, + "grad_norm": 0.3092534840106964, + "learning_rate": 6.352560062575284e-05, + "loss": 0.055, + "step": 7960 + }, + { + "epoch": 8.208032955715757, + "grad_norm": 0.36051103472709656, + "learning_rate": 6.343714324704034e-05, + "loss": 0.0551, + "step": 7970 + }, + { + "epoch": 8.218331616889804, + "grad_norm": 0.33508798480033875, + "learning_rate": 6.3348640514458e-05, + "loss": 0.0462, + "step": 7980 + }, + { + "epoch": 8.228630278063852, + "grad_norm": 0.9673136472702026, + "learning_rate": 6.326009272672564e-05, + "loss": 0.0442, + "step": 7990 + }, + { + "epoch": 8.2389289392379, + "grad_norm": 1.469125509262085, + "learning_rate": 6.317150018271522e-05, + "loss": 0.0465, + "step": 8000 + }, + { + "epoch": 8.249227600411947, + "grad_norm": 0.3022879660129547, + "learning_rate": 6.308286318144971e-05, + "loss": 0.052, + "step": 8010 + }, + { + "epoch": 8.259526261585993, + "grad_norm": 0.240738183259964, + "learning_rate": 6.299418202210214e-05, + "loss": 0.044, + "step": 8020 + }, + { + "epoch": 8.26982492276004, + "grad_norm": 0.3125, + "learning_rate": 6.290545700399462e-05, + "loss": 0.0413, + "step": 8030 + }, + { + "epoch": 8.280123583934088, + "grad_norm": 0.3256394565105438, + "learning_rate": 6.281668842659725e-05, + "loss": 0.0381, + "step": 8040 + }, + { + "epoch": 8.290422245108136, + "grad_norm": 0.3764393925666809, + "learning_rate": 6.27278765895272e-05, + "loss": 0.0412, + "step": 8050 + }, + { + "epoch": 8.300720906282184, + "grad_norm": 0.28021517395973206, + "learning_rate": 6.263902179254762e-05, + "loss": 0.0392, + "step": 8060 + }, + { + "epoch": 8.311019567456231, + "grad_norm": 0.3545322120189667, + "learning_rate": 6.255012433556665e-05, + "loss": 0.039, + "step": 8070 + }, + { + "epoch": 8.321318228630279, + "grad_norm": 0.33872804045677185, + "learning_rate": 6.246118451863646e-05, + "loss": 0.0417, + "step": 8080 + }, + { + "epoch": 8.331616889804325, + "grad_norm": 0.9136466383934021, + "learning_rate": 6.237220264195216e-05, + "loss": 0.0429, + "step": 8090 + }, + { + "epoch": 8.341915550978372, + "grad_norm": 0.31747815012931824, + "learning_rate": 6.228317900585083e-05, + "loss": 0.0425, + "step": 8100 + }, + { + "epoch": 8.35221421215242, + "grad_norm": 0.3648073375225067, + "learning_rate": 6.219411391081055e-05, + "loss": 0.0384, + "step": 8110 + }, + { + "epoch": 8.362512873326468, + "grad_norm": 0.26562437415122986, + "learning_rate": 6.210500765744925e-05, + "loss": 0.036, + "step": 8120 + }, + { + "epoch": 8.372811534500515, + "grad_norm": 0.2761411666870117, + "learning_rate": 6.201586054652379e-05, + "loss": 0.0466, + "step": 8130 + }, + { + "epoch": 8.383110195674563, + "grad_norm": 0.46033117175102234, + "learning_rate": 6.192667287892905e-05, + "loss": 0.0432, + "step": 8140 + }, + { + "epoch": 8.393408856848609, + "grad_norm": 0.3292730450630188, + "learning_rate": 6.183744495569666e-05, + "loss": 0.0426, + "step": 8150 + }, + { + "epoch": 8.403707518022657, + "grad_norm": 0.2943620979785919, + "learning_rate": 6.174817707799417e-05, + "loss": 0.0483, + "step": 8160 + }, + { + "epoch": 8.414006179196704, + "grad_norm": 0.3903990685939789, + "learning_rate": 6.165886954712401e-05, + "loss": 0.043, + "step": 8170 + }, + { + "epoch": 8.424304840370752, + "grad_norm": 0.41772767901420593, + "learning_rate": 6.156952266452247e-05, + "loss": 0.0407, + "step": 8180 + }, + { + "epoch": 8.4346035015448, + "grad_norm": 0.5899285078048706, + "learning_rate": 6.148013673175857e-05, + "loss": 0.0434, + "step": 8190 + }, + { + "epoch": 8.444902162718847, + "grad_norm": 0.22386884689331055, + "learning_rate": 6.13907120505332e-05, + "loss": 0.042, + "step": 8200 + }, + { + "epoch": 8.455200823892893, + "grad_norm": 0.3034772276878357, + "learning_rate": 6.130124892267806e-05, + "loss": 0.0365, + "step": 8210 + }, + { + "epoch": 8.46549948506694, + "grad_norm": 0.37777379155158997, + "learning_rate": 6.121174765015455e-05, + "loss": 0.0419, + "step": 8220 + }, + { + "epoch": 8.475798146240988, + "grad_norm": 0.30282172560691833, + "learning_rate": 6.112220853505288e-05, + "loss": 0.0418, + "step": 8230 + }, + { + "epoch": 8.486096807415036, + "grad_norm": 0.5801701545715332, + "learning_rate": 6.103263187959095e-05, + "loss": 0.049, + "step": 8240 + }, + { + "epoch": 8.496395468589084, + "grad_norm": 0.32179057598114014, + "learning_rate": 6.094301798611338e-05, + "loss": 0.0396, + "step": 8250 + }, + { + "epoch": 8.506694129763131, + "grad_norm": 0.2766133248806, + "learning_rate": 6.085336715709049e-05, + "loss": 0.0484, + "step": 8260 + }, + { + "epoch": 8.516992790937179, + "grad_norm": 0.2891679108142853, + "learning_rate": 6.076367969511725e-05, + "loss": 0.0483, + "step": 8270 + }, + { + "epoch": 8.527291452111225, + "grad_norm": 0.35707661509513855, + "learning_rate": 6.067395590291226e-05, + "loss": 0.0468, + "step": 8280 + }, + { + "epoch": 8.537590113285273, + "grad_norm": 0.29469162225723267, + "learning_rate": 6.0584196083316794e-05, + "loss": 0.0441, + "step": 8290 + }, + { + "epoch": 8.54788877445932, + "grad_norm": 0.29220518469810486, + "learning_rate": 6.0494400539293675e-05, + "loss": 0.0389, + "step": 8300 + }, + { + "epoch": 8.558187435633368, + "grad_norm": 0.3941989243030548, + "learning_rate": 6.040456957392635e-05, + "loss": 0.0389, + "step": 8310 + }, + { + "epoch": 8.568486096807415, + "grad_norm": 0.2707824409008026, + "learning_rate": 6.03147034904178e-05, + "loss": 0.0471, + "step": 8320 + }, + { + "epoch": 8.578784757981463, + "grad_norm": 0.35828855633735657, + "learning_rate": 6.0224802592089513e-05, + "loss": 0.0453, + "step": 8330 + }, + { + "epoch": 8.589083419155509, + "grad_norm": 0.2687852382659912, + "learning_rate": 6.013486718238055e-05, + "loss": 0.041, + "step": 8340 + }, + { + "epoch": 8.599382080329557, + "grad_norm": 0.25436437129974365, + "learning_rate": 6.004489756484641e-05, + "loss": 0.0411, + "step": 8350 + }, + { + "epoch": 8.609680741503604, + "grad_norm": 0.22475087642669678, + "learning_rate": 5.995489404315806e-05, + "loss": 0.0409, + "step": 8360 + }, + { + "epoch": 8.619979402677652, + "grad_norm": 0.32723718881607056, + "learning_rate": 5.98648569211009e-05, + "loss": 0.0477, + "step": 8370 + }, + { + "epoch": 8.6302780638517, + "grad_norm": 0.2676869034767151, + "learning_rate": 5.977478650257374e-05, + "loss": 0.0363, + "step": 8380 + }, + { + "epoch": 8.640576725025747, + "grad_norm": 0.6640805006027222, + "learning_rate": 5.9684683091587804e-05, + "loss": 0.0396, + "step": 8390 + }, + { + "epoch": 8.650875386199793, + "grad_norm": 0.29109275341033936, + "learning_rate": 5.959454699226562e-05, + "loss": 0.0452, + "step": 8400 + }, + { + "epoch": 8.66117404737384, + "grad_norm": 0.39319050312042236, + "learning_rate": 5.95043785088401e-05, + "loss": 0.0359, + "step": 8410 + }, + { + "epoch": 8.671472708547888, + "grad_norm": 0.2134009450674057, + "learning_rate": 5.941417794565343e-05, + "loss": 0.0387, + "step": 8420 + }, + { + "epoch": 8.681771369721936, + "grad_norm": 0.21827584505081177, + "learning_rate": 5.9323945607156076e-05, + "loss": 0.0382, + "step": 8430 + }, + { + "epoch": 8.692070030895984, + "grad_norm": 0.41963616013526917, + "learning_rate": 5.9233681797905785e-05, + "loss": 0.0404, + "step": 8440 + }, + { + "epoch": 8.702368692070031, + "grad_norm": 0.21744829416275024, + "learning_rate": 5.914338682256647e-05, + "loss": 0.0437, + "step": 8450 + }, + { + "epoch": 8.712667353244079, + "grad_norm": 0.27720943093299866, + "learning_rate": 5.905306098590728e-05, + "loss": 0.0403, + "step": 8460 + }, + { + "epoch": 8.722966014418125, + "grad_norm": 0.30195143818855286, + "learning_rate": 5.896270459280153e-05, + "loss": 0.0374, + "step": 8470 + }, + { + "epoch": 8.733264675592173, + "grad_norm": 0.32989758253097534, + "learning_rate": 5.8872317948225644e-05, + "loss": 0.0368, + "step": 8480 + }, + { + "epoch": 8.74356333676622, + "grad_norm": 0.22078627347946167, + "learning_rate": 5.8781901357258165e-05, + "loss": 0.0467, + "step": 8490 + }, + { + "epoch": 8.753861997940268, + "grad_norm": 0.5876451134681702, + "learning_rate": 5.869145512507872e-05, + "loss": 0.0407, + "step": 8500 + }, + { + "epoch": 8.764160659114316, + "grad_norm": 0.44796323776245117, + "learning_rate": 5.860097955696698e-05, + "loss": 0.0382, + "step": 8510 + }, + { + "epoch": 8.774459320288363, + "grad_norm": 0.35779476165771484, + "learning_rate": 5.851047495830163e-05, + "loss": 0.0438, + "step": 8520 + }, + { + "epoch": 8.784757981462409, + "grad_norm": 0.28585049510002136, + "learning_rate": 5.841994163455934e-05, + "loss": 0.0376, + "step": 8530 + }, + { + "epoch": 8.795056642636457, + "grad_norm": 0.26791223883628845, + "learning_rate": 5.832937989131374e-05, + "loss": 0.0387, + "step": 8540 + }, + { + "epoch": 8.805355303810504, + "grad_norm": 0.5671482086181641, + "learning_rate": 5.823879003423438e-05, + "loss": 0.0366, + "step": 8550 + }, + { + "epoch": 8.815653964984552, + "grad_norm": 0.1565544456243515, + "learning_rate": 5.8148172369085686e-05, + "loss": 0.0369, + "step": 8560 + }, + { + "epoch": 8.8259526261586, + "grad_norm": 0.46639129519462585, + "learning_rate": 5.8057527201725984e-05, + "loss": 0.0398, + "step": 8570 + }, + { + "epoch": 8.836251287332647, + "grad_norm": 0.8469918370246887, + "learning_rate": 5.796685483810637e-05, + "loss": 0.047, + "step": 8580 + }, + { + "epoch": 8.846549948506695, + "grad_norm": 0.1878482550382614, + "learning_rate": 5.7876155584269785e-05, + "loss": 0.0386, + "step": 8590 + }, + { + "epoch": 8.85684860968074, + "grad_norm": 0.26714402437210083, + "learning_rate": 5.7785429746349905e-05, + "loss": 0.049, + "step": 8600 + }, + { + "epoch": 8.867147270854788, + "grad_norm": 0.35005736351013184, + "learning_rate": 5.7694677630570146e-05, + "loss": 0.0435, + "step": 8610 + }, + { + "epoch": 8.877445932028836, + "grad_norm": 0.48994550108909607, + "learning_rate": 5.760389954324261e-05, + "loss": 0.049, + "step": 8620 + }, + { + "epoch": 8.887744593202884, + "grad_norm": 0.24901621043682098, + "learning_rate": 5.7513095790767066e-05, + "loss": 0.0445, + "step": 8630 + }, + { + "epoch": 8.898043254376931, + "grad_norm": 0.32309484481811523, + "learning_rate": 5.742226667962991e-05, + "loss": 0.0471, + "step": 8640 + }, + { + "epoch": 8.908341915550979, + "grad_norm": 0.30904820561408997, + "learning_rate": 5.733141251640315e-05, + "loss": 0.0377, + "step": 8650 + }, + { + "epoch": 8.918640576725025, + "grad_norm": 0.30617690086364746, + "learning_rate": 5.724053360774327e-05, + "loss": 0.0378, + "step": 8660 + }, + { + "epoch": 8.928939237899073, + "grad_norm": 0.19513899087905884, + "learning_rate": 5.7149630260390384e-05, + "loss": 0.0315, + "step": 8670 + }, + { + "epoch": 8.93923789907312, + "grad_norm": 0.5502423048019409, + "learning_rate": 5.705870278116703e-05, + "loss": 0.0422, + "step": 8680 + }, + { + "epoch": 8.949536560247168, + "grad_norm": 0.3435225486755371, + "learning_rate": 5.6967751476977215e-05, + "loss": 0.0406, + "step": 8690 + }, + { + "epoch": 8.959835221421216, + "grad_norm": 0.28045403957366943, + "learning_rate": 5.687677665480533e-05, + "loss": 0.0473, + "step": 8700 + }, + { + "epoch": 8.970133882595263, + "grad_norm": 0.2749752700328827, + "learning_rate": 5.6785778621715225e-05, + "loss": 0.0378, + "step": 8710 + }, + { + "epoch": 8.98043254376931, + "grad_norm": 0.39981475472450256, + "learning_rate": 5.669475768484901e-05, + "loss": 0.0406, + "step": 8720 + }, + { + "epoch": 8.990731204943357, + "grad_norm": 0.28953787684440613, + "learning_rate": 5.660371415142611e-05, + "loss": 0.0379, + "step": 8730 + }, + { + "epoch": 9.001029866117404, + "grad_norm": 0.17452044785022736, + "learning_rate": 5.65126483287423e-05, + "loss": 0.0412, + "step": 8740 + }, + { + "epoch": 9.011328527291452, + "grad_norm": 0.3600793182849884, + "learning_rate": 5.642156052416849e-05, + "loss": 0.041, + "step": 8750 + }, + { + "epoch": 9.0216271884655, + "grad_norm": 0.2760295569896698, + "learning_rate": 5.633045104514982e-05, + "loss": 0.0435, + "step": 8760 + }, + { + "epoch": 9.031925849639547, + "grad_norm": 0.3825409710407257, + "learning_rate": 5.6239320199204616e-05, + "loss": 0.0408, + "step": 8770 + }, + { + "epoch": 9.042224510813595, + "grad_norm": 0.374891072511673, + "learning_rate": 5.614816829392328e-05, + "loss": 0.0383, + "step": 8780 + }, + { + "epoch": 9.052523171987641, + "grad_norm": 0.27747559547424316, + "learning_rate": 5.60569956369673e-05, + "loss": 0.0464, + "step": 8790 + }, + { + "epoch": 9.062821833161689, + "grad_norm": 0.28678062558174133, + "learning_rate": 5.596580253606824e-05, + "loss": 0.0487, + "step": 8800 + }, + { + "epoch": 9.073120494335736, + "grad_norm": 0.4970363676548004, + "learning_rate": 5.587458929902664e-05, + "loss": 0.051, + "step": 8810 + }, + { + "epoch": 9.083419155509784, + "grad_norm": 0.30037108063697815, + "learning_rate": 5.5783356233711005e-05, + "loss": 0.0383, + "step": 8820 + }, + { + "epoch": 9.093717816683832, + "grad_norm": 0.2640860676765442, + "learning_rate": 5.569210364805677e-05, + "loss": 0.0462, + "step": 8830 + }, + { + "epoch": 9.10401647785788, + "grad_norm": 0.30006083846092224, + "learning_rate": 5.5600831850065274e-05, + "loss": 0.0362, + "step": 8840 + }, + { + "epoch": 9.114315139031925, + "grad_norm": 0.3721349537372589, + "learning_rate": 5.550954114780269e-05, + "loss": 0.0399, + "step": 8850 + }, + { + "epoch": 9.124613800205973, + "grad_norm": 0.336732417345047, + "learning_rate": 5.541823184939896e-05, + "loss": 0.0421, + "step": 8860 + }, + { + "epoch": 9.13491246138002, + "grad_norm": 0.26279309391975403, + "learning_rate": 5.532690426304685e-05, + "loss": 0.0433, + "step": 8870 + }, + { + "epoch": 9.145211122554068, + "grad_norm": 0.2945043742656708, + "learning_rate": 5.5235558697000836e-05, + "loss": 0.0439, + "step": 8880 + }, + { + "epoch": 9.155509783728116, + "grad_norm": 0.47877517342567444, + "learning_rate": 5.514419545957606e-05, + "loss": 0.0431, + "step": 8890 + }, + { + "epoch": 9.165808444902163, + "grad_norm": 0.3854601979255676, + "learning_rate": 5.5052814859147315e-05, + "loss": 0.0365, + "step": 8900 + }, + { + "epoch": 9.176107106076211, + "grad_norm": 0.3006962835788727, + "learning_rate": 5.496141720414804e-05, + "loss": 0.0427, + "step": 8910 + }, + { + "epoch": 9.186405767250257, + "grad_norm": 0.5065596699714661, + "learning_rate": 5.487000280306917e-05, + "loss": 0.0395, + "step": 8920 + }, + { + "epoch": 9.196704428424304, + "grad_norm": 0.4032178521156311, + "learning_rate": 5.4778571964458214e-05, + "loss": 0.0341, + "step": 8930 + }, + { + "epoch": 9.207003089598352, + "grad_norm": 0.357695609331131, + "learning_rate": 5.468712499691816e-05, + "loss": 0.0427, + "step": 8940 + }, + { + "epoch": 9.2173017507724, + "grad_norm": 0.6212796568870544, + "learning_rate": 5.45956622091064e-05, + "loss": 0.0444, + "step": 8950 + }, + { + "epoch": 9.227600411946447, + "grad_norm": 0.29458391666412354, + "learning_rate": 5.4504183909733734e-05, + "loss": 0.0402, + "step": 8960 + }, + { + "epoch": 9.237899073120495, + "grad_norm": 0.309467613697052, + "learning_rate": 5.441269040756334e-05, + "loss": 0.0412, + "step": 8970 + }, + { + "epoch": 9.248197734294541, + "grad_norm": 0.17707674205303192, + "learning_rate": 5.43211820114097e-05, + "loss": 0.0423, + "step": 8980 + }, + { + "epoch": 9.258496395468589, + "grad_norm": 0.4098307490348816, + "learning_rate": 5.422965903013757e-05, + "loss": 0.0421, + "step": 8990 + }, + { + "epoch": 9.268795056642636, + "grad_norm": 0.31290164589881897, + "learning_rate": 5.41381217726609e-05, + "loss": 0.0402, + "step": 9000 + }, + { + "epoch": 9.279093717816684, + "grad_norm": 0.20957662165164948, + "learning_rate": 5.404657054794189e-05, + "loss": 0.0426, + "step": 9010 + }, + { + "epoch": 9.289392378990732, + "grad_norm": 0.2308698147535324, + "learning_rate": 5.3955005664989834e-05, + "loss": 0.0389, + "step": 9020 + }, + { + "epoch": 9.29969104016478, + "grad_norm": 0.2409774512052536, + "learning_rate": 5.3863427432860125e-05, + "loss": 0.0352, + "step": 9030 + }, + { + "epoch": 9.309989701338825, + "grad_norm": 0.24483443796634674, + "learning_rate": 5.3771836160653254e-05, + "loss": 0.0406, + "step": 9040 + }, + { + "epoch": 9.320288362512873, + "grad_norm": 0.2869531810283661, + "learning_rate": 5.368023215751369e-05, + "loss": 0.0379, + "step": 9050 + }, + { + "epoch": 9.33058702368692, + "grad_norm": 0.27807915210723877, + "learning_rate": 5.3588615732628854e-05, + "loss": 0.0451, + "step": 9060 + }, + { + "epoch": 9.340885684860968, + "grad_norm": 0.33199331164360046, + "learning_rate": 5.3496987195228156e-05, + "loss": 0.034, + "step": 9070 + }, + { + "epoch": 9.351184346035016, + "grad_norm": 0.2562348246574402, + "learning_rate": 5.340534685458185e-05, + "loss": 0.0413, + "step": 9080 + }, + { + "epoch": 9.361483007209063, + "grad_norm": 0.3097791075706482, + "learning_rate": 5.3313695020000024e-05, + "loss": 0.039, + "step": 9090 + }, + { + "epoch": 9.371781668383111, + "grad_norm": 0.3079645037651062, + "learning_rate": 5.322203200083154e-05, + "loss": 0.0349, + "step": 9100 + }, + { + "epoch": 9.382080329557157, + "grad_norm": 0.4117037057876587, + "learning_rate": 5.3130358106463104e-05, + "loss": 0.0407, + "step": 9110 + }, + { + "epoch": 9.392378990731205, + "grad_norm": 0.4133201539516449, + "learning_rate": 5.303867364631804e-05, + "loss": 0.045, + "step": 9120 + }, + { + "epoch": 9.402677651905252, + "grad_norm": 0.2096584141254425, + "learning_rate": 5.294697892985534e-05, + "loss": 0.0335, + "step": 9130 + }, + { + "epoch": 9.4129763130793, + "grad_norm": 0.28559908270835876, + "learning_rate": 5.285527426656865e-05, + "loss": 0.0398, + "step": 9140 + }, + { + "epoch": 9.423274974253347, + "grad_norm": 0.3598606288433075, + "learning_rate": 5.2763559965985184e-05, + "loss": 0.0419, + "step": 9150 + }, + { + "epoch": 9.433573635427395, + "grad_norm": 0.35209372639656067, + "learning_rate": 5.2671836337664634e-05, + "loss": 0.0405, + "step": 9160 + }, + { + "epoch": 9.443872296601441, + "grad_norm": 0.23415158689022064, + "learning_rate": 5.2580103691198255e-05, + "loss": 0.0366, + "step": 9170 + }, + { + "epoch": 9.454170957775489, + "grad_norm": 0.2906668484210968, + "learning_rate": 5.24883623362077e-05, + "loss": 0.0493, + "step": 9180 + }, + { + "epoch": 9.464469618949536, + "grad_norm": 0.21137650310993195, + "learning_rate": 5.2396612582343986e-05, + "loss": 0.0423, + "step": 9190 + }, + { + "epoch": 9.474768280123584, + "grad_norm": 0.23499812185764313, + "learning_rate": 5.230485473928651e-05, + "loss": 0.0416, + "step": 9200 + }, + { + "epoch": 9.485066941297632, + "grad_norm": 0.372158020734787, + "learning_rate": 5.221308911674201e-05, + "loss": 0.0407, + "step": 9210 + }, + { + "epoch": 9.49536560247168, + "grad_norm": 0.2552221119403839, + "learning_rate": 5.2121316024443415e-05, + "loss": 0.0408, + "step": 9220 + }, + { + "epoch": 9.505664263645727, + "grad_norm": 0.27116450667381287, + "learning_rate": 5.202953577214889e-05, + "loss": 0.0375, + "step": 9230 + }, + { + "epoch": 9.515962924819773, + "grad_norm": 1.0216639041900635, + "learning_rate": 5.1937748669640776e-05, + "loss": 0.0412, + "step": 9240 + }, + { + "epoch": 9.52626158599382, + "grad_norm": 0.39132076501846313, + "learning_rate": 5.1845955026724535e-05, + "loss": 0.0408, + "step": 9250 + }, + { + "epoch": 9.536560247167868, + "grad_norm": 0.3046022653579712, + "learning_rate": 5.175415515322768e-05, + "loss": 0.0349, + "step": 9260 + }, + { + "epoch": 9.546858908341916, + "grad_norm": 0.5317039489746094, + "learning_rate": 5.1662349358998796e-05, + "loss": 0.0377, + "step": 9270 + }, + { + "epoch": 9.557157569515963, + "grad_norm": 0.308902382850647, + "learning_rate": 5.157053795390642e-05, + "loss": 0.0416, + "step": 9280 + }, + { + "epoch": 9.567456230690011, + "grad_norm": 0.1709175854921341, + "learning_rate": 5.147872124783805e-05, + "loss": 0.0367, + "step": 9290 + }, + { + "epoch": 9.577754891864057, + "grad_norm": 0.35447025299072266, + "learning_rate": 5.138689955069902e-05, + "loss": 0.0339, + "step": 9300 + }, + { + "epoch": 9.588053553038105, + "grad_norm": 0.20557384192943573, + "learning_rate": 5.12950731724116e-05, + "loss": 0.0435, + "step": 9310 + }, + { + "epoch": 9.598352214212152, + "grad_norm": 0.27278539538383484, + "learning_rate": 5.12032424229138e-05, + "loss": 0.0399, + "step": 9320 + }, + { + "epoch": 9.6086508753862, + "grad_norm": 0.3033859133720398, + "learning_rate": 5.111140761215839e-05, + "loss": 0.0376, + "step": 9330 + }, + { + "epoch": 9.618949536560248, + "grad_norm": 0.3543021082878113, + "learning_rate": 5.101956905011185e-05, + "loss": 0.0427, + "step": 9340 + }, + { + "epoch": 9.629248197734295, + "grad_norm": 0.2944181561470032, + "learning_rate": 5.0927727046753336e-05, + "loss": 0.0371, + "step": 9350 + }, + { + "epoch": 9.639546858908343, + "grad_norm": 0.3597414493560791, + "learning_rate": 5.08358819120736e-05, + "loss": 0.0373, + "step": 9360 + }, + { + "epoch": 9.649845520082389, + "grad_norm": 0.33194977045059204, + "learning_rate": 5.074403395607399e-05, + "loss": 0.0424, + "step": 9370 + }, + { + "epoch": 9.660144181256436, + "grad_norm": 0.21433711051940918, + "learning_rate": 5.0652183488765335e-05, + "loss": 0.0407, + "step": 9380 + }, + { + "epoch": 9.670442842430484, + "grad_norm": 0.3961849808692932, + "learning_rate": 5.056033082016699e-05, + "loss": 0.0419, + "step": 9390 + }, + { + "epoch": 9.680741503604532, + "grad_norm": 0.9774559140205383, + "learning_rate": 5.046847626030569e-05, + "loss": 0.041, + "step": 9400 + }, + { + "epoch": 9.69104016477858, + "grad_norm": 0.36883220076560974, + "learning_rate": 5.037662011921459e-05, + "loss": 0.0377, + "step": 9410 + }, + { + "epoch": 9.701338825952627, + "grad_norm": 0.37542909383773804, + "learning_rate": 5.028476270693217e-05, + "loss": 0.0408, + "step": 9420 + }, + { + "epoch": 9.711637487126673, + "grad_norm": 0.45353376865386963, + "learning_rate": 5.0192904333501214e-05, + "loss": 0.0419, + "step": 9430 + }, + { + "epoch": 9.72193614830072, + "grad_norm": 0.27116161584854126, + "learning_rate": 5.010104530896771e-05, + "loss": 0.0447, + "step": 9440 + }, + { + "epoch": 9.732234809474768, + "grad_norm": 0.26916906237602234, + "learning_rate": 5.000918594337989e-05, + "loss": 0.0461, + "step": 9450 + }, + { + "epoch": 9.742533470648816, + "grad_norm": 0.3069358766078949, + "learning_rate": 4.991732654678709e-05, + "loss": 0.0458, + "step": 9460 + }, + { + "epoch": 9.752832131822863, + "grad_norm": 0.42274564504623413, + "learning_rate": 4.9825467429238834e-05, + "loss": 0.0401, + "step": 9470 + }, + { + "epoch": 9.763130792996911, + "grad_norm": 0.17982327938079834, + "learning_rate": 4.973360890078358e-05, + "loss": 0.0427, + "step": 9480 + }, + { + "epoch": 9.773429454170957, + "grad_norm": 0.23251447081565857, + "learning_rate": 4.96417512714679e-05, + "loss": 0.0326, + "step": 9490 + }, + { + "epoch": 9.783728115345005, + "grad_norm": 0.2869229018688202, + "learning_rate": 4.954989485133533e-05, + "loss": 0.0507, + "step": 9500 + }, + { + "epoch": 9.794026776519052, + "grad_norm": 1.0959696769714355, + "learning_rate": 4.9458039950425224e-05, + "loss": 0.0518, + "step": 9510 + }, + { + "epoch": 9.8043254376931, + "grad_norm": 0.3641543686389923, + "learning_rate": 4.9366186878771926e-05, + "loss": 0.0434, + "step": 9520 + }, + { + "epoch": 9.814624098867148, + "grad_norm": 0.5896167159080505, + "learning_rate": 4.927433594640354e-05, + "loss": 0.0409, + "step": 9530 + }, + { + "epoch": 9.824922760041195, + "grad_norm": 0.24302540719509125, + "learning_rate": 4.918248746334096e-05, + "loss": 0.0451, + "step": 9540 + }, + { + "epoch": 9.835221421215241, + "grad_norm": 0.2889201045036316, + "learning_rate": 4.909064173959681e-05, + "loss": 0.0384, + "step": 9550 + }, + { + "epoch": 9.845520082389289, + "grad_norm": 0.37873101234436035, + "learning_rate": 4.8998799085174455e-05, + "loss": 0.0404, + "step": 9560 + }, + { + "epoch": 9.855818743563336, + "grad_norm": 0.4369457960128784, + "learning_rate": 4.89069598100668e-05, + "loss": 0.0431, + "step": 9570 + }, + { + "epoch": 9.866117404737384, + "grad_norm": 0.37580832839012146, + "learning_rate": 4.881512422425541e-05, + "loss": 0.044, + "step": 9580 + }, + { + "epoch": 9.876416065911432, + "grad_norm": 0.46920913457870483, + "learning_rate": 4.872329263770942e-05, + "loss": 0.0469, + "step": 9590 + }, + { + "epoch": 9.88671472708548, + "grad_norm": 0.24571798741817474, + "learning_rate": 4.8631465360384385e-05, + "loss": 0.0398, + "step": 9600 + }, + { + "epoch": 9.897013388259527, + "grad_norm": 0.3728749454021454, + "learning_rate": 4.85396427022214e-05, + "loss": 0.0352, + "step": 9610 + }, + { + "epoch": 9.907312049433573, + "grad_norm": 0.301878958940506, + "learning_rate": 4.844782497314591e-05, + "loss": 0.0432, + "step": 9620 + }, + { + "epoch": 9.91761071060762, + "grad_norm": 0.26632949709892273, + "learning_rate": 4.835601248306675e-05, + "loss": 0.0439, + "step": 9630 + }, + { + "epoch": 9.927909371781668, + "grad_norm": 0.31497064232826233, + "learning_rate": 4.826420554187506e-05, + "loss": 0.0399, + "step": 9640 + }, + { + "epoch": 9.938208032955716, + "grad_norm": 0.26114657521247864, + "learning_rate": 4.817240445944327e-05, + "loss": 0.0408, + "step": 9650 + }, + { + "epoch": 9.948506694129764, + "grad_norm": 0.2729547619819641, + "learning_rate": 4.8080609545624004e-05, + "loss": 0.0392, + "step": 9660 + }, + { + "epoch": 9.958805355303811, + "grad_norm": 0.22712601721286774, + "learning_rate": 4.798882111024912e-05, + "loss": 0.0363, + "step": 9670 + }, + { + "epoch": 9.969104016477857, + "grad_norm": 0.47241315245628357, + "learning_rate": 4.7897039463128524e-05, + "loss": 0.0369, + "step": 9680 + }, + { + "epoch": 9.979402677651905, + "grad_norm": 0.3929249048233032, + "learning_rate": 4.780526491404929e-05, + "loss": 0.0436, + "step": 9690 + }, + { + "epoch": 9.989701338825952, + "grad_norm": 0.32324254512786865, + "learning_rate": 4.771349777277452e-05, + "loss": 0.0418, + "step": 9700 + }, + { + "epoch": 10.0, + "grad_norm": 0.4991161525249481, + "learning_rate": 4.762173834904225e-05, + "loss": 0.0352, + "step": 9710 + }, + { + "epoch": 10.010298661174048, + "grad_norm": 0.2615014612674713, + "learning_rate": 4.752998695256455e-05, + "loss": 0.0412, + "step": 9720 + }, + { + "epoch": 10.020597322348095, + "grad_norm": 0.29027608036994934, + "learning_rate": 4.743824389302635e-05, + "loss": 0.035, + "step": 9730 + }, + { + "epoch": 10.030895983522143, + "grad_norm": 0.3496328294277191, + "learning_rate": 4.734650948008445e-05, + "loss": 0.038, + "step": 9740 + }, + { + "epoch": 10.041194644696189, + "grad_norm": 0.25003111362457275, + "learning_rate": 4.7254784023366444e-05, + "loss": 0.0408, + "step": 9750 + }, + { + "epoch": 10.051493305870236, + "grad_norm": 0.28183093667030334, + "learning_rate": 4.716306783246977e-05, + "loss": 0.0415, + "step": 9760 + }, + { + "epoch": 10.061791967044284, + "grad_norm": 0.3574424386024475, + "learning_rate": 4.707136121696048e-05, + "loss": 0.0394, + "step": 9770 + }, + { + "epoch": 10.072090628218332, + "grad_norm": 0.2761897146701813, + "learning_rate": 4.69796644863724e-05, + "loss": 0.034, + "step": 9780 + }, + { + "epoch": 10.08238928939238, + "grad_norm": 0.2602722644805908, + "learning_rate": 4.688797795020597e-05, + "loss": 0.0354, + "step": 9790 + }, + { + "epoch": 10.092687950566427, + "grad_norm": 0.2515560984611511, + "learning_rate": 4.6796301917927166e-05, + "loss": 0.0402, + "step": 9800 + }, + { + "epoch": 10.102986611740473, + "grad_norm": 0.24942000210285187, + "learning_rate": 4.670463669896659e-05, + "loss": 0.0406, + "step": 9810 + }, + { + "epoch": 10.11328527291452, + "grad_norm": 0.29609471559524536, + "learning_rate": 4.66129826027183e-05, + "loss": 0.0397, + "step": 9820 + }, + { + "epoch": 10.123583934088568, + "grad_norm": 0.3640936613082886, + "learning_rate": 4.652133993853883e-05, + "loss": 0.0456, + "step": 9830 + }, + { + "epoch": 10.133882595262616, + "grad_norm": 0.2724517285823822, + "learning_rate": 4.64297090157461e-05, + "loss": 0.0371, + "step": 9840 + }, + { + "epoch": 10.144181256436664, + "grad_norm": 0.33307430148124695, + "learning_rate": 4.633809014361843e-05, + "loss": 0.0438, + "step": 9850 + }, + { + "epoch": 10.154479917610711, + "grad_norm": 0.45976462960243225, + "learning_rate": 4.624648363139344e-05, + "loss": 0.0479, + "step": 9860 + }, + { + "epoch": 10.164778578784759, + "grad_norm": 0.24571570754051208, + "learning_rate": 4.615488978826709e-05, + "loss": 0.0375, + "step": 9870 + }, + { + "epoch": 10.175077239958805, + "grad_norm": 0.4202505052089691, + "learning_rate": 4.6063308923392485e-05, + "loss": 0.0446, + "step": 9880 + }, + { + "epoch": 10.185375901132852, + "grad_norm": 0.30180397629737854, + "learning_rate": 4.5971741345879e-05, + "loss": 0.0372, + "step": 9890 + }, + { + "epoch": 10.1956745623069, + "grad_norm": 0.39542245864868164, + "learning_rate": 4.588018736479115e-05, + "loss": 0.0407, + "step": 9900 + }, + { + "epoch": 10.205973223480948, + "grad_norm": 0.5576333403587341, + "learning_rate": 4.5788647289147516e-05, + "loss": 0.0372, + "step": 9910 + }, + { + "epoch": 10.216271884654995, + "grad_norm": 0.2639693319797516, + "learning_rate": 4.56971214279198e-05, + "loss": 0.0463, + "step": 9920 + }, + { + "epoch": 10.226570545829043, + "grad_norm": 0.26938265562057495, + "learning_rate": 4.56056100900317e-05, + "loss": 0.0367, + "step": 9930 + }, + { + "epoch": 10.236869207003089, + "grad_norm": 0.27783456444740295, + "learning_rate": 4.5514113584357873e-05, + "loss": 0.0369, + "step": 9940 + }, + { + "epoch": 10.247167868177137, + "grad_norm": 0.27680081129074097, + "learning_rate": 4.542263221972295e-05, + "loss": 0.0393, + "step": 9950 + }, + { + "epoch": 10.257466529351184, + "grad_norm": 0.2161240130662918, + "learning_rate": 4.5331166304900464e-05, + "loss": 0.042, + "step": 9960 + }, + { + "epoch": 10.267765190525232, + "grad_norm": 0.27455902099609375, + "learning_rate": 4.5239716148611724e-05, + "loss": 0.0434, + "step": 9970 + }, + { + "epoch": 10.27806385169928, + "grad_norm": 0.3013168275356293, + "learning_rate": 4.514828205952495e-05, + "loss": 0.0395, + "step": 9980 + }, + { + "epoch": 10.288362512873327, + "grad_norm": 0.2296813279390335, + "learning_rate": 4.505686434625409e-05, + "loss": 0.0368, + "step": 9990 + }, + { + "epoch": 10.298661174047373, + "grad_norm": 0.19806218147277832, + "learning_rate": 4.496546331735778e-05, + "loss": 0.0391, + "step": 10000 + }, + { + "epoch": 10.30895983522142, + "grad_norm": 0.24850870668888092, + "learning_rate": 4.4874079281338416e-05, + "loss": 0.0407, + "step": 10010 + }, + { + "epoch": 10.319258496395468, + "grad_norm": 0.16531158983707428, + "learning_rate": 4.478271254664097e-05, + "loss": 0.0359, + "step": 10020 + }, + { + "epoch": 10.329557157569516, + "grad_norm": 0.5394207835197449, + "learning_rate": 4.469136342165207e-05, + "loss": 0.0375, + "step": 10030 + }, + { + "epoch": 10.339855818743564, + "grad_norm": 0.4204263687133789, + "learning_rate": 4.460003221469886e-05, + "loss": 0.042, + "step": 10040 + }, + { + "epoch": 10.350154479917611, + "grad_norm": 2.313096284866333, + "learning_rate": 4.450871923404806e-05, + "loss": 0.0465, + "step": 10050 + }, + { + "epoch": 10.360453141091659, + "grad_norm": 0.6360970735549927, + "learning_rate": 4.441742478790481e-05, + "loss": 0.0421, + "step": 10060 + }, + { + "epoch": 10.370751802265705, + "grad_norm": 0.23286186158657074, + "learning_rate": 4.432614918441175e-05, + "loss": 0.0352, + "step": 10070 + }, + { + "epoch": 10.381050463439752, + "grad_norm": 0.3724748194217682, + "learning_rate": 4.4234892731647866e-05, + "loss": 0.0434, + "step": 10080 + }, + { + "epoch": 10.3913491246138, + "grad_norm": 0.212792307138443, + "learning_rate": 4.414365573762755e-05, + "loss": 0.0357, + "step": 10090 + }, + { + "epoch": 10.401647785787848, + "grad_norm": 0.22442536056041718, + "learning_rate": 4.4052438510299515e-05, + "loss": 0.0398, + "step": 10100 + }, + { + "epoch": 10.411946446961895, + "grad_norm": 0.3250674307346344, + "learning_rate": 4.3961241357545706e-05, + "loss": 0.0377, + "step": 10110 + }, + { + "epoch": 10.422245108135943, + "grad_norm": 0.2997426986694336, + "learning_rate": 4.387006458718037e-05, + "loss": 0.0385, + "step": 10120 + }, + { + "epoch": 10.432543769309989, + "grad_norm": 0.26953554153442383, + "learning_rate": 4.377890850694893e-05, + "loss": 0.0352, + "step": 10130 + }, + { + "epoch": 10.442842430484037, + "grad_norm": 0.3824928402900696, + "learning_rate": 4.368777342452697e-05, + "loss": 0.038, + "step": 10140 + }, + { + "epoch": 10.453141091658084, + "grad_norm": 0.33039042353630066, + "learning_rate": 4.35966596475192e-05, + "loss": 0.0354, + "step": 10150 + }, + { + "epoch": 10.463439752832132, + "grad_norm": 0.665787935256958, + "learning_rate": 4.3505567483458456e-05, + "loss": 0.0393, + "step": 10160 + }, + { + "epoch": 10.47373841400618, + "grad_norm": 0.25892671942710876, + "learning_rate": 4.341449723980457e-05, + "loss": 0.0403, + "step": 10170 + }, + { + "epoch": 10.484037075180227, + "grad_norm": 0.8381480574607849, + "learning_rate": 4.3323449223943416e-05, + "loss": 0.0403, + "step": 10180 + }, + { + "epoch": 10.494335736354273, + "grad_norm": 0.2520352303981781, + "learning_rate": 4.323242374318586e-05, + "loss": 0.0376, + "step": 10190 + }, + { + "epoch": 10.50463439752832, + "grad_norm": 0.30395472049713135, + "learning_rate": 4.314142110476666e-05, + "loss": 0.039, + "step": 10200 + }, + { + "epoch": 10.514933058702368, + "grad_norm": 0.2134946584701538, + "learning_rate": 4.305044161584352e-05, + "loss": 0.0356, + "step": 10210 + }, + { + "epoch": 10.525231719876416, + "grad_norm": 0.30410531163215637, + "learning_rate": 4.295948558349598e-05, + "loss": 0.0399, + "step": 10220 + }, + { + "epoch": 10.535530381050464, + "grad_norm": 0.3639879524707794, + "learning_rate": 4.2868553314724425e-05, + "loss": 0.0377, + "step": 10230 + }, + { + "epoch": 10.545829042224511, + "grad_norm": 0.7833529114723206, + "learning_rate": 4.2777645116449004e-05, + "loss": 0.042, + "step": 10240 + }, + { + "epoch": 10.556127703398559, + "grad_norm": 0.3496880829334259, + "learning_rate": 4.268676129550869e-05, + "loss": 0.043, + "step": 10250 + }, + { + "epoch": 10.566426364572605, + "grad_norm": 0.24933426082134247, + "learning_rate": 4.2595902158660074e-05, + "loss": 0.0392, + "step": 10260 + }, + { + "epoch": 10.576725025746653, + "grad_norm": 0.35013383626937866, + "learning_rate": 4.250506801257653e-05, + "loss": 0.0403, + "step": 10270 + }, + { + "epoch": 10.5870236869207, + "grad_norm": 0.5155181884765625, + "learning_rate": 4.241425916384699e-05, + "loss": 0.0383, + "step": 10280 + }, + { + "epoch": 10.597322348094748, + "grad_norm": 0.5019784569740295, + "learning_rate": 4.2323475918975075e-05, + "loss": 0.0412, + "step": 10290 + }, + { + "epoch": 10.607621009268795, + "grad_norm": 0.38487544655799866, + "learning_rate": 4.223271858437799e-05, + "loss": 0.0377, + "step": 10300 + }, + { + "epoch": 10.617919670442843, + "grad_norm": 0.2794114947319031, + "learning_rate": 4.21419874663854e-05, + "loss": 0.0398, + "step": 10310 + }, + { + "epoch": 10.628218331616889, + "grad_norm": 0.1784840226173401, + "learning_rate": 4.205128287123858e-05, + "loss": 0.0375, + "step": 10320 + }, + { + "epoch": 10.638516992790937, + "grad_norm": 0.19784130156040192, + "learning_rate": 4.196060510508922e-05, + "loss": 0.0329, + "step": 10330 + }, + { + "epoch": 10.648815653964984, + "grad_norm": 0.25078096985816956, + "learning_rate": 4.186995447399849e-05, + "loss": 0.0305, + "step": 10340 + }, + { + "epoch": 10.659114315139032, + "grad_norm": 0.2800082862377167, + "learning_rate": 4.177933128393594e-05, + "loss": 0.0386, + "step": 10350 + }, + { + "epoch": 10.66941297631308, + "grad_norm": 0.2689889073371887, + "learning_rate": 4.1688735840778546e-05, + "loss": 0.0355, + "step": 10360 + }, + { + "epoch": 10.679711637487127, + "grad_norm": 0.26448753476142883, + "learning_rate": 4.159816845030957e-05, + "loss": 0.0357, + "step": 10370 + }, + { + "epoch": 10.690010298661175, + "grad_norm": 0.2718246579170227, + "learning_rate": 4.1507629418217634e-05, + "loss": 0.0339, + "step": 10380 + }, + { + "epoch": 10.70030895983522, + "grad_norm": 0.2607558071613312, + "learning_rate": 4.141711905009566e-05, + "loss": 0.0397, + "step": 10390 + }, + { + "epoch": 10.710607621009268, + "grad_norm": 0.324266254901886, + "learning_rate": 4.132663765143975e-05, + "loss": 0.0355, + "step": 10400 + }, + { + "epoch": 10.720906282183316, + "grad_norm": 0.31110501289367676, + "learning_rate": 4.1236185527648294e-05, + "loss": 0.0389, + "step": 10410 + }, + { + "epoch": 10.731204943357364, + "grad_norm": 0.3010208010673523, + "learning_rate": 4.114576298402084e-05, + "loss": 0.0384, + "step": 10420 + }, + { + "epoch": 10.741503604531411, + "grad_norm": 0.42494192719459534, + "learning_rate": 4.1055370325757106e-05, + "loss": 0.0407, + "step": 10430 + }, + { + "epoch": 10.751802265705459, + "grad_norm": 0.26597830653190613, + "learning_rate": 4.096500785795591e-05, + "loss": 0.0351, + "step": 10440 + }, + { + "epoch": 10.762100926879505, + "grad_norm": 0.3270758092403412, + "learning_rate": 4.087467588561424e-05, + "loss": 0.0351, + "step": 10450 + }, + { + "epoch": 10.772399588053553, + "grad_norm": 0.35372480750083923, + "learning_rate": 4.0784374713626076e-05, + "loss": 0.0431, + "step": 10460 + }, + { + "epoch": 10.7826982492276, + "grad_norm": 0.3251330256462097, + "learning_rate": 4.069410464678148e-05, + "loss": 0.0352, + "step": 10470 + }, + { + "epoch": 10.792996910401648, + "grad_norm": 0.26621249318122864, + "learning_rate": 4.0603865989765504e-05, + "loss": 0.0432, + "step": 10480 + }, + { + "epoch": 10.803295571575696, + "grad_norm": 0.3128867745399475, + "learning_rate": 4.05136590471572e-05, + "loss": 0.0412, + "step": 10490 + }, + { + "epoch": 10.813594232749743, + "grad_norm": 0.20734545588493347, + "learning_rate": 4.042348412342861e-05, + "loss": 0.0352, + "step": 10500 + }, + { + "epoch": 10.82389289392379, + "grad_norm": 0.3195039629936218, + "learning_rate": 4.0333341522943614e-05, + "loss": 0.0374, + "step": 10510 + }, + { + "epoch": 10.834191555097837, + "grad_norm": 0.27724260091781616, + "learning_rate": 4.024323154995708e-05, + "loss": 0.0405, + "step": 10520 + }, + { + "epoch": 10.844490216271884, + "grad_norm": 0.2909531593322754, + "learning_rate": 4.015315450861371e-05, + "loss": 0.0364, + "step": 10530 + }, + { + "epoch": 10.854788877445932, + "grad_norm": 0.28578925132751465, + "learning_rate": 4.006311070294702e-05, + "loss": 0.0354, + "step": 10540 + }, + { + "epoch": 10.86508753861998, + "grad_norm": 0.2503175437450409, + "learning_rate": 3.997310043687842e-05, + "loss": 0.0348, + "step": 10550 + }, + { + "epoch": 10.875386199794027, + "grad_norm": 0.36039701104164124, + "learning_rate": 3.988312401421609e-05, + "loss": 0.0414, + "step": 10560 + }, + { + "epoch": 10.885684860968075, + "grad_norm": 0.45128464698791504, + "learning_rate": 3.979318173865393e-05, + "loss": 0.04, + "step": 10570 + }, + { + "epoch": 10.89598352214212, + "grad_norm": 0.35974377393722534, + "learning_rate": 3.970327391377064e-05, + "loss": 0.0392, + "step": 10580 + }, + { + "epoch": 10.906282183316168, + "grad_norm": 0.22907008230686188, + "learning_rate": 3.9613400843028666e-05, + "loss": 0.0342, + "step": 10590 + }, + { + "epoch": 10.916580844490216, + "grad_norm": 0.3276582956314087, + "learning_rate": 3.9523562829773036e-05, + "loss": 0.043, + "step": 10600 + }, + { + "epoch": 10.926879505664264, + "grad_norm": 0.27974191308021545, + "learning_rate": 3.943376017723057e-05, + "loss": 0.0357, + "step": 10610 + }, + { + "epoch": 10.937178166838311, + "grad_norm": 0.3858673572540283, + "learning_rate": 3.934399318850868e-05, + "loss": 0.0369, + "step": 10620 + }, + { + "epoch": 10.947476828012359, + "grad_norm": 0.29965823888778687, + "learning_rate": 3.925426216659438e-05, + "loss": 0.0369, + "step": 10630 + }, + { + "epoch": 10.957775489186405, + "grad_norm": 0.3583829998970032, + "learning_rate": 3.916456741435336e-05, + "loss": 0.0425, + "step": 10640 + }, + { + "epoch": 10.968074150360453, + "grad_norm": 0.27793335914611816, + "learning_rate": 3.9074909234528826e-05, + "loss": 0.0399, + "step": 10650 + }, + { + "epoch": 10.9783728115345, + "grad_norm": 0.24120087921619415, + "learning_rate": 3.898528792974056e-05, + "loss": 0.0403, + "step": 10660 + }, + { + "epoch": 10.988671472708548, + "grad_norm": 0.22013327479362488, + "learning_rate": 3.8895703802483916e-05, + "loss": 0.034, + "step": 10670 + }, + { + "epoch": 10.998970133882596, + "grad_norm": 0.2588166296482086, + "learning_rate": 3.880615715512868e-05, + "loss": 0.0316, + "step": 10680 + }, + { + "epoch": 11.009268795056643, + "grad_norm": 0.2514420449733734, + "learning_rate": 3.871664828991822e-05, + "loss": 0.0383, + "step": 10690 + }, + { + "epoch": 11.019567456230691, + "grad_norm": 0.3404804468154907, + "learning_rate": 3.862717750896837e-05, + "loss": 0.0352, + "step": 10700 + }, + { + "epoch": 11.029866117404737, + "grad_norm": 0.9497872591018677, + "learning_rate": 3.853774511426634e-05, + "loss": 0.0366, + "step": 10710 + }, + { + "epoch": 11.040164778578784, + "grad_norm": 0.28247174620628357, + "learning_rate": 3.844835140766988e-05, + "loss": 0.0473, + "step": 10720 + }, + { + "epoch": 11.050463439752832, + "grad_norm": 0.28879600763320923, + "learning_rate": 3.83589966909061e-05, + "loss": 0.0344, + "step": 10730 + }, + { + "epoch": 11.06076210092688, + "grad_norm": 0.23894581198692322, + "learning_rate": 3.82696812655705e-05, + "loss": 0.0349, + "step": 10740 + }, + { + "epoch": 11.071060762100927, + "grad_norm": 0.26289770007133484, + "learning_rate": 3.818040543312598e-05, + "loss": 0.0384, + "step": 10750 + }, + { + "epoch": 11.081359423274975, + "grad_norm": 0.33045023679733276, + "learning_rate": 3.809116949490184e-05, + "loss": 0.0331, + "step": 10760 + }, + { + "epoch": 11.091658084449021, + "grad_norm": 0.46705836057662964, + "learning_rate": 3.8001973752092655e-05, + "loss": 0.0386, + "step": 10770 + }, + { + "epoch": 11.101956745623069, + "grad_norm": 0.5863741040229797, + "learning_rate": 3.791281850575737e-05, + "loss": 0.0415, + "step": 10780 + }, + { + "epoch": 11.112255406797116, + "grad_norm": 0.24471549689769745, + "learning_rate": 3.782370405681828e-05, + "loss": 0.0372, + "step": 10790 + }, + { + "epoch": 11.122554067971164, + "grad_norm": 0.3259426951408386, + "learning_rate": 3.773463070605987e-05, + "loss": 0.043, + "step": 10800 + }, + { + "epoch": 11.132852729145212, + "grad_norm": 0.2583596408367157, + "learning_rate": 3.764559875412803e-05, + "loss": 0.0354, + "step": 10810 + }, + { + "epoch": 11.14315139031926, + "grad_norm": 0.46032634377479553, + "learning_rate": 3.7556608501528846e-05, + "loss": 0.0393, + "step": 10820 + }, + { + "epoch": 11.153450051493305, + "grad_norm": 0.38069912791252136, + "learning_rate": 3.7467660248627654e-05, + "loss": 0.0398, + "step": 10830 + }, + { + "epoch": 11.163748712667353, + "grad_norm": 0.28435567021369934, + "learning_rate": 3.737875429564807e-05, + "loss": 0.0388, + "step": 10840 + }, + { + "epoch": 11.1740473738414, + "grad_norm": 0.34043052792549133, + "learning_rate": 3.7289890942670946e-05, + "loss": 0.0296, + "step": 10850 + }, + { + "epoch": 11.184346035015448, + "grad_norm": 0.3213551938533783, + "learning_rate": 3.720107048963327e-05, + "loss": 0.0296, + "step": 10860 + }, + { + "epoch": 11.194644696189496, + "grad_norm": 0.45642250776290894, + "learning_rate": 3.711229323632732e-05, + "loss": 0.0347, + "step": 10870 + }, + { + "epoch": 11.204943357363543, + "grad_norm": 0.29973405599594116, + "learning_rate": 3.70235594823995e-05, + "loss": 0.036, + "step": 10880 + }, + { + "epoch": 11.215242018537591, + "grad_norm": 0.2634925842285156, + "learning_rate": 3.693486952734941e-05, + "loss": 0.0337, + "step": 10890 + }, + { + "epoch": 11.225540679711637, + "grad_norm": 0.25237777829170227, + "learning_rate": 3.684622367052887e-05, + "loss": 0.0347, + "step": 10900 + }, + { + "epoch": 11.235839340885684, + "grad_norm": 0.20709861814975739, + "learning_rate": 3.675762221114077e-05, + "loss": 0.0305, + "step": 10910 + }, + { + "epoch": 11.246138002059732, + "grad_norm": 0.14299030601978302, + "learning_rate": 3.66690654482382e-05, + "loss": 0.0334, + "step": 10920 + }, + { + "epoch": 11.25643666323378, + "grad_norm": 0.2454812377691269, + "learning_rate": 3.658055368072339e-05, + "loss": 0.0375, + "step": 10930 + }, + { + "epoch": 11.266735324407827, + "grad_norm": 0.2894679307937622, + "learning_rate": 3.6492087207346666e-05, + "loss": 0.0416, + "step": 10940 + }, + { + "epoch": 11.277033985581875, + "grad_norm": 0.2871219217777252, + "learning_rate": 3.640366632670549e-05, + "loss": 0.034, + "step": 10950 + }, + { + "epoch": 11.287332646755921, + "grad_norm": 0.30559393763542175, + "learning_rate": 3.631529133724348e-05, + "loss": 0.0369, + "step": 10960 + }, + { + "epoch": 11.297631307929969, + "grad_norm": 0.35164326429367065, + "learning_rate": 3.622696253724927e-05, + "loss": 0.035, + "step": 10970 + }, + { + "epoch": 11.307929969104016, + "grad_norm": 0.27396318316459656, + "learning_rate": 3.613868022485566e-05, + "loss": 0.0389, + "step": 10980 + }, + { + "epoch": 11.318228630278064, + "grad_norm": 0.27721869945526123, + "learning_rate": 3.605044469803854e-05, + "loss": 0.0365, + "step": 10990 + }, + { + "epoch": 11.328527291452112, + "grad_norm": 0.2726707458496094, + "learning_rate": 3.5962256254615853e-05, + "loss": 0.0382, + "step": 11000 + }, + { + "epoch": 11.33882595262616, + "grad_norm": 0.3522757589817047, + "learning_rate": 3.587411519224665e-05, + "loss": 0.0432, + "step": 11010 + }, + { + "epoch": 11.349124613800207, + "grad_norm": 0.2744219899177551, + "learning_rate": 3.5786021808430054e-05, + "loss": 0.0328, + "step": 11020 + }, + { + "epoch": 11.359423274974253, + "grad_norm": 0.36627647280693054, + "learning_rate": 3.569797640050423e-05, + "loss": 0.0407, + "step": 11030 + }, + { + "epoch": 11.3697219361483, + "grad_norm": 0.20793434977531433, + "learning_rate": 3.560997926564545e-05, + "loss": 0.0284, + "step": 11040 + }, + { + "epoch": 11.380020597322348, + "grad_norm": 0.23446743190288544, + "learning_rate": 3.552203070086707e-05, + "loss": 0.0355, + "step": 11050 + }, + { + "epoch": 11.390319258496396, + "grad_norm": 0.48527511954307556, + "learning_rate": 3.543413100301843e-05, + "loss": 0.0378, + "step": 11060 + }, + { + "epoch": 11.400617919670443, + "grad_norm": 0.39768174290657043, + "learning_rate": 3.534628046878403e-05, + "loss": 0.0329, + "step": 11070 + }, + { + "epoch": 11.410916580844491, + "grad_norm": 0.19781740009784698, + "learning_rate": 3.525847939468233e-05, + "loss": 0.0371, + "step": 11080 + }, + { + "epoch": 11.421215242018537, + "grad_norm": 0.2503238022327423, + "learning_rate": 3.517072807706492e-05, + "loss": 0.0363, + "step": 11090 + }, + { + "epoch": 11.431513903192585, + "grad_norm": 0.3444472849369049, + "learning_rate": 3.508302681211546e-05, + "loss": 0.0343, + "step": 11100 + }, + { + "epoch": 11.441812564366632, + "grad_norm": 0.3007254898548126, + "learning_rate": 3.499537589584859e-05, + "loss": 0.0441, + "step": 11110 + }, + { + "epoch": 11.45211122554068, + "grad_norm": 0.38914212584495544, + "learning_rate": 3.490777562410907e-05, + "loss": 0.0331, + "step": 11120 + }, + { + "epoch": 11.462409886714727, + "grad_norm": 0.3051401674747467, + "learning_rate": 3.482022629257074e-05, + "loss": 0.0328, + "step": 11130 + }, + { + "epoch": 11.472708547888775, + "grad_norm": 0.306740403175354, + "learning_rate": 3.473272819673542e-05, + "loss": 0.039, + "step": 11140 + }, + { + "epoch": 11.483007209062821, + "grad_norm": 0.42291760444641113, + "learning_rate": 3.4645281631932074e-05, + "loss": 0.0526, + "step": 11150 + }, + { + "epoch": 11.493305870236869, + "grad_norm": 0.2984221577644348, + "learning_rate": 3.455788689331574e-05, + "loss": 0.0345, + "step": 11160 + }, + { + "epoch": 11.503604531410916, + "grad_norm": 0.19411993026733398, + "learning_rate": 3.447054427586644e-05, + "loss": 0.0384, + "step": 11170 + }, + { + "epoch": 11.513903192584964, + "grad_norm": 0.3595150113105774, + "learning_rate": 3.438325407438837e-05, + "loss": 0.0358, + "step": 11180 + }, + { + "epoch": 11.524201853759012, + "grad_norm": 0.289594829082489, + "learning_rate": 3.4296016583508775e-05, + "loss": 0.0314, + "step": 11190 + }, + { + "epoch": 11.53450051493306, + "grad_norm": 0.3801267743110657, + "learning_rate": 3.420883209767697e-05, + "loss": 0.0453, + "step": 11200 + }, + { + "epoch": 11.544799176107105, + "grad_norm": 0.45930567383766174, + "learning_rate": 3.4121700911163366e-05, + "loss": 0.0418, + "step": 11210 + }, + { + "epoch": 11.555097837281153, + "grad_norm": 0.2295006662607193, + "learning_rate": 3.403462331805852e-05, + "loss": 0.0378, + "step": 11220 + }, + { + "epoch": 11.5653964984552, + "grad_norm": 0.38683414459228516, + "learning_rate": 3.394759961227202e-05, + "loss": 0.038, + "step": 11230 + }, + { + "epoch": 11.575695159629248, + "grad_norm": 0.32741764187812805, + "learning_rate": 3.386063008753164e-05, + "loss": 0.0403, + "step": 11240 + }, + { + "epoch": 11.585993820803296, + "grad_norm": 0.3826991319656372, + "learning_rate": 3.377371503738227e-05, + "loss": 0.0408, + "step": 11250 + }, + { + "epoch": 11.596292481977343, + "grad_norm": 0.5855404138565063, + "learning_rate": 3.368685475518488e-05, + "loss": 0.0343, + "step": 11260 + }, + { + "epoch": 11.606591143151391, + "grad_norm": 0.30145469307899475, + "learning_rate": 3.360004953411566e-05, + "loss": 0.0292, + "step": 11270 + }, + { + "epoch": 11.616889804325437, + "grad_norm": 1.2090197801589966, + "learning_rate": 3.3513299667164864e-05, + "loss": 0.0298, + "step": 11280 + }, + { + "epoch": 11.627188465499485, + "grad_norm": 0.7051903009414673, + "learning_rate": 3.3426605447136004e-05, + "loss": 0.0366, + "step": 11290 + }, + { + "epoch": 11.637487126673532, + "grad_norm": 0.3094668984413147, + "learning_rate": 3.3339967166644726e-05, + "loss": 0.0378, + "step": 11300 + }, + { + "epoch": 11.64778578784758, + "grad_norm": 0.3277672231197357, + "learning_rate": 3.325338511811784e-05, + "loss": 0.0407, + "step": 11310 + }, + { + "epoch": 11.658084449021628, + "grad_norm": 0.27167952060699463, + "learning_rate": 3.316685959379241e-05, + "loss": 0.0377, + "step": 11320 + }, + { + "epoch": 11.668383110195675, + "grad_norm": 0.5050401091575623, + "learning_rate": 3.308039088571469e-05, + "loss": 0.039, + "step": 11330 + }, + { + "epoch": 11.678681771369721, + "grad_norm": 0.23651434481143951, + "learning_rate": 3.2993979285739143e-05, + "loss": 0.0339, + "step": 11340 + }, + { + "epoch": 11.688980432543769, + "grad_norm": 0.3040764331817627, + "learning_rate": 3.2907625085527503e-05, + "loss": 0.0351, + "step": 11350 + }, + { + "epoch": 11.699279093717816, + "grad_norm": 0.23311540484428406, + "learning_rate": 3.28213285765478e-05, + "loss": 0.0347, + "step": 11360 + }, + { + "epoch": 11.709577754891864, + "grad_norm": 0.21837526559829712, + "learning_rate": 3.273509005007327e-05, + "loss": 0.0397, + "step": 11370 + }, + { + "epoch": 11.719876416065912, + "grad_norm": 0.24095067381858826, + "learning_rate": 3.264890979718147e-05, + "loss": 0.0335, + "step": 11380 + }, + { + "epoch": 11.73017507723996, + "grad_norm": 0.4714142680168152, + "learning_rate": 3.256278810875332e-05, + "loss": 0.0355, + "step": 11390 + }, + { + "epoch": 11.740473738414007, + "grad_norm": 0.3001396059989929, + "learning_rate": 3.247672527547197e-05, + "loss": 0.0311, + "step": 11400 + }, + { + "epoch": 11.750772399588053, + "grad_norm": 0.2514890730381012, + "learning_rate": 3.239072158782198e-05, + "loss": 0.0374, + "step": 11410 + }, + { + "epoch": 11.7610710607621, + "grad_norm": 0.22603774070739746, + "learning_rate": 3.230477733608831e-05, + "loss": 0.0368, + "step": 11420 + }, + { + "epoch": 11.771369721936148, + "grad_norm": 0.22810235619544983, + "learning_rate": 3.221889281035522e-05, + "loss": 0.0331, + "step": 11430 + }, + { + "epoch": 11.781668383110196, + "grad_norm": 0.18763025104999542, + "learning_rate": 3.2133068300505455e-05, + "loss": 0.0328, + "step": 11440 + }, + { + "epoch": 11.791967044284243, + "grad_norm": 0.32261693477630615, + "learning_rate": 3.204730409621917e-05, + "loss": 0.0408, + "step": 11450 + }, + { + "epoch": 11.802265705458291, + "grad_norm": 0.27985504269599915, + "learning_rate": 3.196160048697293e-05, + "loss": 0.0415, + "step": 11460 + }, + { + "epoch": 11.812564366632337, + "grad_norm": 0.28317996859550476, + "learning_rate": 3.187595776203886e-05, + "loss": 0.0413, + "step": 11470 + }, + { + "epoch": 11.822863027806385, + "grad_norm": 0.2768697440624237, + "learning_rate": 3.1790376210483494e-05, + "loss": 0.0433, + "step": 11480 + }, + { + "epoch": 11.833161688980432, + "grad_norm": 0.27718645334243774, + "learning_rate": 3.170485612116697e-05, + "loss": 0.028, + "step": 11490 + }, + { + "epoch": 11.84346035015448, + "grad_norm": 0.27956560254096985, + "learning_rate": 3.161939778274191e-05, + "loss": 0.0318, + "step": 11500 + }, + { + "epoch": 11.853759011328528, + "grad_norm": 0.25807636976242065, + "learning_rate": 3.1534001483652556e-05, + "loss": 0.0439, + "step": 11510 + }, + { + "epoch": 11.864057672502575, + "grad_norm": 0.6703087687492371, + "learning_rate": 3.14486675121337e-05, + "loss": 0.0298, + "step": 11520 + }, + { + "epoch": 11.874356333676623, + "grad_norm": 0.46335524320602417, + "learning_rate": 3.136339615620985e-05, + "loss": 0.0481, + "step": 11530 + }, + { + "epoch": 11.884654994850669, + "grad_norm": 0.250967800617218, + "learning_rate": 3.127818770369406e-05, + "loss": 0.0337, + "step": 11540 + }, + { + "epoch": 11.894953656024716, + "grad_norm": 0.2240300476551056, + "learning_rate": 3.119304244218715e-05, + "loss": 0.0327, + "step": 11550 + }, + { + "epoch": 11.905252317198764, + "grad_norm": 0.2884691655635834, + "learning_rate": 3.110796065907665e-05, + "loss": 0.0363, + "step": 11560 + }, + { + "epoch": 11.915550978372812, + "grad_norm": 0.28418871760368347, + "learning_rate": 3.102294264153577e-05, + "loss": 0.0325, + "step": 11570 + }, + { + "epoch": 11.92584963954686, + "grad_norm": 0.2494005262851715, + "learning_rate": 3.093798867652257e-05, + "loss": 0.0358, + "step": 11580 + }, + { + "epoch": 11.936148300720907, + "grad_norm": 0.43249595165252686, + "learning_rate": 3.0853099050778854e-05, + "loss": 0.0361, + "step": 11590 + }, + { + "epoch": 11.946446961894953, + "grad_norm": 0.32216548919677734, + "learning_rate": 3.0768274050829306e-05, + "loss": 0.0359, + "step": 11600 + }, + { + "epoch": 11.956745623069, + "grad_norm": 0.3839482069015503, + "learning_rate": 3.0683513962980456e-05, + "loss": 0.0338, + "step": 11610 + }, + { + "epoch": 11.967044284243048, + "grad_norm": 0.25899192690849304, + "learning_rate": 3.059881907331979e-05, + "loss": 0.0326, + "step": 11620 + }, + { + "epoch": 11.977342945417096, + "grad_norm": 0.2512173652648926, + "learning_rate": 3.0514189667714632e-05, + "loss": 0.0352, + "step": 11630 + }, + { + "epoch": 11.987641606591144, + "grad_norm": 0.43213722109794617, + "learning_rate": 3.042962603181138e-05, + "loss": 0.0395, + "step": 11640 + }, + { + "epoch": 11.997940267765191, + "grad_norm": 0.25386422872543335, + "learning_rate": 3.034512845103441e-05, + "loss": 0.0314, + "step": 11650 + }, + { + "epoch": 12.008238928939237, + "grad_norm": 0.35718950629234314, + "learning_rate": 3.0260697210585108e-05, + "loss": 0.0371, + "step": 11660 + }, + { + "epoch": 12.018537590113285, + "grad_norm": 0.29993295669555664, + "learning_rate": 3.017633259544101e-05, + "loss": 0.035, + "step": 11670 + }, + { + "epoch": 12.028836251287332, + "grad_norm": 0.3331249952316284, + "learning_rate": 3.0092034890354694e-05, + "loss": 0.0406, + "step": 11680 + }, + { + "epoch": 12.03913491246138, + "grad_norm": 0.22086752951145172, + "learning_rate": 3.0007804379852977e-05, + "loss": 0.0252, + "step": 11690 + }, + { + "epoch": 12.049433573635428, + "grad_norm": 0.22861167788505554, + "learning_rate": 2.9923641348235843e-05, + "loss": 0.0426, + "step": 11700 + }, + { + "epoch": 12.059732234809475, + "grad_norm": 0.26923444867134094, + "learning_rate": 2.9839546079575497e-05, + "loss": 0.0454, + "step": 11710 + }, + { + "epoch": 12.070030895983523, + "grad_norm": 0.23918205499649048, + "learning_rate": 2.9755518857715448e-05, + "loss": 0.0402, + "step": 11720 + }, + { + "epoch": 12.080329557157569, + "grad_norm": 0.23139654099941254, + "learning_rate": 2.967155996626956e-05, + "loss": 0.0303, + "step": 11730 + }, + { + "epoch": 12.090628218331616, + "grad_norm": 0.38359567523002625, + "learning_rate": 2.9587669688620988e-05, + "loss": 0.0398, + "step": 11740 + }, + { + "epoch": 12.100926879505664, + "grad_norm": 0.23274274170398712, + "learning_rate": 2.950384830792136e-05, + "loss": 0.0283, + "step": 11750 + }, + { + "epoch": 12.111225540679712, + "grad_norm": 0.29843324422836304, + "learning_rate": 2.942009610708976e-05, + "loss": 0.0339, + "step": 11760 + }, + { + "epoch": 12.12152420185376, + "grad_norm": 0.2866639494895935, + "learning_rate": 2.9336413368811723e-05, + "loss": 0.0325, + "step": 11770 + }, + { + "epoch": 12.131822863027807, + "grad_norm": 0.3042534589767456, + "learning_rate": 2.9252800375538368e-05, + "loss": 0.0355, + "step": 11780 + }, + { + "epoch": 12.142121524201853, + "grad_norm": 0.2678833305835724, + "learning_rate": 2.9169257409485418e-05, + "loss": 0.0329, + "step": 11790 + }, + { + "epoch": 12.1524201853759, + "grad_norm": 0.19894133508205414, + "learning_rate": 2.9085784752632157e-05, + "loss": 0.0383, + "step": 11800 + }, + { + "epoch": 12.162718846549948, + "grad_norm": 0.19369176030158997, + "learning_rate": 2.9002382686720676e-05, + "loss": 0.0303, + "step": 11810 + }, + { + "epoch": 12.173017507723996, + "grad_norm": 0.23142315447330475, + "learning_rate": 2.8919051493254724e-05, + "loss": 0.0404, + "step": 11820 + }, + { + "epoch": 12.183316168898044, + "grad_norm": 0.2168169468641281, + "learning_rate": 2.883579145349884e-05, + "loss": 0.0352, + "step": 11830 + }, + { + "epoch": 12.193614830072091, + "grad_norm": 0.27123361825942993, + "learning_rate": 2.8752602848477432e-05, + "loss": 0.0358, + "step": 11840 + }, + { + "epoch": 12.203913491246137, + "grad_norm": 1.34294593334198, + "learning_rate": 2.8669485958973775e-05, + "loss": 0.0336, + "step": 11850 + }, + { + "epoch": 12.214212152420185, + "grad_norm": 0.35292431712150574, + "learning_rate": 2.858644106552909e-05, + "loss": 0.0356, + "step": 11860 + }, + { + "epoch": 12.224510813594232, + "grad_norm": 0.5437068939208984, + "learning_rate": 2.850346844844157e-05, + "loss": 0.04, + "step": 11870 + }, + { + "epoch": 12.23480947476828, + "grad_norm": 0.7077152729034424, + "learning_rate": 2.8420568387765557e-05, + "loss": 0.0381, + "step": 11880 + }, + { + "epoch": 12.245108135942328, + "grad_norm": 1.2102924585342407, + "learning_rate": 2.8337741163310317e-05, + "loss": 0.0316, + "step": 11890 + }, + { + "epoch": 12.255406797116375, + "grad_norm": 0.22898398339748383, + "learning_rate": 2.825498705463947e-05, + "loss": 0.0355, + "step": 11900 + }, + { + "epoch": 12.265705458290423, + "grad_norm": 0.16343450546264648, + "learning_rate": 2.8172306341069672e-05, + "loss": 0.0333, + "step": 11910 + }, + { + "epoch": 12.276004119464469, + "grad_norm": 0.2778915762901306, + "learning_rate": 2.8089699301670002e-05, + "loss": 0.034, + "step": 11920 + }, + { + "epoch": 12.286302780638517, + "grad_norm": 0.2954021096229553, + "learning_rate": 2.800716621526078e-05, + "loss": 0.03, + "step": 11930 + }, + { + "epoch": 12.296601441812564, + "grad_norm": 0.18878135085105896, + "learning_rate": 2.7924707360412746e-05, + "loss": 0.0322, + "step": 11940 + }, + { + "epoch": 12.306900102986612, + "grad_norm": 0.25053462386131287, + "learning_rate": 2.7842323015446082e-05, + "loss": 0.0376, + "step": 11950 + }, + { + "epoch": 12.31719876416066, + "grad_norm": 0.21085461974143982, + "learning_rate": 2.7760013458429475e-05, + "loss": 0.0333, + "step": 11960 + }, + { + "epoch": 12.327497425334707, + "grad_norm": 0.27033373713493347, + "learning_rate": 2.767777896717919e-05, + "loss": 0.0387, + "step": 11970 + }, + { + "epoch": 12.337796086508753, + "grad_norm": 0.2603791356086731, + "learning_rate": 2.7595619819258116e-05, + "loss": 0.0336, + "step": 11980 + }, + { + "epoch": 12.3480947476828, + "grad_norm": 0.2735675573348999, + "learning_rate": 2.7513536291974895e-05, + "loss": 0.0367, + "step": 11990 + }, + { + "epoch": 12.358393408856848, + "grad_norm": 0.2710510790348053, + "learning_rate": 2.743152866238281e-05, + "loss": 0.0359, + "step": 12000 + }, + { + "epoch": 12.368692070030896, + "grad_norm": 0.3120410144329071, + "learning_rate": 2.7349597207279088e-05, + "loss": 0.0353, + "step": 12010 + }, + { + "epoch": 12.378990731204944, + "grad_norm": 1.238741159439087, + "learning_rate": 2.7267742203203795e-05, + "loss": 0.0328, + "step": 12020 + }, + { + "epoch": 12.389289392378991, + "grad_norm": 0.24720178544521332, + "learning_rate": 2.718596392643895e-05, + "loss": 0.035, + "step": 12030 + }, + { + "epoch": 12.399588053553039, + "grad_norm": 0.5230728387832642, + "learning_rate": 2.7104262653007616e-05, + "loss": 0.0385, + "step": 12040 + }, + { + "epoch": 12.409886714727085, + "grad_norm": 0.30197054147720337, + "learning_rate": 2.7022638658672933e-05, + "loss": 0.0378, + "step": 12050 + }, + { + "epoch": 12.420185375901132, + "grad_norm": 0.35036417841911316, + "learning_rate": 2.6941092218937214e-05, + "loss": 0.0316, + "step": 12060 + }, + { + "epoch": 12.43048403707518, + "grad_norm": 0.1900859922170639, + "learning_rate": 2.6859623609040984e-05, + "loss": 0.0416, + "step": 12070 + }, + { + "epoch": 12.440782698249228, + "grad_norm": 0.3137092888355255, + "learning_rate": 2.6778233103962158e-05, + "loss": 0.0347, + "step": 12080 + }, + { + "epoch": 12.451081359423275, + "grad_norm": 0.2586371600627899, + "learning_rate": 2.6696920978414862e-05, + "loss": 0.0313, + "step": 12090 + }, + { + "epoch": 12.461380020597323, + "grad_norm": 0.22871264815330505, + "learning_rate": 2.6615687506848864e-05, + "loss": 0.0384, + "step": 12100 + }, + { + "epoch": 12.471678681771369, + "grad_norm": 0.500694751739502, + "learning_rate": 2.6534532963448274e-05, + "loss": 0.0365, + "step": 12110 + }, + { + "epoch": 12.481977342945417, + "grad_norm": 0.23115640878677368, + "learning_rate": 2.645345762213094e-05, + "loss": 0.0359, + "step": 12120 + }, + { + "epoch": 12.492276004119464, + "grad_norm": 0.27199363708496094, + "learning_rate": 2.6372461756547306e-05, + "loss": 0.0367, + "step": 12130 + }, + { + "epoch": 12.502574665293512, + "grad_norm": 0.4970080256462097, + "learning_rate": 2.6291545640079583e-05, + "loss": 0.038, + "step": 12140 + }, + { + "epoch": 12.51287332646756, + "grad_norm": 0.31872427463531494, + "learning_rate": 2.6210709545840816e-05, + "loss": 0.0349, + "step": 12150 + }, + { + "epoch": 12.523171987641607, + "grad_norm": 0.543602705001831, + "learning_rate": 2.612995374667394e-05, + "loss": 0.0456, + "step": 12160 + }, + { + "epoch": 12.533470648815655, + "grad_norm": 0.24425791203975677, + "learning_rate": 2.6049278515150888e-05, + "loss": 0.0343, + "step": 12170 + }, + { + "epoch": 12.5437693099897, + "grad_norm": 0.32970938086509705, + "learning_rate": 2.5968684123571625e-05, + "loss": 0.0358, + "step": 12180 + }, + { + "epoch": 12.554067971163748, + "grad_norm": 0.24140028655529022, + "learning_rate": 2.5888170843963332e-05, + "loss": 0.0415, + "step": 12190 + }, + { + "epoch": 12.564366632337796, + "grad_norm": 0.1907021552324295, + "learning_rate": 2.5807738948079307e-05, + "loss": 0.0332, + "step": 12200 + }, + { + "epoch": 12.574665293511844, + "grad_norm": 0.2994469404220581, + "learning_rate": 2.572738870739827e-05, + "loss": 0.0332, + "step": 12210 + }, + { + "epoch": 12.584963954685891, + "grad_norm": 0.3281172811985016, + "learning_rate": 2.5647120393123246e-05, + "loss": 0.0355, + "step": 12220 + }, + { + "epoch": 12.595262615859939, + "grad_norm": 0.222566619515419, + "learning_rate": 2.5566934276180792e-05, + "loss": 0.0299, + "step": 12230 + }, + { + "epoch": 12.605561277033985, + "grad_norm": 0.38741955161094666, + "learning_rate": 2.5486830627219993e-05, + "loss": 0.0369, + "step": 12240 + }, + { + "epoch": 12.615859938208033, + "grad_norm": 0.24740222096443176, + "learning_rate": 2.540680971661161e-05, + "loss": 0.034, + "step": 12250 + }, + { + "epoch": 12.62615859938208, + "grad_norm": 0.2917155623435974, + "learning_rate": 2.5326871814447116e-05, + "loss": 0.0325, + "step": 12260 + }, + { + "epoch": 12.636457260556128, + "grad_norm": 0.3306695818901062, + "learning_rate": 2.5247017190537802e-05, + "loss": 0.0314, + "step": 12270 + }, + { + "epoch": 12.646755921730175, + "grad_norm": 0.3189143240451813, + "learning_rate": 2.5167246114413956e-05, + "loss": 0.0406, + "step": 12280 + }, + { + "epoch": 12.657054582904223, + "grad_norm": 0.27937018871307373, + "learning_rate": 2.5087558855323718e-05, + "loss": 0.037, + "step": 12290 + }, + { + "epoch": 12.667353244078269, + "grad_norm": 0.23929426074028015, + "learning_rate": 2.5007955682232498e-05, + "loss": 0.0366, + "step": 12300 + }, + { + "epoch": 12.677651905252317, + "grad_norm": 0.38764917850494385, + "learning_rate": 2.4928436863821725e-05, + "loss": 0.0357, + "step": 12310 + }, + { + "epoch": 12.687950566426364, + "grad_norm": 0.22392131388187408, + "learning_rate": 2.4849002668488245e-05, + "loss": 0.031, + "step": 12320 + }, + { + "epoch": 12.698249227600412, + "grad_norm": 0.35927116870880127, + "learning_rate": 2.4769653364343222e-05, + "loss": 0.0355, + "step": 12330 + }, + { + "epoch": 12.70854788877446, + "grad_norm": 0.3391915261745453, + "learning_rate": 2.4690389219211273e-05, + "loss": 0.0346, + "step": 12340 + }, + { + "epoch": 12.718846549948507, + "grad_norm": 0.21950756013393402, + "learning_rate": 2.4611210500629618e-05, + "loss": 0.0339, + "step": 12350 + }, + { + "epoch": 12.729145211122553, + "grad_norm": 0.22874067723751068, + "learning_rate": 2.453211747584711e-05, + "loss": 0.0347, + "step": 12360 + }, + { + "epoch": 12.7394438722966, + "grad_norm": 0.5297624468803406, + "learning_rate": 2.4453110411823382e-05, + "loss": 0.0308, + "step": 12370 + }, + { + "epoch": 12.749742533470648, + "grad_norm": 0.31514862179756165, + "learning_rate": 2.4374189575227902e-05, + "loss": 0.032, + "step": 12380 + }, + { + "epoch": 12.760041194644696, + "grad_norm": 0.26266971230506897, + "learning_rate": 2.429535523243917e-05, + "loss": 0.0357, + "step": 12390 + }, + { + "epoch": 12.770339855818744, + "grad_norm": 0.18397288024425507, + "learning_rate": 2.4216607649543628e-05, + "loss": 0.0307, + "step": 12400 + }, + { + "epoch": 12.780638516992791, + "grad_norm": 0.26537027955055237, + "learning_rate": 2.4137947092334994e-05, + "loss": 0.0363, + "step": 12410 + }, + { + "epoch": 12.790937178166839, + "grad_norm": 0.28661102056503296, + "learning_rate": 2.4059373826313185e-05, + "loss": 0.0306, + "step": 12420 + }, + { + "epoch": 12.801235839340885, + "grad_norm": 0.26964297890663147, + "learning_rate": 2.3980888116683515e-05, + "loss": 0.0324, + "step": 12430 + }, + { + "epoch": 12.811534500514933, + "grad_norm": 0.2776640057563782, + "learning_rate": 2.3902490228355756e-05, + "loss": 0.0329, + "step": 12440 + }, + { + "epoch": 12.82183316168898, + "grad_norm": 0.4814803898334503, + "learning_rate": 2.3824180425943277e-05, + "loss": 0.0303, + "step": 12450 + }, + { + "epoch": 12.832131822863028, + "grad_norm": 0.22867955267429352, + "learning_rate": 2.374595897376211e-05, + "loss": 0.0288, + "step": 12460 + }, + { + "epoch": 12.842430484037076, + "grad_norm": 0.21567359566688538, + "learning_rate": 2.366782613583009e-05, + "loss": 0.0325, + "step": 12470 + }, + { + "epoch": 12.852729145211123, + "grad_norm": 0.290703684091568, + "learning_rate": 2.3589782175866015e-05, + "loss": 0.0298, + "step": 12480 + }, + { + "epoch": 12.863027806385169, + "grad_norm": 0.3255325257778168, + "learning_rate": 2.3511827357288575e-05, + "loss": 0.0363, + "step": 12490 + }, + { + "epoch": 12.873326467559217, + "grad_norm": 0.44946736097335815, + "learning_rate": 2.343396194321572e-05, + "loss": 0.0332, + "step": 12500 + }, + { + "epoch": 12.883625128733264, + "grad_norm": 0.25294211506843567, + "learning_rate": 2.33561861964635e-05, + "loss": 0.0348, + "step": 12510 + }, + { + "epoch": 12.893923789907312, + "grad_norm": 0.18743322789669037, + "learning_rate": 2.3278500379545436e-05, + "loss": 0.0336, + "step": 12520 + }, + { + "epoch": 12.90422245108136, + "grad_norm": 0.16629280149936676, + "learning_rate": 2.3200904754671453e-05, + "loss": 0.0381, + "step": 12530 + }, + { + "epoch": 12.914521112255407, + "grad_norm": 0.1841958910226822, + "learning_rate": 2.312339958374705e-05, + "loss": 0.0273, + "step": 12540 + }, + { + "epoch": 12.924819773429455, + "grad_norm": 0.3820919096469879, + "learning_rate": 2.3045985128372442e-05, + "loss": 0.0354, + "step": 12550 + }, + { + "epoch": 12.9351184346035, + "grad_norm": 0.22891731560230255, + "learning_rate": 2.2968661649841643e-05, + "loss": 0.0393, + "step": 12560 + }, + { + "epoch": 12.945417095777549, + "grad_norm": 0.21805356442928314, + "learning_rate": 2.2891429409141594e-05, + "loss": 0.0312, + "step": 12570 + }, + { + "epoch": 12.955715756951596, + "grad_norm": 0.29530712962150574, + "learning_rate": 2.281428866695128e-05, + "loss": 0.034, + "step": 12580 + }, + { + "epoch": 12.966014418125644, + "grad_norm": 0.3417767286300659, + "learning_rate": 2.2737239683640908e-05, + "loss": 0.0291, + "step": 12590 + }, + { + "epoch": 12.976313079299691, + "grad_norm": 0.36338862776756287, + "learning_rate": 2.266028271927087e-05, + "loss": 0.0288, + "step": 12600 + }, + { + "epoch": 12.98661174047374, + "grad_norm": 0.18803521990776062, + "learning_rate": 2.258341803359108e-05, + "loss": 0.035, + "step": 12610 + }, + { + "epoch": 12.996910401647785, + "grad_norm": 0.2204011231660843, + "learning_rate": 2.2506645886039918e-05, + "loss": 0.0331, + "step": 12620 + }, + { + "epoch": 13.007209062821833, + "grad_norm": 0.23867210745811462, + "learning_rate": 2.242996653574345e-05, + "loss": 0.0327, + "step": 12630 + }, + { + "epoch": 13.01750772399588, + "grad_norm": 0.22372329235076904, + "learning_rate": 2.2353380241514515e-05, + "loss": 0.0313, + "step": 12640 + }, + { + "epoch": 13.027806385169928, + "grad_norm": 0.2398245483636856, + "learning_rate": 2.2276887261851875e-05, + "loss": 0.0405, + "step": 12650 + }, + { + "epoch": 13.038105046343976, + "grad_norm": 0.20746667683124542, + "learning_rate": 2.2200487854939322e-05, + "loss": 0.0332, + "step": 12660 + }, + { + "epoch": 13.048403707518023, + "grad_norm": 0.23980452120304108, + "learning_rate": 2.21241822786448e-05, + "loss": 0.0331, + "step": 12670 + }, + { + "epoch": 13.058702368692071, + "grad_norm": 0.2431352734565735, + "learning_rate": 2.204797079051962e-05, + "loss": 0.0337, + "step": 12680 + }, + { + "epoch": 13.069001029866117, + "grad_norm": 0.21622303128242493, + "learning_rate": 2.1971853647797415e-05, + "loss": 0.0369, + "step": 12690 + }, + { + "epoch": 13.079299691040164, + "grad_norm": 0.17636331915855408, + "learning_rate": 2.1895831107393484e-05, + "loss": 0.0385, + "step": 12700 + }, + { + "epoch": 13.089598352214212, + "grad_norm": 0.3212912976741791, + "learning_rate": 2.181990342590371e-05, + "loss": 0.0388, + "step": 12710 + }, + { + "epoch": 13.09989701338826, + "grad_norm": 0.4048994183540344, + "learning_rate": 2.1744070859603897e-05, + "loss": 0.0314, + "step": 12720 + }, + { + "epoch": 13.110195674562307, + "grad_norm": 0.2608017921447754, + "learning_rate": 2.1668333664448776e-05, + "loss": 0.0348, + "step": 12730 + }, + { + "epoch": 13.120494335736355, + "grad_norm": 0.22120167315006256, + "learning_rate": 2.1592692096071153e-05, + "loss": 0.0282, + "step": 12740 + }, + { + "epoch": 13.130792996910401, + "grad_norm": 0.22117048501968384, + "learning_rate": 2.1517146409781103e-05, + "loss": 0.0346, + "step": 12750 + }, + { + "epoch": 13.141091658084449, + "grad_norm": 0.2921169102191925, + "learning_rate": 2.1441696860565048e-05, + "loss": 0.0342, + "step": 12760 + }, + { + "epoch": 13.151390319258496, + "grad_norm": 0.22612257301807404, + "learning_rate": 2.1366343703084936e-05, + "loss": 0.0312, + "step": 12770 + }, + { + "epoch": 13.161688980432544, + "grad_norm": 0.27955397963523865, + "learning_rate": 2.1291087191677343e-05, + "loss": 0.0332, + "step": 12780 + }, + { + "epoch": 13.171987641606592, + "grad_norm": 0.2641075849533081, + "learning_rate": 2.121592758035273e-05, + "loss": 0.0368, + "step": 12790 + }, + { + "epoch": 13.18228630278064, + "grad_norm": 0.26150405406951904, + "learning_rate": 2.114086512279434e-05, + "loss": 0.0355, + "step": 12800 + }, + { + "epoch": 13.192584963954685, + "grad_norm": 0.2792717218399048, + "learning_rate": 2.1065900072357635e-05, + "loss": 0.029, + "step": 12810 + }, + { + "epoch": 13.202883625128733, + "grad_norm": 0.21909286081790924, + "learning_rate": 2.0991032682069246e-05, + "loss": 0.0379, + "step": 12820 + }, + { + "epoch": 13.21318228630278, + "grad_norm": 0.2866324782371521, + "learning_rate": 2.0916263204626162e-05, + "loss": 0.0282, + "step": 12830 + }, + { + "epoch": 13.223480947476828, + "grad_norm": 0.28694427013397217, + "learning_rate": 2.0841591892394925e-05, + "loss": 0.0399, + "step": 12840 + }, + { + "epoch": 13.233779608650876, + "grad_norm": 0.31920716166496277, + "learning_rate": 2.0767018997410713e-05, + "loss": 0.0365, + "step": 12850 + }, + { + "epoch": 13.244078269824923, + "grad_norm": 0.35022082924842834, + "learning_rate": 2.0692544771376543e-05, + "loss": 0.0264, + "step": 12860 + }, + { + "epoch": 13.254376930998971, + "grad_norm": 0.25149139761924744, + "learning_rate": 2.0618169465662364e-05, + "loss": 0.0302, + "step": 12870 + }, + { + "epoch": 13.264675592173017, + "grad_norm": 0.2645907402038574, + "learning_rate": 2.0543893331304333e-05, + "loss": 0.0328, + "step": 12880 + }, + { + "epoch": 13.274974253347064, + "grad_norm": 0.17596539855003357, + "learning_rate": 2.0469716619003725e-05, + "loss": 0.0328, + "step": 12890 + }, + { + "epoch": 13.285272914521112, + "grad_norm": 0.2291368991136551, + "learning_rate": 2.039563957912642e-05, + "loss": 0.0318, + "step": 12900 + }, + { + "epoch": 13.29557157569516, + "grad_norm": 0.21256229281425476, + "learning_rate": 2.0321662461701696e-05, + "loss": 0.0334, + "step": 12910 + }, + { + "epoch": 13.305870236869207, + "grad_norm": 0.30739450454711914, + "learning_rate": 2.024778551642172e-05, + "loss": 0.0321, + "step": 12920 + }, + { + "epoch": 13.316168898043255, + "grad_norm": 0.2791813015937805, + "learning_rate": 2.017400899264047e-05, + "loss": 0.0302, + "step": 12930 + }, + { + "epoch": 13.326467559217301, + "grad_norm": 0.3258625864982605, + "learning_rate": 2.0100333139372985e-05, + "loss": 0.0361, + "step": 12940 + }, + { + "epoch": 13.336766220391349, + "grad_norm": 0.2523643672466278, + "learning_rate": 2.0026758205294533e-05, + "loss": 0.0322, + "step": 12950 + }, + { + "epoch": 13.347064881565396, + "grad_norm": 0.2704935073852539, + "learning_rate": 1.9953284438739733e-05, + "loss": 0.0321, + "step": 12960 + }, + { + "epoch": 13.357363542739444, + "grad_norm": 0.45123302936553955, + "learning_rate": 1.9879912087701753e-05, + "loss": 0.0331, + "step": 12970 + }, + { + "epoch": 13.367662203913492, + "grad_norm": 1.1362191438674927, + "learning_rate": 1.9806641399831433e-05, + "loss": 0.0352, + "step": 12980 + }, + { + "epoch": 13.37796086508754, + "grad_norm": 0.3239549398422241, + "learning_rate": 1.9733472622436544e-05, + "loss": 0.0317, + "step": 12990 + }, + { + "epoch": 13.388259526261585, + "grad_norm": 0.20692795515060425, + "learning_rate": 1.9660406002480765e-05, + "loss": 0.0328, + "step": 13000 + }, + { + "epoch": 13.398558187435633, + "grad_norm": 0.24428331851959229, + "learning_rate": 1.9587441786583076e-05, + "loss": 0.0344, + "step": 13010 + }, + { + "epoch": 13.40885684860968, + "grad_norm": 0.17566567659378052, + "learning_rate": 1.951458022101676e-05, + "loss": 0.0346, + "step": 13020 + }, + { + "epoch": 13.419155509783728, + "grad_norm": 0.2601017951965332, + "learning_rate": 1.944182155170864e-05, + "loss": 0.0413, + "step": 13030 + }, + { + "epoch": 13.429454170957776, + "grad_norm": 0.22690336406230927, + "learning_rate": 1.9369166024238232e-05, + "loss": 0.039, + "step": 13040 + }, + { + "epoch": 13.439752832131823, + "grad_norm": 0.34189629554748535, + "learning_rate": 1.9296613883836945e-05, + "loss": 0.0297, + "step": 13050 + }, + { + "epoch": 13.450051493305871, + "grad_norm": 0.39015287160873413, + "learning_rate": 1.9224165375387193e-05, + "loss": 0.0352, + "step": 13060 + }, + { + "epoch": 13.460350154479917, + "grad_norm": 0.16422075033187866, + "learning_rate": 1.9151820743421617e-05, + "loss": 0.0298, + "step": 13070 + }, + { + "epoch": 13.470648815653965, + "grad_norm": 0.20099236071109772, + "learning_rate": 1.9079580232122303e-05, + "loss": 0.0271, + "step": 13080 + }, + { + "epoch": 13.480947476828012, + "grad_norm": 0.37444478273391724, + "learning_rate": 1.9007444085319786e-05, + "loss": 0.0382, + "step": 13090 + }, + { + "epoch": 13.49124613800206, + "grad_norm": 0.24139359593391418, + "learning_rate": 1.8935412546492486e-05, + "loss": 0.0334, + "step": 13100 + }, + { + "epoch": 13.501544799176108, + "grad_norm": 0.3007052540779114, + "learning_rate": 1.88634858587656e-05, + "loss": 0.0341, + "step": 13110 + }, + { + "epoch": 13.511843460350155, + "grad_norm": 0.30898720026016235, + "learning_rate": 1.8791664264910537e-05, + "loss": 0.0324, + "step": 13120 + }, + { + "epoch": 13.522142121524201, + "grad_norm": 0.3256855905056, + "learning_rate": 1.8719948007343936e-05, + "loss": 0.0376, + "step": 13130 + }, + { + "epoch": 13.532440782698249, + "grad_norm": 0.2092374563217163, + "learning_rate": 1.8648337328126906e-05, + "loss": 0.0298, + "step": 13140 + }, + { + "epoch": 13.542739443872296, + "grad_norm": 0.34433215856552124, + "learning_rate": 1.85768324689642e-05, + "loss": 0.0371, + "step": 13150 + }, + { + "epoch": 13.553038105046344, + "grad_norm": 0.47145530581474304, + "learning_rate": 1.850543367120341e-05, + "loss": 0.0389, + "step": 13160 + }, + { + "epoch": 13.563336766220392, + "grad_norm": 1.9276230335235596, + "learning_rate": 1.8434141175834125e-05, + "loss": 0.0356, + "step": 13170 + }, + { + "epoch": 13.57363542739444, + "grad_norm": 0.1196000725030899, + "learning_rate": 1.8362955223487143e-05, + "loss": 0.0292, + "step": 13180 + }, + { + "epoch": 13.583934088568487, + "grad_norm": 0.21239057183265686, + "learning_rate": 1.8291876054433693e-05, + "loss": 0.0314, + "step": 13190 + }, + { + "epoch": 13.594232749742533, + "grad_norm": 0.27161744236946106, + "learning_rate": 1.8220903908584492e-05, + "loss": 0.0323, + "step": 13200 + }, + { + "epoch": 13.60453141091658, + "grad_norm": 0.23213060200214386, + "learning_rate": 1.8150039025489113e-05, + "loss": 0.0335, + "step": 13210 + }, + { + "epoch": 13.614830072090628, + "grad_norm": 0.26432856917381287, + "learning_rate": 1.8079281644335055e-05, + "loss": 0.0348, + "step": 13220 + }, + { + "epoch": 13.625128733264676, + "grad_norm": 0.24627777934074402, + "learning_rate": 1.8008632003946957e-05, + "loss": 0.0308, + "step": 13230 + }, + { + "epoch": 13.635427394438723, + "grad_norm": 0.3506312966346741, + "learning_rate": 1.7938090342785817e-05, + "loss": 0.0379, + "step": 13240 + }, + { + "epoch": 13.645726055612771, + "grad_norm": 0.20565661787986755, + "learning_rate": 1.7867656898948187e-05, + "loss": 0.0338, + "step": 13250 + }, + { + "epoch": 13.656024716786817, + "grad_norm": 0.2677291929721832, + "learning_rate": 1.7797331910165336e-05, + "loss": 0.0325, + "step": 13260 + }, + { + "epoch": 13.666323377960865, + "grad_norm": 0.30942559242248535, + "learning_rate": 1.7727115613802465e-05, + "loss": 0.0365, + "step": 13270 + }, + { + "epoch": 13.676622039134912, + "grad_norm": 0.23922519385814667, + "learning_rate": 1.765700824685797e-05, + "loss": 0.0366, + "step": 13280 + }, + { + "epoch": 13.68692070030896, + "grad_norm": 0.18366648256778717, + "learning_rate": 1.758701004596247e-05, + "loss": 0.0305, + "step": 13290 + }, + { + "epoch": 13.697219361483008, + "grad_norm": 0.2875716984272003, + "learning_rate": 1.751712124737826e-05, + "loss": 0.0363, + "step": 13300 + }, + { + "epoch": 13.707518022657055, + "grad_norm": 0.3050890564918518, + "learning_rate": 1.744734208699822e-05, + "loss": 0.037, + "step": 13310 + }, + { + "epoch": 13.717816683831103, + "grad_norm": 0.24879583716392517, + "learning_rate": 1.7377672800345302e-05, + "loss": 0.0285, + "step": 13320 + }, + { + "epoch": 13.728115345005149, + "grad_norm": 0.22065865993499756, + "learning_rate": 1.7308113622571544e-05, + "loss": 0.0299, + "step": 13330 + }, + { + "epoch": 13.738414006179196, + "grad_norm": 0.1869887113571167, + "learning_rate": 1.7238664788457342e-05, + "loss": 0.0344, + "step": 13340 + }, + { + "epoch": 13.748712667353244, + "grad_norm": 0.21137484908103943, + "learning_rate": 1.7169326532410663e-05, + "loss": 0.0332, + "step": 13350 + }, + { + "epoch": 13.759011328527292, + "grad_norm": 0.3234722912311554, + "learning_rate": 1.7100099088466242e-05, + "loss": 0.0345, + "step": 13360 + }, + { + "epoch": 13.76930998970134, + "grad_norm": 0.2264581024646759, + "learning_rate": 1.7030982690284792e-05, + "loss": 0.0291, + "step": 13370 + }, + { + "epoch": 13.779608650875387, + "grad_norm": 0.29631558060646057, + "learning_rate": 1.69619775711522e-05, + "loss": 0.0361, + "step": 13380 + }, + { + "epoch": 13.789907312049433, + "grad_norm": 0.292219340801239, + "learning_rate": 1.689308396397882e-05, + "loss": 0.0256, + "step": 13390 + }, + { + "epoch": 13.80020597322348, + "grad_norm": 0.17191918194293976, + "learning_rate": 1.6824302101298526e-05, + "loss": 0.0349, + "step": 13400 + }, + { + "epoch": 13.810504634397528, + "grad_norm": 0.22219271957874298, + "learning_rate": 1.6755632215268118e-05, + "loss": 0.0316, + "step": 13410 + }, + { + "epoch": 13.820803295571576, + "grad_norm": 0.18818335235118866, + "learning_rate": 1.6687074537666398e-05, + "loss": 0.0325, + "step": 13420 + }, + { + "epoch": 13.831101956745623, + "grad_norm": 0.2848359942436218, + "learning_rate": 1.6618629299893434e-05, + "loss": 0.0327, + "step": 13430 + }, + { + "epoch": 13.841400617919671, + "grad_norm": 0.26240599155426025, + "learning_rate": 1.6550296732969795e-05, + "loss": 0.0321, + "step": 13440 + }, + { + "epoch": 13.851699279093717, + "grad_norm": 0.166743665933609, + "learning_rate": 1.648207706753575e-05, + "loss": 0.0361, + "step": 13450 + }, + { + "epoch": 13.861997940267765, + "grad_norm": 0.2783146798610687, + "learning_rate": 1.6413970533850498e-05, + "loss": 0.0395, + "step": 13460 + }, + { + "epoch": 13.872296601441812, + "grad_norm": 0.2442004680633545, + "learning_rate": 1.6345977361791366e-05, + "loss": 0.0385, + "step": 13470 + }, + { + "epoch": 13.88259526261586, + "grad_norm": 0.16581279039382935, + "learning_rate": 1.6278097780853136e-05, + "loss": 0.0356, + "step": 13480 + }, + { + "epoch": 13.892893923789908, + "grad_norm": 0.37210017442703247, + "learning_rate": 1.6210332020147055e-05, + "loss": 0.0363, + "step": 13490 + }, + { + "epoch": 13.903192584963955, + "grad_norm": 0.18403227627277374, + "learning_rate": 1.6142680308400338e-05, + "loss": 0.0389, + "step": 13500 + }, + { + "epoch": 13.913491246138001, + "grad_norm": 0.283448189496994, + "learning_rate": 1.6075142873955164e-05, + "loss": 0.0318, + "step": 13510 + }, + { + "epoch": 13.923789907312049, + "grad_norm": 0.24017812311649323, + "learning_rate": 1.6007719944768025e-05, + "loss": 0.035, + "step": 13520 + }, + { + "epoch": 13.934088568486096, + "grad_norm": 0.14648008346557617, + "learning_rate": 1.594041174840894e-05, + "loss": 0.0276, + "step": 13530 + }, + { + "epoch": 13.944387229660144, + "grad_norm": 0.31949880719184875, + "learning_rate": 1.587321851206061e-05, + "loss": 0.0312, + "step": 13540 + }, + { + "epoch": 13.954685890834192, + "grad_norm": 0.27566295862197876, + "learning_rate": 1.5806140462517828e-05, + "loss": 0.0308, + "step": 13550 + }, + { + "epoch": 13.96498455200824, + "grad_norm": 0.221617192029953, + "learning_rate": 1.573917782618651e-05, + "loss": 0.033, + "step": 13560 + }, + { + "epoch": 13.975283213182287, + "grad_norm": 0.15257342159748077, + "learning_rate": 1.567233082908306e-05, + "loss": 0.0272, + "step": 13570 + }, + { + "epoch": 13.985581874356333, + "grad_norm": 0.31881460547447205, + "learning_rate": 1.5605599696833544e-05, + "loss": 0.036, + "step": 13580 + }, + { + "epoch": 13.99588053553038, + "grad_norm": 0.21161913871765137, + "learning_rate": 1.5538984654673016e-05, + "loss": 0.0272, + "step": 13590 + }, + { + "epoch": 14.006179196704428, + "grad_norm": 0.22538325190544128, + "learning_rate": 1.5472485927444597e-05, + "loss": 0.023, + "step": 13600 + }, + { + "epoch": 14.016477857878476, + "grad_norm": 0.2999170422554016, + "learning_rate": 1.5406103739598903e-05, + "loss": 0.032, + "step": 13610 + }, + { + "epoch": 14.026776519052524, + "grad_norm": 0.26565343141555786, + "learning_rate": 1.5339838315193156e-05, + "loss": 0.031, + "step": 13620 + }, + { + "epoch": 14.037075180226571, + "grad_norm": 0.3137536942958832, + "learning_rate": 1.5273689877890485e-05, + "loss": 0.0302, + "step": 13630 + }, + { + "epoch": 14.047373841400617, + "grad_norm": 0.1854087859392166, + "learning_rate": 1.5207658650959138e-05, + "loss": 0.0345, + "step": 13640 + }, + { + "epoch": 14.057672502574665, + "grad_norm": 0.2928926348686218, + "learning_rate": 1.5141744857271778e-05, + "loss": 0.0334, + "step": 13650 + }, + { + "epoch": 14.067971163748712, + "grad_norm": 0.42930635809898376, + "learning_rate": 1.5075948719304672e-05, + "loss": 0.0272, + "step": 13660 + }, + { + "epoch": 14.07826982492276, + "grad_norm": 0.20846472680568695, + "learning_rate": 1.5010270459136966e-05, + "loss": 0.0331, + "step": 13670 + }, + { + "epoch": 14.088568486096808, + "grad_norm": 0.2335253208875656, + "learning_rate": 1.4944710298449999e-05, + "loss": 0.0312, + "step": 13680 + }, + { + "epoch": 14.098867147270855, + "grad_norm": 0.18406903743743896, + "learning_rate": 1.4879268458526379e-05, + "loss": 0.033, + "step": 13690 + }, + { + "epoch": 14.109165808444903, + "grad_norm": 0.26444944739341736, + "learning_rate": 1.481394516024947e-05, + "loss": 0.0282, + "step": 13700 + }, + { + "epoch": 14.119464469618949, + "grad_norm": 0.19681231677532196, + "learning_rate": 1.4748740624102459e-05, + "loss": 0.0354, + "step": 13710 + }, + { + "epoch": 14.129763130792997, + "grad_norm": 0.22566291689872742, + "learning_rate": 1.468365507016769e-05, + "loss": 0.0327, + "step": 13720 + }, + { + "epoch": 14.140061791967044, + "grad_norm": 0.24647872149944305, + "learning_rate": 1.4618688718125929e-05, + "loss": 0.0301, + "step": 13730 + }, + { + "epoch": 14.150360453141092, + "grad_norm": 0.2727005183696747, + "learning_rate": 1.455384178725555e-05, + "loss": 0.0261, + "step": 13740 + }, + { + "epoch": 14.16065911431514, + "grad_norm": 0.2636515200138092, + "learning_rate": 1.4489114496431938e-05, + "loss": 0.0362, + "step": 13750 + }, + { + "epoch": 14.170957775489187, + "grad_norm": 0.24423463642597198, + "learning_rate": 1.4424507064126597e-05, + "loss": 0.0308, + "step": 13760 + }, + { + "epoch": 14.181256436663233, + "grad_norm": 0.2822682559490204, + "learning_rate": 1.4360019708406487e-05, + "loss": 0.038, + "step": 13770 + }, + { + "epoch": 14.19155509783728, + "grad_norm": 0.19930243492126465, + "learning_rate": 1.4295652646933277e-05, + "loss": 0.0291, + "step": 13780 + }, + { + "epoch": 14.201853759011328, + "grad_norm": 0.1978948414325714, + "learning_rate": 1.4231406096962669e-05, + "loss": 0.0302, + "step": 13790 + }, + { + "epoch": 14.212152420185376, + "grad_norm": 0.17142613232135773, + "learning_rate": 1.4167280275343492e-05, + "loss": 0.0257, + "step": 13800 + }, + { + "epoch": 14.222451081359424, + "grad_norm": 0.2695595622062683, + "learning_rate": 1.4103275398517197e-05, + "loss": 0.0349, + "step": 13810 + }, + { + "epoch": 14.232749742533471, + "grad_norm": 0.23960620164871216, + "learning_rate": 1.4039391682516972e-05, + "loss": 0.0307, + "step": 13820 + }, + { + "epoch": 14.243048403707519, + "grad_norm": 0.279876172542572, + "learning_rate": 1.3975629342967001e-05, + "loss": 0.0334, + "step": 13830 + }, + { + "epoch": 14.253347064881565, + "grad_norm": 0.260696142911911, + "learning_rate": 1.3911988595081893e-05, + "loss": 0.0316, + "step": 13840 + }, + { + "epoch": 14.263645726055612, + "grad_norm": 0.24109739065170288, + "learning_rate": 1.3848469653665786e-05, + "loss": 0.0306, + "step": 13850 + }, + { + "epoch": 14.27394438722966, + "grad_norm": 0.3289351165294647, + "learning_rate": 1.378507273311171e-05, + "loss": 0.0362, + "step": 13860 + }, + { + "epoch": 14.284243048403708, + "grad_norm": 0.33488863706588745, + "learning_rate": 1.3721798047400813e-05, + "loss": 0.0408, + "step": 13870 + }, + { + "epoch": 14.294541709577755, + "grad_norm": 3.9080820083618164, + "learning_rate": 1.3658645810101755e-05, + "loss": 0.0278, + "step": 13880 + }, + { + "epoch": 14.304840370751803, + "grad_norm": 0.2996270954608917, + "learning_rate": 1.3595616234369762e-05, + "loss": 0.0277, + "step": 13890 + }, + { + "epoch": 14.315139031925849, + "grad_norm": 0.2796926498413086, + "learning_rate": 1.3532709532946186e-05, + "loss": 0.0328, + "step": 13900 + }, + { + "epoch": 14.325437693099897, + "grad_norm": 0.24468347430229187, + "learning_rate": 1.3469925918157567e-05, + "loss": 0.0327, + "step": 13910 + }, + { + "epoch": 14.335736354273944, + "grad_norm": 0.23212593793869019, + "learning_rate": 1.3407265601914976e-05, + "loss": 0.0317, + "step": 13920 + }, + { + "epoch": 14.346035015447992, + "grad_norm": 0.23879218101501465, + "learning_rate": 1.3344728795713413e-05, + "loss": 0.0365, + "step": 13930 + }, + { + "epoch": 14.35633367662204, + "grad_norm": 0.2575908303260803, + "learning_rate": 1.3282315710630882e-05, + "loss": 0.0385, + "step": 13940 + }, + { + "epoch": 14.366632337796087, + "grad_norm": 0.3186909556388855, + "learning_rate": 1.3220026557327898e-05, + "loss": 0.0403, + "step": 13950 + }, + { + "epoch": 14.376930998970133, + "grad_norm": 0.2613557279109955, + "learning_rate": 1.3157861546046613e-05, + "loss": 0.0328, + "step": 13960 + }, + { + "epoch": 14.38722966014418, + "grad_norm": 0.3558288514614105, + "learning_rate": 1.3095820886610188e-05, + "loss": 0.0293, + "step": 13970 + }, + { + "epoch": 14.397528321318228, + "grad_norm": 0.2622450292110443, + "learning_rate": 1.3033904788422047e-05, + "loss": 0.0261, + "step": 13980 + }, + { + "epoch": 14.407826982492276, + "grad_norm": 0.23433591425418854, + "learning_rate": 1.2972113460465246e-05, + "loss": 0.0286, + "step": 13990 + }, + { + "epoch": 14.418125643666324, + "grad_norm": 0.2427792251110077, + "learning_rate": 1.2910447111301604e-05, + "loss": 0.0316, + "step": 14000 + }, + { + "epoch": 14.428424304840371, + "grad_norm": 0.3044346570968628, + "learning_rate": 1.284890594907121e-05, + "loss": 0.0273, + "step": 14010 + }, + { + "epoch": 14.438722966014419, + "grad_norm": 0.16404663026332855, + "learning_rate": 1.2787490181491568e-05, + "loss": 0.0257, + "step": 14020 + }, + { + "epoch": 14.449021627188465, + "grad_norm": 0.26250144839286804, + "learning_rate": 1.2726200015856892e-05, + "loss": 0.0328, + "step": 14030 + }, + { + "epoch": 14.459320288362512, + "grad_norm": 0.7278460264205933, + "learning_rate": 1.2665035659037561e-05, + "loss": 0.0297, + "step": 14040 + }, + { + "epoch": 14.46961894953656, + "grad_norm": 0.34996357560157776, + "learning_rate": 1.2603997317479238e-05, + "loss": 0.0324, + "step": 14050 + }, + { + "epoch": 14.479917610710608, + "grad_norm": 0.44799286127090454, + "learning_rate": 1.2543085197202287e-05, + "loss": 0.036, + "step": 14060 + }, + { + "epoch": 14.490216271884655, + "grad_norm": 0.24697241187095642, + "learning_rate": 1.2482299503801016e-05, + "loss": 0.0315, + "step": 14070 + }, + { + "epoch": 14.500514933058703, + "grad_norm": 0.3266669511795044, + "learning_rate": 1.2421640442443055e-05, + "loss": 0.0351, + "step": 14080 + }, + { + "epoch": 14.510813594232749, + "grad_norm": 0.42595696449279785, + "learning_rate": 1.2361108217868544e-05, + "loss": 0.029, + "step": 14090 + }, + { + "epoch": 14.521112255406797, + "grad_norm": 0.28600630164146423, + "learning_rate": 1.23007030343896e-05, + "loss": 0.0288, + "step": 14100 + }, + { + "epoch": 14.531410916580844, + "grad_norm": 0.32830336689949036, + "learning_rate": 1.2240425095889495e-05, + "loss": 0.0323, + "step": 14110 + }, + { + "epoch": 14.541709577754892, + "grad_norm": 0.23947954177856445, + "learning_rate": 1.2180274605821989e-05, + "loss": 0.0301, + "step": 14120 + }, + { + "epoch": 14.55200823892894, + "grad_norm": 0.14854808151721954, + "learning_rate": 1.2120251767210755e-05, + "loss": 0.0305, + "step": 14130 + }, + { + "epoch": 14.562306900102987, + "grad_norm": 0.4753403961658478, + "learning_rate": 1.2060356782648503e-05, + "loss": 0.0333, + "step": 14140 + }, + { + "epoch": 14.572605561277033, + "grad_norm": 0.15201760828495026, + "learning_rate": 1.2000589854296507e-05, + "loss": 0.0348, + "step": 14150 + }, + { + "epoch": 14.58290422245108, + "grad_norm": 0.36805441975593567, + "learning_rate": 1.1940951183883742e-05, + "loss": 0.0315, + "step": 14160 + }, + { + "epoch": 14.593202883625128, + "grad_norm": 0.22207669913768768, + "learning_rate": 1.1881440972706315e-05, + "loss": 0.0299, + "step": 14170 + }, + { + "epoch": 14.603501544799176, + "grad_norm": 0.27251651883125305, + "learning_rate": 1.1822059421626724e-05, + "loss": 0.0364, + "step": 14180 + }, + { + "epoch": 14.613800205973224, + "grad_norm": 0.2771929204463959, + "learning_rate": 1.1762806731073261e-05, + "loss": 0.0272, + "step": 14190 + }, + { + "epoch": 14.624098867147271, + "grad_norm": 0.2667066156864166, + "learning_rate": 1.1703683101039197e-05, + "loss": 0.0271, + "step": 14200 + }, + { + "epoch": 14.634397528321319, + "grad_norm": 0.2355891466140747, + "learning_rate": 1.1644688731082242e-05, + "loss": 0.0299, + "step": 14210 + }, + { + "epoch": 14.644696189495365, + "grad_norm": 0.39315053820610046, + "learning_rate": 1.1585823820323843e-05, + "loss": 0.0334, + "step": 14220 + }, + { + "epoch": 14.654994850669413, + "grad_norm": 0.298880010843277, + "learning_rate": 1.1527088567448407e-05, + "loss": 0.0309, + "step": 14230 + }, + { + "epoch": 14.66529351184346, + "grad_norm": 0.21369227766990662, + "learning_rate": 1.1468483170702805e-05, + "loss": 0.0271, + "step": 14240 + }, + { + "epoch": 14.675592173017508, + "grad_norm": 0.21962594985961914, + "learning_rate": 1.141000782789554e-05, + "loss": 0.0296, + "step": 14250 + }, + { + "epoch": 14.685890834191555, + "grad_norm": 0.3962979316711426, + "learning_rate": 1.135166273639619e-05, + "loss": 0.0361, + "step": 14260 + }, + { + "epoch": 14.696189495365603, + "grad_norm": 0.2696010172367096, + "learning_rate": 1.1293448093134656e-05, + "loss": 0.0317, + "step": 14270 + }, + { + "epoch": 14.706488156539649, + "grad_norm": 0.16473254561424255, + "learning_rate": 1.1235364094600632e-05, + "loss": 0.0259, + "step": 14280 + }, + { + "epoch": 14.716786817713697, + "grad_norm": 0.18638800084590912, + "learning_rate": 1.1177410936842719e-05, + "loss": 0.0236, + "step": 14290 + }, + { + "epoch": 14.727085478887744, + "grad_norm": 0.35101962089538574, + "learning_rate": 1.1119588815468012e-05, + "loss": 0.0266, + "step": 14300 + }, + { + "epoch": 14.737384140061792, + "grad_norm": 0.2792340815067291, + "learning_rate": 1.1061897925641296e-05, + "loss": 0.0318, + "step": 14310 + }, + { + "epoch": 14.74768280123584, + "grad_norm": 0.19751253724098206, + "learning_rate": 1.100433846208434e-05, + "loss": 0.0294, + "step": 14320 + }, + { + "epoch": 14.757981462409887, + "grad_norm": 0.2783863842487335, + "learning_rate": 1.094691061907544e-05, + "loss": 0.0359, + "step": 14330 + }, + { + "epoch": 14.768280123583935, + "grad_norm": 0.2864331305027008, + "learning_rate": 1.088961459044852e-05, + "loss": 0.0289, + "step": 14340 + }, + { + "epoch": 14.77857878475798, + "grad_norm": 0.19958889484405518, + "learning_rate": 1.0832450569592684e-05, + "loss": 0.0296, + "step": 14350 + }, + { + "epoch": 14.788877445932028, + "grad_norm": 0.2572004199028015, + "learning_rate": 1.0775418749451427e-05, + "loss": 0.0299, + "step": 14360 + }, + { + "epoch": 14.799176107106076, + "grad_norm": 0.24685412645339966, + "learning_rate": 1.0718519322522053e-05, + "loss": 0.0346, + "step": 14370 + }, + { + "epoch": 14.809474768280124, + "grad_norm": 0.2643430829048157, + "learning_rate": 1.0661752480854975e-05, + "loss": 0.0253, + "step": 14380 + }, + { + "epoch": 14.819773429454171, + "grad_norm": 0.2792705297470093, + "learning_rate": 1.0605118416053162e-05, + "loss": 0.0295, + "step": 14390 + }, + { + "epoch": 14.830072090628219, + "grad_norm": 0.4018799662590027, + "learning_rate": 1.0548617319271342e-05, + "loss": 0.034, + "step": 14400 + }, + { + "epoch": 14.840370751802265, + "grad_norm": 0.20562392473220825, + "learning_rate": 1.049224938121548e-05, + "loss": 0.0386, + "step": 14410 + }, + { + "epoch": 14.850669412976313, + "grad_norm": 0.2107439637184143, + "learning_rate": 1.043601479214214e-05, + "loss": 0.038, + "step": 14420 + }, + { + "epoch": 14.86096807415036, + "grad_norm": 0.2785644829273224, + "learning_rate": 1.0379913741857699e-05, + "loss": 0.0308, + "step": 14430 + }, + { + "epoch": 14.871266735324408, + "grad_norm": 0.23650747537612915, + "learning_rate": 1.03239464197179e-05, + "loss": 0.0312, + "step": 14440 + }, + { + "epoch": 14.881565396498456, + "grad_norm": 0.2766387462615967, + "learning_rate": 1.0268113014627073e-05, + "loss": 0.0265, + "step": 14450 + }, + { + "epoch": 14.891864057672503, + "grad_norm": 0.2568782567977905, + "learning_rate": 1.021241371503755e-05, + "loss": 0.037, + "step": 14460 + }, + { + "epoch": 14.90216271884655, + "grad_norm": 0.18696804344654083, + "learning_rate": 1.0156848708949006e-05, + "loss": 0.0266, + "step": 14470 + }, + { + "epoch": 14.912461380020597, + "grad_norm": 0.23785705864429474, + "learning_rate": 1.0101418183907896e-05, + "loss": 0.0304, + "step": 14480 + }, + { + "epoch": 14.922760041194644, + "grad_norm": 0.2720486521720886, + "learning_rate": 1.004612232700669e-05, + "loss": 0.0359, + "step": 14490 + }, + { + "epoch": 14.933058702368692, + "grad_norm": 0.21330799162387848, + "learning_rate": 9.990961324883358e-06, + "loss": 0.0288, + "step": 14500 + }, + { + "epoch": 14.94335736354274, + "grad_norm": 0.24091622233390808, + "learning_rate": 9.935935363720728e-06, + "loss": 0.0275, + "step": 14510 + }, + { + "epoch": 14.953656024716787, + "grad_norm": 0.34269654750823975, + "learning_rate": 9.88104462924575e-06, + "loss": 0.0323, + "step": 14520 + }, + { + "epoch": 14.963954685890835, + "grad_norm": 0.23459886014461517, + "learning_rate": 9.826289306729052e-06, + "loss": 0.0293, + "step": 14530 + }, + { + "epoch": 14.97425334706488, + "grad_norm": 0.27133437991142273, + "learning_rate": 9.7716695809841e-06, + "loss": 0.0329, + "step": 14540 + }, + { + "epoch": 14.984552008238929, + "grad_norm": 0.24615567922592163, + "learning_rate": 9.717185636366783e-06, + "loss": 0.0317, + "step": 14550 + }, + { + "epoch": 14.994850669412976, + "grad_norm": 0.26164570450782776, + "learning_rate": 9.662837656774632e-06, + "loss": 0.031, + "step": 14560 + }, + { + "epoch": 15.005149330587024, + "grad_norm": 0.18910399079322815, + "learning_rate": 9.608625825646288e-06, + "loss": 0.0349, + "step": 14570 + }, + { + "epoch": 15.015447991761071, + "grad_norm": 0.3117832541465759, + "learning_rate": 9.554550325960853e-06, + "loss": 0.032, + "step": 14580 + }, + { + "epoch": 15.02574665293512, + "grad_norm": 0.22034838795661926, + "learning_rate": 9.500611340237258e-06, + "loss": 0.0301, + "step": 14590 + }, + { + "epoch": 15.036045314109165, + "grad_norm": 0.2756035029888153, + "learning_rate": 9.446809050533678e-06, + "loss": 0.0272, + "step": 14600 + }, + { + "epoch": 15.046343975283213, + "grad_norm": 0.3038906157016754, + "learning_rate": 9.393143638446889e-06, + "loss": 0.0327, + "step": 14610 + }, + { + "epoch": 15.05664263645726, + "grad_norm": 0.22907866537570953, + "learning_rate": 9.33961528511172e-06, + "loss": 0.0307, + "step": 14620 + }, + { + "epoch": 15.066941297631308, + "grad_norm": 0.4842381775379181, + "learning_rate": 9.286224171200297e-06, + "loss": 0.0284, + "step": 14630 + }, + { + "epoch": 15.077239958805356, + "grad_norm": 0.8235160112380981, + "learning_rate": 9.232970476921626e-06, + "loss": 0.0336, + "step": 14640 + }, + { + "epoch": 15.087538619979403, + "grad_norm": 0.4762952923774719, + "learning_rate": 9.17985438202082e-06, + "loss": 0.0315, + "step": 14650 + }, + { + "epoch": 15.097837281153451, + "grad_norm": 0.20582009851932526, + "learning_rate": 9.12687606577859e-06, + "loss": 0.0283, + "step": 14660 + }, + { + "epoch": 15.108135942327497, + "grad_norm": 0.20658078789710999, + "learning_rate": 9.074035707010575e-06, + "loss": 0.0277, + "step": 14670 + }, + { + "epoch": 15.118434603501544, + "grad_norm": 0.2650274336338043, + "learning_rate": 9.02133348406684e-06, + "loss": 0.031, + "step": 14680 + }, + { + "epoch": 15.128733264675592, + "grad_norm": 0.26044949889183044, + "learning_rate": 8.968769574831115e-06, + "loss": 0.0287, + "step": 14690 + }, + { + "epoch": 15.13903192584964, + "grad_norm": 0.25187498331069946, + "learning_rate": 8.916344156720335e-06, + "loss": 0.0301, + "step": 14700 + }, + { + "epoch": 15.149330587023687, + "grad_norm": 0.4505482017993927, + "learning_rate": 8.864057406684023e-06, + "loss": 0.0264, + "step": 14710 + }, + { + "epoch": 15.159629248197735, + "grad_norm": 0.2146962434053421, + "learning_rate": 8.81190950120357e-06, + "loss": 0.0386, + "step": 14720 + }, + { + "epoch": 15.169927909371781, + "grad_norm": 0.17643073201179504, + "learning_rate": 8.759900616291834e-06, + "loss": 0.0271, + "step": 14730 + }, + { + "epoch": 15.180226570545829, + "grad_norm": 0.3004768192768097, + "learning_rate": 8.708030927492345e-06, + "loss": 0.034, + "step": 14740 + }, + { + "epoch": 15.190525231719876, + "grad_norm": 0.33159592747688293, + "learning_rate": 8.656300609878898e-06, + "loss": 0.033, + "step": 14750 + }, + { + "epoch": 15.200823892893924, + "grad_norm": 0.2567281126976013, + "learning_rate": 8.604709838054813e-06, + "loss": 0.0325, + "step": 14760 + }, + { + "epoch": 15.211122554067972, + "grad_norm": 0.20799218118190765, + "learning_rate": 8.55325878615244e-06, + "loss": 0.0317, + "step": 14770 + }, + { + "epoch": 15.22142121524202, + "grad_norm": 0.2914055585861206, + "learning_rate": 8.501947627832507e-06, + "loss": 0.0308, + "step": 14780 + }, + { + "epoch": 15.231719876416065, + "grad_norm": 0.24458810687065125, + "learning_rate": 8.450776536283594e-06, + "loss": 0.0359, + "step": 14790 + }, + { + "epoch": 15.242018537590113, + "grad_norm": 0.30409494042396545, + "learning_rate": 8.399745684221499e-06, + "loss": 0.0357, + "step": 14800 + }, + { + "epoch": 15.25231719876416, + "grad_norm": 0.2720089852809906, + "learning_rate": 8.348855243888681e-06, + "loss": 0.0344, + "step": 14810 + }, + { + "epoch": 15.262615859938208, + "grad_norm": 0.25461846590042114, + "learning_rate": 8.2981053870537e-06, + "loss": 0.0325, + "step": 14820 + }, + { + "epoch": 15.272914521112256, + "grad_norm": 0.2355855405330658, + "learning_rate": 8.247496285010548e-06, + "loss": 0.0276, + "step": 14830 + }, + { + "epoch": 15.283213182286303, + "grad_norm": 0.1807708442211151, + "learning_rate": 8.197028108578197e-06, + "loss": 0.03, + "step": 14840 + }, + { + "epoch": 15.293511843460351, + "grad_norm": 0.21903660893440247, + "learning_rate": 8.146701028099917e-06, + "loss": 0.0254, + "step": 14850 + }, + { + "epoch": 15.303810504634397, + "grad_norm": 0.5081159472465515, + "learning_rate": 8.096515213442762e-06, + "loss": 0.0276, + "step": 14860 + }, + { + "epoch": 15.314109165808445, + "grad_norm": 0.22669517993927002, + "learning_rate": 8.046470833996973e-06, + "loss": 0.0272, + "step": 14870 + }, + { + "epoch": 15.324407826982492, + "grad_norm": 0.2578093409538269, + "learning_rate": 7.996568058675402e-06, + "loss": 0.0304, + "step": 14880 + }, + { + "epoch": 15.33470648815654, + "grad_norm": 0.20256255567073822, + "learning_rate": 7.946807055912959e-06, + "loss": 0.0292, + "step": 14890 + }, + { + "epoch": 15.345005149330587, + "grad_norm": 0.2500031888484955, + "learning_rate": 7.897187993666022e-06, + "loss": 0.0315, + "step": 14900 + }, + { + "epoch": 15.355303810504635, + "grad_norm": 0.2907675802707672, + "learning_rate": 7.84771103941192e-06, + "loss": 0.0341, + "step": 14910 + }, + { + "epoch": 15.365602471678681, + "grad_norm": 0.1547321081161499, + "learning_rate": 7.79837636014827e-06, + "loss": 0.0249, + "step": 14920 + }, + { + "epoch": 15.375901132852729, + "grad_norm": 0.2814120054244995, + "learning_rate": 7.749184122392539e-06, + "loss": 0.0365, + "step": 14930 + }, + { + "epoch": 15.386199794026776, + "grad_norm": 0.37319841980934143, + "learning_rate": 7.700134492181344e-06, + "loss": 0.0274, + "step": 14940 + }, + { + "epoch": 15.396498455200824, + "grad_norm": 0.24200180172920227, + "learning_rate": 7.651227635070041e-06, + "loss": 0.0306, + "step": 14950 + }, + { + "epoch": 15.406797116374872, + "grad_norm": 0.6322610378265381, + "learning_rate": 7.602463716132041e-06, + "loss": 0.0279, + "step": 14960 + }, + { + "epoch": 15.41709577754892, + "grad_norm": 0.43964508175849915, + "learning_rate": 7.553842899958308e-06, + "loss": 0.032, + "step": 14970 + }, + { + "epoch": 15.427394438722967, + "grad_norm": 0.3598411977291107, + "learning_rate": 7.505365350656812e-06, + "loss": 0.0275, + "step": 14980 + }, + { + "epoch": 15.437693099897013, + "grad_norm": 0.19508050382137299, + "learning_rate": 7.457031231851941e-06, + "loss": 0.034, + "step": 14990 + }, + { + "epoch": 15.44799176107106, + "grad_norm": 0.29256248474121094, + "learning_rate": 7.4088407066839784e-06, + "loss": 0.0387, + "step": 15000 + }, + { + "epoch": 15.458290422245108, + "grad_norm": 0.2301289290189743, + "learning_rate": 7.36079393780853e-06, + "loss": 0.0311, + "step": 15010 + }, + { + "epoch": 15.468589083419156, + "grad_norm": 0.29095834493637085, + "learning_rate": 7.312891087396034e-06, + "loss": 0.0259, + "step": 15020 + }, + { + "epoch": 15.478887744593203, + "grad_norm": 0.2932276129722595, + "learning_rate": 7.2651323171310795e-06, + "loss": 0.0293, + "step": 15030 + }, + { + "epoch": 15.489186405767251, + "grad_norm": 0.24277035892009735, + "learning_rate": 7.217517788212025e-06, + "loss": 0.0334, + "step": 15040 + }, + { + "epoch": 15.499485066941297, + "grad_norm": 0.23208442330360413, + "learning_rate": 7.170047661350349e-06, + "loss": 0.0296, + "step": 15050 + }, + { + "epoch": 15.509783728115345, + "grad_norm": 0.1625526398420334, + "learning_rate": 7.122722096770123e-06, + "loss": 0.0283, + "step": 15060 + }, + { + "epoch": 15.520082389289392, + "grad_norm": 0.29437604546546936, + "learning_rate": 7.075541254207502e-06, + "loss": 0.0284, + "step": 15070 + }, + { + "epoch": 15.53038105046344, + "grad_norm": 0.3337920308113098, + "learning_rate": 7.028505292910154e-06, + "loss": 0.0235, + "step": 15080 + }, + { + "epoch": 15.540679711637488, + "grad_norm": 0.16761137545108795, + "learning_rate": 6.981614371636747e-06, + "loss": 0.0261, + "step": 15090 + }, + { + "epoch": 15.550978372811535, + "grad_norm": 0.18191471695899963, + "learning_rate": 6.934868648656373e-06, + "loss": 0.0273, + "step": 15100 + }, + { + "epoch": 15.561277033985581, + "grad_norm": 0.2083984911441803, + "learning_rate": 6.8882682817481006e-06, + "loss": 0.0339, + "step": 15110 + }, + { + "epoch": 15.571575695159629, + "grad_norm": 0.33254730701446533, + "learning_rate": 6.841813428200306e-06, + "loss": 0.0335, + "step": 15120 + }, + { + "epoch": 15.581874356333676, + "grad_norm": 0.22721487283706665, + "learning_rate": 6.795504244810285e-06, + "loss": 0.0284, + "step": 15130 + }, + { + "epoch": 15.592173017507724, + "grad_norm": 0.3968798816204071, + "learning_rate": 6.749340887883626e-06, + "loss": 0.0326, + "step": 15140 + }, + { + "epoch": 15.602471678681772, + "grad_norm": 0.1721322387456894, + "learning_rate": 6.7033235132337225e-06, + "loss": 0.0267, + "step": 15150 + }, + { + "epoch": 15.61277033985582, + "grad_norm": 0.3585062026977539, + "learning_rate": 6.6574522761812366e-06, + "loss": 0.0297, + "step": 15160 + }, + { + "epoch": 15.623069001029865, + "grad_norm": 0.45918750762939453, + "learning_rate": 6.611727331553586e-06, + "loss": 0.0275, + "step": 15170 + }, + { + "epoch": 15.633367662203913, + "grad_norm": 0.3067721724510193, + "learning_rate": 6.566148833684399e-06, + "loss": 0.0287, + "step": 15180 + }, + { + "epoch": 15.64366632337796, + "grad_norm": 0.2751639187335968, + "learning_rate": 6.520716936413018e-06, + "loss": 0.0295, + "step": 15190 + }, + { + "epoch": 15.653964984552008, + "grad_norm": 0.21889840066432953, + "learning_rate": 6.475431793083974e-06, + "loss": 0.0321, + "step": 15200 + }, + { + "epoch": 15.664263645726056, + "grad_norm": 0.3290077745914459, + "learning_rate": 6.4302935565464514e-06, + "loss": 0.031, + "step": 15210 + }, + { + "epoch": 15.674562306900103, + "grad_norm": 0.5243391394615173, + "learning_rate": 6.385302379153818e-06, + "loss": 0.0248, + "step": 15220 + }, + { + "epoch": 15.684860968074151, + "grad_norm": 1.0162177085876465, + "learning_rate": 6.3404584127630115e-06, + "loss": 0.0243, + "step": 15230 + }, + { + "epoch": 15.695159629248197, + "grad_norm": 0.33608901500701904, + "learning_rate": 6.295761808734174e-06, + "loss": 0.0307, + "step": 15240 + }, + { + "epoch": 15.705458290422245, + "grad_norm": 0.2736285626888275, + "learning_rate": 6.251212717930017e-06, + "loss": 0.0341, + "step": 15250 + }, + { + "epoch": 15.715756951596292, + "grad_norm": 0.3048650920391083, + "learning_rate": 6.206811290715353e-06, + "loss": 0.035, + "step": 15260 + }, + { + "epoch": 15.72605561277034, + "grad_norm": 0.2898007929325104, + "learning_rate": 6.16255767695661e-06, + "loss": 0.0304, + "step": 15270 + }, + { + "epoch": 15.736354273944388, + "grad_norm": 0.2866269052028656, + "learning_rate": 6.118452026021299e-06, + "loss": 0.0344, + "step": 15280 + }, + { + "epoch": 15.746652935118435, + "grad_norm": 0.29790258407592773, + "learning_rate": 6.07449448677751e-06, + "loss": 0.0333, + "step": 15290 + }, + { + "epoch": 15.756951596292481, + "grad_norm": 0.33838725090026855, + "learning_rate": 6.030685207593423e-06, + "loss": 0.0345, + "step": 15300 + }, + { + "epoch": 15.767250257466529, + "grad_norm": 0.28657403588294983, + "learning_rate": 5.9870243363368275e-06, + "loss": 0.0321, + "step": 15310 + }, + { + "epoch": 15.777548918640576, + "grad_norm": 0.34499257802963257, + "learning_rate": 5.943512020374537e-06, + "loss": 0.0367, + "step": 15320 + }, + { + "epoch": 15.787847579814624, + "grad_norm": 0.2314077764749527, + "learning_rate": 5.90014840657202e-06, + "loss": 0.0351, + "step": 15330 + }, + { + "epoch": 15.798146240988672, + "grad_norm": 0.40013644099235535, + "learning_rate": 5.856933641292789e-06, + "loss": 0.0305, + "step": 15340 + }, + { + "epoch": 15.80844490216272, + "grad_norm": 0.6308583617210388, + "learning_rate": 5.813867870397977e-06, + "loss": 0.0331, + "step": 15350 + }, + { + "epoch": 15.818743563336767, + "grad_norm": 0.3136028051376343, + "learning_rate": 5.770951239245803e-06, + "loss": 0.0313, + "step": 15360 + }, + { + "epoch": 15.829042224510813, + "grad_norm": 0.18756185472011566, + "learning_rate": 5.72818389269113e-06, + "loss": 0.0261, + "step": 15370 + }, + { + "epoch": 15.83934088568486, + "grad_norm": 0.22854579985141754, + "learning_rate": 5.685565975084911e-06, + "loss": 0.0307, + "step": 15380 + }, + { + "epoch": 15.849639546858908, + "grad_norm": 0.18659406900405884, + "learning_rate": 5.643097630273769e-06, + "loss": 0.0293, + "step": 15390 + }, + { + "epoch": 15.859938208032956, + "grad_norm": 0.2682023048400879, + "learning_rate": 5.600779001599455e-06, + "loss": 0.0339, + "step": 15400 + }, + { + "epoch": 15.870236869207003, + "grad_norm": 0.29009154438972473, + "learning_rate": 5.558610231898393e-06, + "loss": 0.037, + "step": 15410 + }, + { + "epoch": 15.880535530381051, + "grad_norm": 0.32601863145828247, + "learning_rate": 5.516591463501231e-06, + "loss": 0.0322, + "step": 15420 + }, + { + "epoch": 15.890834191555097, + "grad_norm": 0.25241759419441223, + "learning_rate": 5.474722838232254e-06, + "loss": 0.0335, + "step": 15430 + }, + { + "epoch": 15.901132852729145, + "grad_norm": 0.34431523084640503, + "learning_rate": 5.433004497409039e-06, + "loss": 0.027, + "step": 15440 + }, + { + "epoch": 15.911431513903192, + "grad_norm": 0.24490360915660858, + "learning_rate": 5.391436581841886e-06, + "loss": 0.0287, + "step": 15450 + }, + { + "epoch": 15.92173017507724, + "grad_norm": 0.25288495421409607, + "learning_rate": 5.350019231833364e-06, + "loss": 0.0301, + "step": 15460 + }, + { + "epoch": 15.932028836251288, + "grad_norm": 0.23814049363136292, + "learning_rate": 5.3087525871778565e-06, + "loss": 0.0291, + "step": 15470 + }, + { + "epoch": 15.942327497425335, + "grad_norm": 0.2367774397134781, + "learning_rate": 5.2676367871610675e-06, + "loss": 0.0325, + "step": 15480 + }, + { + "epoch": 15.952626158599383, + "grad_norm": 0.20925898849964142, + "learning_rate": 5.226671970559577e-06, + "loss": 0.0307, + "step": 15490 + }, + { + "epoch": 15.962924819773429, + "grad_norm": 0.36154627799987793, + "learning_rate": 5.185858275640332e-06, + "loss": 0.0328, + "step": 15500 + }, + { + "epoch": 15.973223480947476, + "grad_norm": 0.25385522842407227, + "learning_rate": 5.145195840160239e-06, + "loss": 0.0299, + "step": 15510 + }, + { + "epoch": 15.983522142121524, + "grad_norm": 0.25496914982795715, + "learning_rate": 5.1046848013656165e-06, + "loss": 0.0292, + "step": 15520 + }, + { + "epoch": 15.993820803295572, + "grad_norm": 0.2563509941101074, + "learning_rate": 5.064325295991829e-06, + "loss": 0.0284, + "step": 15530 + }, + { + "epoch": 16.004119464469618, + "grad_norm": 0.2616461217403412, + "learning_rate": 5.024117460262751e-06, + "loss": 0.0439, + "step": 15540 + }, + { + "epoch": 16.014418125643665, + "grad_norm": 0.3009835481643677, + "learning_rate": 4.984061429890324e-06, + "loss": 0.0304, + "step": 15550 + }, + { + "epoch": 16.024716786817713, + "grad_norm": 0.29534780979156494, + "learning_rate": 4.94415734007413e-06, + "loss": 0.0319, + "step": 15560 + }, + { + "epoch": 16.03501544799176, + "grad_norm": 0.21110209822654724, + "learning_rate": 4.9044053255008935e-06, + "loss": 0.0309, + "step": 15570 + }, + { + "epoch": 16.04531410916581, + "grad_norm": 0.257237046957016, + "learning_rate": 4.864805520344051e-06, + "loss": 0.0274, + "step": 15580 + }, + { + "epoch": 16.055612770339856, + "grad_norm": 0.3104022741317749, + "learning_rate": 4.8253580582632906e-06, + "loss": 0.0294, + "step": 15590 + }, + { + "epoch": 16.065911431513904, + "grad_norm": 0.1543678343296051, + "learning_rate": 4.786063072404112e-06, + "loss": 0.0247, + "step": 15600 + }, + { + "epoch": 16.07621009268795, + "grad_norm": 0.18241259455680847, + "learning_rate": 4.7469206953973495e-06, + "loss": 0.0245, + "step": 15610 + }, + { + "epoch": 16.086508753862, + "grad_norm": 0.18561235070228577, + "learning_rate": 4.707931059358783e-06, + "loss": 0.0282, + "step": 15620 + }, + { + "epoch": 16.096807415036047, + "grad_norm": 0.36796221137046814, + "learning_rate": 4.669094295888588e-06, + "loss": 0.0323, + "step": 15630 + }, + { + "epoch": 16.107106076210094, + "grad_norm": 0.21030554175376892, + "learning_rate": 4.630410536071006e-06, + "loss": 0.0271, + "step": 15640 + }, + { + "epoch": 16.117404737384142, + "grad_norm": 0.23774808645248413, + "learning_rate": 4.59187991047384e-06, + "loss": 0.0319, + "step": 15650 + }, + { + "epoch": 16.127703398558186, + "grad_norm": 0.16403083503246307, + "learning_rate": 4.553502549148009e-06, + "loss": 0.0339, + "step": 15660 + }, + { + "epoch": 16.138002059732234, + "grad_norm": 0.23186904191970825, + "learning_rate": 4.515278581627141e-06, + "loss": 0.0301, + "step": 15670 + }, + { + "epoch": 16.14830072090628, + "grad_norm": 0.24327369034290314, + "learning_rate": 4.477208136927119e-06, + "loss": 0.0308, + "step": 15680 + }, + { + "epoch": 16.15859938208033, + "grad_norm": 0.2953716814517975, + "learning_rate": 4.439291343545643e-06, + "loss": 0.0281, + "step": 15690 + }, + { + "epoch": 16.168898043254377, + "grad_norm": 0.24078382551670074, + "learning_rate": 4.401528329461779e-06, + "loss": 0.0304, + "step": 15700 + }, + { + "epoch": 16.179196704428424, + "grad_norm": 0.3598305583000183, + "learning_rate": 4.363919222135604e-06, + "loss": 0.0279, + "step": 15710 + }, + { + "epoch": 16.189495365602472, + "grad_norm": 0.18711034953594208, + "learning_rate": 4.326464148507647e-06, + "loss": 0.0289, + "step": 15720 + }, + { + "epoch": 16.19979402677652, + "grad_norm": 0.3203088045120239, + "learning_rate": 4.289163234998589e-06, + "loss": 0.0334, + "step": 15730 + }, + { + "epoch": 16.210092687950567, + "grad_norm": 0.2985017001628876, + "learning_rate": 4.2520166075087635e-06, + "loss": 0.0246, + "step": 15740 + }, + { + "epoch": 16.220391349124615, + "grad_norm": 0.25471287965774536, + "learning_rate": 4.2150243914177325e-06, + "loss": 0.029, + "step": 15750 + }, + { + "epoch": 16.230690010298662, + "grad_norm": 0.22707876563072205, + "learning_rate": 4.178186711583904e-06, + "loss": 0.0258, + "step": 15760 + }, + { + "epoch": 16.24098867147271, + "grad_norm": 0.2530466914176941, + "learning_rate": 4.141503692344062e-06, + "loss": 0.0324, + "step": 15770 + }, + { + "epoch": 16.251287332646754, + "grad_norm": 0.23593966662883759, + "learning_rate": 4.1049754575129935e-06, + "loss": 0.0299, + "step": 15780 + }, + { + "epoch": 16.261585993820802, + "grad_norm": 0.26746660470962524, + "learning_rate": 4.068602130383031e-06, + "loss": 0.025, + "step": 15790 + }, + { + "epoch": 16.27188465499485, + "grad_norm": 0.3687654733657837, + "learning_rate": 4.032383833723657e-06, + "loss": 0.0344, + "step": 15800 + }, + { + "epoch": 16.282183316168897, + "grad_norm": 0.26962026953697205, + "learning_rate": 3.99632068978108e-06, + "loss": 0.0315, + "step": 15810 + }, + { + "epoch": 16.292481977342945, + "grad_norm": 0.3096659779548645, + "learning_rate": 3.960412820277865e-06, + "loss": 0.0241, + "step": 15820 + }, + { + "epoch": 16.302780638516992, + "grad_norm": 0.3644077777862549, + "learning_rate": 3.924660346412418e-06, + "loss": 0.0348, + "step": 15830 + }, + { + "epoch": 16.31307929969104, + "grad_norm": 0.2755933701992035, + "learning_rate": 3.8890633888587046e-06, + "loss": 0.0309, + "step": 15840 + }, + { + "epoch": 16.323377960865088, + "grad_norm": 0.5915675163269043, + "learning_rate": 3.8536220677657495e-06, + "loss": 0.0314, + "step": 15850 + }, + { + "epoch": 16.333676622039135, + "grad_norm": 0.2403060346841812, + "learning_rate": 3.8183365027572805e-06, + "loss": 0.0304, + "step": 15860 + }, + { + "epoch": 16.343975283213183, + "grad_norm": 0.24288389086723328, + "learning_rate": 3.783206812931289e-06, + "loss": 0.0291, + "step": 15870 + }, + { + "epoch": 16.35427394438723, + "grad_norm": 0.3532700836658478, + "learning_rate": 3.7482331168596675e-06, + "loss": 0.0289, + "step": 15880 + }, + { + "epoch": 16.36457260556128, + "grad_norm": 0.18153394758701324, + "learning_rate": 3.7134155325877772e-06, + "loss": 0.0329, + "step": 15890 + }, + { + "epoch": 16.374871266735326, + "grad_norm": 0.4066762924194336, + "learning_rate": 3.678754177634053e-06, + "loss": 0.0293, + "step": 15900 + }, + { + "epoch": 16.38516992790937, + "grad_norm": 0.33672627806663513, + "learning_rate": 3.64424916898965e-06, + "loss": 0.0303, + "step": 15910 + }, + { + "epoch": 16.395468589083418, + "grad_norm": 0.273366242647171, + "learning_rate": 3.6099006231179622e-06, + "loss": 0.0307, + "step": 15920 + }, + { + "epoch": 16.405767250257465, + "grad_norm": 0.22325216233730316, + "learning_rate": 3.575708655954324e-06, + "loss": 0.0327, + "step": 15930 + }, + { + "epoch": 16.416065911431513, + "grad_norm": 0.18643653392791748, + "learning_rate": 3.541673382905558e-06, + "loss": 0.0346, + "step": 15940 + }, + { + "epoch": 16.42636457260556, + "grad_norm": 0.2503977119922638, + "learning_rate": 3.5077949188495996e-06, + "loss": 0.033, + "step": 15950 + }, + { + "epoch": 16.43666323377961, + "grad_norm": 0.29063940048217773, + "learning_rate": 3.474073378135123e-06, + "loss": 0.0286, + "step": 15960 + }, + { + "epoch": 16.446961894953656, + "grad_norm": 0.2275126725435257, + "learning_rate": 3.440508874581139e-06, + "loss": 0.0321, + "step": 15970 + }, + { + "epoch": 16.457260556127704, + "grad_norm": 0.24945175647735596, + "learning_rate": 3.4071015214766134e-06, + "loss": 0.0312, + "step": 15980 + }, + { + "epoch": 16.46755921730175, + "grad_norm": 0.4091668725013733, + "learning_rate": 3.3738514315800995e-06, + "loss": 0.0351, + "step": 15990 + }, + { + "epoch": 16.4778578784758, + "grad_norm": 0.20869703590869904, + "learning_rate": 3.3407587171193354e-06, + "loss": 0.0262, + "step": 16000 + }, + { + "epoch": 16.488156539649847, + "grad_norm": 0.19803866744041443, + "learning_rate": 3.3078234897908788e-06, + "loss": 0.0293, + "step": 16010 + }, + { + "epoch": 16.498455200823894, + "grad_norm": 0.24785685539245605, + "learning_rate": 3.2750458607597457e-06, + "loss": 0.0295, + "step": 16020 + }, + { + "epoch": 16.508753861997942, + "grad_norm": 0.23679105937480927, + "learning_rate": 3.2424259406589664e-06, + "loss": 0.0269, + "step": 16030 + }, + { + "epoch": 16.519052523171986, + "grad_norm": 0.21375852823257446, + "learning_rate": 3.209963839589325e-06, + "loss": 0.0236, + "step": 16040 + }, + { + "epoch": 16.529351184346034, + "grad_norm": 0.1723773181438446, + "learning_rate": 3.177659667118882e-06, + "loss": 0.0312, + "step": 16050 + }, + { + "epoch": 16.53964984552008, + "grad_norm": 0.24385997653007507, + "learning_rate": 3.1455135322826678e-06, + "loss": 0.0301, + "step": 16060 + }, + { + "epoch": 16.54994850669413, + "grad_norm": 0.2073340266942978, + "learning_rate": 3.1135255435822796e-06, + "loss": 0.0286, + "step": 16070 + }, + { + "epoch": 16.560247167868177, + "grad_norm": 0.2794674336910248, + "learning_rate": 3.0816958089855462e-06, + "loss": 0.0265, + "step": 16080 + }, + { + "epoch": 16.570545829042224, + "grad_norm": 0.2308894544839859, + "learning_rate": 3.0500244359261355e-06, + "loss": 0.0284, + "step": 16090 + }, + { + "epoch": 16.580844490216272, + "grad_norm": 0.2674751579761505, + "learning_rate": 3.018511531303203e-06, + "loss": 0.0282, + "step": 16100 + }, + { + "epoch": 16.59114315139032, + "grad_norm": 0.20278188586235046, + "learning_rate": 2.9871572014810555e-06, + "loss": 0.0272, + "step": 16110 + }, + { + "epoch": 16.601441812564367, + "grad_norm": 0.20840872824192047, + "learning_rate": 2.9559615522887273e-06, + "loss": 0.0358, + "step": 16120 + }, + { + "epoch": 16.611740473738415, + "grad_norm": 0.26591232419013977, + "learning_rate": 2.924924689019698e-06, + "loss": 0.0262, + "step": 16130 + }, + { + "epoch": 16.622039134912463, + "grad_norm": 0.22082144021987915, + "learning_rate": 2.8940467164314924e-06, + "loss": 0.0321, + "step": 16140 + }, + { + "epoch": 16.63233779608651, + "grad_norm": 0.2413538098335266, + "learning_rate": 2.8633277387453308e-06, + "loss": 0.0377, + "step": 16150 + }, + { + "epoch": 16.642636457260558, + "grad_norm": 0.2731287479400635, + "learning_rate": 2.8327678596457963e-06, + "loss": 0.031, + "step": 16160 + }, + { + "epoch": 16.652935118434602, + "grad_norm": 0.18613195419311523, + "learning_rate": 2.802367182280463e-06, + "loss": 0.0367, + "step": 16170 + }, + { + "epoch": 16.66323377960865, + "grad_norm": 0.19616888463497162, + "learning_rate": 2.7721258092595627e-06, + "loss": 0.0265, + "step": 16180 + }, + { + "epoch": 16.673532440782697, + "grad_norm": 0.20527370274066925, + "learning_rate": 2.7420438426556338e-06, + "loss": 0.0331, + "step": 16190 + }, + { + "epoch": 16.683831101956745, + "grad_norm": 0.21385008096694946, + "learning_rate": 2.712121384003169e-06, + "loss": 0.0271, + "step": 16200 + }, + { + "epoch": 16.694129763130793, + "grad_norm": 0.2785768210887909, + "learning_rate": 2.682358534298285e-06, + "loss": 0.0365, + "step": 16210 + }, + { + "epoch": 16.70442842430484, + "grad_norm": 0.2710186243057251, + "learning_rate": 2.652755393998396e-06, + "loss": 0.0245, + "step": 16220 + }, + { + "epoch": 16.714727085478888, + "grad_norm": 0.2453254610300064, + "learning_rate": 2.6233120630218045e-06, + "loss": 0.0327, + "step": 16230 + }, + { + "epoch": 16.725025746652936, + "grad_norm": 0.2788352072238922, + "learning_rate": 2.594028640747476e-06, + "loss": 0.0292, + "step": 16240 + }, + { + "epoch": 16.735324407826983, + "grad_norm": 0.4019950032234192, + "learning_rate": 2.564905226014597e-06, + "loss": 0.029, + "step": 16250 + }, + { + "epoch": 16.74562306900103, + "grad_norm": 0.2551436424255371, + "learning_rate": 2.5359419171223086e-06, + "loss": 0.0296, + "step": 16260 + }, + { + "epoch": 16.75592173017508, + "grad_norm": 0.2889397442340851, + "learning_rate": 2.507138811829346e-06, + "loss": 0.033, + "step": 16270 + }, + { + "epoch": 16.766220391349126, + "grad_norm": 0.25674816966056824, + "learning_rate": 2.4784960073537143e-06, + "loss": 0.0267, + "step": 16280 + }, + { + "epoch": 16.77651905252317, + "grad_norm": 0.21177352964878082, + "learning_rate": 2.4500136003723638e-06, + "loss": 0.0262, + "step": 16290 + }, + { + "epoch": 16.786817713697218, + "grad_norm": 0.21103815734386444, + "learning_rate": 2.421691687020855e-06, + "loss": 0.0295, + "step": 16300 + }, + { + "epoch": 16.797116374871266, + "grad_norm": 0.26780322194099426, + "learning_rate": 2.3935303628930707e-06, + "loss": 0.0327, + "step": 16310 + }, + { + "epoch": 16.807415036045313, + "grad_norm": 0.49311545491218567, + "learning_rate": 2.3655297230408045e-06, + "loss": 0.03, + "step": 16320 + }, + { + "epoch": 16.81771369721936, + "grad_norm": 0.2364225834608078, + "learning_rate": 2.3376898619735577e-06, + "loss": 0.0276, + "step": 16330 + }, + { + "epoch": 16.82801235839341, + "grad_norm": 0.29716435074806213, + "learning_rate": 2.3100108736581305e-06, + "loss": 0.027, + "step": 16340 + }, + { + "epoch": 16.838311019567456, + "grad_norm": 0.20759916305541992, + "learning_rate": 2.282492851518342e-06, + "loss": 0.0275, + "step": 16350 + }, + { + "epoch": 16.848609680741504, + "grad_norm": 0.1657613217830658, + "learning_rate": 2.2551358884347007e-06, + "loss": 0.0273, + "step": 16360 + }, + { + "epoch": 16.85890834191555, + "grad_norm": 0.16528256237506866, + "learning_rate": 2.227940076744117e-06, + "loss": 0.0309, + "step": 16370 + }, + { + "epoch": 16.8692070030896, + "grad_norm": 0.28386402130126953, + "learning_rate": 2.2009055082395537e-06, + "loss": 0.0324, + "step": 16380 + }, + { + "epoch": 16.879505664263647, + "grad_norm": 0.23188601434230804, + "learning_rate": 2.174032274169746e-06, + "loss": 0.0283, + "step": 16390 + }, + { + "epoch": 16.889804325437694, + "grad_norm": 0.34195181727409363, + "learning_rate": 2.1473204652388834e-06, + "loss": 0.031, + "step": 16400 + }, + { + "epoch": 16.900102986611742, + "grad_norm": 0.19225898385047913, + "learning_rate": 2.1207701716062956e-06, + "loss": 0.0374, + "step": 16410 + }, + { + "epoch": 16.910401647785786, + "grad_norm": 0.4472239911556244, + "learning_rate": 2.0943814828861762e-06, + "loss": 0.0304, + "step": 16420 + }, + { + "epoch": 16.920700308959834, + "grad_norm": 0.26532843708992004, + "learning_rate": 2.0681544881472283e-06, + "loss": 0.0291, + "step": 16430 + }, + { + "epoch": 16.93099897013388, + "grad_norm": 0.27116134762763977, + "learning_rate": 2.0420892759124176e-06, + "loss": 0.0224, + "step": 16440 + }, + { + "epoch": 16.94129763130793, + "grad_norm": 0.3424379825592041, + "learning_rate": 2.0161859341586597e-06, + "loss": 0.0274, + "step": 16450 + }, + { + "epoch": 16.951596292481977, + "grad_norm": 0.23772460222244263, + "learning_rate": 1.9904445503164838e-06, + "loss": 0.0308, + "step": 16460 + }, + { + "epoch": 16.961894953656024, + "grad_norm": 0.23013190925121307, + "learning_rate": 1.964865211269801e-06, + "loss": 0.0265, + "step": 16470 + }, + { + "epoch": 16.972193614830072, + "grad_norm": 0.2528025805950165, + "learning_rate": 1.939448003355554e-06, + "loss": 0.0342, + "step": 16480 + }, + { + "epoch": 16.98249227600412, + "grad_norm": 0.39106324315071106, + "learning_rate": 1.914193012363469e-06, + "loss": 0.0326, + "step": 16490 + }, + { + "epoch": 16.992790937178167, + "grad_norm": 0.4082978069782257, + "learning_rate": 1.8891003235357308e-06, + "loss": 0.0321, + "step": 16500 + }, + { + "epoch": 17.003089598352215, + "grad_norm": 0.1785215586423874, + "learning_rate": 1.8641700215667413e-06, + "loss": 0.0265, + "step": 16510 + }, + { + "epoch": 17.013388259526263, + "grad_norm": 0.5540566444396973, + "learning_rate": 1.839402190602757e-06, + "loss": 0.0281, + "step": 16520 + }, + { + "epoch": 17.02368692070031, + "grad_norm": 0.2588430941104889, + "learning_rate": 1.8147969142417066e-06, + "loss": 0.0284, + "step": 16530 + }, + { + "epoch": 17.033985581874358, + "grad_norm": 0.3563145399093628, + "learning_rate": 1.7903542755328073e-06, + "loss": 0.0308, + "step": 16540 + }, + { + "epoch": 17.044284243048402, + "grad_norm": 0.303353488445282, + "learning_rate": 1.766074356976366e-06, + "loss": 0.0302, + "step": 16550 + }, + { + "epoch": 17.05458290422245, + "grad_norm": 0.24329645931720734, + "learning_rate": 1.7419572405234453e-06, + "loss": 0.0282, + "step": 16560 + }, + { + "epoch": 17.064881565396497, + "grad_norm": 0.212374746799469, + "learning_rate": 1.7180030075756136e-06, + "loss": 0.0298, + "step": 16570 + }, + { + "epoch": 17.075180226570545, + "grad_norm": 0.22339214384555817, + "learning_rate": 1.6942117389846746e-06, + "loss": 0.0314, + "step": 16580 + }, + { + "epoch": 17.085478887744593, + "grad_norm": 0.2897525131702423, + "learning_rate": 1.6705835150523707e-06, + "loss": 0.0331, + "step": 16590 + }, + { + "epoch": 17.09577754891864, + "grad_norm": 0.20139732956886292, + "learning_rate": 1.6471184155301355e-06, + "loss": 0.0271, + "step": 16600 + }, + { + "epoch": 17.106076210092688, + "grad_norm": 0.30817776918411255, + "learning_rate": 1.6238165196188039e-06, + "loss": 0.0288, + "step": 16610 + }, + { + "epoch": 17.116374871266736, + "grad_norm": 0.23742049932479858, + "learning_rate": 1.6006779059683784e-06, + "loss": 0.0317, + "step": 16620 + }, + { + "epoch": 17.126673532440783, + "grad_norm": 0.2712803781032562, + "learning_rate": 1.5777026526777094e-06, + "loss": 0.029, + "step": 16630 + }, + { + "epoch": 17.13697219361483, + "grad_norm": 0.19828765094280243, + "learning_rate": 1.5548908372942983e-06, + "loss": 0.0315, + "step": 16640 + }, + { + "epoch": 17.14727085478888, + "grad_norm": 0.27912184596061707, + "learning_rate": 1.5322425368139714e-06, + "loss": 0.0293, + "step": 16650 + }, + { + "epoch": 17.157569515962926, + "grad_norm": 0.41649627685546875, + "learning_rate": 1.5097578276806633e-06, + "loss": 0.0299, + "step": 16660 + }, + { + "epoch": 17.167868177136974, + "grad_norm": 0.20297054946422577, + "learning_rate": 1.487436785786145e-06, + "loss": 0.0313, + "step": 16670 + }, + { + "epoch": 17.178166838311018, + "grad_norm": 0.38883742690086365, + "learning_rate": 1.4652794864697671e-06, + "loss": 0.0293, + "step": 16680 + }, + { + "epoch": 17.188465499485066, + "grad_norm": 0.2401762455701828, + "learning_rate": 1.4432860045182017e-06, + "loss": 0.0282, + "step": 16690 + }, + { + "epoch": 17.198764160659113, + "grad_norm": 0.3450429141521454, + "learning_rate": 1.4214564141651898e-06, + "loss": 0.0249, + "step": 16700 + }, + { + "epoch": 17.20906282183316, + "grad_norm": 0.17480014264583588, + "learning_rate": 1.3997907890913265e-06, + "loss": 0.0271, + "step": 16710 + }, + { + "epoch": 17.21936148300721, + "grad_norm": 0.2633569538593292, + "learning_rate": 1.3782892024237327e-06, + "loss": 0.0282, + "step": 16720 + }, + { + "epoch": 17.229660144181256, + "grad_norm": 0.22684310376644135, + "learning_rate": 1.3569517267359e-06, + "loss": 0.0325, + "step": 16730 + }, + { + "epoch": 17.239958805355304, + "grad_norm": 0.30432412028312683, + "learning_rate": 1.33577843404738e-06, + "loss": 0.027, + "step": 16740 + }, + { + "epoch": 17.25025746652935, + "grad_norm": 0.3308713734149933, + "learning_rate": 1.3147693958235618e-06, + "loss": 0.0296, + "step": 16750 + }, + { + "epoch": 17.2605561277034, + "grad_norm": 0.2591300904750824, + "learning_rate": 1.2939246829754503e-06, + "loss": 0.0191, + "step": 16760 + }, + { + "epoch": 17.270854788877447, + "grad_norm": 0.3229091763496399, + "learning_rate": 1.2732443658593884e-06, + "loss": 0.0278, + "step": 16770 + }, + { + "epoch": 17.281153450051495, + "grad_norm": 0.3232883810997009, + "learning_rate": 1.2527285142768574e-06, + "loss": 0.0308, + "step": 16780 + }, + { + "epoch": 17.291452111225542, + "grad_norm": 0.16374994814395905, + "learning_rate": 1.2323771974742104e-06, + "loss": 0.0285, + "step": 16790 + }, + { + "epoch": 17.301750772399586, + "grad_norm": 0.4016587734222412, + "learning_rate": 1.212190484142467e-06, + "loss": 0.0287, + "step": 16800 + }, + { + "epoch": 17.312049433573634, + "grad_norm": 0.7468344569206238, + "learning_rate": 1.192168442417052e-06, + "loss": 0.0318, + "step": 16810 + }, + { + "epoch": 17.32234809474768, + "grad_norm": 0.62845778465271, + "learning_rate": 1.1723111398776077e-06, + "loss": 0.0307, + "step": 16820 + }, + { + "epoch": 17.33264675592173, + "grad_norm": 0.29316961765289307, + "learning_rate": 1.1526186435476927e-06, + "loss": 0.0322, + "step": 16830 + }, + { + "epoch": 17.342945417095777, + "grad_norm": 0.2891688942909241, + "learning_rate": 1.1330910198946442e-06, + "loss": 0.0274, + "step": 16840 + }, + { + "epoch": 17.353244078269825, + "grad_norm": 0.28778383135795593, + "learning_rate": 1.1137283348292892e-06, + "loss": 0.0341, + "step": 16850 + }, + { + "epoch": 17.363542739443872, + "grad_norm": 0.17100463807582855, + "learning_rate": 1.0945306537057555e-06, + "loss": 0.0334, + "step": 16860 + }, + { + "epoch": 17.37384140061792, + "grad_norm": 0.17976661026477814, + "learning_rate": 1.0754980413212268e-06, + "loss": 0.0299, + "step": 16870 + }, + { + "epoch": 17.384140061791967, + "grad_norm": 0.2614526152610779, + "learning_rate": 1.0566305619157502e-06, + "loss": 0.0278, + "step": 16880 + }, + { + "epoch": 17.394438722966015, + "grad_norm": 0.195588618516922, + "learning_rate": 1.0379282791719958e-06, + "loss": 0.028, + "step": 16890 + }, + { + "epoch": 17.404737384140063, + "grad_norm": 1.0282113552093506, + "learning_rate": 1.0193912562150464e-06, + "loss": 0.0291, + "step": 16900 + }, + { + "epoch": 17.41503604531411, + "grad_norm": 0.2868080735206604, + "learning_rate": 1.0010195556122203e-06, + "loss": 0.0329, + "step": 16910 + }, + { + "epoch": 17.425334706488158, + "grad_norm": 0.2227233201265335, + "learning_rate": 9.828132393727875e-07, + "loss": 0.0262, + "step": 16920 + }, + { + "epoch": 17.435633367662202, + "grad_norm": 0.20315021276474, + "learning_rate": 9.647723689478305e-07, + "loss": 0.0324, + "step": 16930 + }, + { + "epoch": 17.44593202883625, + "grad_norm": 0.6371609568595886, + "learning_rate": 9.468970052300019e-07, + "loss": 0.0318, + "step": 16940 + }, + { + "epoch": 17.456230690010297, + "grad_norm": 0.18564990162849426, + "learning_rate": 9.291872085533227e-07, + "loss": 0.0289, + "step": 16950 + }, + { + "epoch": 17.466529351184345, + "grad_norm": 0.22705796360969543, + "learning_rate": 9.116430386929886e-07, + "loss": 0.0249, + "step": 16960 + }, + { + "epoch": 17.476828012358393, + "grad_norm": 0.2133428156375885, + "learning_rate": 8.942645548651541e-07, + "loss": 0.0376, + "step": 16970 + }, + { + "epoch": 17.48712667353244, + "grad_norm": 0.19329524040222168, + "learning_rate": 8.770518157267482e-07, + "loss": 0.0308, + "step": 16980 + }, + { + "epoch": 17.497425334706488, + "grad_norm": 0.2410387098789215, + "learning_rate": 8.60004879375259e-07, + "loss": 0.0273, + "step": 16990 + }, + { + "epoch": 17.507723995880536, + "grad_norm": 0.20141083002090454, + "learning_rate": 8.4312380334855e-07, + "loss": 0.0336, + "step": 17000 + }, + { + "epoch": 17.518022657054583, + "grad_norm": 0.27098795771598816, + "learning_rate": 8.264086446246655e-07, + "loss": 0.0313, + "step": 17010 + }, + { + "epoch": 17.52832131822863, + "grad_norm": 0.35340428352355957, + "learning_rate": 8.098594596216424e-07, + "loss": 0.0348, + "step": 17020 + }, + { + "epoch": 17.53861997940268, + "grad_norm": 0.3264867663383484, + "learning_rate": 7.934763041972937e-07, + "loss": 0.0302, + "step": 17030 + }, + { + "epoch": 17.548918640576726, + "grad_norm": 0.2895232141017914, + "learning_rate": 7.772592336490525e-07, + "loss": 0.0325, + "step": 17040 + }, + { + "epoch": 17.559217301750774, + "grad_norm": 0.24770499765872955, + "learning_rate": 7.612083027137728e-07, + "loss": 0.0319, + "step": 17050 + }, + { + "epoch": 17.569515962924818, + "grad_norm": 0.4487510323524475, + "learning_rate": 7.453235655675406e-07, + "loss": 0.0258, + "step": 17060 + }, + { + "epoch": 17.579814624098866, + "grad_norm": 0.38243043422698975, + "learning_rate": 7.296050758254957e-07, + "loss": 0.0308, + "step": 17070 + }, + { + "epoch": 17.590113285272913, + "grad_norm": 0.5216277837753296, + "learning_rate": 7.140528865416441e-07, + "loss": 0.0268, + "step": 17080 + }, + { + "epoch": 17.60041194644696, + "grad_norm": 0.300006240606308, + "learning_rate": 6.986670502086901e-07, + "loss": 0.0324, + "step": 17090 + }, + { + "epoch": 17.61071060762101, + "grad_norm": 0.22057189047336578, + "learning_rate": 6.834476187578543e-07, + "loss": 0.0282, + "step": 17100 + }, + { + "epoch": 17.621009268795056, + "grad_norm": 0.26959654688835144, + "learning_rate": 6.683946435586952e-07, + "loss": 0.0307, + "step": 17110 + }, + { + "epoch": 17.631307929969104, + "grad_norm": 0.28995075821876526, + "learning_rate": 6.535081754189321e-07, + "loss": 0.0318, + "step": 17120 + }, + { + "epoch": 17.64160659114315, + "grad_norm": 0.3135945200920105, + "learning_rate": 6.387882645842947e-07, + "loss": 0.0287, + "step": 17130 + }, + { + "epoch": 17.6519052523172, + "grad_norm": 0.26953238248825073, + "learning_rate": 6.24234960738318e-07, + "loss": 0.0292, + "step": 17140 + }, + { + "epoch": 17.662203913491247, + "grad_norm": 0.2764807343482971, + "learning_rate": 6.098483130022148e-07, + "loss": 0.027, + "step": 17150 + }, + { + "epoch": 17.672502574665295, + "grad_norm": 0.3281687796115875, + "learning_rate": 5.956283699346754e-07, + "loss": 0.0254, + "step": 17160 + }, + { + "epoch": 17.682801235839342, + "grad_norm": 0.17730310559272766, + "learning_rate": 5.815751795317237e-07, + "loss": 0.0277, + "step": 17170 + }, + { + "epoch": 17.69309989701339, + "grad_norm": 0.43514519929885864, + "learning_rate": 5.676887892265559e-07, + "loss": 0.0238, + "step": 17180 + }, + { + "epoch": 17.703398558187434, + "grad_norm": 0.31942808628082275, + "learning_rate": 5.539692458893575e-07, + "loss": 0.027, + "step": 17190 + }, + { + "epoch": 17.71369721936148, + "grad_norm": 1.2527509927749634, + "learning_rate": 5.404165958271811e-07, + "loss": 0.029, + "step": 17200 + }, + { + "epoch": 17.72399588053553, + "grad_norm": 0.2568182051181793, + "learning_rate": 5.270308847837579e-07, + "loss": 0.0316, + "step": 17210 + }, + { + "epoch": 17.734294541709577, + "grad_norm": 0.32886284589767456, + "learning_rate": 5.13812157939364e-07, + "loss": 0.0341, + "step": 17220 + }, + { + "epoch": 17.744593202883625, + "grad_norm": 0.1350669264793396, + "learning_rate": 5.007604599106486e-07, + "loss": 0.0279, + "step": 17230 + }, + { + "epoch": 17.754891864057672, + "grad_norm": 0.24451610445976257, + "learning_rate": 4.878758347505175e-07, + "loss": 0.0261, + "step": 17240 + }, + { + "epoch": 17.76519052523172, + "grad_norm": 0.23091380298137665, + "learning_rate": 4.751583259479331e-07, + "loss": 0.031, + "step": 17250 + }, + { + "epoch": 17.775489186405768, + "grad_norm": 0.311443030834198, + "learning_rate": 4.6260797642782014e-07, + "loss": 0.032, + "step": 17260 + }, + { + "epoch": 17.785787847579815, + "grad_norm": 0.2045062929391861, + "learning_rate": 4.5022482855088255e-07, + "loss": 0.0256, + "step": 17270 + }, + { + "epoch": 17.796086508753863, + "grad_norm": 0.339093953371048, + "learning_rate": 4.380089241134866e-07, + "loss": 0.0306, + "step": 17280 + }, + { + "epoch": 17.80638516992791, + "grad_norm": 0.3019813597202301, + "learning_rate": 4.259603043475002e-07, + "loss": 0.0302, + "step": 17290 + }, + { + "epoch": 17.816683831101958, + "grad_norm": 0.21195490658283234, + "learning_rate": 4.1407900992015414e-07, + "loss": 0.0318, + "step": 17300 + }, + { + "epoch": 17.826982492276002, + "grad_norm": 0.2570505142211914, + "learning_rate": 4.023650809339363e-07, + "loss": 0.0387, + "step": 17310 + }, + { + "epoch": 17.83728115345005, + "grad_norm": 0.36077165603637695, + "learning_rate": 3.9081855692640333e-07, + "loss": 0.0281, + "step": 17320 + }, + { + "epoch": 17.847579814624098, + "grad_norm": 0.24089422821998596, + "learning_rate": 3.7943947687010816e-07, + "loss": 0.0265, + "step": 17330 + }, + { + "epoch": 17.857878475798145, + "grad_norm": 0.3065880835056305, + "learning_rate": 3.6822787917240587e-07, + "loss": 0.0265, + "step": 17340 + }, + { + "epoch": 17.868177136972193, + "grad_norm": 0.20888155698776245, + "learning_rate": 3.571838016753759e-07, + "loss": 0.0345, + "step": 17350 + }, + { + "epoch": 17.87847579814624, + "grad_norm": 0.42461952567100525, + "learning_rate": 3.4630728165566117e-07, + "loss": 0.0334, + "step": 17360 + }, + { + "epoch": 17.888774459320288, + "grad_norm": 0.36267679929733276, + "learning_rate": 3.3559835582435695e-07, + "loss": 0.0306, + "step": 17370 + }, + { + "epoch": 17.899073120494336, + "grad_norm": 0.1654314249753952, + "learning_rate": 3.250570603268943e-07, + "loss": 0.0247, + "step": 17380 + }, + { + "epoch": 17.909371781668384, + "grad_norm": 0.2670270800590515, + "learning_rate": 3.1468343074290143e-07, + "loss": 0.032, + "step": 17390 + }, + { + "epoch": 17.91967044284243, + "grad_norm": 0.2694757878780365, + "learning_rate": 3.0447750208607573e-07, + "loss": 0.0269, + "step": 17400 + }, + { + "epoch": 17.92996910401648, + "grad_norm": 0.34293317794799805, + "learning_rate": 2.944393088041009e-07, + "loss": 0.0234, + "step": 17410 + }, + { + "epoch": 17.940267765190526, + "grad_norm": 0.25010308623313904, + "learning_rate": 2.8456888477850776e-07, + "loss": 0.0294, + "step": 17420 + }, + { + "epoch": 17.950566426364574, + "grad_norm": 0.34105420112609863, + "learning_rate": 2.7486626332455245e-07, + "loss": 0.0292, + "step": 17430 + }, + { + "epoch": 17.96086508753862, + "grad_norm": 0.2277262657880783, + "learning_rate": 2.653314771911108e-07, + "loss": 0.0398, + "step": 17440 + }, + { + "epoch": 17.971163748712666, + "grad_norm": 0.3880465030670166, + "learning_rate": 2.5596455856058963e-07, + "loss": 0.0323, + "step": 17450 + }, + { + "epoch": 17.981462409886714, + "grad_norm": 0.1923012137413025, + "learning_rate": 2.467655390487822e-07, + "loss": 0.0227, + "step": 17460 + }, + { + "epoch": 17.99176107106076, + "grad_norm": 0.24936918914318085, + "learning_rate": 2.3773444970477955e-07, + "loss": 0.0249, + "step": 17470 + }, + { + "epoch": 18.00205973223481, + "grad_norm": 0.2869769334793091, + "learning_rate": 2.2887132101087615e-07, + "loss": 0.0248, + "step": 17480 + }, + { + "epoch": 18.012358393408856, + "grad_norm": 0.25350290536880493, + "learning_rate": 2.201761828824367e-07, + "loss": 0.0327, + "step": 17490 + }, + { + "epoch": 18.022657054582904, + "grad_norm": 0.27213600277900696, + "learning_rate": 2.1164906466783485e-07, + "loss": 0.0285, + "step": 17500 + }, + { + "epoch": 18.03295571575695, + "grad_norm": 0.257794588804245, + "learning_rate": 2.032899951483147e-07, + "loss": 0.0281, + "step": 17510 + }, + { + "epoch": 18.043254376931, + "grad_norm": 0.2469080537557602, + "learning_rate": 1.9509900253792955e-07, + "loss": 0.0259, + "step": 17520 + }, + { + "epoch": 18.053553038105047, + "grad_norm": 0.2920747995376587, + "learning_rate": 1.870761144834088e-07, + "loss": 0.0287, + "step": 17530 + }, + { + "epoch": 18.063851699279095, + "grad_norm": 0.2282969057559967, + "learning_rate": 1.7922135806410778e-07, + "loss": 0.0277, + "step": 17540 + }, + { + "epoch": 18.074150360453142, + "grad_norm": 0.28502708673477173, + "learning_rate": 1.7153475979186927e-07, + "loss": 0.0345, + "step": 17550 + }, + { + "epoch": 18.08444902162719, + "grad_norm": 0.23902451992034912, + "learning_rate": 1.6401634561098444e-07, + "loss": 0.0335, + "step": 17560 + }, + { + "epoch": 18.094747682801234, + "grad_norm": 0.3159581124782562, + "learning_rate": 1.566661408980541e-07, + "loss": 0.0299, + "step": 17570 + }, + { + "epoch": 18.105046343975282, + "grad_norm": 0.12344943732023239, + "learning_rate": 1.4948417046194985e-07, + "loss": 0.0272, + "step": 17580 + }, + { + "epoch": 18.11534500514933, + "grad_norm": 0.3794369101524353, + "learning_rate": 1.42470458543692e-07, + "loss": 0.0338, + "step": 17590 + }, + { + "epoch": 18.125643666323377, + "grad_norm": 0.1987241804599762, + "learning_rate": 1.3562502881639404e-07, + "loss": 0.0223, + "step": 17600 + }, + { + "epoch": 18.135942327497425, + "grad_norm": 0.21883957087993622, + "learning_rate": 1.2894790438516824e-07, + "loss": 0.0275, + "step": 17610 + }, + { + "epoch": 18.146240988671472, + "grad_norm": 0.2665363550186157, + "learning_rate": 1.2243910778705348e-07, + "loss": 0.033, + "step": 17620 + }, + { + "epoch": 18.15653964984552, + "grad_norm": 0.15010571479797363, + "learning_rate": 1.1609866099094313e-07, + "loss": 0.0227, + "step": 17630 + }, + { + "epoch": 18.166838311019568, + "grad_norm": 0.19142857193946838, + "learning_rate": 1.0992658539750178e-07, + "loss": 0.0279, + "step": 17640 + }, + { + "epoch": 18.177136972193615, + "grad_norm": 0.2638980746269226, + "learning_rate": 1.0392290183909304e-07, + "loss": 0.0265, + "step": 17650 + }, + { + "epoch": 18.187435633367663, + "grad_norm": 0.19933411478996277, + "learning_rate": 9.808763057971849e-08, + "loss": 0.0294, + "step": 17660 + }, + { + "epoch": 18.19773429454171, + "grad_norm": 0.32049107551574707, + "learning_rate": 9.242079131495107e-08, + "loss": 0.0268, + "step": 17670 + }, + { + "epoch": 18.20803295571576, + "grad_norm": 0.22636005282402039, + "learning_rate": 8.69224031718463e-08, + "loss": 0.0359, + "step": 17680 + }, + { + "epoch": 18.218331616889806, + "grad_norm": 0.19072987139225006, + "learning_rate": 8.159248470890334e-08, + "loss": 0.0272, + "step": 17690 + }, + { + "epoch": 18.22863027806385, + "grad_norm": 0.5597253441810608, + "learning_rate": 7.643105391598737e-08, + "loss": 0.0296, + "step": 17700 + }, + { + "epoch": 18.238928939237898, + "grad_norm": 0.20172372460365295, + "learning_rate": 7.143812821427953e-08, + "loss": 0.0321, + "step": 17710 + }, + { + "epoch": 18.249227600411945, + "grad_norm": 0.49044567346572876, + "learning_rate": 6.661372445621039e-08, + "loss": 0.0284, + "step": 17720 + }, + { + "epoch": 18.259526261585993, + "grad_norm": 0.2032887190580368, + "learning_rate": 6.19578589253933e-08, + "loss": 0.03, + "step": 17730 + }, + { + "epoch": 18.26982492276004, + "grad_norm": 0.30425992608070374, + "learning_rate": 5.747054733660773e-08, + "loss": 0.0301, + "step": 17740 + }, + { + "epoch": 18.28012358393409, + "grad_norm": 0.2486412227153778, + "learning_rate": 5.3151804835688267e-08, + "loss": 0.0261, + "step": 17750 + }, + { + "epoch": 18.290422245108136, + "grad_norm": 0.21091780066490173, + "learning_rate": 4.9001645999524613e-08, + "loss": 0.0276, + "step": 17760 + }, + { + "epoch": 18.300720906282184, + "grad_norm": 0.36458486318588257, + "learning_rate": 4.502008483598941e-08, + "loss": 0.0277, + "step": 17770 + }, + { + "epoch": 18.31101956745623, + "grad_norm": 0.21798443794250488, + "learning_rate": 4.1207134783888265e-08, + "loss": 0.0307, + "step": 17780 + }, + { + "epoch": 18.32131822863028, + "grad_norm": 0.27093908190727234, + "learning_rate": 3.756280871293205e-08, + "loss": 0.0328, + "step": 17790 + }, + { + "epoch": 18.331616889804327, + "grad_norm": 0.1765187829732895, + "learning_rate": 3.4087118923659125e-08, + "loss": 0.0305, + "step": 17800 + }, + { + "epoch": 18.341915550978374, + "grad_norm": 0.9125376343727112, + "learning_rate": 3.078007714744646e-08, + "loss": 0.0408, + "step": 17810 + }, + { + "epoch": 18.352214212152422, + "grad_norm": 0.1739547997713089, + "learning_rate": 2.7641694546409746e-08, + "loss": 0.0282, + "step": 17820 + }, + { + "epoch": 18.362512873326466, + "grad_norm": 0.2467593103647232, + "learning_rate": 2.467198171342e-08, + "loss": 0.0266, + "step": 17830 + }, + { + "epoch": 18.372811534500514, + "grad_norm": 0.7820371389389038, + "learning_rate": 2.1870948672036984e-08, + "loss": 0.0263, + "step": 17840 + }, + { + "epoch": 18.38311019567456, + "grad_norm": 0.30878883600234985, + "learning_rate": 1.9238604876470334e-08, + "loss": 0.03, + "step": 17850 + }, + { + "epoch": 18.39340885684861, + "grad_norm": 0.2729048728942871, + "learning_rate": 1.6774959211568465e-08, + "loss": 0.035, + "step": 17860 + }, + { + "epoch": 18.403707518022657, + "grad_norm": 0.33503258228302, + "learning_rate": 1.4480019992785254e-08, + "loss": 0.0261, + "step": 17870 + }, + { + "epoch": 18.414006179196704, + "grad_norm": 0.24983762204647064, + "learning_rate": 1.2353794966135646e-08, + "loss": 0.0265, + "step": 17880 + }, + { + "epoch": 18.424304840370752, + "grad_norm": 0.24591587483882904, + "learning_rate": 1.0396291308190087e-08, + "loss": 0.0248, + "step": 17890 + }, + { + "epoch": 18.4346035015448, + "grad_norm": 0.24605391919612885, + "learning_rate": 8.607515626030128e-09, + "loss": 0.0289, + "step": 17900 + }, + { + "epoch": 18.444902162718847, + "grad_norm": 0.2520316541194916, + "learning_rate": 6.987473957242863e-09, + "loss": 0.0307, + "step": 17910 + }, + { + "epoch": 18.455200823892895, + "grad_norm": 0.46191495656967163, + "learning_rate": 5.536171769887632e-09, + "loss": 0.0303, + "step": 17920 + }, + { + "epoch": 18.465499485066942, + "grad_norm": 0.26452863216400146, + "learning_rate": 4.253613962496017e-09, + "loss": 0.0329, + "step": 17930 + }, + { + "epoch": 18.47579814624099, + "grad_norm": 0.3968678116798401, + "learning_rate": 3.1398048640385315e-09, + "loss": 0.0356, + "step": 17940 + }, + { + "epoch": 18.486096807415038, + "grad_norm": 0.19242151081562042, + "learning_rate": 2.1947482338968705e-09, + "loss": 0.0265, + "step": 17950 + }, + { + "epoch": 18.496395468589082, + "grad_norm": 0.20866911113262177, + "learning_rate": 1.4184472618972154e-09, + "loss": 0.0251, + "step": 17960 + }, + { + "epoch": 18.50669412976313, + "grad_norm": 0.17729917168617249, + "learning_rate": 8.109045682547223e-10, + "loss": 0.0264, + "step": 17970 + }, + { + "epoch": 18.516992790937177, + "grad_norm": 0.19232727587223053, + "learning_rate": 3.721222035846239e-10, + "loss": 0.0366, + "step": 17980 + }, + { + "epoch": 18.527291452111225, + "grad_norm": 0.41915977001190186, + "learning_rate": 1.0210164889112861e-10, + "loss": 0.0288, + "step": 17990 + }, + { + "epoch": 18.537590113285273, + "grad_norm": 0.742242693901062, + "learning_rate": 8.438155674195258e-13, + "loss": 0.0335, + "step": 18000 + }, + { + "epoch": 18.537590113285273, + "step": 18000, + "total_flos": 0.0, + "train_loss": 0.05001054983586073, + "train_runtime": 5749.6082, + "train_samples_per_second": 100.181, + "train_steps_per_second": 3.131 + } + ], + "logging_steps": 10, + "max_steps": 18000, + "num_input_tokens_seen": 0, + "num_train_epochs": 19, + "save_steps": 20000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}