diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16050 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.25599901577555206, + "eval_steps": 500, + "global_step": 22889, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00011184368726268168, + "grad_norm": 0.6328383684158325, + "learning_rate": 4.5e-06, + "loss": 1.734, + "step": 10 + }, + { + "epoch": 0.00022368737452536336, + "grad_norm": 0.566952109336853, + "learning_rate": 9.5e-06, + "loss": 1.6903, + "step": 20 + }, + { + "epoch": 0.00033553106178804503, + "grad_norm": 0.5359939932823181, + "learning_rate": 1.4500000000000002e-05, + "loss": 1.6266, + "step": 30 + }, + { + "epoch": 0.0004473747490507267, + "grad_norm": 0.4729914367198944, + "learning_rate": 1.95e-05, + "loss": 1.5731, + "step": 40 + }, + { + "epoch": 0.0005592184363134084, + "grad_norm": 0.42020025849342346, + "learning_rate": 2.4500000000000003e-05, + "loss": 1.5335, + "step": 50 + }, + { + "epoch": 0.0006710621235760901, + "grad_norm": 0.4461672604084015, + "learning_rate": 2.95e-05, + "loss": 1.4851, + "step": 60 + }, + { + "epoch": 0.0007829058108387717, + "grad_norm": 0.4443751275539398, + "learning_rate": 3.4500000000000005e-05, + "loss": 1.4431, + "step": 70 + }, + { + "epoch": 0.0008947494981014534, + "grad_norm": 0.4204632639884949, + "learning_rate": 3.95e-05, + "loss": 1.4036, + "step": 80 + }, + { + "epoch": 0.0010065931853641351, + "grad_norm": 0.3985028862953186, + "learning_rate": 4.45e-05, + "loss": 1.3725, + "step": 90 + }, + { + "epoch": 0.0011184368726268167, + "grad_norm": 0.4111650586128235, + "learning_rate": 4.9500000000000004e-05, + "loss": 1.3527, + "step": 100 + }, + { + "epoch": 0.0012302805598894985, + "grad_norm": 0.4175569713115692, + "learning_rate": 5.45e-05, + "loss": 1.3431, + "step": 110 + }, + { + "epoch": 0.0013421242471521801, + "grad_norm": 0.3871678411960602, + "learning_rate": 5.9499999999999996e-05, + "loss": 1.3322, + "step": 120 + }, + { + "epoch": 0.0014539679344148617, + "grad_norm": 0.39584827423095703, + "learning_rate": 6.450000000000001e-05, + "loss": 1.3075, + "step": 130 + }, + { + "epoch": 0.0015658116216775435, + "grad_norm": 0.4165605902671814, + "learning_rate": 6.950000000000001e-05, + "loss": 1.286, + "step": 140 + }, + { + "epoch": 0.001677655308940225, + "grad_norm": 0.3985513150691986, + "learning_rate": 7.45e-05, + "loss": 1.2567, + "step": 150 + }, + { + "epoch": 0.0017894989962029069, + "grad_norm": 0.39112743735313416, + "learning_rate": 7.950000000000001e-05, + "loss": 1.2448, + "step": 160 + }, + { + "epoch": 0.0019013426834655885, + "grad_norm": 0.3867124915122986, + "learning_rate": 8.450000000000001e-05, + "loss": 1.2405, + "step": 170 + }, + { + "epoch": 0.0020131863707282703, + "grad_norm": 0.3955863416194916, + "learning_rate": 8.95e-05, + "loss": 1.2123, + "step": 180 + }, + { + "epoch": 0.002125030057990952, + "grad_norm": 0.40293410420417786, + "learning_rate": 9.45e-05, + "loss": 1.2081, + "step": 190 + }, + { + "epoch": 0.0022368737452536334, + "grad_norm": 0.3828902542591095, + "learning_rate": 9.95e-05, + "loss": 1.2049, + "step": 200 + }, + { + "epoch": 0.002348717432516315, + "grad_norm": 0.3969178795814514, + "learning_rate": 0.00010449999999999999, + "loss": 1.1892, + "step": 210 + }, + { + "epoch": 0.002460561119778997, + "grad_norm": 0.4122287929058075, + "learning_rate": 0.0001095, + "loss": 1.184, + "step": 220 + }, + { + "epoch": 0.0025724048070416786, + "grad_norm": 0.3793940246105194, + "learning_rate": 0.0001145, + "loss": 1.1809, + "step": 230 + }, + { + "epoch": 0.0026842484943043602, + "grad_norm": 0.4132145643234253, + "learning_rate": 0.00011949999999999999, + "loss": 1.1883, + "step": 240 + }, + { + "epoch": 0.002796092181567042, + "grad_norm": 0.3900831639766693, + "learning_rate": 0.0001245, + "loss": 1.1818, + "step": 250 + }, + { + "epoch": 0.0029079358688297234, + "grad_norm": 0.3898029625415802, + "learning_rate": 0.0001295, + "loss": 1.1693, + "step": 260 + }, + { + "epoch": 0.0030197795560924054, + "grad_norm": 0.40828797221183777, + "learning_rate": 0.00013450000000000002, + "loss": 1.1869, + "step": 270 + }, + { + "epoch": 0.003131623243355087, + "grad_norm": 0.3976770341396332, + "learning_rate": 0.0001395, + "loss": 1.1841, + "step": 280 + }, + { + "epoch": 0.0032434669306177686, + "grad_norm": 0.3902062773704529, + "learning_rate": 0.0001445, + "loss": 1.1843, + "step": 290 + }, + { + "epoch": 0.00335531061788045, + "grad_norm": 0.38051125407218933, + "learning_rate": 0.0001495, + "loss": 1.1662, + "step": 300 + }, + { + "epoch": 0.0034671543051431318, + "grad_norm": 0.3628483712673187, + "learning_rate": 0.00015450000000000001, + "loss": 1.1638, + "step": 310 + }, + { + "epoch": 0.0035789979924058138, + "grad_norm": 0.3693360388278961, + "learning_rate": 0.0001595, + "loss": 1.1606, + "step": 320 + }, + { + "epoch": 0.0036908416796684954, + "grad_norm": 0.38896557688713074, + "learning_rate": 0.00016450000000000001, + "loss": 1.1448, + "step": 330 + }, + { + "epoch": 0.003802685366931177, + "grad_norm": 0.40257108211517334, + "learning_rate": 0.00016950000000000003, + "loss": 1.143, + "step": 340 + }, + { + "epoch": 0.0039145290541938585, + "grad_norm": 0.38656994700431824, + "learning_rate": 0.00017449999999999999, + "loss": 1.141, + "step": 350 + }, + { + "epoch": 0.0040263727414565405, + "grad_norm": 0.3700025677680969, + "learning_rate": 0.0001795, + "loss": 1.136, + "step": 360 + }, + { + "epoch": 0.004138216428719222, + "grad_norm": 0.37222161889076233, + "learning_rate": 0.0001845, + "loss": 1.1292, + "step": 370 + }, + { + "epoch": 0.004250060115981904, + "grad_norm": 0.39386317133903503, + "learning_rate": 0.0001895, + "loss": 1.1139, + "step": 380 + }, + { + "epoch": 0.004361903803244586, + "grad_norm": 0.3776305913925171, + "learning_rate": 0.0001945, + "loss": 1.1125, + "step": 390 + }, + { + "epoch": 0.004473747490507267, + "grad_norm": 0.40314197540283203, + "learning_rate": 0.00019950000000000002, + "loss": 1.0962, + "step": 400 + }, + { + "epoch": 0.004585591177769949, + "grad_norm": 0.37841472029685974, + "learning_rate": 0.00020449999999999998, + "loss": 1.0987, + "step": 410 + }, + { + "epoch": 0.00469743486503263, + "grad_norm": 0.3678649365901947, + "learning_rate": 0.0002095, + "loss": 1.0826, + "step": 420 + }, + { + "epoch": 0.004809278552295312, + "grad_norm": 0.37902751564979553, + "learning_rate": 0.0002145, + "loss": 1.0973, + "step": 430 + }, + { + "epoch": 0.004921122239557994, + "grad_norm": 0.3776302635669708, + "learning_rate": 0.0002195, + "loss": 1.112, + "step": 440 + }, + { + "epoch": 0.005032965926820675, + "grad_norm": 0.43771493434906006, + "learning_rate": 0.0002245, + "loss": 1.1005, + "step": 450 + }, + { + "epoch": 0.005144809614083357, + "grad_norm": 0.3662595748901367, + "learning_rate": 0.00022950000000000002, + "loss": 1.0899, + "step": 460 + }, + { + "epoch": 0.005256653301346038, + "grad_norm": 0.37473002076148987, + "learning_rate": 0.00023449999999999998, + "loss": 1.0982, + "step": 470 + }, + { + "epoch": 0.0053684969886087204, + "grad_norm": 0.35591790080070496, + "learning_rate": 0.0002395, + "loss": 1.1005, + "step": 480 + }, + { + "epoch": 0.0054803406758714025, + "grad_norm": 0.3825643062591553, + "learning_rate": 0.0002445, + "loss": 1.0896, + "step": 490 + }, + { + "epoch": 0.005592184363134084, + "grad_norm": 0.3784261643886566, + "learning_rate": 0.0002495, + "loss": 1.1039, + "step": 500 + }, + { + "epoch": 0.005704028050396766, + "grad_norm": 0.35387158393859863, + "learning_rate": 0.0002545, + "loss": 1.1038, + "step": 510 + }, + { + "epoch": 0.005815871737659447, + "grad_norm": 0.3992142975330353, + "learning_rate": 0.0002595, + "loss": 1.088, + "step": 520 + }, + { + "epoch": 0.005927715424922129, + "grad_norm": 0.36795270442962646, + "learning_rate": 0.00026450000000000003, + "loss": 1.0888, + "step": 530 + }, + { + "epoch": 0.006039559112184811, + "grad_norm": 0.4007701575756073, + "learning_rate": 0.00026950000000000005, + "loss": 1.0838, + "step": 540 + }, + { + "epoch": 0.006151402799447492, + "grad_norm": 0.34527722001075745, + "learning_rate": 0.0002745, + "loss": 1.0892, + "step": 550 + }, + { + "epoch": 0.006263246486710174, + "grad_norm": 0.37232115864753723, + "learning_rate": 0.0002795, + "loss": 1.0939, + "step": 560 + }, + { + "epoch": 0.006375090173972855, + "grad_norm": 0.4048405885696411, + "learning_rate": 0.0002845, + "loss": 1.0863, + "step": 570 + }, + { + "epoch": 0.006486933861235537, + "grad_norm": 0.37317511439323425, + "learning_rate": 0.0002895, + "loss": 1.0711, + "step": 580 + }, + { + "epoch": 0.006598777548498219, + "grad_norm": 0.38564008474349976, + "learning_rate": 0.0002945, + "loss": 1.091, + "step": 590 + }, + { + "epoch": 0.0067106212357609, + "grad_norm": 0.3639361262321472, + "learning_rate": 0.0002995, + "loss": 1.0682, + "step": 600 + }, + { + "epoch": 0.006822464923023582, + "grad_norm": 0.35907182097435, + "learning_rate": 0.0003045, + "loss": 1.0755, + "step": 610 + }, + { + "epoch": 0.0069343086102862635, + "grad_norm": 0.35199785232543945, + "learning_rate": 0.0003095, + "loss": 1.0581, + "step": 620 + }, + { + "epoch": 0.0070461522975489455, + "grad_norm": 0.35156381130218506, + "learning_rate": 0.0003145, + "loss": 1.0651, + "step": 630 + }, + { + "epoch": 0.0071579959848116275, + "grad_norm": 0.3742520213127136, + "learning_rate": 0.0003195, + "loss": 1.0555, + "step": 640 + }, + { + "epoch": 0.007269839672074309, + "grad_norm": 0.3587191700935364, + "learning_rate": 0.00032450000000000003, + "loss": 1.0548, + "step": 650 + }, + { + "epoch": 0.007381683359336991, + "grad_norm": 0.37587791681289673, + "learning_rate": 0.00032950000000000004, + "loss": 1.0437, + "step": 660 + }, + { + "epoch": 0.007493527046599672, + "grad_norm": 0.3410298526287079, + "learning_rate": 0.00033450000000000005, + "loss": 1.0426, + "step": 670 + }, + { + "epoch": 0.007605370733862354, + "grad_norm": 0.3450978696346283, + "learning_rate": 0.0003395, + "loss": 1.0487, + "step": 680 + }, + { + "epoch": 0.007717214421125036, + "grad_norm": 0.3445068299770355, + "learning_rate": 0.00034449999999999997, + "loss": 1.0411, + "step": 690 + }, + { + "epoch": 0.007829058108387717, + "grad_norm": 0.34611567854881287, + "learning_rate": 0.0003495, + "loss": 1.0404, + "step": 700 + }, + { + "epoch": 0.007940901795650398, + "grad_norm": 0.3339330852031708, + "learning_rate": 0.0003545, + "loss": 1.0361, + "step": 710 + }, + { + "epoch": 0.008052745482913081, + "grad_norm": 0.33232080936431885, + "learning_rate": 0.0003595, + "loss": 1.0271, + "step": 720 + }, + { + "epoch": 0.008164589170175762, + "grad_norm": 0.33050498366355896, + "learning_rate": 0.0003645, + "loss": 1.0316, + "step": 730 + }, + { + "epoch": 0.008276432857438443, + "grad_norm": 0.3449972867965698, + "learning_rate": 0.0003695, + "loss": 1.0426, + "step": 740 + }, + { + "epoch": 0.008388276544701126, + "grad_norm": 0.3543892502784729, + "learning_rate": 0.0003745, + "loss": 1.0475, + "step": 750 + }, + { + "epoch": 0.008500120231963807, + "grad_norm": 0.3447831869125366, + "learning_rate": 0.0003795, + "loss": 1.0482, + "step": 760 + }, + { + "epoch": 0.008611963919226489, + "grad_norm": 0.33845630288124084, + "learning_rate": 0.0003845, + "loss": 1.0533, + "step": 770 + }, + { + "epoch": 0.008723807606489171, + "grad_norm": 0.3394622802734375, + "learning_rate": 0.00038950000000000003, + "loss": 1.0803, + "step": 780 + }, + { + "epoch": 0.008835651293751853, + "grad_norm": 0.33649975061416626, + "learning_rate": 0.00039450000000000005, + "loss": 1.0461, + "step": 790 + }, + { + "epoch": 0.008947494981014534, + "grad_norm": 0.3265191912651062, + "learning_rate": 0.0003995, + "loss": 1.0714, + "step": 800 + }, + { + "epoch": 0.009059338668277215, + "grad_norm": 0.34960776567459106, + "learning_rate": 0.0004045, + "loss": 1.0542, + "step": 810 + }, + { + "epoch": 0.009171182355539898, + "grad_norm": 0.3353814482688904, + "learning_rate": 0.0004095, + "loss": 1.0625, + "step": 820 + }, + { + "epoch": 0.009283026042802579, + "grad_norm": 0.3499109148979187, + "learning_rate": 0.0004145, + "loss": 1.0679, + "step": 830 + }, + { + "epoch": 0.00939486973006526, + "grad_norm": 0.33906084299087524, + "learning_rate": 0.0004195, + "loss": 1.0659, + "step": 840 + }, + { + "epoch": 0.009506713417327943, + "grad_norm": 0.3245256543159485, + "learning_rate": 0.0004245, + "loss": 1.078, + "step": 850 + }, + { + "epoch": 0.009618557104590624, + "grad_norm": 0.3364386260509491, + "learning_rate": 0.0004295, + "loss": 1.0771, + "step": 860 + }, + { + "epoch": 0.009730400791853305, + "grad_norm": 0.348718523979187, + "learning_rate": 0.0004345, + "loss": 1.0751, + "step": 870 + }, + { + "epoch": 0.009842244479115988, + "grad_norm": 0.31124839186668396, + "learning_rate": 0.0004395, + "loss": 1.0693, + "step": 880 + }, + { + "epoch": 0.00995408816637867, + "grad_norm": 0.3478352129459381, + "learning_rate": 0.0004445, + "loss": 1.0682, + "step": 890 + }, + { + "epoch": 0.01006593185364135, + "grad_norm": 0.31189802289009094, + "learning_rate": 0.00044950000000000003, + "loss": 1.0608, + "step": 900 + }, + { + "epoch": 0.010177775540904033, + "grad_norm": 0.34715884923934937, + "learning_rate": 0.00045450000000000004, + "loss": 1.0698, + "step": 910 + }, + { + "epoch": 0.010289619228166715, + "grad_norm": 0.3279336988925934, + "learning_rate": 0.00045950000000000006, + "loss": 1.0728, + "step": 920 + }, + { + "epoch": 0.010401462915429396, + "grad_norm": 0.32010868191719055, + "learning_rate": 0.0004645, + "loss": 1.0765, + "step": 930 + }, + { + "epoch": 0.010513306602692077, + "grad_norm": 0.3618028163909912, + "learning_rate": 0.0004695, + "loss": 1.0815, + "step": 940 + }, + { + "epoch": 0.01062515028995476, + "grad_norm": 0.3403186798095703, + "learning_rate": 0.0004745, + "loss": 1.0713, + "step": 950 + }, + { + "epoch": 0.010736993977217441, + "grad_norm": 0.347687691450119, + "learning_rate": 0.0004795, + "loss": 1.0844, + "step": 960 + }, + { + "epoch": 0.010848837664480122, + "grad_norm": 0.3537987768650055, + "learning_rate": 0.0004845, + "loss": 1.0762, + "step": 970 + }, + { + "epoch": 0.010960681351742805, + "grad_norm": 0.42015892267227173, + "learning_rate": 0.0004895, + "loss": 1.0832, + "step": 980 + }, + { + "epoch": 0.011072525039005486, + "grad_norm": 0.35781368613243103, + "learning_rate": 0.0004945, + "loss": 1.0606, + "step": 990 + }, + { + "epoch": 0.011184368726268167, + "grad_norm": 0.3361358344554901, + "learning_rate": 0.0004995, + "loss": 1.0717, + "step": 1000 + }, + { + "epoch": 0.01129621241353085, + "grad_norm": 0.36569204926490784, + "learning_rate": 0.0004997944172872219, + "loss": 1.0602, + "step": 1010 + }, + { + "epoch": 0.011408056100793531, + "grad_norm": 0.31979477405548096, + "learning_rate": 0.0004995659920508017, + "loss": 1.0531, + "step": 1020 + }, + { + "epoch": 0.011519899788056212, + "grad_norm": 0.3295707404613495, + "learning_rate": 0.0004993375668143817, + "loss": 1.0346, + "step": 1030 + }, + { + "epoch": 0.011631743475318894, + "grad_norm": 0.3207838833332062, + "learning_rate": 0.0004991091415779616, + "loss": 1.059, + "step": 1040 + }, + { + "epoch": 0.011743587162581576, + "grad_norm": 0.33032119274139404, + "learning_rate": 0.0004988807163415415, + "loss": 1.0573, + "step": 1050 + }, + { + "epoch": 0.011855430849844258, + "grad_norm": 0.3566173017024994, + "learning_rate": 0.0004986522911051213, + "loss": 1.0501, + "step": 1060 + }, + { + "epoch": 0.011967274537106939, + "grad_norm": 0.31658655405044556, + "learning_rate": 0.0004984238658687012, + "loss": 1.0706, + "step": 1070 + }, + { + "epoch": 0.012079118224369622, + "grad_norm": 0.3438680171966553, + "learning_rate": 0.0004981954406322811, + "loss": 1.0765, + "step": 1080 + }, + { + "epoch": 0.012190961911632303, + "grad_norm": 0.3130144774913788, + "learning_rate": 0.0004979670153958609, + "loss": 1.0588, + "step": 1090 + }, + { + "epoch": 0.012302805598894984, + "grad_norm": 0.31765422224998474, + "learning_rate": 0.0004977385901594408, + "loss": 1.0703, + "step": 1100 + }, + { + "epoch": 0.012414649286157667, + "grad_norm": 0.36112868785858154, + "learning_rate": 0.0004975101649230207, + "loss": 1.0642, + "step": 1110 + }, + { + "epoch": 0.012526492973420348, + "grad_norm": 0.33418065309524536, + "learning_rate": 0.0004972817396866005, + "loss": 1.0572, + "step": 1120 + }, + { + "epoch": 0.01263833666068303, + "grad_norm": 0.34439629316329956, + "learning_rate": 0.0004970533144501805, + "loss": 1.0473, + "step": 1130 + }, + { + "epoch": 0.01275018034794571, + "grad_norm": 0.32954639196395874, + "learning_rate": 0.0004968248892137603, + "loss": 1.054, + "step": 1140 + }, + { + "epoch": 0.012862024035208393, + "grad_norm": 0.3351511061191559, + "learning_rate": 0.0004965964639773402, + "loss": 1.0444, + "step": 1150 + }, + { + "epoch": 0.012973867722471074, + "grad_norm": 0.3065156638622284, + "learning_rate": 0.0004963680387409202, + "loss": 1.0546, + "step": 1160 + }, + { + "epoch": 0.013085711409733755, + "grad_norm": 0.36450672149658203, + "learning_rate": 0.0004961396135045, + "loss": 1.0501, + "step": 1170 + }, + { + "epoch": 0.013197555096996438, + "grad_norm": 0.3020591735839844, + "learning_rate": 0.0004959111882680799, + "loss": 1.052, + "step": 1180 + }, + { + "epoch": 0.01330939878425912, + "grad_norm": 0.3097701966762543, + "learning_rate": 0.0004956827630316598, + "loss": 1.0695, + "step": 1190 + }, + { + "epoch": 0.0134212424715218, + "grad_norm": 0.3410932719707489, + "learning_rate": 0.0004954543377952396, + "loss": 1.0692, + "step": 1200 + }, + { + "epoch": 0.013533086158784484, + "grad_norm": 0.38478952646255493, + "learning_rate": 0.0004952259125588195, + "loss": 1.0592, + "step": 1210 + }, + { + "epoch": 0.013644929846047165, + "grad_norm": 0.3737089931964874, + "learning_rate": 0.0004949974873223994, + "loss": 1.0808, + "step": 1220 + }, + { + "epoch": 0.013756773533309846, + "grad_norm": 0.3264448940753937, + "learning_rate": 0.0004947690620859793, + "loss": 1.0759, + "step": 1230 + }, + { + "epoch": 0.013868617220572527, + "grad_norm": 0.3922732472419739, + "learning_rate": 0.0004945406368495591, + "loss": 1.0634, + "step": 1240 + }, + { + "epoch": 0.01398046090783521, + "grad_norm": 0.36068034172058105, + "learning_rate": 0.000494312211613139, + "loss": 1.0683, + "step": 1250 + }, + { + "epoch": 0.014092304595097891, + "grad_norm": 0.3544798791408539, + "learning_rate": 0.0004940837863767189, + "loss": 1.0687, + "step": 1260 + }, + { + "epoch": 0.014204148282360572, + "grad_norm": 0.31447795033454895, + "learning_rate": 0.0004938553611402987, + "loss": 1.0549, + "step": 1270 + }, + { + "epoch": 0.014315991969623255, + "grad_norm": 0.37639158964157104, + "learning_rate": 0.0004936269359038786, + "loss": 1.0698, + "step": 1280 + }, + { + "epoch": 0.014427835656885936, + "grad_norm": 0.32416418194770813, + "learning_rate": 0.0004933985106674586, + "loss": 1.0617, + "step": 1290 + }, + { + "epoch": 0.014539679344148617, + "grad_norm": 0.3122979998588562, + "learning_rate": 0.0004931700854310385, + "loss": 1.0553, + "step": 1300 + }, + { + "epoch": 0.0146515230314113, + "grad_norm": 0.3574884533882141, + "learning_rate": 0.0004929416601946184, + "loss": 1.0598, + "step": 1310 + }, + { + "epoch": 0.014763366718673981, + "grad_norm": 0.30762428045272827, + "learning_rate": 0.0004927132349581982, + "loss": 1.0642, + "step": 1320 + }, + { + "epoch": 0.014875210405936663, + "grad_norm": 0.34350454807281494, + "learning_rate": 0.0004924848097217781, + "loss": 1.0663, + "step": 1330 + }, + { + "epoch": 0.014987054093199344, + "grad_norm": 0.33486828207969666, + "learning_rate": 0.000492256384485358, + "loss": 1.0479, + "step": 1340 + }, + { + "epoch": 0.015098897780462027, + "grad_norm": 0.3025324046611786, + "learning_rate": 0.0004920279592489378, + "loss": 1.0705, + "step": 1350 + }, + { + "epoch": 0.015210741467724708, + "grad_norm": 0.35260385274887085, + "learning_rate": 0.0004917995340125177, + "loss": 1.0762, + "step": 1360 + }, + { + "epoch": 0.015322585154987389, + "grad_norm": 0.3188925087451935, + "learning_rate": 0.0004915711087760976, + "loss": 1.069, + "step": 1370 + }, + { + "epoch": 0.015434428842250072, + "grad_norm": 0.332660436630249, + "learning_rate": 0.0004913426835396775, + "loss": 1.0749, + "step": 1380 + }, + { + "epoch": 0.015546272529512753, + "grad_norm": 0.31745171546936035, + "learning_rate": 0.0004911142583032573, + "loss": 1.0811, + "step": 1390 + }, + { + "epoch": 0.015658116216775434, + "grad_norm": 0.3237819969654083, + "learning_rate": 0.0004908858330668372, + "loss": 1.0634, + "step": 1400 + }, + { + "epoch": 0.015769959904038115, + "grad_norm": 0.3300880789756775, + "learning_rate": 0.0004906574078304171, + "loss": 1.0554, + "step": 1410 + }, + { + "epoch": 0.015881803591300796, + "grad_norm": 0.32475635409355164, + "learning_rate": 0.0004904289825939969, + "loss": 1.0598, + "step": 1420 + }, + { + "epoch": 0.01599364727856348, + "grad_norm": 0.31278952956199646, + "learning_rate": 0.0004902005573575769, + "loss": 1.0498, + "step": 1430 + }, + { + "epoch": 0.016105490965826162, + "grad_norm": 0.308680921792984, + "learning_rate": 0.0004899721321211568, + "loss": 1.0586, + "step": 1440 + }, + { + "epoch": 0.016217334653088843, + "grad_norm": 0.34637314081192017, + "learning_rate": 0.0004897437068847367, + "loss": 1.0535, + "step": 1450 + }, + { + "epoch": 0.016329178340351524, + "grad_norm": 0.3220643401145935, + "learning_rate": 0.0004895152816483165, + "loss": 1.0624, + "step": 1460 + }, + { + "epoch": 0.016441022027614206, + "grad_norm": 0.31472912430763245, + "learning_rate": 0.0004892868564118964, + "loss": 1.0748, + "step": 1470 + }, + { + "epoch": 0.016552865714876887, + "grad_norm": 0.3416632115840912, + "learning_rate": 0.0004890584311754763, + "loss": 1.0715, + "step": 1480 + }, + { + "epoch": 0.01666470940213957, + "grad_norm": 0.3463667631149292, + "learning_rate": 0.0004888300059390561, + "loss": 1.0914, + "step": 1490 + }, + { + "epoch": 0.016776553089402253, + "grad_norm": 0.3322199881076813, + "learning_rate": 0.000488601580702636, + "loss": 1.0707, + "step": 1500 + }, + { + "epoch": 0.016888396776664934, + "grad_norm": 0.3899800479412079, + "learning_rate": 0.0004883731554662159, + "loss": 1.0883, + "step": 1510 + }, + { + "epoch": 0.017000240463927615, + "grad_norm": 0.3409605324268341, + "learning_rate": 0.0004881447302297958, + "loss": 1.0982, + "step": 1520 + }, + { + "epoch": 0.017112084151190296, + "grad_norm": 0.3720357120037079, + "learning_rate": 0.0004879163049933757, + "loss": 1.0674, + "step": 1530 + }, + { + "epoch": 0.017223927838452977, + "grad_norm": 0.326050728559494, + "learning_rate": 0.00048768787975695554, + "loss": 1.0764, + "step": 1540 + }, + { + "epoch": 0.01733577152571566, + "grad_norm": 0.3238283395767212, + "learning_rate": 0.0004874594545205354, + "loss": 1.0547, + "step": 1550 + }, + { + "epoch": 0.017447615212978343, + "grad_norm": 0.3324073553085327, + "learning_rate": 0.00048723102928411536, + "loss": 1.0608, + "step": 1560 + }, + { + "epoch": 0.017559458900241024, + "grad_norm": 0.3382217586040497, + "learning_rate": 0.0004870026040476952, + "loss": 1.0505, + "step": 1570 + }, + { + "epoch": 0.017671302587503705, + "grad_norm": 0.3409116566181183, + "learning_rate": 0.00048677417881127507, + "loss": 1.0673, + "step": 1580 + }, + { + "epoch": 0.017783146274766386, + "grad_norm": 0.3123399019241333, + "learning_rate": 0.000486545753574855, + "loss": 1.0461, + "step": 1590 + }, + { + "epoch": 0.017894989962029068, + "grad_norm": 0.3178008198738098, + "learning_rate": 0.00048631732833843484, + "loss": 1.0526, + "step": 1600 + }, + { + "epoch": 0.01800683364929175, + "grad_norm": 0.37002459168434143, + "learning_rate": 0.0004860889031020147, + "loss": 1.0483, + "step": 1610 + }, + { + "epoch": 0.01811867733655443, + "grad_norm": 0.31036287546157837, + "learning_rate": 0.0004858604778655946, + "loss": 1.0418, + "step": 1620 + }, + { + "epoch": 0.018230521023817114, + "grad_norm": 0.3027215600013733, + "learning_rate": 0.00048563205262917446, + "loss": 1.0467, + "step": 1630 + }, + { + "epoch": 0.018342364711079796, + "grad_norm": 0.32144612073898315, + "learning_rate": 0.00048540362739275437, + "loss": 1.0437, + "step": 1640 + }, + { + "epoch": 0.018454208398342477, + "grad_norm": 0.3156447410583496, + "learning_rate": 0.0004851752021563343, + "loss": 1.0447, + "step": 1650 + }, + { + "epoch": 0.018566052085605158, + "grad_norm": 0.3228546380996704, + "learning_rate": 0.00048494677691991413, + "loss": 1.056, + "step": 1660 + }, + { + "epoch": 0.01867789577286784, + "grad_norm": 0.3478510081768036, + "learning_rate": 0.000484718351683494, + "loss": 1.0523, + "step": 1670 + }, + { + "epoch": 0.01878973946013052, + "grad_norm": 0.3413507342338562, + "learning_rate": 0.0004844899264470739, + "loss": 1.049, + "step": 1680 + }, + { + "epoch": 0.018901583147393205, + "grad_norm": 0.3277221918106079, + "learning_rate": 0.00048426150121065375, + "loss": 1.0403, + "step": 1690 + }, + { + "epoch": 0.019013426834655886, + "grad_norm": 0.3044646382331848, + "learning_rate": 0.0004840330759742336, + "loss": 1.0518, + "step": 1700 + }, + { + "epoch": 0.019125270521918567, + "grad_norm": 0.31599846482276917, + "learning_rate": 0.0004838046507378135, + "loss": 1.0475, + "step": 1710 + }, + { + "epoch": 0.01923711420918125, + "grad_norm": 0.346741646528244, + "learning_rate": 0.00048357622550139343, + "loss": 1.0515, + "step": 1720 + }, + { + "epoch": 0.01934895789644393, + "grad_norm": 0.32756108045578003, + "learning_rate": 0.0004833478002649733, + "loss": 1.054, + "step": 1730 + }, + { + "epoch": 0.01946080158370661, + "grad_norm": 0.3318345546722412, + "learning_rate": 0.0004831193750285532, + "loss": 1.0575, + "step": 1740 + }, + { + "epoch": 0.019572645270969292, + "grad_norm": 0.3389560282230377, + "learning_rate": 0.00048289094979213305, + "loss": 1.0576, + "step": 1750 + }, + { + "epoch": 0.019684488958231976, + "grad_norm": 0.31532642245292664, + "learning_rate": 0.0004826625245557129, + "loss": 1.0554, + "step": 1760 + }, + { + "epoch": 0.019796332645494658, + "grad_norm": 0.3263496160507202, + "learning_rate": 0.0004824340993192928, + "loss": 1.0697, + "step": 1770 + }, + { + "epoch": 0.01990817633275734, + "grad_norm": 0.328225314617157, + "learning_rate": 0.00048220567408287267, + "loss": 1.0584, + "step": 1780 + }, + { + "epoch": 0.02002002002002002, + "grad_norm": 0.3030998706817627, + "learning_rate": 0.00048197724884645253, + "loss": 1.0555, + "step": 1790 + }, + { + "epoch": 0.0201318637072827, + "grad_norm": 0.32594701647758484, + "learning_rate": 0.0004817488236100325, + "loss": 1.0512, + "step": 1800 + }, + { + "epoch": 0.020243707394545382, + "grad_norm": 0.2882954776287079, + "learning_rate": 0.00048152039837361235, + "loss": 1.0441, + "step": 1810 + }, + { + "epoch": 0.020355551081808067, + "grad_norm": 0.33917129039764404, + "learning_rate": 0.0004812919731371922, + "loss": 1.048, + "step": 1820 + }, + { + "epoch": 0.020467394769070748, + "grad_norm": 0.32748523354530334, + "learning_rate": 0.0004810635479007721, + "loss": 1.042, + "step": 1830 + }, + { + "epoch": 0.02057923845633343, + "grad_norm": 0.32332462072372437, + "learning_rate": 0.00048083512266435197, + "loss": 1.0396, + "step": 1840 + }, + { + "epoch": 0.02069108214359611, + "grad_norm": 0.36977729201316833, + "learning_rate": 0.0004806066974279318, + "loss": 1.0337, + "step": 1850 + }, + { + "epoch": 0.02080292583085879, + "grad_norm": 0.33298948407173157, + "learning_rate": 0.00048037827219151174, + "loss": 1.045, + "step": 1860 + }, + { + "epoch": 0.020914769518121473, + "grad_norm": 0.328861802816391, + "learning_rate": 0.00048014984695509165, + "loss": 1.053, + "step": 1870 + }, + { + "epoch": 0.021026613205384154, + "grad_norm": 0.3438888490200043, + "learning_rate": 0.0004799214217186715, + "loss": 1.0385, + "step": 1880 + }, + { + "epoch": 0.02113845689264684, + "grad_norm": 0.3251883387565613, + "learning_rate": 0.00047969299648225136, + "loss": 1.0436, + "step": 1890 + }, + { + "epoch": 0.02125030057990952, + "grad_norm": 0.3300330340862274, + "learning_rate": 0.00047946457124583127, + "loss": 1.0627, + "step": 1900 + }, + { + "epoch": 0.0213621442671722, + "grad_norm": 0.31774377822875977, + "learning_rate": 0.0004792361460094111, + "loss": 1.0491, + "step": 1910 + }, + { + "epoch": 0.021473987954434882, + "grad_norm": 0.36171990633010864, + "learning_rate": 0.000479007720772991, + "loss": 1.0536, + "step": 1920 + }, + { + "epoch": 0.021585831641697563, + "grad_norm": 0.33032888174057007, + "learning_rate": 0.0004787792955365709, + "loss": 1.0327, + "step": 1930 + }, + { + "epoch": 0.021697675328960244, + "grad_norm": 0.34056538343429565, + "learning_rate": 0.00047855087030015074, + "loss": 1.0354, + "step": 1940 + }, + { + "epoch": 0.021809519016222925, + "grad_norm": 0.31768256425857544, + "learning_rate": 0.00047832244506373065, + "loss": 1.0278, + "step": 1950 + }, + { + "epoch": 0.02192136270348561, + "grad_norm": 0.33165955543518066, + "learning_rate": 0.00047809401982731056, + "loss": 1.057, + "step": 1960 + }, + { + "epoch": 0.02203320639074829, + "grad_norm": 0.34456339478492737, + "learning_rate": 0.0004778655945908904, + "loss": 1.0465, + "step": 1970 + }, + { + "epoch": 0.022145050078010972, + "grad_norm": 0.35331544280052185, + "learning_rate": 0.0004776371693544703, + "loss": 1.0509, + "step": 1980 + }, + { + "epoch": 0.022256893765273653, + "grad_norm": 0.3497447669506073, + "learning_rate": 0.0004774087441180502, + "loss": 1.0579, + "step": 1990 + }, + { + "epoch": 0.022368737452536334, + "grad_norm": 0.31631171703338623, + "learning_rate": 0.00047718031888163004, + "loss": 1.0747, + "step": 2000 + }, + { + "epoch": 0.022480581139799016, + "grad_norm": 0.34811535477638245, + "learning_rate": 0.0004769518936452099, + "loss": 1.0443, + "step": 2010 + }, + { + "epoch": 0.0225924248270617, + "grad_norm": 0.350975900888443, + "learning_rate": 0.0004767234684087898, + "loss": 1.0721, + "step": 2020 + }, + { + "epoch": 0.02270426851432438, + "grad_norm": 0.38026875257492065, + "learning_rate": 0.0004764950431723697, + "loss": 1.0502, + "step": 2030 + }, + { + "epoch": 0.022816112201587063, + "grad_norm": 0.3079335391521454, + "learning_rate": 0.00047626661793594957, + "loss": 1.0325, + "step": 2040 + }, + { + "epoch": 0.022927955888849744, + "grad_norm": 0.3412174582481384, + "learning_rate": 0.0004760381926995295, + "loss": 1.026, + "step": 2050 + }, + { + "epoch": 0.023039799576112425, + "grad_norm": 0.31905752420425415, + "learning_rate": 0.00047580976746310934, + "loss": 1.033, + "step": 2060 + }, + { + "epoch": 0.023151643263375106, + "grad_norm": 0.3110033869743347, + "learning_rate": 0.0004755813422266892, + "loss": 1.026, + "step": 2070 + }, + { + "epoch": 0.023263486950637787, + "grad_norm": 0.3087383210659027, + "learning_rate": 0.0004753529169902691, + "loss": 1.0285, + "step": 2080 + }, + { + "epoch": 0.023375330637900472, + "grad_norm": 0.310497522354126, + "learning_rate": 0.00047512449175384896, + "loss": 1.012, + "step": 2090 + }, + { + "epoch": 0.023487174325163153, + "grad_norm": 0.35822993516921997, + "learning_rate": 0.0004748960665174288, + "loss": 1.0124, + "step": 2100 + }, + { + "epoch": 0.023599018012425834, + "grad_norm": 0.3355759084224701, + "learning_rate": 0.0004746676412810088, + "loss": 1.0159, + "step": 2110 + }, + { + "epoch": 0.023710861699688515, + "grad_norm": 0.29633432626724243, + "learning_rate": 0.00047443921604458863, + "loss": 1.0068, + "step": 2120 + }, + { + "epoch": 0.023822705386951196, + "grad_norm": 0.3268597424030304, + "learning_rate": 0.0004742107908081685, + "loss": 1.0029, + "step": 2130 + }, + { + "epoch": 0.023934549074213878, + "grad_norm": 0.32010769844055176, + "learning_rate": 0.0004739823655717484, + "loss": 1.0081, + "step": 2140 + }, + { + "epoch": 0.02404639276147656, + "grad_norm": 0.30638498067855835, + "learning_rate": 0.00047375394033532826, + "loss": 0.9955, + "step": 2150 + }, + { + "epoch": 0.024158236448739243, + "grad_norm": 0.32299259305000305, + "learning_rate": 0.0004735255150989081, + "loss": 1.0028, + "step": 2160 + }, + { + "epoch": 0.024270080136001924, + "grad_norm": 0.30714213848114014, + "learning_rate": 0.000473297089862488, + "loss": 1.0163, + "step": 2170 + }, + { + "epoch": 0.024381923823264606, + "grad_norm": 0.3207940459251404, + "learning_rate": 0.0004730686646260679, + "loss": 1.0053, + "step": 2180 + }, + { + "epoch": 0.024493767510527287, + "grad_norm": 0.3073663115501404, + "learning_rate": 0.0004728402393896478, + "loss": 1.0007, + "step": 2190 + }, + { + "epoch": 0.024605611197789968, + "grad_norm": 0.3209913671016693, + "learning_rate": 0.0004726118141532277, + "loss": 1.0065, + "step": 2200 + }, + { + "epoch": 0.02471745488505265, + "grad_norm": 0.2987804114818573, + "learning_rate": 0.00047238338891680755, + "loss": 1.0015, + "step": 2210 + }, + { + "epoch": 0.024829298572315334, + "grad_norm": 0.31511807441711426, + "learning_rate": 0.0004721549636803874, + "loss": 0.9892, + "step": 2220 + }, + { + "epoch": 0.024941142259578015, + "grad_norm": 0.2840864956378937, + "learning_rate": 0.0004719265384439673, + "loss": 1.0084, + "step": 2230 + }, + { + "epoch": 0.025052985946840696, + "grad_norm": 0.3094743490219116, + "learning_rate": 0.0004716981132075472, + "loss": 1.0169, + "step": 2240 + }, + { + "epoch": 0.025164829634103377, + "grad_norm": 0.2905067205429077, + "learning_rate": 0.00047146968797112703, + "loss": 0.9991, + "step": 2250 + }, + { + "epoch": 0.02527667332136606, + "grad_norm": 0.31322264671325684, + "learning_rate": 0.00047124126273470694, + "loss": 1.0169, + "step": 2260 + }, + { + "epoch": 0.02538851700862874, + "grad_norm": 0.29053428769111633, + "learning_rate": 0.00047101283749828685, + "loss": 0.9942, + "step": 2270 + }, + { + "epoch": 0.02550036069589142, + "grad_norm": 0.2863853871822357, + "learning_rate": 0.0004707844122618667, + "loss": 1.002, + "step": 2280 + }, + { + "epoch": 0.025612204383154105, + "grad_norm": 0.3087761104106903, + "learning_rate": 0.0004705559870254466, + "loss": 1.0025, + "step": 2290 + }, + { + "epoch": 0.025724048070416786, + "grad_norm": 0.3308629095554352, + "learning_rate": 0.00047032756178902647, + "loss": 1.0078, + "step": 2300 + }, + { + "epoch": 0.025835891757679467, + "grad_norm": 0.29703134298324585, + "learning_rate": 0.0004700991365526063, + "loss": 1.006, + "step": 2310 + }, + { + "epoch": 0.02594773544494215, + "grad_norm": 0.27238258719444275, + "learning_rate": 0.0004698707113161862, + "loss": 0.9963, + "step": 2320 + }, + { + "epoch": 0.02605957913220483, + "grad_norm": 0.2795617878437042, + "learning_rate": 0.0004696422860797661, + "loss": 0.9876, + "step": 2330 + }, + { + "epoch": 0.02617142281946751, + "grad_norm": 0.2989327013492584, + "learning_rate": 0.000469413860843346, + "loss": 0.9864, + "step": 2340 + }, + { + "epoch": 0.026283266506730196, + "grad_norm": 0.3229614794254303, + "learning_rate": 0.00046918543560692586, + "loss": 0.9849, + "step": 2350 + }, + { + "epoch": 0.026395110193992877, + "grad_norm": 0.2921406328678131, + "learning_rate": 0.00046895701037050577, + "loss": 0.9764, + "step": 2360 + }, + { + "epoch": 0.026506953881255558, + "grad_norm": 0.2955220639705658, + "learning_rate": 0.0004687285851340856, + "loss": 0.9883, + "step": 2370 + }, + { + "epoch": 0.02661879756851824, + "grad_norm": 0.31378960609436035, + "learning_rate": 0.0004685001598976655, + "loss": 0.9978, + "step": 2380 + }, + { + "epoch": 0.02673064125578092, + "grad_norm": 0.30504587292671204, + "learning_rate": 0.0004682717346612454, + "loss": 0.9912, + "step": 2390 + }, + { + "epoch": 0.0268424849430436, + "grad_norm": 0.3066459000110626, + "learning_rate": 0.00046804330942482524, + "loss": 0.9877, + "step": 2400 + }, + { + "epoch": 0.026954328630306282, + "grad_norm": 0.3198714256286621, + "learning_rate": 0.0004678148841884051, + "loss": 0.98, + "step": 2410 + }, + { + "epoch": 0.027066172317568967, + "grad_norm": 0.27119094133377075, + "learning_rate": 0.00046758645895198506, + "loss": 1.001, + "step": 2420 + }, + { + "epoch": 0.027178016004831648, + "grad_norm": 0.28178098797798157, + "learning_rate": 0.0004673580337155649, + "loss": 0.9605, + "step": 2430 + }, + { + "epoch": 0.02728985969209433, + "grad_norm": 0.29373088479042053, + "learning_rate": 0.0004671296084791448, + "loss": 0.9834, + "step": 2440 + }, + { + "epoch": 0.02740170337935701, + "grad_norm": 0.2861827313899994, + "learning_rate": 0.0004669011832427247, + "loss": 0.9797, + "step": 2450 + }, + { + "epoch": 0.02751354706661969, + "grad_norm": 0.3488409221172333, + "learning_rate": 0.00046667275800630454, + "loss": 0.9682, + "step": 2460 + }, + { + "epoch": 0.027625390753882373, + "grad_norm": 0.29631665349006653, + "learning_rate": 0.0004664443327698844, + "loss": 0.9751, + "step": 2470 + }, + { + "epoch": 0.027737234441145054, + "grad_norm": 0.27299416065216064, + "learning_rate": 0.0004662159075334643, + "loss": 0.9571, + "step": 2480 + }, + { + "epoch": 0.02784907812840774, + "grad_norm": 0.30409684777259827, + "learning_rate": 0.00046598748229704416, + "loss": 0.968, + "step": 2490 + }, + { + "epoch": 0.02796092181567042, + "grad_norm": 0.2957991063594818, + "learning_rate": 0.00046575905706062407, + "loss": 0.9814, + "step": 2500 + }, + { + "epoch": 0.0280727655029331, + "grad_norm": 0.28328225016593933, + "learning_rate": 0.000465530631824204, + "loss": 0.9816, + "step": 2510 + }, + { + "epoch": 0.028184609190195782, + "grad_norm": 0.40670067071914673, + "learning_rate": 0.00046530220658778384, + "loss": 0.9737, + "step": 2520 + }, + { + "epoch": 0.028296452877458463, + "grad_norm": 0.2818649411201477, + "learning_rate": 0.0004650737813513637, + "loss": 0.9891, + "step": 2530 + }, + { + "epoch": 0.028408296564721144, + "grad_norm": 0.3054118752479553, + "learning_rate": 0.0004648453561149436, + "loss": 0.9976, + "step": 2540 + }, + { + "epoch": 0.02852014025198383, + "grad_norm": 0.31439468264579773, + "learning_rate": 0.00046461693087852346, + "loss": 0.9928, + "step": 2550 + }, + { + "epoch": 0.02863198393924651, + "grad_norm": 0.3173445761203766, + "learning_rate": 0.0004643885056421033, + "loss": 1.0002, + "step": 2560 + }, + { + "epoch": 0.02874382762650919, + "grad_norm": 0.32495757937431335, + "learning_rate": 0.0004641600804056832, + "loss": 0.9981, + "step": 2570 + }, + { + "epoch": 0.028855671313771872, + "grad_norm": 0.35957351326942444, + "learning_rate": 0.00046393165516926313, + "loss": 1.0112, + "step": 2580 + }, + { + "epoch": 0.028967515001034554, + "grad_norm": 0.3070557713508606, + "learning_rate": 0.000463703229932843, + "loss": 1.0047, + "step": 2590 + }, + { + "epoch": 0.029079358688297235, + "grad_norm": 0.3227770924568176, + "learning_rate": 0.0004634748046964229, + "loss": 1.0115, + "step": 2600 + }, + { + "epoch": 0.029191202375559916, + "grad_norm": 0.34345880150794983, + "learning_rate": 0.00046324637946000276, + "loss": 0.9984, + "step": 2610 + }, + { + "epoch": 0.0293030460628226, + "grad_norm": 0.34459254145622253, + "learning_rate": 0.0004630179542235826, + "loss": 0.9965, + "step": 2620 + }, + { + "epoch": 0.02941488975008528, + "grad_norm": 0.3396269679069519, + "learning_rate": 0.0004627895289871625, + "loss": 0.9986, + "step": 2630 + }, + { + "epoch": 0.029526733437347963, + "grad_norm": 0.3370846211910248, + "learning_rate": 0.0004625611037507424, + "loss": 0.9987, + "step": 2640 + }, + { + "epoch": 0.029638577124610644, + "grad_norm": 0.30689191818237305, + "learning_rate": 0.00046233267851432223, + "loss": 1.0081, + "step": 2650 + }, + { + "epoch": 0.029750420811873325, + "grad_norm": 0.35536935925483704, + "learning_rate": 0.0004621042532779022, + "loss": 0.9948, + "step": 2660 + }, + { + "epoch": 0.029862264499136006, + "grad_norm": 0.3295105993747711, + "learning_rate": 0.00046187582804148205, + "loss": 1.0115, + "step": 2670 + }, + { + "epoch": 0.029974108186398687, + "grad_norm": 0.34881895780563354, + "learning_rate": 0.0004616474028050619, + "loss": 1.0024, + "step": 2680 + }, + { + "epoch": 0.030085951873661372, + "grad_norm": 0.379261314868927, + "learning_rate": 0.0004614189775686418, + "loss": 0.9965, + "step": 2690 + }, + { + "epoch": 0.030197795560924053, + "grad_norm": 0.34729093313217163, + "learning_rate": 0.0004611905523322217, + "loss": 1.0026, + "step": 2700 + }, + { + "epoch": 0.030309639248186734, + "grad_norm": 0.34687525033950806, + "learning_rate": 0.00046096212709580153, + "loss": 0.9992, + "step": 2710 + }, + { + "epoch": 0.030421482935449416, + "grad_norm": 0.3564583659172058, + "learning_rate": 0.00046073370185938144, + "loss": 0.9859, + "step": 2720 + }, + { + "epoch": 0.030533326622712097, + "grad_norm": 0.3762670159339905, + "learning_rate": 0.0004605052766229613, + "loss": 1.0059, + "step": 2730 + }, + { + "epoch": 0.030645170309974778, + "grad_norm": 0.3470481038093567, + "learning_rate": 0.0004602768513865412, + "loss": 1.0044, + "step": 2740 + }, + { + "epoch": 0.030757013997237462, + "grad_norm": 0.3322189450263977, + "learning_rate": 0.0004600484261501211, + "loss": 0.9811, + "step": 2750 + }, + { + "epoch": 0.030868857684500144, + "grad_norm": 0.3248903751373291, + "learning_rate": 0.00045982000091370097, + "loss": 0.9721, + "step": 2760 + }, + { + "epoch": 0.030980701371762825, + "grad_norm": 0.32881951332092285, + "learning_rate": 0.0004595915756772808, + "loss": 0.9821, + "step": 2770 + }, + { + "epoch": 0.031092545059025506, + "grad_norm": 0.35410797595977783, + "learning_rate": 0.0004593631504408607, + "loss": 0.9786, + "step": 2780 + }, + { + "epoch": 0.031204388746288187, + "grad_norm": 0.3307279050350189, + "learning_rate": 0.0004591347252044406, + "loss": 0.9759, + "step": 2790 + }, + { + "epoch": 0.03131623243355087, + "grad_norm": 0.3207128643989563, + "learning_rate": 0.00045890629996802045, + "loss": 0.9812, + "step": 2800 + }, + { + "epoch": 0.03142807612081355, + "grad_norm": 0.3065459728240967, + "learning_rate": 0.0004586778747316003, + "loss": 0.9596, + "step": 2810 + }, + { + "epoch": 0.03153991980807623, + "grad_norm": 0.3115104138851166, + "learning_rate": 0.00045844944949518027, + "loss": 0.9732, + "step": 2820 + }, + { + "epoch": 0.031651763495338915, + "grad_norm": 0.3136879801750183, + "learning_rate": 0.0004582210242587601, + "loss": 0.9818, + "step": 2830 + }, + { + "epoch": 0.03176360718260159, + "grad_norm": 0.3240731656551361, + "learning_rate": 0.00045799259902234, + "loss": 0.9836, + "step": 2840 + }, + { + "epoch": 0.03187545086986428, + "grad_norm": 0.31390219926834106, + "learning_rate": 0.0004577641737859199, + "loss": 0.9837, + "step": 2850 + }, + { + "epoch": 0.03198729455712696, + "grad_norm": 0.3056069612503052, + "learning_rate": 0.00045753574854949975, + "loss": 0.995, + "step": 2860 + }, + { + "epoch": 0.03209913824438964, + "grad_norm": 0.29556363821029663, + "learning_rate": 0.0004573073233130796, + "loss": 1.0018, + "step": 2870 + }, + { + "epoch": 0.032210981931652324, + "grad_norm": 0.2931666374206543, + "learning_rate": 0.0004570788980766595, + "loss": 1.0124, + "step": 2880 + }, + { + "epoch": 0.032322825618915, + "grad_norm": 0.31029924750328064, + "learning_rate": 0.0004568504728402394, + "loss": 1.0115, + "step": 2890 + }, + { + "epoch": 0.03243466930617769, + "grad_norm": 0.3164144456386566, + "learning_rate": 0.0004566220476038193, + "loss": 0.9966, + "step": 2900 + }, + { + "epoch": 0.032546512993440364, + "grad_norm": 0.31638383865356445, + "learning_rate": 0.0004563936223673992, + "loss": 0.989, + "step": 2910 + }, + { + "epoch": 0.03265835668070305, + "grad_norm": 0.28559473156929016, + "learning_rate": 0.00045616519713097904, + "loss": 1.0038, + "step": 2920 + }, + { + "epoch": 0.032770200367965734, + "grad_norm": 0.285154789686203, + "learning_rate": 0.0004559367718945589, + "loss": 1.0009, + "step": 2930 + }, + { + "epoch": 0.03288204405522841, + "grad_norm": 0.2722555100917816, + "learning_rate": 0.0004557083466581388, + "loss": 0.9977, + "step": 2940 + }, + { + "epoch": 0.032993887742491096, + "grad_norm": 0.2854909896850586, + "learning_rate": 0.00045547992142171866, + "loss": 0.9996, + "step": 2950 + }, + { + "epoch": 0.033105731429753774, + "grad_norm": 0.2726607620716095, + "learning_rate": 0.0004552514961852985, + "loss": 0.9925, + "step": 2960 + }, + { + "epoch": 0.03321757511701646, + "grad_norm": 0.30692654848098755, + "learning_rate": 0.0004550230709488785, + "loss": 0.9776, + "step": 2970 + }, + { + "epoch": 0.03332941880427914, + "grad_norm": 0.2921067774295807, + "learning_rate": 0.00045479464571245834, + "loss": 0.9831, + "step": 2980 + }, + { + "epoch": 0.03344126249154182, + "grad_norm": 0.30490297079086304, + "learning_rate": 0.0004545662204760382, + "loss": 0.9835, + "step": 2990 + }, + { + "epoch": 0.033553106178804505, + "grad_norm": 0.2823980450630188, + "learning_rate": 0.0004543377952396181, + "loss": 0.9859, + "step": 3000 + }, + { + "epoch": 0.03366494986606718, + "grad_norm": 0.31844133138656616, + "learning_rate": 0.00045410937000319796, + "loss": 1.0007, + "step": 3010 + }, + { + "epoch": 0.03377679355332987, + "grad_norm": 0.30595019459724426, + "learning_rate": 0.0004538809447667778, + "loss": 1.0069, + "step": 3020 + }, + { + "epoch": 0.033888637240592545, + "grad_norm": 0.31177419424057007, + "learning_rate": 0.0004536525195303577, + "loss": 1.0068, + "step": 3030 + }, + { + "epoch": 0.03400048092785523, + "grad_norm": 0.33921870589256287, + "learning_rate": 0.0004534240942939376, + "loss": 1.0116, + "step": 3040 + }, + { + "epoch": 0.034112324615117914, + "grad_norm": 0.29299408197402954, + "learning_rate": 0.0004531956690575175, + "loss": 1.0014, + "step": 3050 + }, + { + "epoch": 0.03422416830238059, + "grad_norm": 0.28572002053260803, + "learning_rate": 0.0004529672438210974, + "loss": 0.9976, + "step": 3060 + }, + { + "epoch": 0.03433601198964328, + "grad_norm": 0.30842283368110657, + "learning_rate": 0.00045273881858467726, + "loss": 0.9994, + "step": 3070 + }, + { + "epoch": 0.034447855676905954, + "grad_norm": 0.29677408933639526, + "learning_rate": 0.0004525103933482571, + "loss": 1.0055, + "step": 3080 + }, + { + "epoch": 0.03455969936416864, + "grad_norm": 0.388823926448822, + "learning_rate": 0.000452281968111837, + "loss": 1.0062, + "step": 3090 + }, + { + "epoch": 0.03467154305143132, + "grad_norm": 0.2956707775592804, + "learning_rate": 0.0004520535428754169, + "loss": 0.9794, + "step": 3100 + }, + { + "epoch": 0.034783386738694, + "grad_norm": 0.3179475665092468, + "learning_rate": 0.00045182511763899673, + "loss": 0.9831, + "step": 3110 + }, + { + "epoch": 0.034895230425956686, + "grad_norm": 0.29509803652763367, + "learning_rate": 0.00045159669240257664, + "loss": 0.9851, + "step": 3120 + }, + { + "epoch": 0.035007074113219364, + "grad_norm": 0.31095758080482483, + "learning_rate": 0.00045136826716615655, + "loss": 0.9852, + "step": 3130 + }, + { + "epoch": 0.03511891780048205, + "grad_norm": 0.27768880128860474, + "learning_rate": 0.0004511398419297364, + "loss": 0.9741, + "step": 3140 + }, + { + "epoch": 0.035230761487744726, + "grad_norm": 0.3117106854915619, + "learning_rate": 0.0004509114166933163, + "loss": 0.9987, + "step": 3150 + }, + { + "epoch": 0.03534260517500741, + "grad_norm": 0.30113616585731506, + "learning_rate": 0.0004506829914568962, + "loss": 0.9855, + "step": 3160 + }, + { + "epoch": 0.03545444886227009, + "grad_norm": 0.2842777967453003, + "learning_rate": 0.00045045456622047603, + "loss": 0.9793, + "step": 3170 + }, + { + "epoch": 0.03556629254953277, + "grad_norm": 0.30115559697151184, + "learning_rate": 0.00045022614098405594, + "loss": 0.9854, + "step": 3180 + }, + { + "epoch": 0.03567813623679546, + "grad_norm": 0.3350517153739929, + "learning_rate": 0.0004499977157476358, + "loss": 0.9787, + "step": 3190 + }, + { + "epoch": 0.035789979924058135, + "grad_norm": 0.2736664414405823, + "learning_rate": 0.00044976929051121565, + "loss": 1.0067, + "step": 3200 + }, + { + "epoch": 0.03590182361132082, + "grad_norm": 0.2868112027645111, + "learning_rate": 0.0004495408652747956, + "loss": 1.0002, + "step": 3210 + }, + { + "epoch": 0.0360136672985835, + "grad_norm": 0.27296972274780273, + "learning_rate": 0.00044931244003837547, + "loss": 0.9939, + "step": 3220 + }, + { + "epoch": 0.03612551098584618, + "grad_norm": 0.2894013226032257, + "learning_rate": 0.00044908401480195533, + "loss": 1.0017, + "step": 3230 + }, + { + "epoch": 0.03623735467310886, + "grad_norm": 0.26549386978149414, + "learning_rate": 0.0004488555895655352, + "loss": 0.9953, + "step": 3240 + }, + { + "epoch": 0.036349198360371544, + "grad_norm": 0.27381303906440735, + "learning_rate": 0.0004486271643291151, + "loss": 1.0077, + "step": 3250 + }, + { + "epoch": 0.03646104204763423, + "grad_norm": 0.2829972505569458, + "learning_rate": 0.00044839873909269495, + "loss": 1.0008, + "step": 3260 + }, + { + "epoch": 0.03657288573489691, + "grad_norm": 0.29023584723472595, + "learning_rate": 0.0004481703138562748, + "loss": 0.9999, + "step": 3270 + }, + { + "epoch": 0.03668472942215959, + "grad_norm": 0.29526880383491516, + "learning_rate": 0.00044794188861985477, + "loss": 0.9982, + "step": 3280 + }, + { + "epoch": 0.03679657310942227, + "grad_norm": 0.27724817395210266, + "learning_rate": 0.0004477134633834346, + "loss": 1.0109, + "step": 3290 + }, + { + "epoch": 0.036908416796684954, + "grad_norm": 0.2780180275440216, + "learning_rate": 0.0004474850381470145, + "loss": 0.997, + "step": 3300 + }, + { + "epoch": 0.03702026048394764, + "grad_norm": 0.29814234375953674, + "learning_rate": 0.0004472566129105944, + "loss": 1.0056, + "step": 3310 + }, + { + "epoch": 0.037132104171210316, + "grad_norm": 0.3131207823753357, + "learning_rate": 0.00044702818767417425, + "loss": 0.999, + "step": 3320 + }, + { + "epoch": 0.037243947858473, + "grad_norm": 0.2865641415119171, + "learning_rate": 0.0004467997624377541, + "loss": 0.9938, + "step": 3330 + }, + { + "epoch": 0.03735579154573568, + "grad_norm": 0.31247007846832275, + "learning_rate": 0.000446571337201334, + "loss": 1.0029, + "step": 3340 + }, + { + "epoch": 0.03746763523299836, + "grad_norm": 0.3432846665382385, + "learning_rate": 0.00044634291196491387, + "loss": 0.9861, + "step": 3350 + }, + { + "epoch": 0.03757947892026104, + "grad_norm": 0.3200684189796448, + "learning_rate": 0.0004461144867284938, + "loss": 0.9958, + "step": 3360 + }, + { + "epoch": 0.037691322607523725, + "grad_norm": 0.3280775547027588, + "learning_rate": 0.0004458860614920737, + "loss": 0.9972, + "step": 3370 + }, + { + "epoch": 0.03780316629478641, + "grad_norm": 0.3129955232143402, + "learning_rate": 0.00044565763625565354, + "loss": 0.9947, + "step": 3380 + }, + { + "epoch": 0.03791500998204909, + "grad_norm": 0.27574583888053894, + "learning_rate": 0.0004454292110192334, + "loss": 1.0004, + "step": 3390 + }, + { + "epoch": 0.03802685366931177, + "grad_norm": 0.3088320791721344, + "learning_rate": 0.0004452007857828133, + "loss": 0.9907, + "step": 3400 + }, + { + "epoch": 0.03813869735657445, + "grad_norm": 0.3232235908508301, + "learning_rate": 0.00044497236054639316, + "loss": 0.9956, + "step": 3410 + }, + { + "epoch": 0.038250541043837134, + "grad_norm": 0.3009951114654541, + "learning_rate": 0.000444743935309973, + "loss": 0.9899, + "step": 3420 + }, + { + "epoch": 0.03836238473109981, + "grad_norm": 0.2987104058265686, + "learning_rate": 0.00044451551007355293, + "loss": 0.9852, + "step": 3430 + }, + { + "epoch": 0.0384742284183625, + "grad_norm": 0.2890870273113251, + "learning_rate": 0.00044428708483713284, + "loss": 0.9775, + "step": 3440 + }, + { + "epoch": 0.03858607210562518, + "grad_norm": 0.2704969048500061, + "learning_rate": 0.0004440586596007127, + "loss": 0.9745, + "step": 3450 + }, + { + "epoch": 0.03869791579288786, + "grad_norm": 0.3041844964027405, + "learning_rate": 0.0004438302343642926, + "loss": 0.977, + "step": 3460 + }, + { + "epoch": 0.038809759480150544, + "grad_norm": 0.2794378995895386, + "learning_rate": 0.00044360180912787246, + "loss": 0.9818, + "step": 3470 + }, + { + "epoch": 0.03892160316741322, + "grad_norm": 0.2784910798072815, + "learning_rate": 0.0004433733838914523, + "loss": 0.9655, + "step": 3480 + }, + { + "epoch": 0.039033446854675906, + "grad_norm": 0.2610478103160858, + "learning_rate": 0.0004431449586550322, + "loss": 0.975, + "step": 3490 + }, + { + "epoch": 0.039145290541938584, + "grad_norm": 0.2646799087524414, + "learning_rate": 0.0004429165334186121, + "loss": 0.9767, + "step": 3500 + }, + { + "epoch": 0.03925713422920127, + "grad_norm": 0.2622663676738739, + "learning_rate": 0.00044268810818219194, + "loss": 0.98, + "step": 3510 + }, + { + "epoch": 0.03936897791646395, + "grad_norm": 0.26897987723350525, + "learning_rate": 0.0004424596829457719, + "loss": 0.9718, + "step": 3520 + }, + { + "epoch": 0.03948082160372663, + "grad_norm": 0.29816752672195435, + "learning_rate": 0.00044223125770935176, + "loss": 1.0074, + "step": 3530 + }, + { + "epoch": 0.039592665290989315, + "grad_norm": 0.2652198076248169, + "learning_rate": 0.0004420028324729316, + "loss": 0.9789, + "step": 3540 + }, + { + "epoch": 0.03970450897825199, + "grad_norm": 0.2648336887359619, + "learning_rate": 0.0004417744072365115, + "loss": 0.9794, + "step": 3550 + }, + { + "epoch": 0.03981635266551468, + "grad_norm": 0.25409677624702454, + "learning_rate": 0.0004415459820000914, + "loss": 0.9868, + "step": 3560 + }, + { + "epoch": 0.039928196352777355, + "grad_norm": 0.25675469636917114, + "learning_rate": 0.00044131755676367123, + "loss": 0.9827, + "step": 3570 + }, + { + "epoch": 0.04004004004004004, + "grad_norm": 0.2915634214878082, + "learning_rate": 0.00044108913152725114, + "loss": 0.9833, + "step": 3580 + }, + { + "epoch": 0.040151883727302724, + "grad_norm": 0.29538393020629883, + "learning_rate": 0.000440860706290831, + "loss": 0.9848, + "step": 3590 + }, + { + "epoch": 0.0402637274145654, + "grad_norm": 0.3026215732097626, + "learning_rate": 0.0004406322810544109, + "loss": 0.9778, + "step": 3600 + }, + { + "epoch": 0.04037557110182809, + "grad_norm": 0.30865418910980225, + "learning_rate": 0.0004404038558179908, + "loss": 0.9743, + "step": 3610 + }, + { + "epoch": 0.040487414789090764, + "grad_norm": 0.28092265129089355, + "learning_rate": 0.0004401754305815707, + "loss": 0.9795, + "step": 3620 + }, + { + "epoch": 0.04059925847635345, + "grad_norm": 0.27747923135757446, + "learning_rate": 0.00043994700534515053, + "loss": 0.9642, + "step": 3630 + }, + { + "epoch": 0.040711102163616134, + "grad_norm": 0.28192010521888733, + "learning_rate": 0.00043971858010873044, + "loss": 0.9742, + "step": 3640 + }, + { + "epoch": 0.04082294585087881, + "grad_norm": 0.2670564651489258, + "learning_rate": 0.0004394901548723103, + "loss": 0.9544, + "step": 3650 + }, + { + "epoch": 0.040934789538141496, + "grad_norm": 0.3089617192745209, + "learning_rate": 0.00043926172963589015, + "loss": 0.9563, + "step": 3660 + }, + { + "epoch": 0.041046633225404174, + "grad_norm": 0.26768213510513306, + "learning_rate": 0.00043903330439947, + "loss": 0.9531, + "step": 3670 + }, + { + "epoch": 0.04115847691266686, + "grad_norm": 0.28865131735801697, + "learning_rate": 0.00043880487916305, + "loss": 0.9579, + "step": 3680 + }, + { + "epoch": 0.041270320599929536, + "grad_norm": 0.27369582653045654, + "learning_rate": 0.00043857645392662983, + "loss": 0.9679, + "step": 3690 + }, + { + "epoch": 0.04138216428719222, + "grad_norm": 0.2889108955860138, + "learning_rate": 0.0004383480286902097, + "loss": 0.9561, + "step": 3700 + }, + { + "epoch": 0.041494007974454905, + "grad_norm": 0.2701929211616516, + "learning_rate": 0.0004381196034537896, + "loss": 0.9642, + "step": 3710 + }, + { + "epoch": 0.04160585166171758, + "grad_norm": 0.2817586064338684, + "learning_rate": 0.00043789117821736945, + "loss": 0.9701, + "step": 3720 + }, + { + "epoch": 0.04171769534898027, + "grad_norm": 0.2924664318561554, + "learning_rate": 0.0004376627529809493, + "loss": 0.9617, + "step": 3730 + }, + { + "epoch": 0.041829539036242945, + "grad_norm": 0.28590497374534607, + "learning_rate": 0.0004374343277445292, + "loss": 0.9646, + "step": 3740 + }, + { + "epoch": 0.04194138272350563, + "grad_norm": 0.270046591758728, + "learning_rate": 0.0004372059025081091, + "loss": 0.95, + "step": 3750 + }, + { + "epoch": 0.04205322641076831, + "grad_norm": 0.2508755326271057, + "learning_rate": 0.000436977477271689, + "loss": 0.9525, + "step": 3760 + }, + { + "epoch": 0.04216507009803099, + "grad_norm": 0.26878127455711365, + "learning_rate": 0.0004367490520352689, + "loss": 0.9609, + "step": 3770 + }, + { + "epoch": 0.04227691378529368, + "grad_norm": 0.26882994174957275, + "learning_rate": 0.00043652062679884875, + "loss": 0.9671, + "step": 3780 + }, + { + "epoch": 0.042388757472556354, + "grad_norm": 0.28049325942993164, + "learning_rate": 0.0004362922015624286, + "loss": 0.9492, + "step": 3790 + }, + { + "epoch": 0.04250060115981904, + "grad_norm": 0.33502647280693054, + "learning_rate": 0.0004360637763260085, + "loss": 0.9537, + "step": 3800 + }, + { + "epoch": 0.04261244484708172, + "grad_norm": 0.321997731924057, + "learning_rate": 0.00043583535108958837, + "loss": 0.9646, + "step": 3810 + }, + { + "epoch": 0.0427242885343444, + "grad_norm": 0.29477357864379883, + "learning_rate": 0.0004356069258531682, + "loss": 0.9794, + "step": 3820 + }, + { + "epoch": 0.04283613222160708, + "grad_norm": 0.2989972233772278, + "learning_rate": 0.0004353785006167482, + "loss": 0.9645, + "step": 3830 + }, + { + "epoch": 0.042947975908869764, + "grad_norm": 0.33459851145744324, + "learning_rate": 0.00043515007538032804, + "loss": 0.9556, + "step": 3840 + }, + { + "epoch": 0.04305981959613245, + "grad_norm": 0.2941781282424927, + "learning_rate": 0.0004349216501439079, + "loss": 0.9507, + "step": 3850 + }, + { + "epoch": 0.043171663283395126, + "grad_norm": 0.27801111340522766, + "learning_rate": 0.0004346932249074878, + "loss": 0.9623, + "step": 3860 + }, + { + "epoch": 0.04328350697065781, + "grad_norm": 0.2765832841396332, + "learning_rate": 0.00043446479967106767, + "loss": 0.9815, + "step": 3870 + }, + { + "epoch": 0.04339535065792049, + "grad_norm": 0.303786039352417, + "learning_rate": 0.0004342363744346475, + "loss": 0.9575, + "step": 3880 + }, + { + "epoch": 0.04350719434518317, + "grad_norm": 0.29517048597335815, + "learning_rate": 0.00043400794919822743, + "loss": 0.9554, + "step": 3890 + }, + { + "epoch": 0.04361903803244585, + "grad_norm": 0.28657206892967224, + "learning_rate": 0.0004337795239618073, + "loss": 0.9631, + "step": 3900 + }, + { + "epoch": 0.043730881719708535, + "grad_norm": 0.2933245003223419, + "learning_rate": 0.0004335510987253872, + "loss": 0.987, + "step": 3910 + }, + { + "epoch": 0.04384272540697122, + "grad_norm": 0.31331002712249756, + "learning_rate": 0.0004333226734889671, + "loss": 0.971, + "step": 3920 + }, + { + "epoch": 0.0439545690942339, + "grad_norm": 0.32431700825691223, + "learning_rate": 0.00043309424825254696, + "loss": 0.9603, + "step": 3930 + }, + { + "epoch": 0.04406641278149658, + "grad_norm": 0.3346642851829529, + "learning_rate": 0.0004328658230161268, + "loss": 0.9721, + "step": 3940 + }, + { + "epoch": 0.04417825646875926, + "grad_norm": 0.33921241760253906, + "learning_rate": 0.00043263739777970673, + "loss": 0.9639, + "step": 3950 + }, + { + "epoch": 0.044290100156021944, + "grad_norm": 0.3068247139453888, + "learning_rate": 0.0004324089725432866, + "loss": 0.9756, + "step": 3960 + }, + { + "epoch": 0.04440194384328462, + "grad_norm": 0.3049049973487854, + "learning_rate": 0.00043218054730686644, + "loss": 0.9693, + "step": 3970 + }, + { + "epoch": 0.04451378753054731, + "grad_norm": 0.30104655027389526, + "learning_rate": 0.00043195212207044635, + "loss": 0.9704, + "step": 3980 + }, + { + "epoch": 0.04462563121780999, + "grad_norm": 0.36955609917640686, + "learning_rate": 0.00043172369683402626, + "loss": 0.9527, + "step": 3990 + }, + { + "epoch": 0.04473747490507267, + "grad_norm": 0.318854957818985, + "learning_rate": 0.0004314952715976061, + "loss": 0.9543, + "step": 4000 + }, + { + "epoch": 0.044849318592335354, + "grad_norm": 0.3166191875934601, + "learning_rate": 0.000431266846361186, + "loss": 0.968, + "step": 4010 + }, + { + "epoch": 0.04496116227959803, + "grad_norm": 0.2976950407028198, + "learning_rate": 0.0004310384211247659, + "loss": 0.9822, + "step": 4020 + }, + { + "epoch": 0.045073005966860716, + "grad_norm": 0.2912284731864929, + "learning_rate": 0.00043080999588834574, + "loss": 0.9759, + "step": 4030 + }, + { + "epoch": 0.0451848496541234, + "grad_norm": 0.31027549505233765, + "learning_rate": 0.00043058157065192565, + "loss": 0.9794, + "step": 4040 + }, + { + "epoch": 0.04529669334138608, + "grad_norm": 0.3182738721370697, + "learning_rate": 0.0004303531454155055, + "loss": 0.9654, + "step": 4050 + }, + { + "epoch": 0.04540853702864876, + "grad_norm": 0.3006060719490051, + "learning_rate": 0.00043012472017908536, + "loss": 0.9548, + "step": 4060 + }, + { + "epoch": 0.04552038071591144, + "grad_norm": 0.2828291654586792, + "learning_rate": 0.0004298962949426653, + "loss": 0.9611, + "step": 4070 + }, + { + "epoch": 0.045632224403174125, + "grad_norm": 0.30988603830337524, + "learning_rate": 0.0004296678697062452, + "loss": 0.9614, + "step": 4080 + }, + { + "epoch": 0.0457440680904368, + "grad_norm": 0.29344943165779114, + "learning_rate": 0.00042943944446982503, + "loss": 0.9522, + "step": 4090 + }, + { + "epoch": 0.04585591177769949, + "grad_norm": 0.29713529348373413, + "learning_rate": 0.00042921101923340494, + "loss": 0.9468, + "step": 4100 + }, + { + "epoch": 0.04596775546496217, + "grad_norm": 0.2815961539745331, + "learning_rate": 0.0004289825939969848, + "loss": 0.9546, + "step": 4110 + }, + { + "epoch": 0.04607959915222485, + "grad_norm": 0.25218480825424194, + "learning_rate": 0.00042875416876056465, + "loss": 0.9372, + "step": 4120 + }, + { + "epoch": 0.046191442839487534, + "grad_norm": 0.2735552191734314, + "learning_rate": 0.0004285257435241445, + "loss": 0.942, + "step": 4130 + }, + { + "epoch": 0.04630328652675021, + "grad_norm": 0.27451473474502563, + "learning_rate": 0.0004282973182877245, + "loss": 0.931, + "step": 4140 + }, + { + "epoch": 0.0464151302140129, + "grad_norm": 0.24361196160316467, + "learning_rate": 0.00042806889305130433, + "loss": 0.924, + "step": 4150 + }, + { + "epoch": 0.046526973901275574, + "grad_norm": 0.25817179679870605, + "learning_rate": 0.0004278404678148842, + "loss": 0.9373, + "step": 4160 + }, + { + "epoch": 0.04663881758853826, + "grad_norm": 0.28722450137138367, + "learning_rate": 0.0004276120425784641, + "loss": 0.9271, + "step": 4170 + }, + { + "epoch": 0.046750661275800943, + "grad_norm": 0.25202882289886475, + "learning_rate": 0.00042738361734204395, + "loss": 0.9187, + "step": 4180 + }, + { + "epoch": 0.04686250496306362, + "grad_norm": 0.2637481391429901, + "learning_rate": 0.0004271551921056238, + "loss": 0.9402, + "step": 4190 + }, + { + "epoch": 0.046974348650326306, + "grad_norm": 0.2684090733528137, + "learning_rate": 0.0004269267668692037, + "loss": 0.9574, + "step": 4200 + }, + { + "epoch": 0.047086192337588983, + "grad_norm": 0.28711873292922974, + "learning_rate": 0.00042669834163278357, + "loss": 0.9551, + "step": 4210 + }, + { + "epoch": 0.04719803602485167, + "grad_norm": 0.2933102250099182, + "learning_rate": 0.0004264699163963635, + "loss": 0.9457, + "step": 4220 + }, + { + "epoch": 0.047309879712114346, + "grad_norm": 0.2875578701496124, + "learning_rate": 0.0004262414911599434, + "loss": 0.9667, + "step": 4230 + }, + { + "epoch": 0.04742172339937703, + "grad_norm": 0.3007104694843292, + "learning_rate": 0.00042601306592352325, + "loss": 0.9672, + "step": 4240 + }, + { + "epoch": 0.047533567086639715, + "grad_norm": 0.30211201310157776, + "learning_rate": 0.0004257846406871031, + "loss": 0.9781, + "step": 4250 + }, + { + "epoch": 0.04764541077390239, + "grad_norm": 0.29263827204704285, + "learning_rate": 0.000425556215450683, + "loss": 0.9923, + "step": 4260 + }, + { + "epoch": 0.04775725446116508, + "grad_norm": 0.29569676518440247, + "learning_rate": 0.00042532779021426287, + "loss": 0.9913, + "step": 4270 + }, + { + "epoch": 0.047869098148427755, + "grad_norm": 0.28223690390586853, + "learning_rate": 0.0004250993649778427, + "loss": 0.9817, + "step": 4280 + }, + { + "epoch": 0.04798094183569044, + "grad_norm": 0.271419882774353, + "learning_rate": 0.00042487093974142263, + "loss": 0.9977, + "step": 4290 + }, + { + "epoch": 0.04809278552295312, + "grad_norm": 0.26362791657447815, + "learning_rate": 0.00042464251450500254, + "loss": 0.9859, + "step": 4300 + }, + { + "epoch": 0.0482046292102158, + "grad_norm": 0.31365934014320374, + "learning_rate": 0.0004244140892685824, + "loss": 0.9862, + "step": 4310 + }, + { + "epoch": 0.04831647289747849, + "grad_norm": 0.26915237307548523, + "learning_rate": 0.0004241856640321623, + "loss": 0.9693, + "step": 4320 + }, + { + "epoch": 0.048428316584741164, + "grad_norm": 0.2639203369617462, + "learning_rate": 0.00042395723879574217, + "loss": 0.9691, + "step": 4330 + }, + { + "epoch": 0.04854016027200385, + "grad_norm": 0.30106601119041443, + "learning_rate": 0.000423728813559322, + "loss": 0.9521, + "step": 4340 + }, + { + "epoch": 0.04865200395926653, + "grad_norm": 0.2807524800300598, + "learning_rate": 0.00042350038832290193, + "loss": 0.9616, + "step": 4350 + }, + { + "epoch": 0.04876384764652921, + "grad_norm": 0.27363407611846924, + "learning_rate": 0.0004232719630864818, + "loss": 0.9538, + "step": 4360 + }, + { + "epoch": 0.048875691333791896, + "grad_norm": 0.29041701555252075, + "learning_rate": 0.00042304353785006164, + "loss": 0.9455, + "step": 4370 + }, + { + "epoch": 0.048987535021054573, + "grad_norm": 0.28237226605415344, + "learning_rate": 0.0004228151126136416, + "loss": 0.9615, + "step": 4380 + }, + { + "epoch": 0.04909937870831726, + "grad_norm": 0.30885329842567444, + "learning_rate": 0.00042258668737722146, + "loss": 0.9691, + "step": 4390 + }, + { + "epoch": 0.049211222395579936, + "grad_norm": 0.2734643220901489, + "learning_rate": 0.0004223582621408013, + "loss": 0.9663, + "step": 4400 + }, + { + "epoch": 0.04932306608284262, + "grad_norm": 0.2652278244495392, + "learning_rate": 0.00042212983690438123, + "loss": 0.9439, + "step": 4410 + }, + { + "epoch": 0.0494349097701053, + "grad_norm": 0.27749761939048767, + "learning_rate": 0.0004219014116679611, + "loss": 0.9623, + "step": 4420 + }, + { + "epoch": 0.04954675345736798, + "grad_norm": 0.2812553942203522, + "learning_rate": 0.00042167298643154094, + "loss": 0.9557, + "step": 4430 + }, + { + "epoch": 0.04965859714463067, + "grad_norm": 0.2762252688407898, + "learning_rate": 0.00042144456119512085, + "loss": 0.945, + "step": 4440 + }, + { + "epoch": 0.049770440831893345, + "grad_norm": 0.277118980884552, + "learning_rate": 0.0004212161359587007, + "loss": 0.93, + "step": 4450 + }, + { + "epoch": 0.04988228451915603, + "grad_norm": 0.2723037004470825, + "learning_rate": 0.0004209877107222806, + "loss": 0.963, + "step": 4460 + }, + { + "epoch": 0.04999412820641871, + "grad_norm": 0.29789137840270996, + "learning_rate": 0.0004207592854858605, + "loss": 0.954, + "step": 4470 + }, + { + "epoch": 0.05010597189368139, + "grad_norm": 0.26940014958381653, + "learning_rate": 0.0004205308602494404, + "loss": 0.9443, + "step": 4480 + }, + { + "epoch": 0.05021781558094407, + "grad_norm": 0.263300359249115, + "learning_rate": 0.00042030243501302024, + "loss": 0.9403, + "step": 4490 + }, + { + "epoch": 0.050329659268206754, + "grad_norm": 0.27823972702026367, + "learning_rate": 0.00042007400977660015, + "loss": 0.95, + "step": 4500 + }, + { + "epoch": 0.05044150295546944, + "grad_norm": 0.2782444357872009, + "learning_rate": 0.00041984558454018, + "loss": 0.953, + "step": 4510 + }, + { + "epoch": 0.05055334664273212, + "grad_norm": 0.277182936668396, + "learning_rate": 0.00041961715930375986, + "loss": 0.9498, + "step": 4520 + }, + { + "epoch": 0.0506651903299948, + "grad_norm": 0.2942575514316559, + "learning_rate": 0.00041938873406733977, + "loss": 0.957, + "step": 4530 + }, + { + "epoch": 0.05077703401725748, + "grad_norm": 0.3258327543735504, + "learning_rate": 0.0004191603088309197, + "loss": 0.9626, + "step": 4540 + }, + { + "epoch": 0.05088887770452016, + "grad_norm": 0.27874353528022766, + "learning_rate": 0.00041893188359449953, + "loss": 0.971, + "step": 4550 + }, + { + "epoch": 0.05100072139178284, + "grad_norm": 0.2981313169002533, + "learning_rate": 0.00041870345835807944, + "loss": 0.965, + "step": 4560 + }, + { + "epoch": 0.051112565079045526, + "grad_norm": 0.30568984150886536, + "learning_rate": 0.0004184750331216593, + "loss": 0.9566, + "step": 4570 + }, + { + "epoch": 0.05122440876630821, + "grad_norm": 0.27867600321769714, + "learning_rate": 0.00041824660788523915, + "loss": 0.94, + "step": 4580 + }, + { + "epoch": 0.05133625245357089, + "grad_norm": 0.30877605080604553, + "learning_rate": 0.000418018182648819, + "loss": 0.9453, + "step": 4590 + }, + { + "epoch": 0.05144809614083357, + "grad_norm": 0.3018844425678253, + "learning_rate": 0.0004177897574123989, + "loss": 0.9511, + "step": 4600 + }, + { + "epoch": 0.05155993982809625, + "grad_norm": 0.27943944931030273, + "learning_rate": 0.0004175613321759788, + "loss": 0.9371, + "step": 4610 + }, + { + "epoch": 0.051671783515358935, + "grad_norm": 0.2654775381088257, + "learning_rate": 0.0004173329069395587, + "loss": 0.9366, + "step": 4620 + }, + { + "epoch": 0.05178362720262161, + "grad_norm": 0.27594050765037537, + "learning_rate": 0.0004171044817031386, + "loss": 0.9229, + "step": 4630 + }, + { + "epoch": 0.0518954708898843, + "grad_norm": 0.26856914162635803, + "learning_rate": 0.00041687605646671845, + "loss": 0.9357, + "step": 4640 + }, + { + "epoch": 0.05200731457714698, + "grad_norm": 0.2956237494945526, + "learning_rate": 0.0004166476312302983, + "loss": 0.9023, + "step": 4650 + }, + { + "epoch": 0.05211915826440966, + "grad_norm": 0.30004164576530457, + "learning_rate": 0.0004164192059938782, + "loss": 0.9273, + "step": 4660 + }, + { + "epoch": 0.052231001951672344, + "grad_norm": 0.2691096365451813, + "learning_rate": 0.0004161907807574581, + "loss": 0.9332, + "step": 4670 + }, + { + "epoch": 0.05234284563893502, + "grad_norm": 0.2551780641078949, + "learning_rate": 0.00041596235552103793, + "loss": 0.9327, + "step": 4680 + }, + { + "epoch": 0.052454689326197707, + "grad_norm": 0.2806546092033386, + "learning_rate": 0.0004157339302846179, + "loss": 0.9355, + "step": 4690 + }, + { + "epoch": 0.05256653301346039, + "grad_norm": 0.27648645639419556, + "learning_rate": 0.00041550550504819775, + "loss": 0.9348, + "step": 4700 + }, + { + "epoch": 0.05267837670072307, + "grad_norm": 0.2816336750984192, + "learning_rate": 0.0004152770798117776, + "loss": 0.9294, + "step": 4710 + }, + { + "epoch": 0.05279022038798575, + "grad_norm": 0.29570698738098145, + "learning_rate": 0.0004150486545753575, + "loss": 0.9317, + "step": 4720 + }, + { + "epoch": 0.05290206407524843, + "grad_norm": 0.26981687545776367, + "learning_rate": 0.00041482022933893737, + "loss": 0.9317, + "step": 4730 + }, + { + "epoch": 0.053013907762511116, + "grad_norm": 0.2586159110069275, + "learning_rate": 0.0004145918041025172, + "loss": 0.9162, + "step": 4740 + }, + { + "epoch": 0.05312575144977379, + "grad_norm": 0.24129503965377808, + "learning_rate": 0.00041436337886609714, + "loss": 0.934, + "step": 4750 + }, + { + "epoch": 0.05323759513703648, + "grad_norm": 0.28072717785835266, + "learning_rate": 0.000414134953629677, + "loss": 0.9089, + "step": 4760 + }, + { + "epoch": 0.05334943882429916, + "grad_norm": 0.2760024964809418, + "learning_rate": 0.0004139065283932569, + "loss": 0.9115, + "step": 4770 + }, + { + "epoch": 0.05346128251156184, + "grad_norm": 0.28894710540771484, + "learning_rate": 0.0004136781031568368, + "loss": 0.9108, + "step": 4780 + }, + { + "epoch": 0.053573126198824525, + "grad_norm": 0.27882319688796997, + "learning_rate": 0.00041344967792041667, + "loss": 0.9184, + "step": 4790 + }, + { + "epoch": 0.0536849698860872, + "grad_norm": 0.27242934703826904, + "learning_rate": 0.0004132212526839965, + "loss": 0.9498, + "step": 4800 + }, + { + "epoch": 0.05379681357334989, + "grad_norm": 0.2809596359729767, + "learning_rate": 0.00041299282744757643, + "loss": 0.9365, + "step": 4810 + }, + { + "epoch": 0.053908657260612565, + "grad_norm": 0.3026556074619293, + "learning_rate": 0.0004127644022111563, + "loss": 0.9433, + "step": 4820 + }, + { + "epoch": 0.05402050094787525, + "grad_norm": 0.2933846116065979, + "learning_rate": 0.00041253597697473614, + "loss": 0.9351, + "step": 4830 + }, + { + "epoch": 0.054132344635137934, + "grad_norm": 0.2774868309497833, + "learning_rate": 0.00041230755173831605, + "loss": 0.9285, + "step": 4840 + }, + { + "epoch": 0.05424418832240061, + "grad_norm": 0.2859903573989868, + "learning_rate": 0.00041207912650189596, + "loss": 0.9344, + "step": 4850 + }, + { + "epoch": 0.054356032009663297, + "grad_norm": 0.26687270402908325, + "learning_rate": 0.0004118507012654758, + "loss": 0.9281, + "step": 4860 + }, + { + "epoch": 0.054467875696925974, + "grad_norm": 0.31075340509414673, + "learning_rate": 0.00041162227602905573, + "loss": 0.9418, + "step": 4870 + }, + { + "epoch": 0.05457971938418866, + "grad_norm": 0.2569184899330139, + "learning_rate": 0.0004113938507926356, + "loss": 0.9394, + "step": 4880 + }, + { + "epoch": 0.054691563071451336, + "grad_norm": 0.26250478625297546, + "learning_rate": 0.00041116542555621544, + "loss": 0.9499, + "step": 4890 + }, + { + "epoch": 0.05480340675871402, + "grad_norm": 0.27604004740715027, + "learning_rate": 0.00041093700031979535, + "loss": 0.9268, + "step": 4900 + }, + { + "epoch": 0.054915250445976706, + "grad_norm": 0.26279163360595703, + "learning_rate": 0.0004107085750833752, + "loss": 0.9313, + "step": 4910 + }, + { + "epoch": 0.05502709413323938, + "grad_norm": 0.29265978932380676, + "learning_rate": 0.00041048014984695506, + "loss": 0.9498, + "step": 4920 + }, + { + "epoch": 0.05513893782050207, + "grad_norm": 0.32107868790626526, + "learning_rate": 0.000410251724610535, + "loss": 0.9708, + "step": 4930 + }, + { + "epoch": 0.055250781507764746, + "grad_norm": 0.32804161310195923, + "learning_rate": 0.0004100232993741149, + "loss": 0.9624, + "step": 4940 + }, + { + "epoch": 0.05536262519502743, + "grad_norm": 0.3207037150859833, + "learning_rate": 0.00040979487413769474, + "loss": 0.9538, + "step": 4950 + }, + { + "epoch": 0.05547446888229011, + "grad_norm": 0.29660555720329285, + "learning_rate": 0.00040956644890127465, + "loss": 0.9677, + "step": 4960 + }, + { + "epoch": 0.05558631256955279, + "grad_norm": 0.34930771589279175, + "learning_rate": 0.0004093380236648545, + "loss": 0.9777, + "step": 4970 + }, + { + "epoch": 0.05569815625681548, + "grad_norm": 0.3037464916706085, + "learning_rate": 0.00040910959842843436, + "loss": 0.9826, + "step": 4980 + }, + { + "epoch": 0.055809999944078155, + "grad_norm": 0.31435292959213257, + "learning_rate": 0.00040888117319201427, + "loss": 0.9677, + "step": 4990 + }, + { + "epoch": 0.05592184363134084, + "grad_norm": 0.29182785749435425, + "learning_rate": 0.0004086527479555941, + "loss": 0.9563, + "step": 5000 + }, + { + "epoch": 0.05603368731860352, + "grad_norm": 0.34796231985092163, + "learning_rate": 0.00040842432271917403, + "loss": 0.957, + "step": 5010 + }, + { + "epoch": 0.0561455310058662, + "grad_norm": 0.3027050495147705, + "learning_rate": 0.00040819589748275394, + "loss": 0.967, + "step": 5020 + }, + { + "epoch": 0.056257374693128887, + "grad_norm": 0.3419332802295685, + "learning_rate": 0.0004079674722463338, + "loss": 0.9654, + "step": 5030 + }, + { + "epoch": 0.056369218380391564, + "grad_norm": 0.29381224513053894, + "learning_rate": 0.00040773904700991366, + "loss": 0.9647, + "step": 5040 + }, + { + "epoch": 0.05648106206765425, + "grad_norm": 0.29206860065460205, + "learning_rate": 0.0004075106217734935, + "loss": 0.9637, + "step": 5050 + }, + { + "epoch": 0.056592905754916926, + "grad_norm": 0.3169795274734497, + "learning_rate": 0.0004072821965370734, + "loss": 0.963, + "step": 5060 + }, + { + "epoch": 0.05670474944217961, + "grad_norm": 0.30713772773742676, + "learning_rate": 0.0004070537713006533, + "loss": 0.9766, + "step": 5070 + }, + { + "epoch": 0.05681659312944229, + "grad_norm": 0.29805994033813477, + "learning_rate": 0.00040682534606423313, + "loss": 0.9597, + "step": 5080 + }, + { + "epoch": 0.05692843681670497, + "grad_norm": 0.33419644832611084, + "learning_rate": 0.0004065969208278131, + "loss": 0.9598, + "step": 5090 + }, + { + "epoch": 0.05704028050396766, + "grad_norm": 0.31769025325775146, + "learning_rate": 0.00040636849559139295, + "loss": 0.942, + "step": 5100 + }, + { + "epoch": 0.057152124191230336, + "grad_norm": 0.3017726242542267, + "learning_rate": 0.0004061400703549728, + "loss": 0.9627, + "step": 5110 + }, + { + "epoch": 0.05726396787849302, + "grad_norm": 0.32213470339775085, + "learning_rate": 0.0004059116451185527, + "loss": 0.9518, + "step": 5120 + }, + { + "epoch": 0.0573758115657557, + "grad_norm": 0.29069948196411133, + "learning_rate": 0.0004056832198821326, + "loss": 0.9337, + "step": 5130 + }, + { + "epoch": 0.05748765525301838, + "grad_norm": 0.32283100485801697, + "learning_rate": 0.00040545479464571243, + "loss": 0.959, + "step": 5140 + }, + { + "epoch": 0.05759949894028106, + "grad_norm": 0.3191847801208496, + "learning_rate": 0.00040522636940929234, + "loss": 0.9439, + "step": 5150 + }, + { + "epoch": 0.057711342627543745, + "grad_norm": 0.565864622592926, + "learning_rate": 0.00040499794417287225, + "loss": 0.9587, + "step": 5160 + }, + { + "epoch": 0.05782318631480643, + "grad_norm": 0.3419003188610077, + "learning_rate": 0.0004047695189364521, + "loss": 0.9466, + "step": 5170 + }, + { + "epoch": 0.05793503000206911, + "grad_norm": 0.28331097960472107, + "learning_rate": 0.000404541093700032, + "loss": 0.9472, + "step": 5180 + }, + { + "epoch": 0.05804687368933179, + "grad_norm": 0.2994554042816162, + "learning_rate": 0.00040431266846361187, + "loss": 0.9434, + "step": 5190 + }, + { + "epoch": 0.05815871737659447, + "grad_norm": 0.30070000886917114, + "learning_rate": 0.0004040842432271917, + "loss": 0.9408, + "step": 5200 + }, + { + "epoch": 0.058270561063857154, + "grad_norm": 0.29924333095550537, + "learning_rate": 0.00040385581799077164, + "loss": 0.9484, + "step": 5210 + }, + { + "epoch": 0.05838240475111983, + "grad_norm": 0.2905283272266388, + "learning_rate": 0.0004036273927543515, + "loss": 0.9636, + "step": 5220 + }, + { + "epoch": 0.058494248438382516, + "grad_norm": 0.3290540874004364, + "learning_rate": 0.00040339896751793135, + "loss": 0.9396, + "step": 5230 + }, + { + "epoch": 0.0586060921256452, + "grad_norm": 0.29686272144317627, + "learning_rate": 0.0004031705422815113, + "loss": 0.9408, + "step": 5240 + }, + { + "epoch": 0.05871793581290788, + "grad_norm": 0.2768057882785797, + "learning_rate": 0.00040294211704509117, + "loss": 0.9328, + "step": 5250 + }, + { + "epoch": 0.05882977950017056, + "grad_norm": 0.2614899277687073, + "learning_rate": 0.000402713691808671, + "loss": 0.9483, + "step": 5260 + }, + { + "epoch": 0.05894162318743324, + "grad_norm": 0.2692766487598419, + "learning_rate": 0.00040248526657225093, + "loss": 0.9479, + "step": 5270 + }, + { + "epoch": 0.059053466874695926, + "grad_norm": 0.3009514808654785, + "learning_rate": 0.0004022568413358308, + "loss": 0.9681, + "step": 5280 + }, + { + "epoch": 0.0591653105619586, + "grad_norm": 0.27767086029052734, + "learning_rate": 0.00040202841609941064, + "loss": 0.9685, + "step": 5290 + }, + { + "epoch": 0.05927715424922129, + "grad_norm": 0.2956901788711548, + "learning_rate": 0.00040179999086299055, + "loss": 0.9609, + "step": 5300 + }, + { + "epoch": 0.05938899793648397, + "grad_norm": 0.3046570420265198, + "learning_rate": 0.0004015715656265704, + "loss": 0.961, + "step": 5310 + }, + { + "epoch": 0.05950084162374665, + "grad_norm": 0.24477365612983704, + "learning_rate": 0.0004013431403901503, + "loss": 0.9501, + "step": 5320 + }, + { + "epoch": 0.059612685311009335, + "grad_norm": 0.25505194067955017, + "learning_rate": 0.00040111471515373023, + "loss": 0.946, + "step": 5330 + }, + { + "epoch": 0.05972452899827201, + "grad_norm": 0.26015251874923706, + "learning_rate": 0.0004008862899173101, + "loss": 0.9372, + "step": 5340 + }, + { + "epoch": 0.0598363726855347, + "grad_norm": 0.24911250174045563, + "learning_rate": 0.00040065786468088994, + "loss": 0.9487, + "step": 5350 + }, + { + "epoch": 0.059948216372797375, + "grad_norm": 0.2779735028743744, + "learning_rate": 0.00040042943944446985, + "loss": 0.9316, + "step": 5360 + }, + { + "epoch": 0.06006006006006006, + "grad_norm": 0.30663251876831055, + "learning_rate": 0.0004002010142080497, + "loss": 0.9461, + "step": 5370 + }, + { + "epoch": 0.060171903747322744, + "grad_norm": 0.2724740505218506, + "learning_rate": 0.00039997258897162956, + "loss": 0.9214, + "step": 5380 + }, + { + "epoch": 0.06028374743458542, + "grad_norm": 0.26819276809692383, + "learning_rate": 0.00039974416373520947, + "loss": 0.9368, + "step": 5390 + }, + { + "epoch": 0.060395591121848106, + "grad_norm": 0.26342320442199707, + "learning_rate": 0.0003995157384987894, + "loss": 0.9332, + "step": 5400 + }, + { + "epoch": 0.060507434809110784, + "grad_norm": 0.32590556144714355, + "learning_rate": 0.00039928731326236924, + "loss": 0.9286, + "step": 5410 + }, + { + "epoch": 0.06061927849637347, + "grad_norm": 0.2747272849082947, + "learning_rate": 0.00039905888802594915, + "loss": 0.932, + "step": 5420 + }, + { + "epoch": 0.06073112218363615, + "grad_norm": 0.23089702427387238, + "learning_rate": 0.000398830462789529, + "loss": 0.9216, + "step": 5430 + }, + { + "epoch": 0.06084296587089883, + "grad_norm": 0.24383346736431122, + "learning_rate": 0.00039860203755310886, + "loss": 0.9333, + "step": 5440 + }, + { + "epoch": 0.060954809558161516, + "grad_norm": 0.23999489843845367, + "learning_rate": 0.00039837361231668877, + "loss": 0.9134, + "step": 5450 + }, + { + "epoch": 0.06106665324542419, + "grad_norm": 0.3041435480117798, + "learning_rate": 0.0003981451870802686, + "loss": 0.9226, + "step": 5460 + }, + { + "epoch": 0.06117849693268688, + "grad_norm": 0.2667579650878906, + "learning_rate": 0.0003979167618438485, + "loss": 0.9148, + "step": 5470 + }, + { + "epoch": 0.061290340619949556, + "grad_norm": 0.2730364203453064, + "learning_rate": 0.0003976883366074284, + "loss": 0.9073, + "step": 5480 + }, + { + "epoch": 0.06140218430721224, + "grad_norm": 0.28175118565559387, + "learning_rate": 0.0003974599113710083, + "loss": 0.9097, + "step": 5490 + }, + { + "epoch": 0.061514027994474925, + "grad_norm": 0.2826266288757324, + "learning_rate": 0.00039723148613458816, + "loss": 0.8972, + "step": 5500 + }, + { + "epoch": 0.0616258716817376, + "grad_norm": 0.25821810960769653, + "learning_rate": 0.000397003060898168, + "loss": 0.8898, + "step": 5510 + }, + { + "epoch": 0.06173771536900029, + "grad_norm": 0.31401073932647705, + "learning_rate": 0.0003967746356617479, + "loss": 0.8986, + "step": 5520 + }, + { + "epoch": 0.061849559056262965, + "grad_norm": 0.2664715349674225, + "learning_rate": 0.0003965462104253278, + "loss": 0.9178, + "step": 5530 + }, + { + "epoch": 0.06196140274352565, + "grad_norm": 0.2725924253463745, + "learning_rate": 0.00039631778518890763, + "loss": 0.8941, + "step": 5540 + }, + { + "epoch": 0.06207324643078833, + "grad_norm": 0.2991993725299835, + "learning_rate": 0.0003960893599524876, + "loss": 0.899, + "step": 5550 + }, + { + "epoch": 0.06218509011805101, + "grad_norm": 0.2683865427970886, + "learning_rate": 0.00039586093471606745, + "loss": 0.9105, + "step": 5560 + }, + { + "epoch": 0.062296933805313696, + "grad_norm": 0.29127469658851624, + "learning_rate": 0.0003956325094796473, + "loss": 0.9091, + "step": 5570 + }, + { + "epoch": 0.062408777492576374, + "grad_norm": 0.28191229701042175, + "learning_rate": 0.0003954040842432272, + "loss": 0.9078, + "step": 5580 + }, + { + "epoch": 0.06252062117983906, + "grad_norm": 0.28319644927978516, + "learning_rate": 0.0003951756590068071, + "loss": 0.9134, + "step": 5590 + }, + { + "epoch": 0.06263246486710174, + "grad_norm": 0.2563108205795288, + "learning_rate": 0.00039494723377038693, + "loss": 0.9166, + "step": 5600 + }, + { + "epoch": 0.06274430855436441, + "grad_norm": 0.29730817675590515, + "learning_rate": 0.00039471880853396684, + "loss": 0.9101, + "step": 5610 + }, + { + "epoch": 0.0628561522416271, + "grad_norm": 0.25925830006599426, + "learning_rate": 0.0003944903832975467, + "loss": 0.9131, + "step": 5620 + }, + { + "epoch": 0.06296799592888978, + "grad_norm": 0.2645208537578583, + "learning_rate": 0.0003942619580611266, + "loss": 0.9203, + "step": 5630 + }, + { + "epoch": 0.06307983961615246, + "grad_norm": 0.2844574749469757, + "learning_rate": 0.0003940335328247065, + "loss": 0.914, + "step": 5640 + }, + { + "epoch": 0.06319168330341515, + "grad_norm": 0.2687402367591858, + "learning_rate": 0.00039380510758828637, + "loss": 0.9095, + "step": 5650 + }, + { + "epoch": 0.06330352699067783, + "grad_norm": 0.22893477976322174, + "learning_rate": 0.00039357668235186623, + "loss": 0.8993, + "step": 5660 + }, + { + "epoch": 0.06341537067794051, + "grad_norm": 0.27271768450737, + "learning_rate": 0.00039334825711544614, + "loss": 0.8989, + "step": 5670 + }, + { + "epoch": 0.06352721436520319, + "grad_norm": 0.27709853649139404, + "learning_rate": 0.000393119831879026, + "loss": 0.8998, + "step": 5680 + }, + { + "epoch": 0.06363905805246588, + "grad_norm": 0.24321520328521729, + "learning_rate": 0.00039289140664260585, + "loss": 0.887, + "step": 5690 + }, + { + "epoch": 0.06375090173972855, + "grad_norm": 0.26779887080192566, + "learning_rate": 0.00039266298140618576, + "loss": 0.9091, + "step": 5700 + }, + { + "epoch": 0.06386274542699123, + "grad_norm": 0.2612350881099701, + "learning_rate": 0.00039243455616976567, + "loss": 0.9043, + "step": 5710 + }, + { + "epoch": 0.06397458911425392, + "grad_norm": 0.26247987151145935, + "learning_rate": 0.0003922061309333455, + "loss": 0.9024, + "step": 5720 + }, + { + "epoch": 0.0640864328015166, + "grad_norm": 0.2605653703212738, + "learning_rate": 0.00039197770569692543, + "loss": 0.9311, + "step": 5730 + }, + { + "epoch": 0.06419827648877928, + "grad_norm": 0.28249841928482056, + "learning_rate": 0.0003917492804605053, + "loss": 0.9265, + "step": 5740 + }, + { + "epoch": 0.06431012017604196, + "grad_norm": 0.2880108654499054, + "learning_rate": 0.00039152085522408515, + "loss": 0.9331, + "step": 5750 + }, + { + "epoch": 0.06442196386330465, + "grad_norm": 0.31626009941101074, + "learning_rate": 0.00039129242998766506, + "loss": 0.9483, + "step": 5760 + }, + { + "epoch": 0.06453380755056733, + "grad_norm": 0.28972744941711426, + "learning_rate": 0.0003910640047512449, + "loss": 0.9239, + "step": 5770 + }, + { + "epoch": 0.06464565123783, + "grad_norm": 0.27140864729881287, + "learning_rate": 0.00039083557951482477, + "loss": 0.9259, + "step": 5780 + }, + { + "epoch": 0.0647574949250927, + "grad_norm": 0.26331818103790283, + "learning_rate": 0.00039060715427840473, + "loss": 0.9383, + "step": 5790 + }, + { + "epoch": 0.06486933861235537, + "grad_norm": 0.26927000284194946, + "learning_rate": 0.0003903787290419846, + "loss": 0.9236, + "step": 5800 + }, + { + "epoch": 0.06498118229961805, + "grad_norm": 0.2833601236343384, + "learning_rate": 0.00039015030380556444, + "loss": 0.9257, + "step": 5810 + }, + { + "epoch": 0.06509302598688073, + "grad_norm": 0.2970174551010132, + "learning_rate": 0.00038992187856914435, + "loss": 0.9164, + "step": 5820 + }, + { + "epoch": 0.06520486967414342, + "grad_norm": 0.27904263138771057, + "learning_rate": 0.0003896934533327242, + "loss": 0.9045, + "step": 5830 + }, + { + "epoch": 0.0653167133614061, + "grad_norm": 0.24879537522792816, + "learning_rate": 0.00038946502809630406, + "loss": 0.9, + "step": 5840 + }, + { + "epoch": 0.06542855704866878, + "grad_norm": 0.2897798717021942, + "learning_rate": 0.000389236602859884, + "loss": 0.919, + "step": 5850 + }, + { + "epoch": 0.06554040073593147, + "grad_norm": 0.26522865891456604, + "learning_rate": 0.00038900817762346383, + "loss": 0.9168, + "step": 5860 + }, + { + "epoch": 0.06565224442319414, + "grad_norm": 0.26405441761016846, + "learning_rate": 0.00038877975238704374, + "loss": 0.9169, + "step": 5870 + }, + { + "epoch": 0.06576408811045682, + "grad_norm": 0.2543514370918274, + "learning_rate": 0.00038855132715062365, + "loss": 0.917, + "step": 5880 + }, + { + "epoch": 0.06587593179771951, + "grad_norm": 0.2683538794517517, + "learning_rate": 0.0003883229019142035, + "loss": 0.9179, + "step": 5890 + }, + { + "epoch": 0.06598777548498219, + "grad_norm": 0.24559274315834045, + "learning_rate": 0.00038809447667778336, + "loss": 0.907, + "step": 5900 + }, + { + "epoch": 0.06609961917224487, + "grad_norm": 0.2604455351829529, + "learning_rate": 0.00038786605144136327, + "loss": 0.9172, + "step": 5910 + }, + { + "epoch": 0.06621146285950755, + "grad_norm": 0.24329319596290588, + "learning_rate": 0.0003876376262049431, + "loss": 0.9171, + "step": 5920 + }, + { + "epoch": 0.06632330654677024, + "grad_norm": 0.237509623169899, + "learning_rate": 0.000387409200968523, + "loss": 0.9272, + "step": 5930 + }, + { + "epoch": 0.06643515023403292, + "grad_norm": 0.2569025754928589, + "learning_rate": 0.00038718077573210284, + "loss": 0.9327, + "step": 5940 + }, + { + "epoch": 0.0665469939212956, + "grad_norm": 0.2908497750759125, + "learning_rate": 0.0003869523504956828, + "loss": 0.9299, + "step": 5950 + }, + { + "epoch": 0.06665883760855829, + "grad_norm": 0.24669544398784637, + "learning_rate": 0.00038672392525926266, + "loss": 0.9036, + "step": 5960 + }, + { + "epoch": 0.06677068129582096, + "grad_norm": 0.23906981945037842, + "learning_rate": 0.0003864955000228425, + "loss": 0.9266, + "step": 5970 + }, + { + "epoch": 0.06688252498308364, + "grad_norm": 0.2822079658508301, + "learning_rate": 0.0003862670747864224, + "loss": 0.9209, + "step": 5980 + }, + { + "epoch": 0.06699436867034632, + "grad_norm": 0.27469775080680847, + "learning_rate": 0.0003860386495500023, + "loss": 0.9385, + "step": 5990 + }, + { + "epoch": 0.06710621235760901, + "grad_norm": 0.24559862911701202, + "learning_rate": 0.00038581022431358213, + "loss": 0.9248, + "step": 6000 + }, + { + "epoch": 0.06721805604487169, + "grad_norm": 0.24427008628845215, + "learning_rate": 0.00038558179907716204, + "loss": 0.9358, + "step": 6010 + }, + { + "epoch": 0.06732989973213437, + "grad_norm": 0.2626965641975403, + "learning_rate": 0.00038535337384074195, + "loss": 0.9211, + "step": 6020 + }, + { + "epoch": 0.06744174341939706, + "grad_norm": 0.226990208029747, + "learning_rate": 0.0003851249486043218, + "loss": 0.9292, + "step": 6030 + }, + { + "epoch": 0.06755358710665973, + "grad_norm": 0.2762834131717682, + "learning_rate": 0.0003848965233679017, + "loss": 0.932, + "step": 6040 + }, + { + "epoch": 0.06766543079392241, + "grad_norm": 0.2799958884716034, + "learning_rate": 0.0003846680981314816, + "loss": 0.943, + "step": 6050 + }, + { + "epoch": 0.06777727448118509, + "grad_norm": 0.26224029064178467, + "learning_rate": 0.00038443967289506143, + "loss": 0.9236, + "step": 6060 + }, + { + "epoch": 0.06788911816844778, + "grad_norm": 0.2897866368293762, + "learning_rate": 0.00038421124765864134, + "loss": 0.95, + "step": 6070 + }, + { + "epoch": 0.06800096185571046, + "grad_norm": 0.2899113893508911, + "learning_rate": 0.0003839828224222212, + "loss": 0.9403, + "step": 6080 + }, + { + "epoch": 0.06811280554297314, + "grad_norm": 0.27765581011772156, + "learning_rate": 0.00038375439718580105, + "loss": 0.9447, + "step": 6090 + }, + { + "epoch": 0.06822464923023583, + "grad_norm": 0.27683207392692566, + "learning_rate": 0.000383525971949381, + "loss": 0.949, + "step": 6100 + }, + { + "epoch": 0.0683364929174985, + "grad_norm": 0.2815559506416321, + "learning_rate": 0.00038329754671296087, + "loss": 0.9627, + "step": 6110 + }, + { + "epoch": 0.06844833660476118, + "grad_norm": 0.2741657793521881, + "learning_rate": 0.00038306912147654073, + "loss": 0.9659, + "step": 6120 + }, + { + "epoch": 0.06856018029202386, + "grad_norm": 0.4103181064128876, + "learning_rate": 0.00038284069624012064, + "loss": 0.9612, + "step": 6130 + }, + { + "epoch": 0.06867202397928655, + "grad_norm": 0.2862701416015625, + "learning_rate": 0.0003826122710037005, + "loss": 0.9393, + "step": 6140 + }, + { + "epoch": 0.06878386766654923, + "grad_norm": 0.2789844274520874, + "learning_rate": 0.00038238384576728035, + "loss": 0.9447, + "step": 6150 + }, + { + "epoch": 0.06889571135381191, + "grad_norm": 0.590391993522644, + "learning_rate": 0.00038215542053086026, + "loss": 0.9525, + "step": 6160 + }, + { + "epoch": 0.0690075550410746, + "grad_norm": 0.2721211016178131, + "learning_rate": 0.0003819269952944401, + "loss": 0.9467, + "step": 6170 + }, + { + "epoch": 0.06911939872833728, + "grad_norm": 0.27576929330825806, + "learning_rate": 0.00038169857005802, + "loss": 0.9428, + "step": 6180 + }, + { + "epoch": 0.06923124241559996, + "grad_norm": 0.28229111433029175, + "learning_rate": 0.00038147014482159993, + "loss": 0.9418, + "step": 6190 + }, + { + "epoch": 0.06934308610286263, + "grad_norm": 0.29595518112182617, + "learning_rate": 0.0003812417195851798, + "loss": 0.9178, + "step": 6200 + }, + { + "epoch": 0.06945492979012532, + "grad_norm": 0.3055596351623535, + "learning_rate": 0.00038101329434875965, + "loss": 0.9464, + "step": 6210 + }, + { + "epoch": 0.069566773477388, + "grad_norm": 0.29212549328804016, + "learning_rate": 0.00038078486911233956, + "loss": 0.9491, + "step": 6220 + }, + { + "epoch": 0.06967861716465068, + "grad_norm": 0.288928359746933, + "learning_rate": 0.0003805564438759194, + "loss": 0.9285, + "step": 6230 + }, + { + "epoch": 0.06979046085191337, + "grad_norm": 0.2759207487106323, + "learning_rate": 0.00038032801863949927, + "loss": 0.9336, + "step": 6240 + }, + { + "epoch": 0.06990230453917605, + "grad_norm": 0.31041648983955383, + "learning_rate": 0.0003800995934030792, + "loss": 0.9317, + "step": 6250 + }, + { + "epoch": 0.07001414822643873, + "grad_norm": 0.29425299167633057, + "learning_rate": 0.0003798711681666591, + "loss": 0.9212, + "step": 6260 + }, + { + "epoch": 0.0701259919137014, + "grad_norm": 0.278062105178833, + "learning_rate": 0.00037964274293023894, + "loss": 0.9291, + "step": 6270 + }, + { + "epoch": 0.0702378356009641, + "grad_norm": 0.2983698546886444, + "learning_rate": 0.00037941431769381885, + "loss": 0.9169, + "step": 6280 + }, + { + "epoch": 0.07034967928822677, + "grad_norm": 0.29595527052879333, + "learning_rate": 0.0003791858924573987, + "loss": 0.9286, + "step": 6290 + }, + { + "epoch": 0.07046152297548945, + "grad_norm": 0.26365020871162415, + "learning_rate": 0.00037895746722097856, + "loss": 0.9312, + "step": 6300 + }, + { + "epoch": 0.07057336666275214, + "grad_norm": 0.27807778120040894, + "learning_rate": 0.0003787290419845585, + "loss": 0.9274, + "step": 6310 + }, + { + "epoch": 0.07068521035001482, + "grad_norm": 0.2585415840148926, + "learning_rate": 0.00037850061674813833, + "loss": 0.9513, + "step": 6320 + }, + { + "epoch": 0.0707970540372775, + "grad_norm": 0.2740543484687805, + "learning_rate": 0.0003782721915117182, + "loss": 0.922, + "step": 6330 + }, + { + "epoch": 0.07090889772454018, + "grad_norm": 0.28271788358688354, + "learning_rate": 0.00037804376627529815, + "loss": 0.94, + "step": 6340 + }, + { + "epoch": 0.07102074141180287, + "grad_norm": 0.28767603635787964, + "learning_rate": 0.000377815341038878, + "loss": 0.9295, + "step": 6350 + }, + { + "epoch": 0.07113258509906555, + "grad_norm": 0.25200092792510986, + "learning_rate": 0.00037758691580245786, + "loss": 0.9219, + "step": 6360 + }, + { + "epoch": 0.07124442878632822, + "grad_norm": 0.27449852228164673, + "learning_rate": 0.00037735849056603777, + "loss": 0.9227, + "step": 6370 + }, + { + "epoch": 0.07135627247359091, + "grad_norm": 0.27951040863990784, + "learning_rate": 0.0003771300653296176, + "loss": 0.9256, + "step": 6380 + }, + { + "epoch": 0.07146811616085359, + "grad_norm": 0.27883175015449524, + "learning_rate": 0.0003769016400931975, + "loss": 0.9244, + "step": 6390 + }, + { + "epoch": 0.07157995984811627, + "grad_norm": 0.27942216396331787, + "learning_rate": 0.00037667321485677734, + "loss": 0.9287, + "step": 6400 + }, + { + "epoch": 0.07169180353537895, + "grad_norm": 0.2605076730251312, + "learning_rate": 0.00037644478962035725, + "loss": 0.9213, + "step": 6410 + }, + { + "epoch": 0.07180364722264164, + "grad_norm": 0.25812190771102905, + "learning_rate": 0.00037621636438393716, + "loss": 0.9268, + "step": 6420 + }, + { + "epoch": 0.07191549090990432, + "grad_norm": 0.27478551864624023, + "learning_rate": 0.000375987939147517, + "loss": 0.9341, + "step": 6430 + }, + { + "epoch": 0.072027334597167, + "grad_norm": 0.2799810469150543, + "learning_rate": 0.0003757595139110969, + "loss": 0.9308, + "step": 6440 + }, + { + "epoch": 0.07213917828442969, + "grad_norm": 0.2494313269853592, + "learning_rate": 0.0003755310886746768, + "loss": 0.9389, + "step": 6450 + }, + { + "epoch": 0.07225102197169236, + "grad_norm": 0.3362772762775421, + "learning_rate": 0.00037530266343825664, + "loss": 0.9362, + "step": 6460 + }, + { + "epoch": 0.07236286565895504, + "grad_norm": 0.28501999378204346, + "learning_rate": 0.00037507423820183655, + "loss": 0.9262, + "step": 6470 + }, + { + "epoch": 0.07247470934621772, + "grad_norm": 0.24787545204162598, + "learning_rate": 0.0003748458129654164, + "loss": 0.9409, + "step": 6480 + }, + { + "epoch": 0.07258655303348041, + "grad_norm": 0.277665913105011, + "learning_rate": 0.0003746173877289963, + "loss": 0.9244, + "step": 6490 + }, + { + "epoch": 0.07269839672074309, + "grad_norm": 0.2613317370414734, + "learning_rate": 0.0003743889624925762, + "loss": 0.9429, + "step": 6500 + }, + { + "epoch": 0.07281024040800577, + "grad_norm": 0.2740306258201599, + "learning_rate": 0.0003741605372561561, + "loss": 0.9422, + "step": 6510 + }, + { + "epoch": 0.07292208409526846, + "grad_norm": 0.3052440881729126, + "learning_rate": 0.00037393211201973593, + "loss": 0.9346, + "step": 6520 + }, + { + "epoch": 0.07303392778253114, + "grad_norm": 0.27979132533073425, + "learning_rate": 0.00037370368678331584, + "loss": 0.9305, + "step": 6530 + }, + { + "epoch": 0.07314577146979381, + "grad_norm": 0.2834227979183197, + "learning_rate": 0.0003734752615468957, + "loss": 0.9305, + "step": 6540 + }, + { + "epoch": 0.07325761515705649, + "grad_norm": 0.28621387481689453, + "learning_rate": 0.00037324683631047555, + "loss": 0.9505, + "step": 6550 + }, + { + "epoch": 0.07336945884431918, + "grad_norm": 0.2539358139038086, + "learning_rate": 0.00037301841107405546, + "loss": 0.9491, + "step": 6560 + }, + { + "epoch": 0.07348130253158186, + "grad_norm": 0.29257437586784363, + "learning_rate": 0.0003727899858376354, + "loss": 0.9428, + "step": 6570 + }, + { + "epoch": 0.07359314621884454, + "grad_norm": 0.25158485770225525, + "learning_rate": 0.00037256156060121523, + "loss": 0.9471, + "step": 6580 + }, + { + "epoch": 0.07370498990610723, + "grad_norm": 0.26301345229148865, + "learning_rate": 0.00037233313536479514, + "loss": 0.928, + "step": 6590 + }, + { + "epoch": 0.07381683359336991, + "grad_norm": 0.2519192397594452, + "learning_rate": 0.000372104710128375, + "loss": 0.9189, + "step": 6600 + }, + { + "epoch": 0.07392867728063258, + "grad_norm": 0.29801836609840393, + "learning_rate": 0.00037187628489195485, + "loss": 0.9218, + "step": 6610 + }, + { + "epoch": 0.07404052096789528, + "grad_norm": 0.30779263377189636, + "learning_rate": 0.00037164785965553476, + "loss": 0.9263, + "step": 6620 + }, + { + "epoch": 0.07415236465515795, + "grad_norm": 0.2758638262748718, + "learning_rate": 0.0003714194344191146, + "loss": 0.904, + "step": 6630 + }, + { + "epoch": 0.07426420834242063, + "grad_norm": 0.26482871174812317, + "learning_rate": 0.00037119100918269447, + "loss": 0.9024, + "step": 6640 + }, + { + "epoch": 0.07437605202968331, + "grad_norm": 0.24001047015190125, + "learning_rate": 0.00037096258394627444, + "loss": 0.914, + "step": 6650 + }, + { + "epoch": 0.074487895716946, + "grad_norm": 0.2694549560546875, + "learning_rate": 0.0003707341587098543, + "loss": 0.921, + "step": 6660 + }, + { + "epoch": 0.07459973940420868, + "grad_norm": 0.25042393803596497, + "learning_rate": 0.00037050573347343415, + "loss": 0.9108, + "step": 6670 + }, + { + "epoch": 0.07471158309147136, + "grad_norm": 0.25945019721984863, + "learning_rate": 0.00037027730823701406, + "loss": 0.912, + "step": 6680 + }, + { + "epoch": 0.07482342677873405, + "grad_norm": 0.2624742090702057, + "learning_rate": 0.0003700488830005939, + "loss": 0.9108, + "step": 6690 + }, + { + "epoch": 0.07493527046599673, + "grad_norm": 0.27438145875930786, + "learning_rate": 0.00036982045776417377, + "loss": 0.9215, + "step": 6700 + }, + { + "epoch": 0.0750471141532594, + "grad_norm": 0.27610865235328674, + "learning_rate": 0.0003695920325277537, + "loss": 0.9053, + "step": 6710 + }, + { + "epoch": 0.07515895784052208, + "grad_norm": 0.2616426944732666, + "learning_rate": 0.00036936360729133353, + "loss": 0.9255, + "step": 6720 + }, + { + "epoch": 0.07527080152778477, + "grad_norm": 0.3146522641181946, + "learning_rate": 0.00036913518205491344, + "loss": 0.9105, + "step": 6730 + }, + { + "epoch": 0.07538264521504745, + "grad_norm": 0.29139819741249084, + "learning_rate": 0.00036890675681849335, + "loss": 0.9324, + "step": 6740 + }, + { + "epoch": 0.07549448890231013, + "grad_norm": 0.3176229000091553, + "learning_rate": 0.0003686783315820732, + "loss": 0.9434, + "step": 6750 + }, + { + "epoch": 0.07560633258957282, + "grad_norm": 0.2786601781845093, + "learning_rate": 0.00036844990634565307, + "loss": 0.9405, + "step": 6760 + }, + { + "epoch": 0.0757181762768355, + "grad_norm": 0.2988050580024719, + "learning_rate": 0.000368221481109233, + "loss": 0.9477, + "step": 6770 + }, + { + "epoch": 0.07583001996409817, + "grad_norm": 0.28120875358581543, + "learning_rate": 0.00036799305587281283, + "loss": 0.9521, + "step": 6780 + }, + { + "epoch": 0.07594186365136085, + "grad_norm": 0.27376359701156616, + "learning_rate": 0.0003677646306363927, + "loss": 0.9405, + "step": 6790 + }, + { + "epoch": 0.07605370733862354, + "grad_norm": 0.2721284329891205, + "learning_rate": 0.0003675362053999726, + "loss": 0.9392, + "step": 6800 + }, + { + "epoch": 0.07616555102588622, + "grad_norm": 0.31443721055984497, + "learning_rate": 0.0003673077801635525, + "loss": 0.939, + "step": 6810 + }, + { + "epoch": 0.0762773947131489, + "grad_norm": 0.27175766229629517, + "learning_rate": 0.00036707935492713236, + "loss": 0.9262, + "step": 6820 + }, + { + "epoch": 0.07638923840041159, + "grad_norm": 0.2984711527824402, + "learning_rate": 0.00036685092969071227, + "loss": 0.9381, + "step": 6830 + }, + { + "epoch": 0.07650108208767427, + "grad_norm": 0.2773591876029968, + "learning_rate": 0.00036662250445429213, + "loss": 0.9217, + "step": 6840 + }, + { + "epoch": 0.07661292577493695, + "grad_norm": 0.29338565468788147, + "learning_rate": 0.000366394079217872, + "loss": 0.9197, + "step": 6850 + }, + { + "epoch": 0.07672476946219962, + "grad_norm": 0.2456415593624115, + "learning_rate": 0.00036616565398145184, + "loss": 0.9191, + "step": 6860 + }, + { + "epoch": 0.07683661314946232, + "grad_norm": 0.324935644865036, + "learning_rate": 0.00036593722874503175, + "loss": 0.8975, + "step": 6870 + }, + { + "epoch": 0.076948456836725, + "grad_norm": 0.6967706680297852, + "learning_rate": 0.0003657088035086116, + "loss": 0.9053, + "step": 6880 + }, + { + "epoch": 0.07706030052398767, + "grad_norm": 0.8192552328109741, + "learning_rate": 0.0003654803782721915, + "loss": 0.9129, + "step": 6890 + }, + { + "epoch": 0.07717214421125036, + "grad_norm": 0.4698907136917114, + "learning_rate": 0.0003652519530357714, + "loss": 0.9128, + "step": 6900 + }, + { + "epoch": 0.07728398789851304, + "grad_norm": 0.3055092990398407, + "learning_rate": 0.0003650235277993513, + "loss": 0.9207, + "step": 6910 + }, + { + "epoch": 0.07739583158577572, + "grad_norm": 0.31879591941833496, + "learning_rate": 0.00036479510256293114, + "loss": 0.9101, + "step": 6920 + }, + { + "epoch": 0.0775076752730384, + "grad_norm": 0.2708083987236023, + "learning_rate": 0.00036456667732651105, + "loss": 0.9036, + "step": 6930 + }, + { + "epoch": 0.07761951896030109, + "grad_norm": 0.2801443040370941, + "learning_rate": 0.0003643382520900909, + "loss": 0.9031, + "step": 6940 + }, + { + "epoch": 0.07773136264756376, + "grad_norm": 0.2481400966644287, + "learning_rate": 0.00036410982685367076, + "loss": 0.8952, + "step": 6950 + }, + { + "epoch": 0.07784320633482644, + "grad_norm": 0.25424808263778687, + "learning_rate": 0.0003638814016172507, + "loss": 0.8846, + "step": 6960 + }, + { + "epoch": 0.07795505002208913, + "grad_norm": 0.2655096650123596, + "learning_rate": 0.0003636529763808306, + "loss": 0.8922, + "step": 6970 + }, + { + "epoch": 0.07806689370935181, + "grad_norm": 0.281180202960968, + "learning_rate": 0.00036342455114441043, + "loss": 0.8934, + "step": 6980 + }, + { + "epoch": 0.07817873739661449, + "grad_norm": 0.2850550413131714, + "learning_rate": 0.00036319612590799034, + "loss": 0.8856, + "step": 6990 + }, + { + "epoch": 0.07829058108387717, + "grad_norm": 0.24838604032993317, + "learning_rate": 0.0003629677006715702, + "loss": 0.905, + "step": 7000 + }, + { + "epoch": 0.07840242477113986, + "grad_norm": 0.2703045606613159, + "learning_rate": 0.00036273927543515005, + "loss": 0.8816, + "step": 7010 + }, + { + "epoch": 0.07851426845840254, + "grad_norm": 0.2786656320095062, + "learning_rate": 0.00036251085019872996, + "loss": 0.8997, + "step": 7020 + }, + { + "epoch": 0.07862611214566521, + "grad_norm": 0.2771463692188263, + "learning_rate": 0.0003622824249623098, + "loss": 0.9033, + "step": 7030 + }, + { + "epoch": 0.0787379558329279, + "grad_norm": 0.2721976339817047, + "learning_rate": 0.00036205399972588973, + "loss": 0.9109, + "step": 7040 + }, + { + "epoch": 0.07884979952019058, + "grad_norm": 0.2606031596660614, + "learning_rate": 0.00036182557448946964, + "loss": 0.9221, + "step": 7050 + }, + { + "epoch": 0.07896164320745326, + "grad_norm": 0.45895281434059143, + "learning_rate": 0.0003615971492530495, + "loss": 0.908, + "step": 7060 + }, + { + "epoch": 0.07907348689471594, + "grad_norm": 0.30524522066116333, + "learning_rate": 0.00036136872401662935, + "loss": 0.9234, + "step": 7070 + }, + { + "epoch": 0.07918533058197863, + "grad_norm": 0.2704319953918457, + "learning_rate": 0.00036114029878020926, + "loss": 0.9003, + "step": 7080 + }, + { + "epoch": 0.07929717426924131, + "grad_norm": 0.2770727872848511, + "learning_rate": 0.0003609118735437891, + "loss": 0.9253, + "step": 7090 + }, + { + "epoch": 0.07940901795650399, + "grad_norm": 0.25288262963294983, + "learning_rate": 0.00036068344830736897, + "loss": 0.9089, + "step": 7100 + }, + { + "epoch": 0.07952086164376668, + "grad_norm": 0.27105236053466797, + "learning_rate": 0.0003604550230709489, + "loss": 0.9138, + "step": 7110 + }, + { + "epoch": 0.07963270533102935, + "grad_norm": 0.2631518840789795, + "learning_rate": 0.0003602265978345288, + "loss": 0.9226, + "step": 7120 + }, + { + "epoch": 0.07974454901829203, + "grad_norm": 0.25269970297813416, + "learning_rate": 0.00035999817259810865, + "loss": 0.9102, + "step": 7130 + }, + { + "epoch": 0.07985639270555471, + "grad_norm": 0.2576468586921692, + "learning_rate": 0.00035976974736168856, + "loss": 0.9075, + "step": 7140 + }, + { + "epoch": 0.0799682363928174, + "grad_norm": 0.26297688484191895, + "learning_rate": 0.0003595413221252684, + "loss": 0.9004, + "step": 7150 + }, + { + "epoch": 0.08008008008008008, + "grad_norm": 0.3029099702835083, + "learning_rate": 0.00035931289688884827, + "loss": 0.9165, + "step": 7160 + }, + { + "epoch": 0.08019192376734276, + "grad_norm": 0.2699684798717499, + "learning_rate": 0.0003590844716524282, + "loss": 0.9232, + "step": 7170 + }, + { + "epoch": 0.08030376745460545, + "grad_norm": 0.26480093598365784, + "learning_rate": 0.00035885604641600804, + "loss": 0.9319, + "step": 7180 + }, + { + "epoch": 0.08041561114186813, + "grad_norm": 0.27503007650375366, + "learning_rate": 0.0003586276211795879, + "loss": 0.9398, + "step": 7190 + }, + { + "epoch": 0.0805274548291308, + "grad_norm": 0.2715147137641907, + "learning_rate": 0.00035839919594316785, + "loss": 0.9307, + "step": 7200 + }, + { + "epoch": 0.08063929851639348, + "grad_norm": 0.2697315812110901, + "learning_rate": 0.0003581707707067477, + "loss": 0.9342, + "step": 7210 + }, + { + "epoch": 0.08075114220365617, + "grad_norm": 0.2833189070224762, + "learning_rate": 0.00035794234547032757, + "loss": 0.9604, + "step": 7220 + }, + { + "epoch": 0.08086298589091885, + "grad_norm": 0.3069300353527069, + "learning_rate": 0.0003577139202339075, + "loss": 0.9397, + "step": 7230 + }, + { + "epoch": 0.08097482957818153, + "grad_norm": 0.28459593653678894, + "learning_rate": 0.00035748549499748733, + "loss": 0.925, + "step": 7240 + }, + { + "epoch": 0.08108667326544422, + "grad_norm": 0.28896769881248474, + "learning_rate": 0.0003572570697610672, + "loss": 0.9245, + "step": 7250 + }, + { + "epoch": 0.0811985169527069, + "grad_norm": 0.2574586272239685, + "learning_rate": 0.0003570286445246471, + "loss": 0.9326, + "step": 7260 + }, + { + "epoch": 0.08131036063996958, + "grad_norm": 0.2965002954006195, + "learning_rate": 0.00035680021928822695, + "loss": 0.9221, + "step": 7270 + }, + { + "epoch": 0.08142220432723227, + "grad_norm": 0.2657724618911743, + "learning_rate": 0.00035657179405180686, + "loss": 0.9143, + "step": 7280 + }, + { + "epoch": 0.08153404801449494, + "grad_norm": 0.2973329424858093, + "learning_rate": 0.0003563433688153867, + "loss": 0.9164, + "step": 7290 + }, + { + "epoch": 0.08164589170175762, + "grad_norm": 0.3032989501953125, + "learning_rate": 0.00035611494357896663, + "loss": 0.9254, + "step": 7300 + }, + { + "epoch": 0.0817577353890203, + "grad_norm": 0.28107839822769165, + "learning_rate": 0.0003558865183425465, + "loss": 0.9155, + "step": 7310 + }, + { + "epoch": 0.08186957907628299, + "grad_norm": 0.30296218395233154, + "learning_rate": 0.00035565809310612634, + "loss": 0.9218, + "step": 7320 + }, + { + "epoch": 0.08198142276354567, + "grad_norm": 0.28191155195236206, + "learning_rate": 0.00035542966786970625, + "loss": 0.9197, + "step": 7330 + }, + { + "epoch": 0.08209326645080835, + "grad_norm": 0.3113023638725281, + "learning_rate": 0.0003552012426332861, + "loss": 0.9228, + "step": 7340 + }, + { + "epoch": 0.08220511013807104, + "grad_norm": 0.3066212832927704, + "learning_rate": 0.00035497281739686596, + "loss": 0.9191, + "step": 7350 + }, + { + "epoch": 0.08231695382533372, + "grad_norm": 0.2658233940601349, + "learning_rate": 0.0003547443921604459, + "loss": 0.918, + "step": 7360 + }, + { + "epoch": 0.0824287975125964, + "grad_norm": 0.28222033381462097, + "learning_rate": 0.0003545159669240258, + "loss": 0.9253, + "step": 7370 + }, + { + "epoch": 0.08254064119985907, + "grad_norm": 0.2917843461036682, + "learning_rate": 0.00035428754168760564, + "loss": 0.9059, + "step": 7380 + }, + { + "epoch": 0.08265248488712176, + "grad_norm": 0.290404349565506, + "learning_rate": 0.00035405911645118555, + "loss": 0.9044, + "step": 7390 + }, + { + "epoch": 0.08276432857438444, + "grad_norm": 0.28990834951400757, + "learning_rate": 0.0003538306912147654, + "loss": 0.9078, + "step": 7400 + }, + { + "epoch": 0.08287617226164712, + "grad_norm": 0.27296292781829834, + "learning_rate": 0.00035360226597834526, + "loss": 0.9081, + "step": 7410 + }, + { + "epoch": 0.08298801594890981, + "grad_norm": 0.25443321466445923, + "learning_rate": 0.00035337384074192517, + "loss": 0.9019, + "step": 7420 + }, + { + "epoch": 0.08309985963617249, + "grad_norm": 0.25014832615852356, + "learning_rate": 0.0003531454155055051, + "loss": 0.8976, + "step": 7430 + }, + { + "epoch": 0.08321170332343517, + "grad_norm": 0.2844237983226776, + "learning_rate": 0.00035291699026908493, + "loss": 0.9039, + "step": 7440 + }, + { + "epoch": 0.08332354701069784, + "grad_norm": 0.26745542883872986, + "learning_rate": 0.00035268856503266484, + "loss": 0.8813, + "step": 7450 + }, + { + "epoch": 0.08343539069796053, + "grad_norm": 0.30750566720962524, + "learning_rate": 0.0003524601397962447, + "loss": 0.8988, + "step": 7460 + }, + { + "epoch": 0.08354723438522321, + "grad_norm": 0.2960536777973175, + "learning_rate": 0.00035223171455982456, + "loss": 0.8966, + "step": 7470 + }, + { + "epoch": 0.08365907807248589, + "grad_norm": 0.28923213481903076, + "learning_rate": 0.00035200328932340447, + "loss": 0.8872, + "step": 7480 + }, + { + "epoch": 0.08377092175974858, + "grad_norm": 0.2762465476989746, + "learning_rate": 0.0003517748640869843, + "loss": 0.8655, + "step": 7490 + }, + { + "epoch": 0.08388276544701126, + "grad_norm": 0.2870965301990509, + "learning_rate": 0.0003515464388505642, + "loss": 0.889, + "step": 7500 + }, + { + "epoch": 0.08399460913427394, + "grad_norm": 0.3135611116886139, + "learning_rate": 0.00035131801361414414, + "loss": 0.8898, + "step": 7510 + }, + { + "epoch": 0.08410645282153661, + "grad_norm": 0.29541128873825073, + "learning_rate": 0.000351089588377724, + "loss": 0.8884, + "step": 7520 + }, + { + "epoch": 0.0842182965087993, + "grad_norm": 0.2667001485824585, + "learning_rate": 0.00035086116314130385, + "loss": 0.8923, + "step": 7530 + }, + { + "epoch": 0.08433014019606198, + "grad_norm": 0.28677645325660706, + "learning_rate": 0.00035063273790488376, + "loss": 0.8862, + "step": 7540 + }, + { + "epoch": 0.08444198388332466, + "grad_norm": 0.26973757147789, + "learning_rate": 0.0003504043126684636, + "loss": 0.8739, + "step": 7550 + }, + { + "epoch": 0.08455382757058735, + "grad_norm": 0.2670735716819763, + "learning_rate": 0.0003501758874320435, + "loss": 0.8843, + "step": 7560 + }, + { + "epoch": 0.08466567125785003, + "grad_norm": 0.2678844928741455, + "learning_rate": 0.0003499474621956234, + "loss": 0.8855, + "step": 7570 + }, + { + "epoch": 0.08477751494511271, + "grad_norm": 0.26894411444664, + "learning_rate": 0.00034971903695920324, + "loss": 0.8828, + "step": 7580 + }, + { + "epoch": 0.08488935863237539, + "grad_norm": 0.28703927993774414, + "learning_rate": 0.00034949061172278315, + "loss": 0.885, + "step": 7590 + }, + { + "epoch": 0.08500120231963808, + "grad_norm": 0.2618086636066437, + "learning_rate": 0.00034926218648636306, + "loss": 0.8777, + "step": 7600 + }, + { + "epoch": 0.08511304600690076, + "grad_norm": 0.28816747665405273, + "learning_rate": 0.0003490337612499429, + "loss": 0.8836, + "step": 7610 + }, + { + "epoch": 0.08522488969416343, + "grad_norm": 0.29172763228416443, + "learning_rate": 0.00034880533601352277, + "loss": 0.8835, + "step": 7620 + }, + { + "epoch": 0.08533673338142612, + "grad_norm": 0.2613106667995453, + "learning_rate": 0.0003485769107771027, + "loss": 0.8736, + "step": 7630 + }, + { + "epoch": 0.0854485770686888, + "grad_norm": 0.2737283408641815, + "learning_rate": 0.00034834848554068254, + "loss": 0.8589, + "step": 7640 + }, + { + "epoch": 0.08556042075595148, + "grad_norm": 0.2709786295890808, + "learning_rate": 0.0003481200603042624, + "loss": 0.8675, + "step": 7650 + }, + { + "epoch": 0.08567226444321416, + "grad_norm": 0.2982759177684784, + "learning_rate": 0.0003478916350678423, + "loss": 0.8827, + "step": 7660 + }, + { + "epoch": 0.08578410813047685, + "grad_norm": 0.21551093459129333, + "learning_rate": 0.0003476632098314222, + "loss": 0.8663, + "step": 7670 + }, + { + "epoch": 0.08589595181773953, + "grad_norm": 0.26418018341064453, + "learning_rate": 0.00034743478459500207, + "loss": 0.8845, + "step": 7680 + }, + { + "epoch": 0.0860077955050022, + "grad_norm": 0.2310175597667694, + "learning_rate": 0.000347206359358582, + "loss": 0.8874, + "step": 7690 + }, + { + "epoch": 0.0861196391922649, + "grad_norm": 0.25112512707710266, + "learning_rate": 0.00034697793412216183, + "loss": 0.8896, + "step": 7700 + }, + { + "epoch": 0.08623148287952757, + "grad_norm": 0.33391082286834717, + "learning_rate": 0.0003467495088857417, + "loss": 0.8765, + "step": 7710 + }, + { + "epoch": 0.08634332656679025, + "grad_norm": 0.24641484022140503, + "learning_rate": 0.0003465210836493216, + "loss": 0.8572, + "step": 7720 + }, + { + "epoch": 0.08645517025405293, + "grad_norm": 0.26017534732818604, + "learning_rate": 0.00034629265841290145, + "loss": 0.8585, + "step": 7730 + }, + { + "epoch": 0.08656701394131562, + "grad_norm": 0.23500847816467285, + "learning_rate": 0.0003460642331764813, + "loss": 0.8797, + "step": 7740 + }, + { + "epoch": 0.0866788576285783, + "grad_norm": 0.25485488772392273, + "learning_rate": 0.0003458358079400612, + "loss": 0.8796, + "step": 7750 + }, + { + "epoch": 0.08679070131584098, + "grad_norm": 0.27644404768943787, + "learning_rate": 0.00034560738270364113, + "loss": 0.8708, + "step": 7760 + }, + { + "epoch": 0.08690254500310367, + "grad_norm": 0.233077734708786, + "learning_rate": 0.000345378957467221, + "loss": 0.8652, + "step": 7770 + }, + { + "epoch": 0.08701438869036635, + "grad_norm": 0.24039144814014435, + "learning_rate": 0.00034515053223080084, + "loss": 0.8723, + "step": 7780 + }, + { + "epoch": 0.08712623237762902, + "grad_norm": 0.23007874190807343, + "learning_rate": 0.00034492210699438075, + "loss": 0.8644, + "step": 7790 + }, + { + "epoch": 0.0872380760648917, + "grad_norm": 0.27570798993110657, + "learning_rate": 0.0003446936817579606, + "loss": 0.872, + "step": 7800 + }, + { + "epoch": 0.08734991975215439, + "grad_norm": 0.24157382547855377, + "learning_rate": 0.00034446525652154046, + "loss": 0.8846, + "step": 7810 + }, + { + "epoch": 0.08746176343941707, + "grad_norm": 0.2703733742237091, + "learning_rate": 0.0003442368312851204, + "loss": 0.889, + "step": 7820 + }, + { + "epoch": 0.08757360712667975, + "grad_norm": 0.26786255836486816, + "learning_rate": 0.0003440084060487003, + "loss": 0.8933, + "step": 7830 + }, + { + "epoch": 0.08768545081394244, + "grad_norm": 0.2595812976360321, + "learning_rate": 0.00034377998081228014, + "loss": 0.9156, + "step": 7840 + }, + { + "epoch": 0.08779729450120512, + "grad_norm": 0.24396800994873047, + "learning_rate": 0.00034355155557586005, + "loss": 0.8849, + "step": 7850 + }, + { + "epoch": 0.0879091381884678, + "grad_norm": 0.24363452196121216, + "learning_rate": 0.0003433231303394399, + "loss": 0.9011, + "step": 7860 + }, + { + "epoch": 0.08802098187573047, + "grad_norm": 0.2666647434234619, + "learning_rate": 0.00034309470510301976, + "loss": 0.8952, + "step": 7870 + }, + { + "epoch": 0.08813282556299316, + "grad_norm": 0.267863005399704, + "learning_rate": 0.00034286627986659967, + "loss": 0.9113, + "step": 7880 + }, + { + "epoch": 0.08824466925025584, + "grad_norm": 0.24397262930870056, + "learning_rate": 0.0003426378546301795, + "loss": 0.8762, + "step": 7890 + }, + { + "epoch": 0.08835651293751852, + "grad_norm": 0.23912496864795685, + "learning_rate": 0.00034240942939375943, + "loss": 0.8865, + "step": 7900 + }, + { + "epoch": 0.08846835662478121, + "grad_norm": 0.2737523913383484, + "learning_rate": 0.00034218100415733934, + "loss": 0.8732, + "step": 7910 + }, + { + "epoch": 0.08858020031204389, + "grad_norm": 0.24978673458099365, + "learning_rate": 0.0003419525789209192, + "loss": 0.8832, + "step": 7920 + }, + { + "epoch": 0.08869204399930657, + "grad_norm": 0.25200751423835754, + "learning_rate": 0.00034172415368449906, + "loss": 0.8952, + "step": 7930 + }, + { + "epoch": 0.08880388768656924, + "grad_norm": 0.7863819003105164, + "learning_rate": 0.00034149572844807897, + "loss": 0.8708, + "step": 7940 + }, + { + "epoch": 0.08891573137383194, + "grad_norm": 0.2560253441333771, + "learning_rate": 0.0003412673032116588, + "loss": 0.8681, + "step": 7950 + }, + { + "epoch": 0.08902757506109461, + "grad_norm": 0.2669181823730469, + "learning_rate": 0.0003410388779752387, + "loss": 0.9007, + "step": 7960 + }, + { + "epoch": 0.08913941874835729, + "grad_norm": 0.27906209230422974, + "learning_rate": 0.0003408104527388186, + "loss": 0.8988, + "step": 7970 + }, + { + "epoch": 0.08925126243561998, + "grad_norm": 0.2506297826766968, + "learning_rate": 0.0003405820275023985, + "loss": 0.8997, + "step": 7980 + }, + { + "epoch": 0.08936310612288266, + "grad_norm": 0.2513269782066345, + "learning_rate": 0.00034035360226597835, + "loss": 0.9215, + "step": 7990 + }, + { + "epoch": 0.08947494981014534, + "grad_norm": 0.2672421634197235, + "learning_rate": 0.00034012517702955826, + "loss": 0.9112, + "step": 8000 + }, + { + "epoch": 0.08958679349740803, + "grad_norm": 0.2553747296333313, + "learning_rate": 0.0003398967517931381, + "loss": 0.9255, + "step": 8010 + }, + { + "epoch": 0.08969863718467071, + "grad_norm": 0.2325398176908493, + "learning_rate": 0.000339668326556718, + "loss": 0.9173, + "step": 8020 + }, + { + "epoch": 0.08981048087193338, + "grad_norm": 0.23461295664310455, + "learning_rate": 0.0003394399013202979, + "loss": 0.9183, + "step": 8030 + }, + { + "epoch": 0.08992232455919606, + "grad_norm": 0.26092031598091125, + "learning_rate": 0.00033921147608387774, + "loss": 0.9106, + "step": 8040 + }, + { + "epoch": 0.09003416824645875, + "grad_norm": 0.26250872015953064, + "learning_rate": 0.0003389830508474576, + "loss": 0.8893, + "step": 8050 + }, + { + "epoch": 0.09014601193372143, + "grad_norm": 0.2501981556415558, + "learning_rate": 0.00033875462561103756, + "loss": 0.8934, + "step": 8060 + }, + { + "epoch": 0.09025785562098411, + "grad_norm": 0.26185476779937744, + "learning_rate": 0.0003385262003746174, + "loss": 0.8855, + "step": 8070 + }, + { + "epoch": 0.0903696993082468, + "grad_norm": 0.26889827847480774, + "learning_rate": 0.00033829777513819727, + "loss": 0.8944, + "step": 8080 + }, + { + "epoch": 0.09048154299550948, + "grad_norm": 0.2473451793193817, + "learning_rate": 0.0003380693499017772, + "loss": 0.8937, + "step": 8090 + }, + { + "epoch": 0.09059338668277216, + "grad_norm": 0.24157559871673584, + "learning_rate": 0.00033784092466535704, + "loss": 0.8903, + "step": 8100 + }, + { + "epoch": 0.09070523037003483, + "grad_norm": 0.2701563239097595, + "learning_rate": 0.0003376124994289369, + "loss": 0.9109, + "step": 8110 + }, + { + "epoch": 0.09081707405729753, + "grad_norm": 0.28706929087638855, + "learning_rate": 0.0003373840741925168, + "loss": 0.8956, + "step": 8120 + }, + { + "epoch": 0.0909289177445602, + "grad_norm": 0.27120909094810486, + "learning_rate": 0.00033715564895609666, + "loss": 0.8947, + "step": 8130 + }, + { + "epoch": 0.09104076143182288, + "grad_norm": 0.2504216432571411, + "learning_rate": 0.00033692722371967657, + "loss": 0.8814, + "step": 8140 + }, + { + "epoch": 0.09115260511908557, + "grad_norm": 0.2921849489212036, + "learning_rate": 0.0003366987984832565, + "loss": 0.8856, + "step": 8150 + }, + { + "epoch": 0.09126444880634825, + "grad_norm": 0.2587922513484955, + "learning_rate": 0.00033647037324683633, + "loss": 0.8778, + "step": 8160 + }, + { + "epoch": 0.09137629249361093, + "grad_norm": 0.2399989813566208, + "learning_rate": 0.0003362419480104162, + "loss": 0.883, + "step": 8170 + }, + { + "epoch": 0.0914881361808736, + "grad_norm": 0.24794407188892365, + "learning_rate": 0.0003360135227739961, + "loss": 0.8935, + "step": 8180 + }, + { + "epoch": 0.0915999798681363, + "grad_norm": 0.26669082045555115, + "learning_rate": 0.00033578509753757595, + "loss": 0.863, + "step": 8190 + }, + { + "epoch": 0.09171182355539897, + "grad_norm": 0.25162795186042786, + "learning_rate": 0.0003355566723011558, + "loss": 0.8887, + "step": 8200 + }, + { + "epoch": 0.09182366724266165, + "grad_norm": 0.28969621658325195, + "learning_rate": 0.00033532824706473567, + "loss": 0.9066, + "step": 8210 + }, + { + "epoch": 0.09193551092992434, + "grad_norm": 0.25944870710372925, + "learning_rate": 0.00033509982182831563, + "loss": 0.8875, + "step": 8220 + }, + { + "epoch": 0.09204735461718702, + "grad_norm": 0.27627986669540405, + "learning_rate": 0.0003348713965918955, + "loss": 0.8895, + "step": 8230 + }, + { + "epoch": 0.0921591983044497, + "grad_norm": 0.2673914134502411, + "learning_rate": 0.00033464297135547534, + "loss": 0.8937, + "step": 8240 + }, + { + "epoch": 0.09227104199171238, + "grad_norm": 0.2810732126235962, + "learning_rate": 0.00033441454611905525, + "loss": 0.9007, + "step": 8250 + }, + { + "epoch": 0.09238288567897507, + "grad_norm": 0.2671091556549072, + "learning_rate": 0.0003341861208826351, + "loss": 0.905, + "step": 8260 + }, + { + "epoch": 0.09249472936623775, + "grad_norm": 0.25006943941116333, + "learning_rate": 0.00033395769564621496, + "loss": 0.8981, + "step": 8270 + }, + { + "epoch": 0.09260657305350042, + "grad_norm": 0.2891542613506317, + "learning_rate": 0.0003337292704097949, + "loss": 0.8978, + "step": 8280 + }, + { + "epoch": 0.09271841674076312, + "grad_norm": 0.29497236013412476, + "learning_rate": 0.0003335008451733748, + "loss": 0.9044, + "step": 8290 + }, + { + "epoch": 0.0928302604280258, + "grad_norm": 0.29290974140167236, + "learning_rate": 0.00033327241993695464, + "loss": 0.9081, + "step": 8300 + }, + { + "epoch": 0.09294210411528847, + "grad_norm": 0.27077415585517883, + "learning_rate": 0.00033304399470053455, + "loss": 0.9184, + "step": 8310 + }, + { + "epoch": 0.09305394780255115, + "grad_norm": 0.26410186290740967, + "learning_rate": 0.0003328155694641144, + "loss": 0.8912, + "step": 8320 + }, + { + "epoch": 0.09316579148981384, + "grad_norm": 0.2818413972854614, + "learning_rate": 0.00033258714422769426, + "loss": 0.9096, + "step": 8330 + }, + { + "epoch": 0.09327763517707652, + "grad_norm": 0.265286386013031, + "learning_rate": 0.00033235871899127417, + "loss": 0.9192, + "step": 8340 + }, + { + "epoch": 0.0933894788643392, + "grad_norm": 0.2714836597442627, + "learning_rate": 0.000332130293754854, + "loss": 0.9122, + "step": 8350 + }, + { + "epoch": 0.09350132255160189, + "grad_norm": 0.2858263850212097, + "learning_rate": 0.0003319018685184339, + "loss": 0.9143, + "step": 8360 + }, + { + "epoch": 0.09361316623886456, + "grad_norm": 0.27788257598876953, + "learning_rate": 0.00033167344328201385, + "loss": 0.9116, + "step": 8370 + }, + { + "epoch": 0.09372500992612724, + "grad_norm": 0.27748674154281616, + "learning_rate": 0.0003314450180455937, + "loss": 0.8934, + "step": 8380 + }, + { + "epoch": 0.09383685361338992, + "grad_norm": 0.4757048785686493, + "learning_rate": 0.00033121659280917356, + "loss": 0.9097, + "step": 8390 + }, + { + "epoch": 0.09394869730065261, + "grad_norm": 0.3016970157623291, + "learning_rate": 0.00033098816757275347, + "loss": 0.8973, + "step": 8400 + }, + { + "epoch": 0.09406054098791529, + "grad_norm": 0.2640211880207062, + "learning_rate": 0.0003307597423363333, + "loss": 0.8914, + "step": 8410 + }, + { + "epoch": 0.09417238467517797, + "grad_norm": 0.2608022391796112, + "learning_rate": 0.0003305313170999132, + "loss": 0.9138, + "step": 8420 + }, + { + "epoch": 0.09428422836244066, + "grad_norm": 0.23691967129707336, + "learning_rate": 0.0003303028918634931, + "loss": 0.9149, + "step": 8430 + }, + { + "epoch": 0.09439607204970334, + "grad_norm": 0.28734761476516724, + "learning_rate": 0.00033007446662707294, + "loss": 0.9056, + "step": 8440 + }, + { + "epoch": 0.09450791573696601, + "grad_norm": 0.2846873700618744, + "learning_rate": 0.00032984604139065285, + "loss": 0.9052, + "step": 8450 + }, + { + "epoch": 0.09461975942422869, + "grad_norm": 0.2613682448863983, + "learning_rate": 0.00032961761615423276, + "loss": 0.9129, + "step": 8460 + }, + { + "epoch": 0.09473160311149138, + "grad_norm": 0.25336501002311707, + "learning_rate": 0.0003293891909178126, + "loss": 0.9048, + "step": 8470 + }, + { + "epoch": 0.09484344679875406, + "grad_norm": 0.2662324905395508, + "learning_rate": 0.0003291607656813925, + "loss": 0.9181, + "step": 8480 + }, + { + "epoch": 0.09495529048601674, + "grad_norm": 0.2482605278491974, + "learning_rate": 0.0003289323404449724, + "loss": 0.8978, + "step": 8490 + }, + { + "epoch": 0.09506713417327943, + "grad_norm": 0.24181032180786133, + "learning_rate": 0.00032870391520855224, + "loss": 0.9121, + "step": 8500 + }, + { + "epoch": 0.09517897786054211, + "grad_norm": 0.276621013879776, + "learning_rate": 0.0003284754899721321, + "loss": 0.9106, + "step": 8510 + }, + { + "epoch": 0.09529082154780479, + "grad_norm": 0.2788410186767578, + "learning_rate": 0.000328247064735712, + "loss": 0.9062, + "step": 8520 + }, + { + "epoch": 0.09540266523506746, + "grad_norm": 0.28387385606765747, + "learning_rate": 0.0003280186394992919, + "loss": 0.9309, + "step": 8530 + }, + { + "epoch": 0.09551450892233015, + "grad_norm": 0.2923261523246765, + "learning_rate": 0.00032779021426287177, + "loss": 0.9278, + "step": 8540 + }, + { + "epoch": 0.09562635260959283, + "grad_norm": 0.3008005917072296, + "learning_rate": 0.0003275617890264517, + "loss": 0.9196, + "step": 8550 + }, + { + "epoch": 0.09573819629685551, + "grad_norm": 0.2849402129650116, + "learning_rate": 0.00032733336379003154, + "loss": 0.9243, + "step": 8560 + }, + { + "epoch": 0.0958500399841182, + "grad_norm": 0.262134313583374, + "learning_rate": 0.0003271049385536114, + "loss": 0.9346, + "step": 8570 + }, + { + "epoch": 0.09596188367138088, + "grad_norm": 0.2891925573348999, + "learning_rate": 0.0003268765133171913, + "loss": 0.9176, + "step": 8580 + }, + { + "epoch": 0.09607372735864356, + "grad_norm": 0.26165837049484253, + "learning_rate": 0.00032664808808077116, + "loss": 0.9229, + "step": 8590 + }, + { + "epoch": 0.09618557104590623, + "grad_norm": 0.2683985233306885, + "learning_rate": 0.000326419662844351, + "loss": 0.9067, + "step": 8600 + }, + { + "epoch": 0.09629741473316893, + "grad_norm": 0.25300973653793335, + "learning_rate": 0.000326191237607931, + "loss": 0.9037, + "step": 8610 + }, + { + "epoch": 0.0964092584204316, + "grad_norm": 0.30520153045654297, + "learning_rate": 0.00032596281237151083, + "loss": 0.9038, + "step": 8620 + }, + { + "epoch": 0.09652110210769428, + "grad_norm": 0.2573854327201843, + "learning_rate": 0.0003257343871350907, + "loss": 0.9062, + "step": 8630 + }, + { + "epoch": 0.09663294579495697, + "grad_norm": 0.2664088308811188, + "learning_rate": 0.0003255059618986706, + "loss": 0.8864, + "step": 8640 + }, + { + "epoch": 0.09674478948221965, + "grad_norm": 0.26375049352645874, + "learning_rate": 0.00032527753666225046, + "loss": 0.8804, + "step": 8650 + }, + { + "epoch": 0.09685663316948233, + "grad_norm": 0.25367647409439087, + "learning_rate": 0.0003250491114258303, + "loss": 0.8987, + "step": 8660 + }, + { + "epoch": 0.09696847685674502, + "grad_norm": 0.2764420807361603, + "learning_rate": 0.00032482068618941017, + "loss": 0.9078, + "step": 8670 + }, + { + "epoch": 0.0970803205440077, + "grad_norm": 0.2663860023021698, + "learning_rate": 0.0003245922609529901, + "loss": 0.8838, + "step": 8680 + }, + { + "epoch": 0.09719216423127038, + "grad_norm": 0.25380998849868774, + "learning_rate": 0.00032436383571657, + "loss": 0.8949, + "step": 8690 + }, + { + "epoch": 0.09730400791853305, + "grad_norm": 0.29428210854530334, + "learning_rate": 0.00032413541048014984, + "loss": 0.883, + "step": 8700 + }, + { + "epoch": 0.09741585160579574, + "grad_norm": 0.25604331493377686, + "learning_rate": 0.00032390698524372975, + "loss": 0.8891, + "step": 8710 + }, + { + "epoch": 0.09752769529305842, + "grad_norm": 0.26663005352020264, + "learning_rate": 0.0003236785600073096, + "loss": 0.8763, + "step": 8720 + }, + { + "epoch": 0.0976395389803211, + "grad_norm": 0.27305158972740173, + "learning_rate": 0.00032345013477088946, + "loss": 0.8877, + "step": 8730 + }, + { + "epoch": 0.09775138266758379, + "grad_norm": 0.27395525574684143, + "learning_rate": 0.0003232217095344694, + "loss": 0.871, + "step": 8740 + }, + { + "epoch": 0.09786322635484647, + "grad_norm": 0.26152902841567993, + "learning_rate": 0.00032299328429804923, + "loss": 0.8714, + "step": 8750 + }, + { + "epoch": 0.09797507004210915, + "grad_norm": 0.2872631847858429, + "learning_rate": 0.0003227648590616291, + "loss": 0.8754, + "step": 8760 + }, + { + "epoch": 0.09808691372937182, + "grad_norm": 0.2681150436401367, + "learning_rate": 0.00032253643382520905, + "loss": 0.8699, + "step": 8770 + }, + { + "epoch": 0.09819875741663452, + "grad_norm": 0.27205002307891846, + "learning_rate": 0.0003223080085887889, + "loss": 0.8743, + "step": 8780 + }, + { + "epoch": 0.0983106011038972, + "grad_norm": 0.27747979760169983, + "learning_rate": 0.00032207958335236876, + "loss": 0.8607, + "step": 8790 + }, + { + "epoch": 0.09842244479115987, + "grad_norm": 0.2963927984237671, + "learning_rate": 0.00032185115811594867, + "loss": 0.8676, + "step": 8800 + }, + { + "epoch": 0.09853428847842256, + "grad_norm": 0.26414602994918823, + "learning_rate": 0.0003216227328795285, + "loss": 0.8556, + "step": 8810 + }, + { + "epoch": 0.09864613216568524, + "grad_norm": 0.3005480170249939, + "learning_rate": 0.0003213943076431084, + "loss": 0.8816, + "step": 8820 + }, + { + "epoch": 0.09875797585294792, + "grad_norm": 0.29625314474105835, + "learning_rate": 0.0003211658824066883, + "loss": 0.8747, + "step": 8830 + }, + { + "epoch": 0.0988698195402106, + "grad_norm": 0.2900589108467102, + "learning_rate": 0.0003209374571702682, + "loss": 0.8697, + "step": 8840 + }, + { + "epoch": 0.09898166322747329, + "grad_norm": 0.2951551675796509, + "learning_rate": 0.00032070903193384806, + "loss": 0.8756, + "step": 8850 + }, + { + "epoch": 0.09909350691473597, + "grad_norm": 0.3049459159374237, + "learning_rate": 0.00032048060669742797, + "loss": 0.8767, + "step": 8860 + }, + { + "epoch": 0.09920535060199864, + "grad_norm": 0.30216872692108154, + "learning_rate": 0.0003202521814610078, + "loss": 0.8687, + "step": 8870 + }, + { + "epoch": 0.09931719428926133, + "grad_norm": 0.2913934290409088, + "learning_rate": 0.0003200237562245877, + "loss": 0.8616, + "step": 8880 + }, + { + "epoch": 0.09942903797652401, + "grad_norm": 0.26879578828811646, + "learning_rate": 0.0003197953309881676, + "loss": 0.8681, + "step": 8890 + }, + { + "epoch": 0.09954088166378669, + "grad_norm": 0.28092971444129944, + "learning_rate": 0.00031956690575174744, + "loss": 0.8765, + "step": 8900 + }, + { + "epoch": 0.09965272535104937, + "grad_norm": 0.3074035048484802, + "learning_rate": 0.0003193384805153273, + "loss": 0.881, + "step": 8910 + }, + { + "epoch": 0.09976456903831206, + "grad_norm": 0.2945140600204468, + "learning_rate": 0.00031911005527890726, + "loss": 0.8913, + "step": 8920 + }, + { + "epoch": 0.09987641272557474, + "grad_norm": 0.2707176208496094, + "learning_rate": 0.0003188816300424871, + "loss": 0.8822, + "step": 8930 + }, + { + "epoch": 0.09998825641283741, + "grad_norm": 0.2639947235584259, + "learning_rate": 0.000318653204806067, + "loss": 0.8892, + "step": 8940 + }, + { + "epoch": 0.1001001001001001, + "grad_norm": 0.2709505558013916, + "learning_rate": 0.0003184247795696469, + "loss": 0.8654, + "step": 8950 + }, + { + "epoch": 0.10021194378736278, + "grad_norm": 0.27803289890289307, + "learning_rate": 0.00031819635433322674, + "loss": 0.8887, + "step": 8960 + }, + { + "epoch": 0.10032378747462546, + "grad_norm": 0.25851163268089294, + "learning_rate": 0.0003179679290968066, + "loss": 0.8662, + "step": 8970 + }, + { + "epoch": 0.10043563116188814, + "grad_norm": 0.261068731546402, + "learning_rate": 0.0003177395038603865, + "loss": 0.8641, + "step": 8980 + }, + { + "epoch": 0.10054747484915083, + "grad_norm": 0.25510483980178833, + "learning_rate": 0.00031751107862396636, + "loss": 0.8762, + "step": 8990 + }, + { + "epoch": 0.10065931853641351, + "grad_norm": 0.25765854120254517, + "learning_rate": 0.00031728265338754627, + "loss": 0.8837, + "step": 9000 + }, + { + "epoch": 0.10077116222367619, + "grad_norm": 0.24198535084724426, + "learning_rate": 0.0003170542281511262, + "loss": 0.8791, + "step": 9010 + }, + { + "epoch": 0.10088300591093888, + "grad_norm": 0.2673517167568207, + "learning_rate": 0.00031682580291470604, + "loss": 0.8795, + "step": 9020 + }, + { + "epoch": 0.10099484959820156, + "grad_norm": 0.26392221450805664, + "learning_rate": 0.0003165973776782859, + "loss": 0.8788, + "step": 9030 + }, + { + "epoch": 0.10110669328546423, + "grad_norm": 0.2698739171028137, + "learning_rate": 0.0003163689524418658, + "loss": 0.8959, + "step": 9040 + }, + { + "epoch": 0.10121853697272691, + "grad_norm": 0.2800233066082001, + "learning_rate": 0.00031614052720544566, + "loss": 0.8945, + "step": 9050 + }, + { + "epoch": 0.1013303806599896, + "grad_norm": 0.29603493213653564, + "learning_rate": 0.0003159121019690255, + "loss": 0.892, + "step": 9060 + }, + { + "epoch": 0.10144222434725228, + "grad_norm": 0.26462167501449585, + "learning_rate": 0.0003156836767326054, + "loss": 0.8849, + "step": 9070 + }, + { + "epoch": 0.10155406803451496, + "grad_norm": 0.27941739559173584, + "learning_rate": 0.00031545525149618534, + "loss": 0.8782, + "step": 9080 + }, + { + "epoch": 0.10166591172177765, + "grad_norm": 0.2777186334133148, + "learning_rate": 0.0003152268262597652, + "loss": 0.8787, + "step": 9090 + }, + { + "epoch": 0.10177775540904033, + "grad_norm": 0.25893428921699524, + "learning_rate": 0.00031499840102334505, + "loss": 0.8629, + "step": 9100 + }, + { + "epoch": 0.101889599096303, + "grad_norm": 0.27407601475715637, + "learning_rate": 0.00031476997578692496, + "loss": 0.8619, + "step": 9110 + }, + { + "epoch": 0.10200144278356568, + "grad_norm": 0.2663459777832031, + "learning_rate": 0.0003145415505505048, + "loss": 0.8474, + "step": 9120 + }, + { + "epoch": 0.10211328647082837, + "grad_norm": 0.2621177136898041, + "learning_rate": 0.00031431312531408467, + "loss": 0.8565, + "step": 9130 + }, + { + "epoch": 0.10222513015809105, + "grad_norm": 0.26687386631965637, + "learning_rate": 0.0003140847000776646, + "loss": 0.8438, + "step": 9140 + }, + { + "epoch": 0.10233697384535373, + "grad_norm": 0.24772432446479797, + "learning_rate": 0.00031385627484124443, + "loss": 0.8511, + "step": 9150 + }, + { + "epoch": 0.10244881753261642, + "grad_norm": 0.278730183839798, + "learning_rate": 0.00031362784960482434, + "loss": 0.8499, + "step": 9160 + }, + { + "epoch": 0.1025606612198791, + "grad_norm": 0.28657999634742737, + "learning_rate": 0.00031339942436840425, + "loss": 0.85, + "step": 9170 + }, + { + "epoch": 0.10267250490714178, + "grad_norm": 0.2848927676677704, + "learning_rate": 0.0003131709991319841, + "loss": 0.8411, + "step": 9180 + }, + { + "epoch": 0.10278434859440445, + "grad_norm": 0.28381872177124023, + "learning_rate": 0.00031294257389556396, + "loss": 0.8508, + "step": 9190 + }, + { + "epoch": 0.10289619228166715, + "grad_norm": 0.26624616980552673, + "learning_rate": 0.0003127141486591439, + "loss": 0.8658, + "step": 9200 + }, + { + "epoch": 0.10300803596892982, + "grad_norm": 0.2605401277542114, + "learning_rate": 0.00031248572342272373, + "loss": 0.8602, + "step": 9210 + }, + { + "epoch": 0.1031198796561925, + "grad_norm": 0.2819276750087738, + "learning_rate": 0.0003122572981863036, + "loss": 0.8614, + "step": 9220 + }, + { + "epoch": 0.10323172334345519, + "grad_norm": 0.27677878737449646, + "learning_rate": 0.00031202887294988355, + "loss": 0.8556, + "step": 9230 + }, + { + "epoch": 0.10334356703071787, + "grad_norm": 0.25589799880981445, + "learning_rate": 0.0003118004477134634, + "loss": 0.8704, + "step": 9240 + }, + { + "epoch": 0.10345541071798055, + "grad_norm": 0.2731853425502777, + "learning_rate": 0.00031157202247704326, + "loss": 0.8428, + "step": 9250 + }, + { + "epoch": 0.10356725440524323, + "grad_norm": 0.3047199547290802, + "learning_rate": 0.00031134359724062317, + "loss": 0.8508, + "step": 9260 + }, + { + "epoch": 0.10367909809250592, + "grad_norm": 0.28696686029434204, + "learning_rate": 0.00031111517200420303, + "loss": 0.8571, + "step": 9270 + }, + { + "epoch": 0.1037909417797686, + "grad_norm": 0.23354049026966095, + "learning_rate": 0.0003108867467677829, + "loss": 0.8518, + "step": 9280 + }, + { + "epoch": 0.10390278546703127, + "grad_norm": 0.27123787999153137, + "learning_rate": 0.0003106583215313628, + "loss": 0.8621, + "step": 9290 + }, + { + "epoch": 0.10401462915429396, + "grad_norm": 0.2509523332118988, + "learning_rate": 0.00031042989629494265, + "loss": 0.8568, + "step": 9300 + }, + { + "epoch": 0.10412647284155664, + "grad_norm": 0.2359481155872345, + "learning_rate": 0.00031020147105852256, + "loss": 0.8598, + "step": 9310 + }, + { + "epoch": 0.10423831652881932, + "grad_norm": 0.27097463607788086, + "learning_rate": 0.00030997304582210247, + "loss": 0.8615, + "step": 9320 + }, + { + "epoch": 0.104350160216082, + "grad_norm": 0.2616114020347595, + "learning_rate": 0.0003097446205856823, + "loss": 0.8462, + "step": 9330 + }, + { + "epoch": 0.10446200390334469, + "grad_norm": 0.30027398467063904, + "learning_rate": 0.0003095161953492622, + "loss": 0.8683, + "step": 9340 + }, + { + "epoch": 0.10457384759060737, + "grad_norm": 0.28468623757362366, + "learning_rate": 0.0003092877701128421, + "loss": 0.856, + "step": 9350 + }, + { + "epoch": 0.10468569127787004, + "grad_norm": 0.318521112203598, + "learning_rate": 0.00030905934487642195, + "loss": 0.8532, + "step": 9360 + }, + { + "epoch": 0.10479753496513274, + "grad_norm": 0.3118298351764679, + "learning_rate": 0.0003088309196400018, + "loss": 0.8546, + "step": 9370 + }, + { + "epoch": 0.10490937865239541, + "grad_norm": 0.28549399971961975, + "learning_rate": 0.0003086024944035817, + "loss": 0.8718, + "step": 9380 + }, + { + "epoch": 0.10502122233965809, + "grad_norm": 0.24803526699543, + "learning_rate": 0.0003083740691671616, + "loss": 0.8489, + "step": 9390 + }, + { + "epoch": 0.10513306602692078, + "grad_norm": 0.26765918731689453, + "learning_rate": 0.0003081456439307415, + "loss": 0.8617, + "step": 9400 + }, + { + "epoch": 0.10524490971418346, + "grad_norm": 0.26363757252693176, + "learning_rate": 0.0003079172186943214, + "loss": 0.8648, + "step": 9410 + }, + { + "epoch": 0.10535675340144614, + "grad_norm": 0.2734963595867157, + "learning_rate": 0.00030768879345790124, + "loss": 0.8556, + "step": 9420 + }, + { + "epoch": 0.10546859708870882, + "grad_norm": 0.2773530185222626, + "learning_rate": 0.0003074603682214811, + "loss": 0.8737, + "step": 9430 + }, + { + "epoch": 0.1055804407759715, + "grad_norm": 0.2684498429298401, + "learning_rate": 0.000307231942985061, + "loss": 0.8657, + "step": 9440 + }, + { + "epoch": 0.10569228446323418, + "grad_norm": 0.26110732555389404, + "learning_rate": 0.00030700351774864086, + "loss": 0.8618, + "step": 9450 + }, + { + "epoch": 0.10580412815049686, + "grad_norm": 0.27595090866088867, + "learning_rate": 0.0003067750925122207, + "loss": 0.8654, + "step": 9460 + }, + { + "epoch": 0.10591597183775955, + "grad_norm": 0.2799736559391022, + "learning_rate": 0.0003065466672758007, + "loss": 0.8583, + "step": 9470 + }, + { + "epoch": 0.10602781552502223, + "grad_norm": 0.2729387879371643, + "learning_rate": 0.00030631824203938054, + "loss": 0.8628, + "step": 9480 + }, + { + "epoch": 0.10613965921228491, + "grad_norm": 0.30332332849502563, + "learning_rate": 0.0003060898168029604, + "loss": 0.8512, + "step": 9490 + }, + { + "epoch": 0.10625150289954759, + "grad_norm": 0.276753306388855, + "learning_rate": 0.0003058613915665403, + "loss": 0.85, + "step": 9500 + }, + { + "epoch": 0.10636334658681028, + "grad_norm": 0.3190478980541229, + "learning_rate": 0.00030563296633012016, + "loss": 0.8534, + "step": 9510 + }, + { + "epoch": 0.10647519027407296, + "grad_norm": 0.2926968038082123, + "learning_rate": 0.0003054045410937, + "loss": 0.8309, + "step": 9520 + }, + { + "epoch": 0.10658703396133563, + "grad_norm": 0.29631507396698, + "learning_rate": 0.0003051761158572799, + "loss": 0.8406, + "step": 9530 + }, + { + "epoch": 0.10669887764859833, + "grad_norm": 0.2881840765476227, + "learning_rate": 0.0003049476906208598, + "loss": 0.8274, + "step": 9540 + }, + { + "epoch": 0.106810721335861, + "grad_norm": 0.2623940408229828, + "learning_rate": 0.0003047192653844397, + "loss": 0.8346, + "step": 9550 + }, + { + "epoch": 0.10692256502312368, + "grad_norm": 0.29798468947410583, + "learning_rate": 0.00030449084014801955, + "loss": 0.8362, + "step": 9560 + }, + { + "epoch": 0.10703440871038636, + "grad_norm": 0.2976382076740265, + "learning_rate": 0.00030426241491159946, + "loss": 0.8179, + "step": 9570 + }, + { + "epoch": 0.10714625239764905, + "grad_norm": 0.28637486696243286, + "learning_rate": 0.0003040339896751793, + "loss": 0.8363, + "step": 9580 + }, + { + "epoch": 0.10725809608491173, + "grad_norm": 0.3023325204849243, + "learning_rate": 0.00030380556443875917, + "loss": 0.8382, + "step": 9590 + }, + { + "epoch": 0.1073699397721744, + "grad_norm": 0.2889160215854645, + "learning_rate": 0.0003035771392023391, + "loss": 0.8476, + "step": 9600 + }, + { + "epoch": 0.1074817834594371, + "grad_norm": 0.2868768572807312, + "learning_rate": 0.00030334871396591893, + "loss": 0.8482, + "step": 9610 + }, + { + "epoch": 0.10759362714669977, + "grad_norm": 0.2773813307285309, + "learning_rate": 0.0003031202887294988, + "loss": 0.8577, + "step": 9620 + }, + { + "epoch": 0.10770547083396245, + "grad_norm": 0.28698423504829407, + "learning_rate": 0.00030289186349307875, + "loss": 0.8663, + "step": 9630 + }, + { + "epoch": 0.10781731452122513, + "grad_norm": 0.26839759945869446, + "learning_rate": 0.0003026634382566586, + "loss": 0.8649, + "step": 9640 + }, + { + "epoch": 0.10792915820848782, + "grad_norm": 0.2686857283115387, + "learning_rate": 0.00030243501302023847, + "loss": 0.8563, + "step": 9650 + }, + { + "epoch": 0.1080410018957505, + "grad_norm": 0.2815250754356384, + "learning_rate": 0.0003022065877838184, + "loss": 0.8538, + "step": 9660 + }, + { + "epoch": 0.10815284558301318, + "grad_norm": 0.24625800549983978, + "learning_rate": 0.00030197816254739823, + "loss": 0.87, + "step": 9670 + }, + { + "epoch": 0.10826468927027587, + "grad_norm": 0.27051877975463867, + "learning_rate": 0.0003017497373109781, + "loss": 0.8692, + "step": 9680 + }, + { + "epoch": 0.10837653295753855, + "grad_norm": 0.253892183303833, + "learning_rate": 0.000301521312074558, + "loss": 0.8583, + "step": 9690 + }, + { + "epoch": 0.10848837664480122, + "grad_norm": 0.26951879262924194, + "learning_rate": 0.0003012928868381379, + "loss": 0.8699, + "step": 9700 + }, + { + "epoch": 0.1086002203320639, + "grad_norm": 0.27741488814353943, + "learning_rate": 0.00030106446160171776, + "loss": 0.8673, + "step": 9710 + }, + { + "epoch": 0.10871206401932659, + "grad_norm": 0.2655075788497925, + "learning_rate": 0.00030083603636529767, + "loss": 0.8628, + "step": 9720 + }, + { + "epoch": 0.10882390770658927, + "grad_norm": 0.298532098531723, + "learning_rate": 0.00030060761112887753, + "loss": 0.8707, + "step": 9730 + }, + { + "epoch": 0.10893575139385195, + "grad_norm": 0.3105684816837311, + "learning_rate": 0.0003003791858924574, + "loss": 0.8661, + "step": 9740 + }, + { + "epoch": 0.10904759508111464, + "grad_norm": 0.27781355381011963, + "learning_rate": 0.0003001507606560373, + "loss": 0.8871, + "step": 9750 + }, + { + "epoch": 0.10915943876837732, + "grad_norm": 0.2966761589050293, + "learning_rate": 0.00029992233541961715, + "loss": 0.875, + "step": 9760 + }, + { + "epoch": 0.10927128245564, + "grad_norm": 0.3010736405849457, + "learning_rate": 0.000299693910183197, + "loss": 0.8746, + "step": 9770 + }, + { + "epoch": 0.10938312614290267, + "grad_norm": 0.31352171301841736, + "learning_rate": 0.00029946548494677697, + "loss": 0.8733, + "step": 9780 + }, + { + "epoch": 0.10949496983016536, + "grad_norm": 0.30627313256263733, + "learning_rate": 0.0002992370597103568, + "loss": 0.8675, + "step": 9790 + }, + { + "epoch": 0.10960681351742804, + "grad_norm": 0.23990577459335327, + "learning_rate": 0.0002990086344739367, + "loss": 0.8614, + "step": 9800 + }, + { + "epoch": 0.10971865720469072, + "grad_norm": 0.2856599688529968, + "learning_rate": 0.0002987802092375166, + "loss": 0.8454, + "step": 9810 + }, + { + "epoch": 0.10983050089195341, + "grad_norm": 0.26476389169692993, + "learning_rate": 0.00029855178400109645, + "loss": 0.8616, + "step": 9820 + }, + { + "epoch": 0.10994234457921609, + "grad_norm": 0.2871752381324768, + "learning_rate": 0.0002983233587646763, + "loss": 0.8444, + "step": 9830 + }, + { + "epoch": 0.11005418826647877, + "grad_norm": 0.27318039536476135, + "learning_rate": 0.0002980949335282562, + "loss": 0.8487, + "step": 9840 + }, + { + "epoch": 0.11016603195374144, + "grad_norm": 0.25630125403404236, + "learning_rate": 0.00029786650829183607, + "loss": 0.846, + "step": 9850 + }, + { + "epoch": 0.11027787564100414, + "grad_norm": 0.23908184468746185, + "learning_rate": 0.000297638083055416, + "loss": 0.8403, + "step": 9860 + }, + { + "epoch": 0.11038971932826681, + "grad_norm": 0.2978418469429016, + "learning_rate": 0.0002974096578189959, + "loss": 0.8652, + "step": 9870 + }, + { + "epoch": 0.11050156301552949, + "grad_norm": 0.2503781318664551, + "learning_rate": 0.00029718123258257574, + "loss": 0.8657, + "step": 9880 + }, + { + "epoch": 0.11061340670279218, + "grad_norm": 0.28556469082832336, + "learning_rate": 0.0002969528073461556, + "loss": 0.8501, + "step": 9890 + }, + { + "epoch": 0.11072525039005486, + "grad_norm": 0.2643977701663971, + "learning_rate": 0.0002967243821097355, + "loss": 0.8742, + "step": 9900 + }, + { + "epoch": 0.11083709407731754, + "grad_norm": 0.2757241725921631, + "learning_rate": 0.00029649595687331536, + "loss": 0.8837, + "step": 9910 + }, + { + "epoch": 0.11094893776458022, + "grad_norm": 0.28263452649116516, + "learning_rate": 0.0002962675316368952, + "loss": 0.8793, + "step": 9920 + }, + { + "epoch": 0.11106078145184291, + "grad_norm": 0.27624276280403137, + "learning_rate": 0.00029603910640047513, + "loss": 0.8669, + "step": 9930 + }, + { + "epoch": 0.11117262513910559, + "grad_norm": 0.2814600467681885, + "learning_rate": 0.00029581068116405504, + "loss": 0.8858, + "step": 9940 + }, + { + "epoch": 0.11128446882636826, + "grad_norm": 0.2871972918510437, + "learning_rate": 0.0002955822559276349, + "loss": 0.8714, + "step": 9950 + }, + { + "epoch": 0.11139631251363095, + "grad_norm": 0.2885976731777191, + "learning_rate": 0.0002953538306912148, + "loss": 0.8675, + "step": 9960 + }, + { + "epoch": 0.11150815620089363, + "grad_norm": 0.281021386384964, + "learning_rate": 0.00029512540545479466, + "loss": 0.8762, + "step": 9970 + }, + { + "epoch": 0.11161999988815631, + "grad_norm": 0.2923888862133026, + "learning_rate": 0.0002948969802183745, + "loss": 0.87, + "step": 9980 + }, + { + "epoch": 0.11173184357541899, + "grad_norm": 0.2596036195755005, + "learning_rate": 0.00029466855498195443, + "loss": 0.8696, + "step": 9990 + }, + { + "epoch": 0.11184368726268168, + "grad_norm": 0.2749873697757721, + "learning_rate": 0.0002944401297455343, + "loss": 0.8604, + "step": 10000 + }, + { + "epoch": 0.11195553094994436, + "grad_norm": 0.2696766257286072, + "learning_rate": 0.00029421170450911414, + "loss": 0.8743, + "step": 10010 + }, + { + "epoch": 0.11206737463720703, + "grad_norm": 0.2824450731277466, + "learning_rate": 0.00029398327927269405, + "loss": 0.8734, + "step": 10020 + }, + { + "epoch": 0.11217921832446973, + "grad_norm": 0.2795054614543915, + "learning_rate": 0.00029375485403627396, + "loss": 0.865, + "step": 10030 + }, + { + "epoch": 0.1122910620117324, + "grad_norm": 0.2974453866481781, + "learning_rate": 0.0002935264287998538, + "loss": 0.8762, + "step": 10040 + }, + { + "epoch": 0.11240290569899508, + "grad_norm": 0.27134743332862854, + "learning_rate": 0.00029329800356343367, + "loss": 0.8616, + "step": 10050 + }, + { + "epoch": 0.11251474938625777, + "grad_norm": 0.2651810348033905, + "learning_rate": 0.0002930695783270136, + "loss": 0.8653, + "step": 10060 + }, + { + "epoch": 0.11262659307352045, + "grad_norm": 0.29161420464515686, + "learning_rate": 0.00029284115309059344, + "loss": 0.8583, + "step": 10070 + }, + { + "epoch": 0.11273843676078313, + "grad_norm": 0.27624139189720154, + "learning_rate": 0.0002926127278541733, + "loss": 0.8447, + "step": 10080 + }, + { + "epoch": 0.1128502804480458, + "grad_norm": 0.290632039308548, + "learning_rate": 0.00029238430261775326, + "loss": 0.8568, + "step": 10090 + }, + { + "epoch": 0.1129621241353085, + "grad_norm": 0.2906644940376282, + "learning_rate": 0.0002921558773813331, + "loss": 0.8566, + "step": 10100 + }, + { + "epoch": 0.11307396782257118, + "grad_norm": 0.29284584522247314, + "learning_rate": 0.00029192745214491297, + "loss": 0.8679, + "step": 10110 + }, + { + "epoch": 0.11318581150983385, + "grad_norm": 0.29635393619537354, + "learning_rate": 0.0002916990269084929, + "loss": 0.8648, + "step": 10120 + }, + { + "epoch": 0.11329765519709654, + "grad_norm": 0.2560585141181946, + "learning_rate": 0.00029147060167207273, + "loss": 0.8565, + "step": 10130 + }, + { + "epoch": 0.11340949888435922, + "grad_norm": 0.2480679154396057, + "learning_rate": 0.0002912421764356526, + "loss": 0.8574, + "step": 10140 + }, + { + "epoch": 0.1135213425716219, + "grad_norm": 0.28708118200302124, + "learning_rate": 0.0002910137511992325, + "loss": 0.8658, + "step": 10150 + }, + { + "epoch": 0.11363318625888458, + "grad_norm": 0.2553873062133789, + "learning_rate": 0.00029078532596281235, + "loss": 0.8721, + "step": 10160 + }, + { + "epoch": 0.11374502994614727, + "grad_norm": 0.26742488145828247, + "learning_rate": 0.00029055690072639226, + "loss": 0.8608, + "step": 10170 + }, + { + "epoch": 0.11385687363340995, + "grad_norm": 0.2674279510974884, + "learning_rate": 0.0002903284754899722, + "loss": 0.8763, + "step": 10180 + }, + { + "epoch": 0.11396871732067262, + "grad_norm": 0.2484348863363266, + "learning_rate": 0.00029010005025355203, + "loss": 0.8799, + "step": 10190 + }, + { + "epoch": 0.11408056100793532, + "grad_norm": 0.2603932321071625, + "learning_rate": 0.0002898716250171319, + "loss": 0.8922, + "step": 10200 + }, + { + "epoch": 0.114192404695198, + "grad_norm": 0.2510204613208771, + "learning_rate": 0.0002896431997807118, + "loss": 0.8851, + "step": 10210 + }, + { + "epoch": 0.11430424838246067, + "grad_norm": 0.26795732975006104, + "learning_rate": 0.00028941477454429165, + "loss": 0.8917, + "step": 10220 + }, + { + "epoch": 0.11441609206972335, + "grad_norm": 0.2880701422691345, + "learning_rate": 0.0002891863493078715, + "loss": 0.8903, + "step": 10230 + }, + { + "epoch": 0.11452793575698604, + "grad_norm": 0.23970642685890198, + "learning_rate": 0.0002889579240714514, + "loss": 0.8882, + "step": 10240 + }, + { + "epoch": 0.11463977944424872, + "grad_norm": 0.2786742150783539, + "learning_rate": 0.0002887294988350313, + "loss": 0.8827, + "step": 10250 + }, + { + "epoch": 0.1147516231315114, + "grad_norm": 0.2780776619911194, + "learning_rate": 0.0002885010735986112, + "loss": 0.8879, + "step": 10260 + }, + { + "epoch": 0.11486346681877409, + "grad_norm": 0.26984742283821106, + "learning_rate": 0.0002882726483621911, + "loss": 0.8732, + "step": 10270 + }, + { + "epoch": 0.11497531050603677, + "grad_norm": 0.26902884244918823, + "learning_rate": 0.00028804422312577095, + "loss": 0.878, + "step": 10280 + }, + { + "epoch": 0.11508715419329944, + "grad_norm": 0.24787285923957825, + "learning_rate": 0.0002878157978893508, + "loss": 0.8573, + "step": 10290 + }, + { + "epoch": 0.11519899788056212, + "grad_norm": 0.22702965140342712, + "learning_rate": 0.0002875873726529307, + "loss": 0.8621, + "step": 10300 + }, + { + "epoch": 0.11531084156782481, + "grad_norm": 0.27474096417427063, + "learning_rate": 0.00028735894741651057, + "loss": 0.8763, + "step": 10310 + }, + { + "epoch": 0.11542268525508749, + "grad_norm": 0.2605912983417511, + "learning_rate": 0.0002871305221800904, + "loss": 0.8706, + "step": 10320 + }, + { + "epoch": 0.11553452894235017, + "grad_norm": 0.25281742215156555, + "learning_rate": 0.0002869020969436704, + "loss": 0.855, + "step": 10330 + }, + { + "epoch": 0.11564637262961286, + "grad_norm": 0.2559000849723816, + "learning_rate": 0.00028667367170725024, + "loss": 0.8549, + "step": 10340 + }, + { + "epoch": 0.11575821631687554, + "grad_norm": 0.2439345121383667, + "learning_rate": 0.0002864452464708301, + "loss": 0.8639, + "step": 10350 + }, + { + "epoch": 0.11587006000413821, + "grad_norm": 0.2690776288509369, + "learning_rate": 0.00028621682123441, + "loss": 0.8487, + "step": 10360 + }, + { + "epoch": 0.11598190369140089, + "grad_norm": 0.25111067295074463, + "learning_rate": 0.00028598839599798987, + "loss": 0.8558, + "step": 10370 + }, + { + "epoch": 0.11609374737866358, + "grad_norm": 0.26838451623916626, + "learning_rate": 0.0002857599707615697, + "loss": 0.8603, + "step": 10380 + }, + { + "epoch": 0.11620559106592626, + "grad_norm": 0.2401856780052185, + "learning_rate": 0.00028553154552514963, + "loss": 0.8286, + "step": 10390 + }, + { + "epoch": 0.11631743475318894, + "grad_norm": 0.26284924149513245, + "learning_rate": 0.0002853031202887295, + "loss": 0.8402, + "step": 10400 + }, + { + "epoch": 0.11642927844045163, + "grad_norm": 0.28734955191612244, + "learning_rate": 0.0002850746950523094, + "loss": 0.8358, + "step": 10410 + }, + { + "epoch": 0.11654112212771431, + "grad_norm": 0.2564549446105957, + "learning_rate": 0.0002848462698158893, + "loss": 0.8458, + "step": 10420 + }, + { + "epoch": 0.11665296581497699, + "grad_norm": 0.2507050633430481, + "learning_rate": 0.00028461784457946916, + "loss": 0.8371, + "step": 10430 + }, + { + "epoch": 0.11676480950223966, + "grad_norm": 0.25748834013938904, + "learning_rate": 0.000284389419343049, + "loss": 0.8527, + "step": 10440 + }, + { + "epoch": 0.11687665318950236, + "grad_norm": 0.24484454095363617, + "learning_rate": 0.00028416099410662893, + "loss": 0.8372, + "step": 10450 + }, + { + "epoch": 0.11698849687676503, + "grad_norm": 0.24171967804431915, + "learning_rate": 0.0002839325688702088, + "loss": 0.8327, + "step": 10460 + }, + { + "epoch": 0.11710034056402771, + "grad_norm": 0.30423420667648315, + "learning_rate": 0.00028370414363378864, + "loss": 0.8271, + "step": 10470 + }, + { + "epoch": 0.1172121842512904, + "grad_norm": 0.2598424553871155, + "learning_rate": 0.0002834757183973685, + "loss": 0.8169, + "step": 10480 + }, + { + "epoch": 0.11732402793855308, + "grad_norm": 0.2608656585216522, + "learning_rate": 0.00028324729316094846, + "loss": 0.8261, + "step": 10490 + }, + { + "epoch": 0.11743587162581576, + "grad_norm": 0.25370126962661743, + "learning_rate": 0.0002830188679245283, + "loss": 0.8227, + "step": 10500 + }, + { + "epoch": 0.11754771531307844, + "grad_norm": 0.2760542333126068, + "learning_rate": 0.00028279044268810817, + "loss": 0.8413, + "step": 10510 + }, + { + "epoch": 0.11765955900034113, + "grad_norm": 0.24994856119155884, + "learning_rate": 0.0002825620174516881, + "loss": 0.8288, + "step": 10520 + }, + { + "epoch": 0.1177714026876038, + "grad_norm": 0.25439032912254333, + "learning_rate": 0.00028233359221526794, + "loss": 0.8318, + "step": 10530 + }, + { + "epoch": 0.11788324637486648, + "grad_norm": 0.28182244300842285, + "learning_rate": 0.0002821051669788478, + "loss": 0.8437, + "step": 10540 + }, + { + "epoch": 0.11799509006212917, + "grad_norm": 0.2419012039899826, + "learning_rate": 0.0002818767417424277, + "loss": 0.8446, + "step": 10550 + }, + { + "epoch": 0.11810693374939185, + "grad_norm": 0.2598857581615448, + "learning_rate": 0.0002816483165060076, + "loss": 0.8428, + "step": 10560 + }, + { + "epoch": 0.11821877743665453, + "grad_norm": 0.25206229090690613, + "learning_rate": 0.00028141989126958747, + "loss": 0.8533, + "step": 10570 + }, + { + "epoch": 0.1183306211239172, + "grad_norm": 0.25155991315841675, + "learning_rate": 0.0002811914660331674, + "loss": 0.8538, + "step": 10580 + }, + { + "epoch": 0.1184424648111799, + "grad_norm": 0.2342199832201004, + "learning_rate": 0.00028096304079674723, + "loss": 0.8519, + "step": 10590 + }, + { + "epoch": 0.11855430849844258, + "grad_norm": 0.25823327898979187, + "learning_rate": 0.0002807346155603271, + "loss": 0.8483, + "step": 10600 + }, + { + "epoch": 0.11866615218570525, + "grad_norm": 0.26428598165512085, + "learning_rate": 0.000280506190323907, + "loss": 0.86, + "step": 10610 + }, + { + "epoch": 0.11877799587296795, + "grad_norm": 0.25176918506622314, + "learning_rate": 0.00028027776508748685, + "loss": 0.8589, + "step": 10620 + }, + { + "epoch": 0.11888983956023062, + "grad_norm": 0.28826919198036194, + "learning_rate": 0.0002800493398510667, + "loss": 0.8627, + "step": 10630 + }, + { + "epoch": 0.1190016832474933, + "grad_norm": 0.24679958820343018, + "learning_rate": 0.0002798209146146467, + "loss": 0.8563, + "step": 10640 + }, + { + "epoch": 0.11911352693475598, + "grad_norm": 0.2550687789916992, + "learning_rate": 0.00027959248937822653, + "loss": 0.8535, + "step": 10650 + }, + { + "epoch": 0.11922537062201867, + "grad_norm": 0.2506476640701294, + "learning_rate": 0.0002793640641418064, + "loss": 0.8553, + "step": 10660 + }, + { + "epoch": 0.11933721430928135, + "grad_norm": 0.24980700016021729, + "learning_rate": 0.0002791356389053863, + "loss": 0.854, + "step": 10670 + }, + { + "epoch": 0.11944905799654403, + "grad_norm": 0.2280970811843872, + "learning_rate": 0.00027890721366896615, + "loss": 0.8569, + "step": 10680 + }, + { + "epoch": 0.11956090168380672, + "grad_norm": 0.25191232562065125, + "learning_rate": 0.000278678788432546, + "loss": 0.8566, + "step": 10690 + }, + { + "epoch": 0.1196727453710694, + "grad_norm": 0.2748493552207947, + "learning_rate": 0.0002784503631961259, + "loss": 0.8573, + "step": 10700 + }, + { + "epoch": 0.11978458905833207, + "grad_norm": 0.25123515725135803, + "learning_rate": 0.00027822193795970577, + "loss": 0.8473, + "step": 10710 + }, + { + "epoch": 0.11989643274559475, + "grad_norm": 0.25573378801345825, + "learning_rate": 0.0002779935127232857, + "loss": 0.8469, + "step": 10720 + }, + { + "epoch": 0.12000827643285744, + "grad_norm": 0.23367713391780853, + "learning_rate": 0.0002777650874868656, + "loss": 0.8452, + "step": 10730 + }, + { + "epoch": 0.12012012012012012, + "grad_norm": 0.24593010544776917, + "learning_rate": 0.00027753666225044545, + "loss": 0.838, + "step": 10740 + }, + { + "epoch": 0.1202319638073828, + "grad_norm": 0.2422724962234497, + "learning_rate": 0.0002773082370140253, + "loss": 0.8398, + "step": 10750 + }, + { + "epoch": 0.12034380749464549, + "grad_norm": 0.24471783638000488, + "learning_rate": 0.0002770798117776052, + "loss": 0.8409, + "step": 10760 + }, + { + "epoch": 0.12045565118190817, + "grad_norm": 0.25523480772972107, + "learning_rate": 0.00027685138654118507, + "loss": 0.835, + "step": 10770 + }, + { + "epoch": 0.12056749486917084, + "grad_norm": 0.24846532940864563, + "learning_rate": 0.0002766229613047649, + "loss": 0.842, + "step": 10780 + }, + { + "epoch": 0.12067933855643354, + "grad_norm": 0.26955240964889526, + "learning_rate": 0.00027639453606834484, + "loss": 0.8525, + "step": 10790 + }, + { + "epoch": 0.12079118224369621, + "grad_norm": 0.2711884081363678, + "learning_rate": 0.00027616611083192475, + "loss": 0.8352, + "step": 10800 + }, + { + "epoch": 0.12090302593095889, + "grad_norm": 0.24954953789710999, + "learning_rate": 0.0002759376855955046, + "loss": 0.8257, + "step": 10810 + }, + { + "epoch": 0.12101486961822157, + "grad_norm": 0.27029111981391907, + "learning_rate": 0.0002757092603590845, + "loss": 0.8147, + "step": 10820 + }, + { + "epoch": 0.12112671330548426, + "grad_norm": 0.2440258413553238, + "learning_rate": 0.00027548083512266437, + "loss": 0.8239, + "step": 10830 + }, + { + "epoch": 0.12123855699274694, + "grad_norm": 0.27082934975624084, + "learning_rate": 0.0002752524098862442, + "loss": 0.8391, + "step": 10840 + }, + { + "epoch": 0.12135040068000962, + "grad_norm": 0.27641886472702026, + "learning_rate": 0.00027502398464982413, + "loss": 0.8276, + "step": 10850 + }, + { + "epoch": 0.1214622443672723, + "grad_norm": 0.24772177636623383, + "learning_rate": 0.000274795559413404, + "loss": 0.8226, + "step": 10860 + }, + { + "epoch": 0.12157408805453498, + "grad_norm": 0.2585364580154419, + "learning_rate": 0.00027456713417698384, + "loss": 0.8096, + "step": 10870 + }, + { + "epoch": 0.12168593174179766, + "grad_norm": 0.2730146050453186, + "learning_rate": 0.0002743387089405638, + "loss": 0.8156, + "step": 10880 + }, + { + "epoch": 0.12179777542906034, + "grad_norm": 0.2693599760532379, + "learning_rate": 0.00027411028370414366, + "loss": 0.8125, + "step": 10890 + }, + { + "epoch": 0.12190961911632303, + "grad_norm": 0.26071295142173767, + "learning_rate": 0.0002738818584677235, + "loss": 0.8106, + "step": 10900 + }, + { + "epoch": 0.12202146280358571, + "grad_norm": 0.2560258209705353, + "learning_rate": 0.0002736534332313034, + "loss": 0.8195, + "step": 10910 + }, + { + "epoch": 0.12213330649084839, + "grad_norm": 0.27529552578926086, + "learning_rate": 0.0002734250079948833, + "loss": 0.8104, + "step": 10920 + }, + { + "epoch": 0.12224515017811108, + "grad_norm": 0.2782133221626282, + "learning_rate": 0.00027319658275846314, + "loss": 0.8105, + "step": 10930 + }, + { + "epoch": 0.12235699386537376, + "grad_norm": 0.27981024980545044, + "learning_rate": 0.000272968157522043, + "loss": 0.8085, + "step": 10940 + }, + { + "epoch": 0.12246883755263643, + "grad_norm": 0.2741667926311493, + "learning_rate": 0.0002727397322856229, + "loss": 0.8042, + "step": 10950 + }, + { + "epoch": 0.12258068123989911, + "grad_norm": 0.2468159943819046, + "learning_rate": 0.0002725113070492028, + "loss": 0.8198, + "step": 10960 + }, + { + "epoch": 0.1226925249271618, + "grad_norm": 0.26167941093444824, + "learning_rate": 0.00027228288181278267, + "loss": 0.8176, + "step": 10970 + }, + { + "epoch": 0.12280436861442448, + "grad_norm": 0.26660802960395813, + "learning_rate": 0.0002720544565763626, + "loss": 0.8036, + "step": 10980 + }, + { + "epoch": 0.12291621230168716, + "grad_norm": 0.301575243473053, + "learning_rate": 0.00027182603133994244, + "loss": 0.8049, + "step": 10990 + }, + { + "epoch": 0.12302805598894985, + "grad_norm": 0.2759682834148407, + "learning_rate": 0.0002715976061035223, + "loss": 0.8024, + "step": 11000 + }, + { + "epoch": 0.12313989967621253, + "grad_norm": 0.25659626722335815, + "learning_rate": 0.0002713691808671022, + "loss": 0.8229, + "step": 11010 + }, + { + "epoch": 0.1232517433634752, + "grad_norm": 0.2672923505306244, + "learning_rate": 0.00027114075563068206, + "loss": 0.8018, + "step": 11020 + }, + { + "epoch": 0.12336358705073788, + "grad_norm": 0.25423988699913025, + "learning_rate": 0.0002709123303942619, + "loss": 0.836, + "step": 11030 + }, + { + "epoch": 0.12347543073800057, + "grad_norm": 0.28428804874420166, + "learning_rate": 0.0002706839051578419, + "loss": 0.8299, + "step": 11040 + }, + { + "epoch": 0.12358727442526325, + "grad_norm": 0.2924467921257019, + "learning_rate": 0.00027045547992142173, + "loss": 0.8236, + "step": 11050 + }, + { + "epoch": 0.12369911811252593, + "grad_norm": 0.25230658054351807, + "learning_rate": 0.0002702270546850016, + "loss": 0.8274, + "step": 11060 + }, + { + "epoch": 0.12381096179978862, + "grad_norm": 0.27876734733581543, + "learning_rate": 0.0002699986294485815, + "loss": 0.8244, + "step": 11070 + }, + { + "epoch": 0.1239228054870513, + "grad_norm": 0.29841694235801697, + "learning_rate": 0.00026977020421216136, + "loss": 0.8327, + "step": 11080 + }, + { + "epoch": 0.12403464917431398, + "grad_norm": 0.3055926263332367, + "learning_rate": 0.0002695417789757412, + "loss": 0.8247, + "step": 11090 + }, + { + "epoch": 0.12414649286157665, + "grad_norm": 0.275919109582901, + "learning_rate": 0.0002693133537393211, + "loss": 0.8263, + "step": 11100 + }, + { + "epoch": 0.12425833654883935, + "grad_norm": 0.3069559931755066, + "learning_rate": 0.00026908492850290103, + "loss": 0.8242, + "step": 11110 + }, + { + "epoch": 0.12437018023610202, + "grad_norm": 0.2574029564857483, + "learning_rate": 0.0002688565032664809, + "loss": 0.819, + "step": 11120 + }, + { + "epoch": 0.1244820239233647, + "grad_norm": 0.25053170323371887, + "learning_rate": 0.0002686280780300608, + "loss": 0.8022, + "step": 11130 + }, + { + "epoch": 0.12459386761062739, + "grad_norm": 0.27337634563446045, + "learning_rate": 0.00026839965279364065, + "loss": 0.8127, + "step": 11140 + }, + { + "epoch": 0.12470571129789007, + "grad_norm": 0.2531510889530182, + "learning_rate": 0.0002681712275572205, + "loss": 0.8138, + "step": 11150 + }, + { + "epoch": 0.12481755498515275, + "grad_norm": 0.27455076575279236, + "learning_rate": 0.0002679428023208004, + "loss": 0.7974, + "step": 11160 + }, + { + "epoch": 0.12492939867241543, + "grad_norm": 0.2515604496002197, + "learning_rate": 0.0002677143770843803, + "loss": 0.8077, + "step": 11170 + }, + { + "epoch": 0.12504124235967812, + "grad_norm": 0.27941974997520447, + "learning_rate": 0.00026748595184796013, + "loss": 0.8099, + "step": 11180 + }, + { + "epoch": 0.1251530860469408, + "grad_norm": 0.2508449852466583, + "learning_rate": 0.0002672575266115401, + "loss": 0.8077, + "step": 11190 + }, + { + "epoch": 0.12526492973420347, + "grad_norm": 0.24805410206317902, + "learning_rate": 0.00026702910137511995, + "loss": 0.8029, + "step": 11200 + }, + { + "epoch": 0.12537677342146616, + "grad_norm": 0.2730201184749603, + "learning_rate": 0.0002668006761386998, + "loss": 0.8383, + "step": 11210 + }, + { + "epoch": 0.12548861710872883, + "grad_norm": 0.24301932752132416, + "learning_rate": 0.0002665722509022797, + "loss": 0.8245, + "step": 11220 + }, + { + "epoch": 0.12560046079599152, + "grad_norm": 0.270059734582901, + "learning_rate": 0.00026634382566585957, + "loss": 0.8287, + "step": 11230 + }, + { + "epoch": 0.1257123044832542, + "grad_norm": 0.24491746723651886, + "learning_rate": 0.0002661154004294394, + "loss": 0.8283, + "step": 11240 + }, + { + "epoch": 0.12582414817051688, + "grad_norm": 0.2461182177066803, + "learning_rate": 0.00026588697519301934, + "loss": 0.8285, + "step": 11250 + }, + { + "epoch": 0.12593599185777957, + "grad_norm": 0.26306700706481934, + "learning_rate": 0.0002656585499565992, + "loss": 0.8366, + "step": 11260 + }, + { + "epoch": 0.12604783554504226, + "grad_norm": 0.2317613661289215, + "learning_rate": 0.0002654301247201791, + "loss": 0.8373, + "step": 11270 + }, + { + "epoch": 0.12615967923230492, + "grad_norm": 0.25218284130096436, + "learning_rate": 0.000265201699483759, + "loss": 0.8163, + "step": 11280 + }, + { + "epoch": 0.1262715229195676, + "grad_norm": 0.2527898848056793, + "learning_rate": 0.00026497327424733887, + "loss": 0.819, + "step": 11290 + }, + { + "epoch": 0.1263833666068303, + "grad_norm": 0.2344309389591217, + "learning_rate": 0.0002647448490109187, + "loss": 0.8335, + "step": 11300 + }, + { + "epoch": 0.12649521029409297, + "grad_norm": 0.23913320899009705, + "learning_rate": 0.00026451642377449863, + "loss": 0.8289, + "step": 11310 + }, + { + "epoch": 0.12660705398135566, + "grad_norm": 0.24901095032691956, + "learning_rate": 0.0002642879985380785, + "loss": 0.8159, + "step": 11320 + }, + { + "epoch": 0.12671889766861835, + "grad_norm": 0.2503173351287842, + "learning_rate": 0.00026405957330165834, + "loss": 0.8372, + "step": 11330 + }, + { + "epoch": 0.12683074135588102, + "grad_norm": 0.2341470569372177, + "learning_rate": 0.00026383114806523825, + "loss": 0.8264, + "step": 11340 + }, + { + "epoch": 0.1269425850431437, + "grad_norm": 0.23143555223941803, + "learning_rate": 0.00026360272282881816, + "loss": 0.824, + "step": 11350 + }, + { + "epoch": 0.12705442873040637, + "grad_norm": 0.24911652505397797, + "learning_rate": 0.000263374297592398, + "loss": 0.82, + "step": 11360 + }, + { + "epoch": 0.12716627241766906, + "grad_norm": 0.21931353211402893, + "learning_rate": 0.0002631458723559779, + "loss": 0.8194, + "step": 11370 + }, + { + "epoch": 0.12727811610493175, + "grad_norm": 0.2432345151901245, + "learning_rate": 0.0002629174471195578, + "loss": 0.8371, + "step": 11380 + }, + { + "epoch": 0.12738995979219442, + "grad_norm": 0.24188277125358582, + "learning_rate": 0.00026268902188313764, + "loss": 0.8096, + "step": 11390 + }, + { + "epoch": 0.1275018034794571, + "grad_norm": 0.2522214651107788, + "learning_rate": 0.0002624605966467175, + "loss": 0.8187, + "step": 11400 + }, + { + "epoch": 0.1276136471667198, + "grad_norm": 0.2596495449542999, + "learning_rate": 0.0002622321714102974, + "loss": 0.8138, + "step": 11410 + }, + { + "epoch": 0.12772549085398247, + "grad_norm": 0.2708049714565277, + "learning_rate": 0.00026200374617387726, + "loss": 0.8066, + "step": 11420 + }, + { + "epoch": 0.12783733454124516, + "grad_norm": 0.27820831537246704, + "learning_rate": 0.00026177532093745717, + "loss": 0.8112, + "step": 11430 + }, + { + "epoch": 0.12794917822850785, + "grad_norm": 0.23918400704860687, + "learning_rate": 0.0002615468957010371, + "loss": 0.8148, + "step": 11440 + }, + { + "epoch": 0.1280610219157705, + "grad_norm": 0.22054031491279602, + "learning_rate": 0.00026131847046461694, + "loss": 0.8183, + "step": 11450 + }, + { + "epoch": 0.1281728656030332, + "grad_norm": 0.25998455286026, + "learning_rate": 0.0002610900452281968, + "loss": 0.8242, + "step": 11460 + }, + { + "epoch": 0.1282847092902959, + "grad_norm": 0.26852914690971375, + "learning_rate": 0.0002608616199917767, + "loss": 0.8161, + "step": 11470 + }, + { + "epoch": 0.12839655297755856, + "grad_norm": 0.24028563499450684, + "learning_rate": 0.00026063319475535656, + "loss": 0.8083, + "step": 11480 + }, + { + "epoch": 0.12850839666482125, + "grad_norm": 0.24944745004177094, + "learning_rate": 0.0002604047695189364, + "loss": 0.8168, + "step": 11490 + }, + { + "epoch": 0.12862024035208391, + "grad_norm": 0.26595303416252136, + "learning_rate": 0.0002601763442825164, + "loss": 0.8178, + "step": 11500 + }, + { + "epoch": 0.1287320840393466, + "grad_norm": 0.24556541442871094, + "learning_rate": 0.00025994791904609623, + "loss": 0.8229, + "step": 11510 + }, + { + "epoch": 0.1288439277266093, + "grad_norm": 0.24716900289058685, + "learning_rate": 0.0002597194938096761, + "loss": 0.809, + "step": 11520 + }, + { + "epoch": 0.12895577141387196, + "grad_norm": 0.24745820462703705, + "learning_rate": 0.000259491068573256, + "loss": 0.8293, + "step": 11530 + }, + { + "epoch": 0.12906761510113465, + "grad_norm": 0.2732492983341217, + "learning_rate": 0.00025926264333683586, + "loss": 0.8, + "step": 11540 + }, + { + "epoch": 0.12917945878839734, + "grad_norm": 0.23239663243293762, + "learning_rate": 0.0002590342181004157, + "loss": 0.8175, + "step": 11550 + }, + { + "epoch": 0.12929130247566, + "grad_norm": 0.24953389167785645, + "learning_rate": 0.0002588057928639956, + "loss": 0.8152, + "step": 11560 + }, + { + "epoch": 0.1294031461629227, + "grad_norm": 0.25258156657218933, + "learning_rate": 0.0002585773676275755, + "loss": 0.8301, + "step": 11570 + }, + { + "epoch": 0.1295149898501854, + "grad_norm": 0.2609168291091919, + "learning_rate": 0.0002583489423911554, + "loss": 0.8197, + "step": 11580 + }, + { + "epoch": 0.12962683353744806, + "grad_norm": 0.2484872192144394, + "learning_rate": 0.0002581205171547353, + "loss": 0.8362, + "step": 11590 + }, + { + "epoch": 0.12973867722471075, + "grad_norm": 0.2833307385444641, + "learning_rate": 0.00025789209191831515, + "loss": 0.8338, + "step": 11600 + }, + { + "epoch": 0.12985052091197344, + "grad_norm": 0.24657459557056427, + "learning_rate": 0.000257663666681895, + "loss": 0.8205, + "step": 11610 + }, + { + "epoch": 0.1299623645992361, + "grad_norm": 0.2499598115682602, + "learning_rate": 0.0002574352414454749, + "loss": 0.8406, + "step": 11620 + }, + { + "epoch": 0.1300742082864988, + "grad_norm": 0.2757512629032135, + "learning_rate": 0.0002572068162090548, + "loss": 0.8247, + "step": 11630 + }, + { + "epoch": 0.13018605197376146, + "grad_norm": 0.25661805272102356, + "learning_rate": 0.00025697839097263463, + "loss": 0.8384, + "step": 11640 + }, + { + "epoch": 0.13029789566102415, + "grad_norm": 0.27651283144950867, + "learning_rate": 0.00025674996573621454, + "loss": 0.818, + "step": 11650 + }, + { + "epoch": 0.13040973934828684, + "grad_norm": 0.247050940990448, + "learning_rate": 0.00025652154049979445, + "loss": 0.8261, + "step": 11660 + }, + { + "epoch": 0.1305215830355495, + "grad_norm": 0.23124581575393677, + "learning_rate": 0.0002562931152633743, + "loss": 0.8259, + "step": 11670 + }, + { + "epoch": 0.1306334267228122, + "grad_norm": 0.2694045603275299, + "learning_rate": 0.0002560646900269542, + "loss": 0.8304, + "step": 11680 + }, + { + "epoch": 0.1307452704100749, + "grad_norm": 0.26821568608283997, + "learning_rate": 0.00025583626479053407, + "loss": 0.8441, + "step": 11690 + }, + { + "epoch": 0.13085711409733755, + "grad_norm": 0.2747989892959595, + "learning_rate": 0.0002556078395541139, + "loss": 0.841, + "step": 11700 + }, + { + "epoch": 0.13096895778460024, + "grad_norm": 0.28248855471611023, + "learning_rate": 0.00025537941431769384, + "loss": 0.857, + "step": 11710 + }, + { + "epoch": 0.13108080147186293, + "grad_norm": 0.25378182530403137, + "learning_rate": 0.0002551509890812737, + "loss": 0.8437, + "step": 11720 + }, + { + "epoch": 0.1311926451591256, + "grad_norm": 0.25950944423675537, + "learning_rate": 0.00025492256384485355, + "loss": 0.8497, + "step": 11730 + }, + { + "epoch": 0.1313044888463883, + "grad_norm": 0.26261699199676514, + "learning_rate": 0.0002546941386084335, + "loss": 0.8477, + "step": 11740 + }, + { + "epoch": 0.13141633253365098, + "grad_norm": 0.30151599645614624, + "learning_rate": 0.00025446571337201337, + "loss": 0.8405, + "step": 11750 + }, + { + "epoch": 0.13152817622091365, + "grad_norm": 0.2556060254573822, + "learning_rate": 0.0002542372881355932, + "loss": 0.831, + "step": 11760 + }, + { + "epoch": 0.13164001990817634, + "grad_norm": 0.26560309529304504, + "learning_rate": 0.00025400886289917313, + "loss": 0.8445, + "step": 11770 + }, + { + "epoch": 0.13175186359543903, + "grad_norm": 0.28504636883735657, + "learning_rate": 0.000253780437662753, + "loss": 0.8432, + "step": 11780 + }, + { + "epoch": 0.1318637072827017, + "grad_norm": 0.2985188663005829, + "learning_rate": 0.00025355201242633285, + "loss": 0.8584, + "step": 11790 + }, + { + "epoch": 0.13197555096996438, + "grad_norm": 0.28022414445877075, + "learning_rate": 0.00025332358718991276, + "loss": 0.8393, + "step": 11800 + }, + { + "epoch": 0.13208739465722705, + "grad_norm": 0.28535568714141846, + "learning_rate": 0.0002530951619534926, + "loss": 0.8369, + "step": 11810 + }, + { + "epoch": 0.13219923834448974, + "grad_norm": 0.27764952182769775, + "learning_rate": 0.0002528667367170725, + "loss": 0.8435, + "step": 11820 + }, + { + "epoch": 0.13231108203175243, + "grad_norm": 0.28943151235580444, + "learning_rate": 0.0002526383114806524, + "loss": 0.8334, + "step": 11830 + }, + { + "epoch": 0.1324229257190151, + "grad_norm": 0.28240668773651123, + "learning_rate": 0.0002524098862442323, + "loss": 0.8338, + "step": 11840 + }, + { + "epoch": 0.13253476940627779, + "grad_norm": 0.27650541067123413, + "learning_rate": 0.00025218146100781214, + "loss": 0.8275, + "step": 11850 + }, + { + "epoch": 0.13264661309354048, + "grad_norm": 0.27569788694381714, + "learning_rate": 0.000251953035771392, + "loss": 0.8323, + "step": 11860 + }, + { + "epoch": 0.13275845678080314, + "grad_norm": 0.29103782773017883, + "learning_rate": 0.0002517246105349719, + "loss": 0.8401, + "step": 11870 + }, + { + "epoch": 0.13287030046806583, + "grad_norm": 0.28769806027412415, + "learning_rate": 0.00025149618529855176, + "loss": 0.8369, + "step": 11880 + }, + { + "epoch": 0.13298214415532852, + "grad_norm": 0.2803378701210022, + "learning_rate": 0.0002512677600621316, + "loss": 0.8308, + "step": 11890 + }, + { + "epoch": 0.1330939878425912, + "grad_norm": 0.29264572262763977, + "learning_rate": 0.0002510393348257116, + "loss": 0.8314, + "step": 11900 + }, + { + "epoch": 0.13320583152985388, + "grad_norm": 0.27434802055358887, + "learning_rate": 0.00025081090958929144, + "loss": 0.8337, + "step": 11910 + }, + { + "epoch": 0.13331767521711657, + "grad_norm": 0.270589143037796, + "learning_rate": 0.0002505824843528713, + "loss": 0.8503, + "step": 11920 + }, + { + "epoch": 0.13342951890437924, + "grad_norm": 0.27260124683380127, + "learning_rate": 0.0002503540591164512, + "loss": 0.8293, + "step": 11930 + }, + { + "epoch": 0.13354136259164193, + "grad_norm": 0.2684808075428009, + "learning_rate": 0.00025012563388003106, + "loss": 0.8339, + "step": 11940 + }, + { + "epoch": 0.1336532062789046, + "grad_norm": 0.2510156035423279, + "learning_rate": 0.00024989720864361097, + "loss": 0.8464, + "step": 11950 + }, + { + "epoch": 0.13376504996616728, + "grad_norm": 0.24331960082054138, + "learning_rate": 0.0002496687834071908, + "loss": 0.8443, + "step": 11960 + }, + { + "epoch": 0.13387689365342997, + "grad_norm": 0.2688249349594116, + "learning_rate": 0.00024944035817077074, + "loss": 0.8483, + "step": 11970 + }, + { + "epoch": 0.13398873734069264, + "grad_norm": 0.2608729898929596, + "learning_rate": 0.0002492119329343506, + "loss": 0.852, + "step": 11980 + }, + { + "epoch": 0.13410058102795533, + "grad_norm": 0.28415507078170776, + "learning_rate": 0.00024898350769793045, + "loss": 0.8449, + "step": 11990 + }, + { + "epoch": 0.13421242471521802, + "grad_norm": 0.2920886278152466, + "learning_rate": 0.00024875508246151036, + "loss": 0.8281, + "step": 12000 + }, + { + "epoch": 0.13432426840248068, + "grad_norm": 0.2763430178165436, + "learning_rate": 0.00024852665722509027, + "loss": 0.8492, + "step": 12010 + }, + { + "epoch": 0.13443611208974338, + "grad_norm": 0.26460400223731995, + "learning_rate": 0.0002482982319886701, + "loss": 0.8409, + "step": 12020 + }, + { + "epoch": 0.13454795577700607, + "grad_norm": 0.2698183059692383, + "learning_rate": 0.00024806980675225, + "loss": 0.8295, + "step": 12030 + }, + { + "epoch": 0.13465979946426873, + "grad_norm": 0.2728478014469147, + "learning_rate": 0.0002478413815158299, + "loss": 0.837, + "step": 12040 + }, + { + "epoch": 0.13477164315153142, + "grad_norm": 0.282924085855484, + "learning_rate": 0.00024761295627940974, + "loss": 0.8482, + "step": 12050 + }, + { + "epoch": 0.13488348683879411, + "grad_norm": 0.264614999294281, + "learning_rate": 0.00024738453104298965, + "loss": 0.8432, + "step": 12060 + }, + { + "epoch": 0.13499533052605678, + "grad_norm": 0.2475707232952118, + "learning_rate": 0.0002471561058065695, + "loss": 0.8387, + "step": 12070 + }, + { + "epoch": 0.13510717421331947, + "grad_norm": 0.2620779573917389, + "learning_rate": 0.00024692768057014937, + "loss": 0.8559, + "step": 12080 + }, + { + "epoch": 0.13521901790058213, + "grad_norm": 0.2645311951637268, + "learning_rate": 0.0002466992553337293, + "loss": 0.8363, + "step": 12090 + }, + { + "epoch": 0.13533086158784483, + "grad_norm": 0.27586236596107483, + "learning_rate": 0.0002464708300973092, + "loss": 0.8365, + "step": 12100 + }, + { + "epoch": 0.13544270527510752, + "grad_norm": 0.2695125341415405, + "learning_rate": 0.00024624240486088904, + "loss": 0.8412, + "step": 12110 + }, + { + "epoch": 0.13555454896237018, + "grad_norm": 0.2473846971988678, + "learning_rate": 0.0002460139796244689, + "loss": 0.8362, + "step": 12120 + }, + { + "epoch": 0.13566639264963287, + "grad_norm": 0.28001588582992554, + "learning_rate": 0.0002457855543880488, + "loss": 0.8462, + "step": 12130 + }, + { + "epoch": 0.13577823633689556, + "grad_norm": 0.29486599564552307, + "learning_rate": 0.00024555712915162866, + "loss": 0.8607, + "step": 12140 + }, + { + "epoch": 0.13589008002415823, + "grad_norm": 0.2761843204498291, + "learning_rate": 0.00024532870391520857, + "loss": 0.8668, + "step": 12150 + }, + { + "epoch": 0.13600192371142092, + "grad_norm": 0.25779953598976135, + "learning_rate": 0.00024510027867878843, + "loss": 0.853, + "step": 12160 + }, + { + "epoch": 0.1361137673986836, + "grad_norm": 0.27593857049942017, + "learning_rate": 0.00024487185344236834, + "loss": 0.8506, + "step": 12170 + }, + { + "epoch": 0.13622561108594627, + "grad_norm": 0.24426791071891785, + "learning_rate": 0.0002446434282059482, + "loss": 0.8623, + "step": 12180 + }, + { + "epoch": 0.13633745477320897, + "grad_norm": 0.25555628538131714, + "learning_rate": 0.00024441500296952805, + "loss": 0.8493, + "step": 12190 + }, + { + "epoch": 0.13644929846047166, + "grad_norm": 0.2234913557767868, + "learning_rate": 0.00024418657773310796, + "loss": 0.8644, + "step": 12200 + }, + { + "epoch": 0.13656114214773432, + "grad_norm": 0.27130651473999023, + "learning_rate": 0.00024395815249668784, + "loss": 0.8791, + "step": 12210 + }, + { + "epoch": 0.136672985834997, + "grad_norm": 0.24734824895858765, + "learning_rate": 0.0002437297272602677, + "loss": 0.8719, + "step": 12220 + }, + { + "epoch": 0.13678482952225968, + "grad_norm": 0.24316945672035217, + "learning_rate": 0.0002435013020238476, + "loss": 0.8546, + "step": 12230 + }, + { + "epoch": 0.13689667320952237, + "grad_norm": 0.2349976748228073, + "learning_rate": 0.0002432728767874275, + "loss": 0.8458, + "step": 12240 + }, + { + "epoch": 0.13700851689678506, + "grad_norm": 0.26791033148765564, + "learning_rate": 0.00024304445155100735, + "loss": 0.8485, + "step": 12250 + }, + { + "epoch": 0.13712036058404772, + "grad_norm": 0.23598451912403107, + "learning_rate": 0.00024281602631458723, + "loss": 0.8451, + "step": 12260 + }, + { + "epoch": 0.13723220427131042, + "grad_norm": 0.23012129962444305, + "learning_rate": 0.00024258760107816714, + "loss": 0.8332, + "step": 12270 + }, + { + "epoch": 0.1373440479585731, + "grad_norm": 0.22834524512290955, + "learning_rate": 0.000242359175841747, + "loss": 0.8203, + "step": 12280 + }, + { + "epoch": 0.13745589164583577, + "grad_norm": 0.2247861921787262, + "learning_rate": 0.00024213075060532688, + "loss": 0.8303, + "step": 12290 + }, + { + "epoch": 0.13756773533309846, + "grad_norm": 0.2438284307718277, + "learning_rate": 0.00024190232536890676, + "loss": 0.8216, + "step": 12300 + }, + { + "epoch": 0.13767957902036115, + "grad_norm": 0.24075888097286224, + "learning_rate": 0.00024167390013248664, + "loss": 0.7964, + "step": 12310 + }, + { + "epoch": 0.13779142270762382, + "grad_norm": 0.24668976664543152, + "learning_rate": 0.00024144547489606653, + "loss": 0.8028, + "step": 12320 + }, + { + "epoch": 0.1379032663948865, + "grad_norm": 0.26727405190467834, + "learning_rate": 0.0002412170496596464, + "loss": 0.8081, + "step": 12330 + }, + { + "epoch": 0.1380151100821492, + "grad_norm": 0.2645564377307892, + "learning_rate": 0.00024098862442322626, + "loss": 0.8116, + "step": 12340 + }, + { + "epoch": 0.13812695376941186, + "grad_norm": 0.25368645787239075, + "learning_rate": 0.00024076019918680617, + "loss": 0.8105, + "step": 12350 + }, + { + "epoch": 0.13823879745667456, + "grad_norm": 0.26823967695236206, + "learning_rate": 0.00024053177395038606, + "loss": 0.8249, + "step": 12360 + }, + { + "epoch": 0.13835064114393722, + "grad_norm": 0.2827225625514984, + "learning_rate": 0.0002403033487139659, + "loss": 0.8191, + "step": 12370 + }, + { + "epoch": 0.1384624848311999, + "grad_norm": 0.23261433839797974, + "learning_rate": 0.00024007492347754582, + "loss": 0.8215, + "step": 12380 + }, + { + "epoch": 0.1385743285184626, + "grad_norm": 0.27331966161727905, + "learning_rate": 0.00023984649824112568, + "loss": 0.8232, + "step": 12390 + }, + { + "epoch": 0.13868617220572527, + "grad_norm": 0.2801966369152069, + "learning_rate": 0.00023961807300470556, + "loss": 0.8074, + "step": 12400 + }, + { + "epoch": 0.13879801589298796, + "grad_norm": 0.2379591315984726, + "learning_rate": 0.00023938964776828544, + "loss": 0.8209, + "step": 12410 + }, + { + "epoch": 0.13890985958025065, + "grad_norm": 0.27151694893836975, + "learning_rate": 0.00023916122253186533, + "loss": 0.8258, + "step": 12420 + }, + { + "epoch": 0.1390217032675133, + "grad_norm": 0.21429865062236786, + "learning_rate": 0.0002389327972954452, + "loss": 0.8178, + "step": 12430 + }, + { + "epoch": 0.139133546954776, + "grad_norm": 0.2777722477912903, + "learning_rate": 0.0002387043720590251, + "loss": 0.826, + "step": 12440 + }, + { + "epoch": 0.1392453906420387, + "grad_norm": 0.2514742910861969, + "learning_rate": 0.00023847594682260495, + "loss": 0.8362, + "step": 12450 + }, + { + "epoch": 0.13935723432930136, + "grad_norm": 0.23247656226158142, + "learning_rate": 0.00023824752158618486, + "loss": 0.8049, + "step": 12460 + }, + { + "epoch": 0.13946907801656405, + "grad_norm": 0.2391313910484314, + "learning_rate": 0.00023801909634976474, + "loss": 0.8082, + "step": 12470 + }, + { + "epoch": 0.13958092170382674, + "grad_norm": 0.2366340011358261, + "learning_rate": 0.0002377906711133446, + "loss": 0.8214, + "step": 12480 + }, + { + "epoch": 0.1396927653910894, + "grad_norm": 0.2570713758468628, + "learning_rate": 0.00023756224587692448, + "loss": 0.827, + "step": 12490 + }, + { + "epoch": 0.1398046090783521, + "grad_norm": 0.22823789715766907, + "learning_rate": 0.0002373338206405044, + "loss": 0.8314, + "step": 12500 + }, + { + "epoch": 0.1399164527656148, + "grad_norm": 0.24660278856754303, + "learning_rate": 0.00023710539540408424, + "loss": 0.838, + "step": 12510 + }, + { + "epoch": 0.14002829645287745, + "grad_norm": 0.25041723251342773, + "learning_rate": 0.00023687697016766413, + "loss": 0.8371, + "step": 12520 + }, + { + "epoch": 0.14014014014014015, + "grad_norm": 0.23942531645298004, + "learning_rate": 0.000236648544931244, + "loss": 0.8282, + "step": 12530 + }, + { + "epoch": 0.1402519838274028, + "grad_norm": 0.2445865273475647, + "learning_rate": 0.0002364201196948239, + "loss": 0.8307, + "step": 12540 + }, + { + "epoch": 0.1403638275146655, + "grad_norm": 0.25278452038764954, + "learning_rate": 0.00023619169445840378, + "loss": 0.8483, + "step": 12550 + }, + { + "epoch": 0.1404756712019282, + "grad_norm": 0.22890037298202515, + "learning_rate": 0.00023596326922198366, + "loss": 0.8328, + "step": 12560 + }, + { + "epoch": 0.14058751488919086, + "grad_norm": 0.2360977679491043, + "learning_rate": 0.00023573484398556351, + "loss": 0.8373, + "step": 12570 + }, + { + "epoch": 0.14069935857645355, + "grad_norm": 0.22873692214488983, + "learning_rate": 0.00023550641874914342, + "loss": 0.8399, + "step": 12580 + }, + { + "epoch": 0.14081120226371624, + "grad_norm": 0.228402242064476, + "learning_rate": 0.0002352779935127233, + "loss": 0.8272, + "step": 12590 + }, + { + "epoch": 0.1409230459509789, + "grad_norm": 0.2625369131565094, + "learning_rate": 0.00023504956827630316, + "loss": 0.8413, + "step": 12600 + }, + { + "epoch": 0.1410348896382416, + "grad_norm": 0.2744843363761902, + "learning_rate": 0.00023482114303988305, + "loss": 0.823, + "step": 12610 + }, + { + "epoch": 0.1411467333255043, + "grad_norm": 0.24845914542675018, + "learning_rate": 0.00023459271780346293, + "loss": 0.8089, + "step": 12620 + }, + { + "epoch": 0.14125857701276695, + "grad_norm": 0.2431713193655014, + "learning_rate": 0.0002343642925670428, + "loss": 0.8204, + "step": 12630 + }, + { + "epoch": 0.14137042070002964, + "grad_norm": 0.2636731266975403, + "learning_rate": 0.0002341358673306227, + "loss": 0.8241, + "step": 12640 + }, + { + "epoch": 0.14148226438729233, + "grad_norm": 0.24605631828308105, + "learning_rate": 0.00023390744209420255, + "loss": 0.837, + "step": 12650 + }, + { + "epoch": 0.141594108074555, + "grad_norm": 0.25722581148147583, + "learning_rate": 0.00023367901685778246, + "loss": 0.8338, + "step": 12660 + }, + { + "epoch": 0.1417059517618177, + "grad_norm": 0.2628157138824463, + "learning_rate": 0.00023345059162136234, + "loss": 0.8271, + "step": 12670 + }, + { + "epoch": 0.14181779544908035, + "grad_norm": 0.24534687399864197, + "learning_rate": 0.0002332221663849422, + "loss": 0.8281, + "step": 12680 + }, + { + "epoch": 0.14192963913634304, + "grad_norm": 0.24370639026165009, + "learning_rate": 0.00023299374114852208, + "loss": 0.8243, + "step": 12690 + }, + { + "epoch": 0.14204148282360574, + "grad_norm": 0.2993674576282501, + "learning_rate": 0.000232765315912102, + "loss": 0.8191, + "step": 12700 + }, + { + "epoch": 0.1421533265108684, + "grad_norm": 0.2372383326292038, + "learning_rate": 0.00023253689067568185, + "loss": 0.8115, + "step": 12710 + }, + { + "epoch": 0.1422651701981311, + "grad_norm": 0.2405237853527069, + "learning_rate": 0.00023230846543926173, + "loss": 0.8012, + "step": 12720 + }, + { + "epoch": 0.14237701388539378, + "grad_norm": 0.23501497507095337, + "learning_rate": 0.0002320800402028416, + "loss": 0.8272, + "step": 12730 + }, + { + "epoch": 0.14248885757265645, + "grad_norm": 0.2573966085910797, + "learning_rate": 0.0002318516149664215, + "loss": 0.8231, + "step": 12740 + }, + { + "epoch": 0.14260070125991914, + "grad_norm": 0.25884565711021423, + "learning_rate": 0.00023162318973000138, + "loss": 0.8293, + "step": 12750 + }, + { + "epoch": 0.14271254494718183, + "grad_norm": 0.24788953363895416, + "learning_rate": 0.00023139476449358126, + "loss": 0.8338, + "step": 12760 + }, + { + "epoch": 0.1428243886344445, + "grad_norm": 0.23874413967132568, + "learning_rate": 0.00023116633925716112, + "loss": 0.8184, + "step": 12770 + }, + { + "epoch": 0.14293623232170719, + "grad_norm": 0.2358027547597885, + "learning_rate": 0.00023093791402074103, + "loss": 0.8143, + "step": 12780 + }, + { + "epoch": 0.14304807600896988, + "grad_norm": 0.22447925806045532, + "learning_rate": 0.0002307094887843209, + "loss": 0.8093, + "step": 12790 + }, + { + "epoch": 0.14315991969623254, + "grad_norm": 0.25550246238708496, + "learning_rate": 0.00023048106354790077, + "loss": 0.8178, + "step": 12800 + }, + { + "epoch": 0.14327176338349523, + "grad_norm": 0.2370327264070511, + "learning_rate": 0.00023025263831148065, + "loss": 0.8035, + "step": 12810 + }, + { + "epoch": 0.1433836070707579, + "grad_norm": 0.24910229444503784, + "learning_rate": 0.00023002421307506056, + "loss": 0.7965, + "step": 12820 + }, + { + "epoch": 0.1434954507580206, + "grad_norm": 0.23592302203178406, + "learning_rate": 0.0002297957878386404, + "loss": 0.808, + "step": 12830 + }, + { + "epoch": 0.14360729444528328, + "grad_norm": 0.24010522663593292, + "learning_rate": 0.0002295673626022203, + "loss": 0.8047, + "step": 12840 + }, + { + "epoch": 0.14371913813254594, + "grad_norm": 0.26334619522094727, + "learning_rate": 0.00022933893736580015, + "loss": 0.8011, + "step": 12850 + }, + { + "epoch": 0.14383098181980863, + "grad_norm": 0.23162928223609924, + "learning_rate": 0.00022911051212938006, + "loss": 0.811, + "step": 12860 + }, + { + "epoch": 0.14394282550707133, + "grad_norm": 0.24273565411567688, + "learning_rate": 0.00022888208689295994, + "loss": 0.8249, + "step": 12870 + }, + { + "epoch": 0.144054669194334, + "grad_norm": 0.239716574549675, + "learning_rate": 0.0002286536616565398, + "loss": 0.8146, + "step": 12880 + }, + { + "epoch": 0.14416651288159668, + "grad_norm": 0.22947145998477936, + "learning_rate": 0.0002284252364201197, + "loss": 0.8037, + "step": 12890 + }, + { + "epoch": 0.14427835656885937, + "grad_norm": 0.2369975745677948, + "learning_rate": 0.0002281968111836996, + "loss": 0.7938, + "step": 12900 + }, + { + "epoch": 0.14439020025612204, + "grad_norm": 0.23150302469730377, + "learning_rate": 0.00022796838594727945, + "loss": 0.7971, + "step": 12910 + }, + { + "epoch": 0.14450204394338473, + "grad_norm": 0.25659120082855225, + "learning_rate": 0.00022773996071085933, + "loss": 0.7897, + "step": 12920 + }, + { + "epoch": 0.14461388763064742, + "grad_norm": 0.26838308572769165, + "learning_rate": 0.00022751153547443924, + "loss": 0.8025, + "step": 12930 + }, + { + "epoch": 0.14472573131791008, + "grad_norm": 0.2421617954969406, + "learning_rate": 0.0002272831102380191, + "loss": 0.7937, + "step": 12940 + }, + { + "epoch": 0.14483757500517278, + "grad_norm": 0.22780479490756989, + "learning_rate": 0.00022705468500159898, + "loss": 0.7861, + "step": 12950 + }, + { + "epoch": 0.14494941869243544, + "grad_norm": 0.2561044692993164, + "learning_rate": 0.00022682625976517886, + "loss": 0.7817, + "step": 12960 + }, + { + "epoch": 0.14506126237969813, + "grad_norm": 0.24073092639446259, + "learning_rate": 0.00022659783452875875, + "loss": 0.8024, + "step": 12970 + }, + { + "epoch": 0.14517310606696082, + "grad_norm": 0.24959658086299896, + "learning_rate": 0.00022636940929233863, + "loss": 0.7994, + "step": 12980 + }, + { + "epoch": 0.14528494975422349, + "grad_norm": 0.2711149752140045, + "learning_rate": 0.0002261409840559185, + "loss": 0.8011, + "step": 12990 + }, + { + "epoch": 0.14539679344148618, + "grad_norm": 0.2447725236415863, + "learning_rate": 0.00022591255881949837, + "loss": 0.7957, + "step": 13000 + }, + { + "epoch": 0.14550863712874887, + "grad_norm": 0.26505330204963684, + "learning_rate": 0.00022568413358307828, + "loss": 0.7932, + "step": 13010 + }, + { + "epoch": 0.14562048081601153, + "grad_norm": 0.256712943315506, + "learning_rate": 0.00022545570834665816, + "loss": 0.7919, + "step": 13020 + }, + { + "epoch": 0.14573232450327422, + "grad_norm": 0.23816627264022827, + "learning_rate": 0.00022522728311023802, + "loss": 0.7942, + "step": 13030 + }, + { + "epoch": 0.14584416819053692, + "grad_norm": 0.25607794523239136, + "learning_rate": 0.0002249988578738179, + "loss": 0.8058, + "step": 13040 + }, + { + "epoch": 0.14595601187779958, + "grad_norm": 0.2644692361354828, + "learning_rate": 0.0002247704326373978, + "loss": 0.8026, + "step": 13050 + }, + { + "epoch": 0.14606785556506227, + "grad_norm": 0.24160505831241608, + "learning_rate": 0.00022454200740097766, + "loss": 0.8013, + "step": 13060 + }, + { + "epoch": 0.14617969925232496, + "grad_norm": 0.25321200489997864, + "learning_rate": 0.00022431358216455755, + "loss": 0.802, + "step": 13070 + }, + { + "epoch": 0.14629154293958763, + "grad_norm": 0.38834208250045776, + "learning_rate": 0.0002240851569281374, + "loss": 0.8053, + "step": 13080 + }, + { + "epoch": 0.14640338662685032, + "grad_norm": 0.2638767957687378, + "learning_rate": 0.0002238567316917173, + "loss": 0.803, + "step": 13090 + }, + { + "epoch": 0.14651523031411298, + "grad_norm": 0.33412685990333557, + "learning_rate": 0.0002236283064552972, + "loss": 0.8091, + "step": 13100 + }, + { + "epoch": 0.14662707400137567, + "grad_norm": 0.27539852261543274, + "learning_rate": 0.00022339988121887705, + "loss": 0.8019, + "step": 13110 + }, + { + "epoch": 0.14673891768863837, + "grad_norm": 0.25128626823425293, + "learning_rate": 0.00022317145598245693, + "loss": 0.7961, + "step": 13120 + }, + { + "epoch": 0.14685076137590103, + "grad_norm": 0.27428579330444336, + "learning_rate": 0.00022294303074603684, + "loss": 0.792, + "step": 13130 + }, + { + "epoch": 0.14696260506316372, + "grad_norm": 0.25421425700187683, + "learning_rate": 0.0002227146055096167, + "loss": 0.8139, + "step": 13140 + }, + { + "epoch": 0.1470744487504264, + "grad_norm": 0.23709440231323242, + "learning_rate": 0.00022248618027319658, + "loss": 0.8147, + "step": 13150 + }, + { + "epoch": 0.14718629243768908, + "grad_norm": 0.2693617641925812, + "learning_rate": 0.00022225775503677646, + "loss": 0.8174, + "step": 13160 + }, + { + "epoch": 0.14729813612495177, + "grad_norm": 0.26674261689186096, + "learning_rate": 0.00022202932980035635, + "loss": 0.8105, + "step": 13170 + }, + { + "epoch": 0.14740997981221446, + "grad_norm": 0.2656268775463104, + "learning_rate": 0.00022180090456393623, + "loss": 0.8355, + "step": 13180 + }, + { + "epoch": 0.14752182349947712, + "grad_norm": 0.2587822377681732, + "learning_rate": 0.0002215724793275161, + "loss": 0.8311, + "step": 13190 + }, + { + "epoch": 0.14763366718673981, + "grad_norm": 0.29723209142684937, + "learning_rate": 0.00022134405409109597, + "loss": 0.8664, + "step": 13200 + }, + { + "epoch": 0.1477455108740025, + "grad_norm": 0.2579325735569, + "learning_rate": 0.00022111562885467588, + "loss": 0.8515, + "step": 13210 + }, + { + "epoch": 0.14785735456126517, + "grad_norm": 0.28357258439064026, + "learning_rate": 0.00022088720361825576, + "loss": 0.8562, + "step": 13220 + }, + { + "epoch": 0.14796919824852786, + "grad_norm": 0.26742318272590637, + "learning_rate": 0.00022065877838183562, + "loss": 0.8571, + "step": 13230 + }, + { + "epoch": 0.14808104193579055, + "grad_norm": 0.2750874161720276, + "learning_rate": 0.0002204303531454155, + "loss": 0.8449, + "step": 13240 + }, + { + "epoch": 0.14819288562305322, + "grad_norm": 0.3043031692504883, + "learning_rate": 0.0002202019279089954, + "loss": 0.8472, + "step": 13250 + }, + { + "epoch": 0.1483047293103159, + "grad_norm": 0.27216988801956177, + "learning_rate": 0.00021997350267257527, + "loss": 0.8732, + "step": 13260 + }, + { + "epoch": 0.14841657299757857, + "grad_norm": 0.2818603515625, + "learning_rate": 0.00021974507743615515, + "loss": 0.8333, + "step": 13270 + }, + { + "epoch": 0.14852841668484126, + "grad_norm": 0.2604407072067261, + "learning_rate": 0.000219516652199735, + "loss": 0.8467, + "step": 13280 + }, + { + "epoch": 0.14864026037210396, + "grad_norm": 0.28342294692993164, + "learning_rate": 0.00021928822696331491, + "loss": 0.8292, + "step": 13290 + }, + { + "epoch": 0.14875210405936662, + "grad_norm": 0.2564396262168884, + "learning_rate": 0.0002190598017268948, + "loss": 0.8355, + "step": 13300 + }, + { + "epoch": 0.1488639477466293, + "grad_norm": 0.2528108060359955, + "learning_rate": 0.00021883137649047465, + "loss": 0.8269, + "step": 13310 + }, + { + "epoch": 0.148975791433892, + "grad_norm": 0.26454785466194153, + "learning_rate": 0.00021860295125405456, + "loss": 0.8425, + "step": 13320 + }, + { + "epoch": 0.14908763512115467, + "grad_norm": 0.25204601883888245, + "learning_rate": 0.00021837452601763445, + "loss": 0.8251, + "step": 13330 + }, + { + "epoch": 0.14919947880841736, + "grad_norm": 0.24680152535438538, + "learning_rate": 0.0002181461007812143, + "loss": 0.8247, + "step": 13340 + }, + { + "epoch": 0.14931132249568005, + "grad_norm": 0.27356913685798645, + "learning_rate": 0.00021791767554479418, + "loss": 0.811, + "step": 13350 + }, + { + "epoch": 0.1494231661829427, + "grad_norm": 0.24703428149223328, + "learning_rate": 0.0002176892503083741, + "loss": 0.8145, + "step": 13360 + }, + { + "epoch": 0.1495350098702054, + "grad_norm": 0.27793166041374207, + "learning_rate": 0.00021746082507195395, + "loss": 0.8162, + "step": 13370 + }, + { + "epoch": 0.1496468535574681, + "grad_norm": 0.28826582431793213, + "learning_rate": 0.00021723239983553383, + "loss": 0.8258, + "step": 13380 + }, + { + "epoch": 0.14975869724473076, + "grad_norm": 0.24826544523239136, + "learning_rate": 0.00021700397459911372, + "loss": 0.8131, + "step": 13390 + }, + { + "epoch": 0.14987054093199345, + "grad_norm": 0.29015326499938965, + "learning_rate": 0.0002167755493626936, + "loss": 0.8241, + "step": 13400 + }, + { + "epoch": 0.14998238461925611, + "grad_norm": 0.2692265510559082, + "learning_rate": 0.00021654712412627348, + "loss": 0.8046, + "step": 13410 + }, + { + "epoch": 0.1500942283065188, + "grad_norm": 0.28277263045310974, + "learning_rate": 0.00021631869888985336, + "loss": 0.8075, + "step": 13420 + }, + { + "epoch": 0.1502060719937815, + "grad_norm": 0.25920721888542175, + "learning_rate": 0.00021609027365343322, + "loss": 0.8146, + "step": 13430 + }, + { + "epoch": 0.15031791568104416, + "grad_norm": 0.2548248767852783, + "learning_rate": 0.00021586184841701313, + "loss": 0.82, + "step": 13440 + }, + { + "epoch": 0.15042975936830685, + "grad_norm": 0.3121783435344696, + "learning_rate": 0.000215633423180593, + "loss": 0.796, + "step": 13450 + }, + { + "epoch": 0.15054160305556955, + "grad_norm": 0.2799825370311737, + "learning_rate": 0.00021540499794417287, + "loss": 0.8073, + "step": 13460 + }, + { + "epoch": 0.1506534467428322, + "grad_norm": 0.24525675177574158, + "learning_rate": 0.00021517657270775275, + "loss": 0.804, + "step": 13470 + }, + { + "epoch": 0.1507652904300949, + "grad_norm": 0.26799294352531433, + "learning_rate": 0.00021494814747133266, + "loss": 0.8086, + "step": 13480 + }, + { + "epoch": 0.1508771341173576, + "grad_norm": 0.24744056165218353, + "learning_rate": 0.00021471972223491252, + "loss": 0.7972, + "step": 13490 + }, + { + "epoch": 0.15098897780462026, + "grad_norm": 0.27284878492355347, + "learning_rate": 0.0002144912969984924, + "loss": 0.8048, + "step": 13500 + }, + { + "epoch": 0.15110082149188295, + "grad_norm": 0.2427281141281128, + "learning_rate": 0.00021426287176207225, + "loss": 0.8043, + "step": 13510 + }, + { + "epoch": 0.15121266517914564, + "grad_norm": 0.27432921528816223, + "learning_rate": 0.00021403444652565216, + "loss": 0.8198, + "step": 13520 + }, + { + "epoch": 0.1513245088664083, + "grad_norm": 0.26843661069869995, + "learning_rate": 0.00021380602128923205, + "loss": 0.8156, + "step": 13530 + }, + { + "epoch": 0.151436352553671, + "grad_norm": 0.2460176795721054, + "learning_rate": 0.0002135775960528119, + "loss": 0.806, + "step": 13540 + }, + { + "epoch": 0.15154819624093366, + "grad_norm": 0.24147658050060272, + "learning_rate": 0.00021334917081639179, + "loss": 0.8146, + "step": 13550 + }, + { + "epoch": 0.15166003992819635, + "grad_norm": 0.2715270221233368, + "learning_rate": 0.0002131207455799717, + "loss": 0.8065, + "step": 13560 + }, + { + "epoch": 0.15177188361545904, + "grad_norm": 0.2851991653442383, + "learning_rate": 0.00021289232034355155, + "loss": 0.8042, + "step": 13570 + }, + { + "epoch": 0.1518837273027217, + "grad_norm": 0.2779170870780945, + "learning_rate": 0.00021266389510713143, + "loss": 0.8163, + "step": 13580 + }, + { + "epoch": 0.1519955709899844, + "grad_norm": 0.2853197455406189, + "learning_rate": 0.00021243546987071132, + "loss": 0.8025, + "step": 13590 + }, + { + "epoch": 0.1521074146772471, + "grad_norm": 0.2753603160381317, + "learning_rate": 0.0002122070446342912, + "loss": 0.8187, + "step": 13600 + }, + { + "epoch": 0.15221925836450975, + "grad_norm": 0.29546552896499634, + "learning_rate": 0.00021197861939787108, + "loss": 0.8189, + "step": 13610 + }, + { + "epoch": 0.15233110205177244, + "grad_norm": 0.2799798250198364, + "learning_rate": 0.00021175019416145097, + "loss": 0.8098, + "step": 13620 + }, + { + "epoch": 0.15244294573903514, + "grad_norm": 0.23527085781097412, + "learning_rate": 0.00021152176892503082, + "loss": 0.8212, + "step": 13630 + }, + { + "epoch": 0.1525547894262978, + "grad_norm": 0.27207401394844055, + "learning_rate": 0.00021129334368861073, + "loss": 0.808, + "step": 13640 + }, + { + "epoch": 0.1526666331135605, + "grad_norm": 0.26520609855651855, + "learning_rate": 0.00021106491845219061, + "loss": 0.8133, + "step": 13650 + }, + { + "epoch": 0.15277847680082318, + "grad_norm": 0.2750151455402374, + "learning_rate": 0.00021083649321577047, + "loss": 0.8248, + "step": 13660 + }, + { + "epoch": 0.15289032048808585, + "grad_norm": 0.28339120745658875, + "learning_rate": 0.00021060806797935035, + "loss": 0.8175, + "step": 13670 + }, + { + "epoch": 0.15300216417534854, + "grad_norm": 0.27611440420150757, + "learning_rate": 0.00021037964274293026, + "loss": 0.8232, + "step": 13680 + }, + { + "epoch": 0.1531140078626112, + "grad_norm": 0.264113187789917, + "learning_rate": 0.00021015121750651012, + "loss": 0.8217, + "step": 13690 + }, + { + "epoch": 0.1532258515498739, + "grad_norm": 0.27031853795051575, + "learning_rate": 0.00020992279227009, + "loss": 0.8242, + "step": 13700 + }, + { + "epoch": 0.15333769523713658, + "grad_norm": 0.2753359079360962, + "learning_rate": 0.00020969436703366988, + "loss": 0.8311, + "step": 13710 + }, + { + "epoch": 0.15344953892439925, + "grad_norm": 0.24859648942947388, + "learning_rate": 0.00020946594179724977, + "loss": 0.8285, + "step": 13720 + }, + { + "epoch": 0.15356138261166194, + "grad_norm": 0.2773294448852539, + "learning_rate": 0.00020923751656082965, + "loss": 0.8201, + "step": 13730 + }, + { + "epoch": 0.15367322629892463, + "grad_norm": 0.23855488002300262, + "learning_rate": 0.0002090090913244095, + "loss": 0.8145, + "step": 13740 + }, + { + "epoch": 0.1537850699861873, + "grad_norm": 0.27641457319259644, + "learning_rate": 0.0002087806660879894, + "loss": 0.8233, + "step": 13750 + }, + { + "epoch": 0.15389691367345, + "grad_norm": 0.26556023955345154, + "learning_rate": 0.0002085522408515693, + "loss": 0.8309, + "step": 13760 + }, + { + "epoch": 0.15400875736071268, + "grad_norm": 0.2980164885520935, + "learning_rate": 0.00020832381561514915, + "loss": 0.8585, + "step": 13770 + }, + { + "epoch": 0.15412060104797534, + "grad_norm": 0.21802592277526855, + "learning_rate": 0.00020809539037872904, + "loss": 0.8385, + "step": 13780 + }, + { + "epoch": 0.15423244473523803, + "grad_norm": 0.3153620958328247, + "learning_rate": 0.00020786696514230895, + "loss": 0.8423, + "step": 13790 + }, + { + "epoch": 0.15434428842250072, + "grad_norm": 0.2928372621536255, + "learning_rate": 0.0002076385399058888, + "loss": 0.8399, + "step": 13800 + }, + { + "epoch": 0.1544561321097634, + "grad_norm": 0.3015557527542114, + "learning_rate": 0.00020741011466946868, + "loss": 0.843, + "step": 13810 + }, + { + "epoch": 0.15456797579702608, + "grad_norm": 0.2243575006723404, + "learning_rate": 0.00020718168943304857, + "loss": 0.8302, + "step": 13820 + }, + { + "epoch": 0.15467981948428874, + "grad_norm": 0.23281534016132355, + "learning_rate": 0.00020695326419662845, + "loss": 0.8268, + "step": 13830 + }, + { + "epoch": 0.15479166317155144, + "grad_norm": 0.2412877380847931, + "learning_rate": 0.00020672483896020833, + "loss": 0.849, + "step": 13840 + }, + { + "epoch": 0.15490350685881413, + "grad_norm": 0.2762492001056671, + "learning_rate": 0.00020649641372378822, + "loss": 0.8324, + "step": 13850 + }, + { + "epoch": 0.1550153505460768, + "grad_norm": 0.27976560592651367, + "learning_rate": 0.00020626798848736807, + "loss": 0.843, + "step": 13860 + }, + { + "epoch": 0.15512719423333948, + "grad_norm": 0.29076194763183594, + "learning_rate": 0.00020603956325094798, + "loss": 0.8575, + "step": 13870 + }, + { + "epoch": 0.15523903792060217, + "grad_norm": 0.2367868423461914, + "learning_rate": 0.00020581113801452786, + "loss": 0.8465, + "step": 13880 + }, + { + "epoch": 0.15535088160786484, + "grad_norm": 0.26191186904907227, + "learning_rate": 0.00020558271277810772, + "loss": 0.8291, + "step": 13890 + }, + { + "epoch": 0.15546272529512753, + "grad_norm": 0.27254414558410645, + "learning_rate": 0.0002053542875416876, + "loss": 0.8347, + "step": 13900 + }, + { + "epoch": 0.15557456898239022, + "grad_norm": 0.2718988060951233, + "learning_rate": 0.0002051258623052675, + "loss": 0.8319, + "step": 13910 + }, + { + "epoch": 0.15568641266965288, + "grad_norm": 0.24478264153003693, + "learning_rate": 0.00020489743706884737, + "loss": 0.8369, + "step": 13920 + }, + { + "epoch": 0.15579825635691558, + "grad_norm": 0.27791038155555725, + "learning_rate": 0.00020466901183242725, + "loss": 0.8486, + "step": 13930 + }, + { + "epoch": 0.15591010004417827, + "grad_norm": 0.27220630645751953, + "learning_rate": 0.00020444058659600713, + "loss": 0.8335, + "step": 13940 + }, + { + "epoch": 0.15602194373144093, + "grad_norm": 0.2945479154586792, + "learning_rate": 0.00020421216135958702, + "loss": 0.8234, + "step": 13950 + }, + { + "epoch": 0.15613378741870362, + "grad_norm": 0.2911258041858673, + "learning_rate": 0.0002039837361231669, + "loss": 0.8279, + "step": 13960 + }, + { + "epoch": 0.15624563110596631, + "grad_norm": 0.3039700984954834, + "learning_rate": 0.00020375531088674676, + "loss": 0.8409, + "step": 13970 + }, + { + "epoch": 0.15635747479322898, + "grad_norm": 0.27290788292884827, + "learning_rate": 0.00020352688565032664, + "loss": 0.8394, + "step": 13980 + }, + { + "epoch": 0.15646931848049167, + "grad_norm": 0.28534916043281555, + "learning_rate": 0.00020329846041390655, + "loss": 0.8431, + "step": 13990 + }, + { + "epoch": 0.15658116216775433, + "grad_norm": 0.304221510887146, + "learning_rate": 0.0002030700351774864, + "loss": 0.8476, + "step": 14000 + }, + { + "epoch": 0.15669300585501703, + "grad_norm": 0.3151461184024811, + "learning_rate": 0.0002028416099410663, + "loss": 0.852, + "step": 14010 + }, + { + "epoch": 0.15680484954227972, + "grad_norm": 0.2947019040584564, + "learning_rate": 0.00020261318470464617, + "loss": 0.8396, + "step": 14020 + }, + { + "epoch": 0.15691669322954238, + "grad_norm": 0.2737627625465393, + "learning_rate": 0.00020238475946822605, + "loss": 0.8337, + "step": 14030 + }, + { + "epoch": 0.15702853691680507, + "grad_norm": 0.28257089853286743, + "learning_rate": 0.00020215633423180594, + "loss": 0.8475, + "step": 14040 + }, + { + "epoch": 0.15714038060406776, + "grad_norm": 0.3102625608444214, + "learning_rate": 0.00020192790899538582, + "loss": 0.8451, + "step": 14050 + }, + { + "epoch": 0.15725222429133043, + "grad_norm": 0.2839931845664978, + "learning_rate": 0.00020169948375896567, + "loss": 0.8365, + "step": 14060 + }, + { + "epoch": 0.15736406797859312, + "grad_norm": 0.25566980242729187, + "learning_rate": 0.00020147105852254558, + "loss": 0.8287, + "step": 14070 + }, + { + "epoch": 0.1574759116658558, + "grad_norm": 0.267791211605072, + "learning_rate": 0.00020124263328612547, + "loss": 0.8289, + "step": 14080 + }, + { + "epoch": 0.15758775535311847, + "grad_norm": 0.267635703086853, + "learning_rate": 0.00020101420804970532, + "loss": 0.8357, + "step": 14090 + }, + { + "epoch": 0.15769959904038117, + "grad_norm": 0.28065699338912964, + "learning_rate": 0.0002007857828132852, + "loss": 0.8363, + "step": 14100 + }, + { + "epoch": 0.15781144272764386, + "grad_norm": 0.26585736870765686, + "learning_rate": 0.00020055735757686512, + "loss": 0.8409, + "step": 14110 + }, + { + "epoch": 0.15792328641490652, + "grad_norm": 0.2562732398509979, + "learning_rate": 0.00020032893234044497, + "loss": 0.8374, + "step": 14120 + }, + { + "epoch": 0.1580351301021692, + "grad_norm": 0.2572222650051117, + "learning_rate": 0.00020010050710402485, + "loss": 0.8405, + "step": 14130 + }, + { + "epoch": 0.15814697378943188, + "grad_norm": 0.3075050413608551, + "learning_rate": 0.00019987208186760474, + "loss": 0.825, + "step": 14140 + }, + { + "epoch": 0.15825881747669457, + "grad_norm": 0.2630293071269989, + "learning_rate": 0.00019964365663118462, + "loss": 0.8326, + "step": 14150 + }, + { + "epoch": 0.15837066116395726, + "grad_norm": 0.255015105009079, + "learning_rate": 0.0001994152313947645, + "loss": 0.8181, + "step": 14160 + }, + { + "epoch": 0.15848250485121992, + "grad_norm": 0.25929179787635803, + "learning_rate": 0.00019918680615834438, + "loss": 0.8067, + "step": 14170 + }, + { + "epoch": 0.15859434853848262, + "grad_norm": 0.27078965306282043, + "learning_rate": 0.00019895838092192424, + "loss": 0.8043, + "step": 14180 + }, + { + "epoch": 0.1587061922257453, + "grad_norm": 0.2618376612663269, + "learning_rate": 0.00019872995568550415, + "loss": 0.8191, + "step": 14190 + }, + { + "epoch": 0.15881803591300797, + "grad_norm": 0.246153324842453, + "learning_rate": 0.000198501530449084, + "loss": 0.8251, + "step": 14200 + }, + { + "epoch": 0.15892987960027066, + "grad_norm": 0.25498026609420776, + "learning_rate": 0.0001982731052126639, + "loss": 0.8319, + "step": 14210 + }, + { + "epoch": 0.15904172328753335, + "grad_norm": 0.2517942190170288, + "learning_rate": 0.0001980446799762438, + "loss": 0.8106, + "step": 14220 + }, + { + "epoch": 0.15915356697479602, + "grad_norm": 0.2659161388874054, + "learning_rate": 0.00019781625473982365, + "loss": 0.8163, + "step": 14230 + }, + { + "epoch": 0.1592654106620587, + "grad_norm": 0.24527288973331451, + "learning_rate": 0.00019758782950340354, + "loss": 0.8359, + "step": 14240 + }, + { + "epoch": 0.1593772543493214, + "grad_norm": 0.23943792283535004, + "learning_rate": 0.00019735940426698342, + "loss": 0.8253, + "step": 14250 + }, + { + "epoch": 0.15948909803658406, + "grad_norm": 0.30401650071144104, + "learning_rate": 0.0001971309790305633, + "loss": 0.8369, + "step": 14260 + }, + { + "epoch": 0.15960094172384676, + "grad_norm": 0.25001001358032227, + "learning_rate": 0.00019690255379414319, + "loss": 0.8354, + "step": 14270 + }, + { + "epoch": 0.15971278541110942, + "grad_norm": 0.2378586083650589, + "learning_rate": 0.00019667412855772307, + "loss": 0.8324, + "step": 14280 + }, + { + "epoch": 0.1598246290983721, + "grad_norm": 0.26216059923171997, + "learning_rate": 0.00019644570332130292, + "loss": 0.8227, + "step": 14290 + }, + { + "epoch": 0.1599364727856348, + "grad_norm": 0.24156969785690308, + "learning_rate": 0.00019621727808488283, + "loss": 0.8362, + "step": 14300 + }, + { + "epoch": 0.16004831647289747, + "grad_norm": 0.24192091822624207, + "learning_rate": 0.00019598885284846272, + "loss": 0.835, + "step": 14310 + }, + { + "epoch": 0.16016016016016016, + "grad_norm": 0.24861887097358704, + "learning_rate": 0.00019576042761204257, + "loss": 0.8232, + "step": 14320 + }, + { + "epoch": 0.16027200384742285, + "grad_norm": 0.27175864577293396, + "learning_rate": 0.00019553200237562246, + "loss": 0.8303, + "step": 14330 + }, + { + "epoch": 0.16038384753468551, + "grad_norm": 0.272334486246109, + "learning_rate": 0.00019530357713920237, + "loss": 0.8217, + "step": 14340 + }, + { + "epoch": 0.1604956912219482, + "grad_norm": 0.28357213735580444, + "learning_rate": 0.00019507515190278222, + "loss": 0.8343, + "step": 14350 + }, + { + "epoch": 0.1606075349092109, + "grad_norm": 0.272276371717453, + "learning_rate": 0.0001948467266663621, + "loss": 0.8235, + "step": 14360 + }, + { + "epoch": 0.16071937859647356, + "grad_norm": 0.26771044731140137, + "learning_rate": 0.000194618301429942, + "loss": 0.8292, + "step": 14370 + }, + { + "epoch": 0.16083122228373625, + "grad_norm": 0.27449774742126465, + "learning_rate": 0.00019438987619352187, + "loss": 0.8485, + "step": 14380 + }, + { + "epoch": 0.16094306597099894, + "grad_norm": 0.26026156544685364, + "learning_rate": 0.00019416145095710175, + "loss": 0.8458, + "step": 14390 + }, + { + "epoch": 0.1610549096582616, + "grad_norm": 0.2667345404624939, + "learning_rate": 0.00019393302572068164, + "loss": 0.8519, + "step": 14400 + }, + { + "epoch": 0.1611667533455243, + "grad_norm": 0.26302048563957214, + "learning_rate": 0.0001937046004842615, + "loss": 0.8353, + "step": 14410 + }, + { + "epoch": 0.16127859703278696, + "grad_norm": 0.24420003592967987, + "learning_rate": 0.0001934761752478414, + "loss": 0.8464, + "step": 14420 + }, + { + "epoch": 0.16139044072004965, + "grad_norm": 0.2739315629005432, + "learning_rate": 0.00019324775001142126, + "loss": 0.8257, + "step": 14430 + }, + { + "epoch": 0.16150228440731235, + "grad_norm": 0.2370629757642746, + "learning_rate": 0.00019301932477500114, + "loss": 0.8324, + "step": 14440 + }, + { + "epoch": 0.161614128094575, + "grad_norm": 0.2616153955459595, + "learning_rate": 0.00019279089953858102, + "loss": 0.8513, + "step": 14450 + }, + { + "epoch": 0.1617259717818377, + "grad_norm": 0.2527558207511902, + "learning_rate": 0.0001925624743021609, + "loss": 0.8435, + "step": 14460 + }, + { + "epoch": 0.1618378154691004, + "grad_norm": 0.28255122900009155, + "learning_rate": 0.0001923340490657408, + "loss": 0.8497, + "step": 14470 + }, + { + "epoch": 0.16194965915636306, + "grad_norm": 0.23198026418685913, + "learning_rate": 0.00019210562382932067, + "loss": 0.8357, + "step": 14480 + }, + { + "epoch": 0.16206150284362575, + "grad_norm": 0.2534460127353668, + "learning_rate": 0.00019187719859290053, + "loss": 0.8396, + "step": 14490 + }, + { + "epoch": 0.16217334653088844, + "grad_norm": 0.2693686783313751, + "learning_rate": 0.00019164877335648044, + "loss": 0.8438, + "step": 14500 + }, + { + "epoch": 0.1622851902181511, + "grad_norm": 0.26181599497795105, + "learning_rate": 0.00019142034812006032, + "loss": 0.8452, + "step": 14510 + }, + { + "epoch": 0.1623970339054138, + "grad_norm": 0.2268761545419693, + "learning_rate": 0.00019119192288364017, + "loss": 0.8496, + "step": 14520 + }, + { + "epoch": 0.1625088775926765, + "grad_norm": 0.27698907256126404, + "learning_rate": 0.00019096349764722006, + "loss": 0.8265, + "step": 14530 + }, + { + "epoch": 0.16262072127993915, + "grad_norm": 0.30570700764656067, + "learning_rate": 0.00019073507241079997, + "loss": 0.8399, + "step": 14540 + }, + { + "epoch": 0.16273256496720184, + "grad_norm": 0.2894477844238281, + "learning_rate": 0.00019050664717437982, + "loss": 0.8488, + "step": 14550 + }, + { + "epoch": 0.16284440865446453, + "grad_norm": 0.3094457685947418, + "learning_rate": 0.0001902782219379597, + "loss": 0.8243, + "step": 14560 + }, + { + "epoch": 0.1629562523417272, + "grad_norm": 0.2908037602901459, + "learning_rate": 0.0001900497967015396, + "loss": 0.835, + "step": 14570 + }, + { + "epoch": 0.1630680960289899, + "grad_norm": 0.27222102880477905, + "learning_rate": 0.00018982137146511947, + "loss": 0.8306, + "step": 14580 + }, + { + "epoch": 0.16317993971625255, + "grad_norm": 0.2542339563369751, + "learning_rate": 0.00018959294622869935, + "loss": 0.8259, + "step": 14590 + }, + { + "epoch": 0.16329178340351524, + "grad_norm": 0.28288012742996216, + "learning_rate": 0.00018936452099227924, + "loss": 0.8243, + "step": 14600 + }, + { + "epoch": 0.16340362709077794, + "grad_norm": 0.2584143877029419, + "learning_rate": 0.0001891360957558591, + "loss": 0.8224, + "step": 14610 + }, + { + "epoch": 0.1635154707780406, + "grad_norm": 0.26679450273513794, + "learning_rate": 0.000188907670519439, + "loss": 0.8142, + "step": 14620 + }, + { + "epoch": 0.1636273144653033, + "grad_norm": 0.24589306116104126, + "learning_rate": 0.00018867924528301889, + "loss": 0.81, + "step": 14630 + }, + { + "epoch": 0.16373915815256598, + "grad_norm": 0.28474611043930054, + "learning_rate": 0.00018845082004659874, + "loss": 0.7989, + "step": 14640 + }, + { + "epoch": 0.16385100183982865, + "grad_norm": 0.27567991614341736, + "learning_rate": 0.00018822239481017862, + "loss": 0.8049, + "step": 14650 + }, + { + "epoch": 0.16396284552709134, + "grad_norm": 0.2509905695915222, + "learning_rate": 0.0001879939695737585, + "loss": 0.8168, + "step": 14660 + }, + { + "epoch": 0.16407468921435403, + "grad_norm": 0.30284953117370605, + "learning_rate": 0.0001877655443373384, + "loss": 0.8055, + "step": 14670 + }, + { + "epoch": 0.1641865329016167, + "grad_norm": 0.27638325095176697, + "learning_rate": 0.00018753711910091827, + "loss": 0.8368, + "step": 14680 + }, + { + "epoch": 0.16429837658887939, + "grad_norm": 0.29546642303466797, + "learning_rate": 0.00018730869386449816, + "loss": 0.8161, + "step": 14690 + }, + { + "epoch": 0.16441022027614208, + "grad_norm": 0.2483370304107666, + "learning_rate": 0.00018708026862807804, + "loss": 0.8136, + "step": 14700 + }, + { + "epoch": 0.16452206396340474, + "grad_norm": 0.2862898111343384, + "learning_rate": 0.00018685184339165792, + "loss": 0.836, + "step": 14710 + }, + { + "epoch": 0.16463390765066743, + "grad_norm": 0.2730434238910675, + "learning_rate": 0.00018662341815523778, + "loss": 0.8279, + "step": 14720 + }, + { + "epoch": 0.1647457513379301, + "grad_norm": 0.2846275269985199, + "learning_rate": 0.0001863949929188177, + "loss": 0.7991, + "step": 14730 + }, + { + "epoch": 0.1648575950251928, + "grad_norm": 0.2455524355173111, + "learning_rate": 0.00018616656768239757, + "loss": 0.7931, + "step": 14740 + }, + { + "epoch": 0.16496943871245548, + "grad_norm": 0.25060829520225525, + "learning_rate": 0.00018593814244597743, + "loss": 0.8009, + "step": 14750 + }, + { + "epoch": 0.16508128239971814, + "grad_norm": 0.2687000334262848, + "learning_rate": 0.0001857097172095573, + "loss": 0.7968, + "step": 14760 + }, + { + "epoch": 0.16519312608698083, + "grad_norm": 0.28619691729545593, + "learning_rate": 0.00018548129197313722, + "loss": 0.7818, + "step": 14770 + }, + { + "epoch": 0.16530496977424353, + "grad_norm": 0.2549494206905365, + "learning_rate": 0.00018525286673671707, + "loss": 0.7877, + "step": 14780 + }, + { + "epoch": 0.1654168134615062, + "grad_norm": 0.2419700175523758, + "learning_rate": 0.00018502444150029696, + "loss": 0.7899, + "step": 14790 + }, + { + "epoch": 0.16552865714876888, + "grad_norm": 0.2636066675186157, + "learning_rate": 0.00018479601626387684, + "loss": 0.7893, + "step": 14800 + }, + { + "epoch": 0.16564050083603157, + "grad_norm": 0.264072984457016, + "learning_rate": 0.00018456759102745672, + "loss": 0.7984, + "step": 14810 + }, + { + "epoch": 0.16575234452329424, + "grad_norm": 0.2661677598953247, + "learning_rate": 0.0001843391657910366, + "loss": 0.8085, + "step": 14820 + }, + { + "epoch": 0.16586418821055693, + "grad_norm": 0.28324052691459656, + "learning_rate": 0.0001841107405546165, + "loss": 0.8066, + "step": 14830 + }, + { + "epoch": 0.16597603189781962, + "grad_norm": 0.277761310338974, + "learning_rate": 0.00018388231531819634, + "loss": 0.8008, + "step": 14840 + }, + { + "epoch": 0.16608787558508228, + "grad_norm": 0.2669602036476135, + "learning_rate": 0.00018365389008177625, + "loss": 0.8285, + "step": 14850 + }, + { + "epoch": 0.16619971927234498, + "grad_norm": 0.28757140040397644, + "learning_rate": 0.00018342546484535614, + "loss": 0.8121, + "step": 14860 + }, + { + "epoch": 0.16631156295960764, + "grad_norm": 0.2616439163684845, + "learning_rate": 0.000183197039608936, + "loss": 0.8185, + "step": 14870 + }, + { + "epoch": 0.16642340664687033, + "grad_norm": 0.28334370255470276, + "learning_rate": 0.00018296861437251587, + "loss": 0.8229, + "step": 14880 + }, + { + "epoch": 0.16653525033413302, + "grad_norm": 0.2659022808074951, + "learning_rate": 0.00018274018913609576, + "loss": 0.82, + "step": 14890 + }, + { + "epoch": 0.1666470940213957, + "grad_norm": 0.2544262111186981, + "learning_rate": 0.00018251176389967564, + "loss": 0.84, + "step": 14900 + }, + { + "epoch": 0.16675893770865838, + "grad_norm": 0.27492937445640564, + "learning_rate": 0.00018228333866325552, + "loss": 0.8411, + "step": 14910 + }, + { + "epoch": 0.16687078139592107, + "grad_norm": 0.2961216866970062, + "learning_rate": 0.00018205491342683538, + "loss": 0.8178, + "step": 14920 + }, + { + "epoch": 0.16698262508318373, + "grad_norm": 0.2704416811466217, + "learning_rate": 0.0001818264881904153, + "loss": 0.8264, + "step": 14930 + }, + { + "epoch": 0.16709446877044642, + "grad_norm": 0.261704683303833, + "learning_rate": 0.00018159806295399517, + "loss": 0.8307, + "step": 14940 + }, + { + "epoch": 0.16720631245770912, + "grad_norm": 0.26157405972480774, + "learning_rate": 0.00018136963771757503, + "loss": 0.8064, + "step": 14950 + }, + { + "epoch": 0.16731815614497178, + "grad_norm": 0.2589896023273468, + "learning_rate": 0.0001811412124811549, + "loss": 0.8195, + "step": 14960 + }, + { + "epoch": 0.16742999983223447, + "grad_norm": 0.24691319465637207, + "learning_rate": 0.00018091278724473482, + "loss": 0.8283, + "step": 14970 + }, + { + "epoch": 0.16754184351949716, + "grad_norm": 0.2527819871902466, + "learning_rate": 0.00018068436200831468, + "loss": 0.8229, + "step": 14980 + }, + { + "epoch": 0.16765368720675983, + "grad_norm": 0.2639094293117523, + "learning_rate": 0.00018045593677189456, + "loss": 0.8393, + "step": 14990 + }, + { + "epoch": 0.16776553089402252, + "grad_norm": 0.24417634308338165, + "learning_rate": 0.00018022751153547444, + "loss": 0.8204, + "step": 15000 + }, + { + "epoch": 0.16787737458128518, + "grad_norm": 0.25673115253448486, + "learning_rate": 0.00017999908629905432, + "loss": 0.8184, + "step": 15010 + }, + { + "epoch": 0.16798921826854787, + "grad_norm": 0.254077285528183, + "learning_rate": 0.0001797706610626342, + "loss": 0.8195, + "step": 15020 + }, + { + "epoch": 0.16810106195581057, + "grad_norm": 0.2455417662858963, + "learning_rate": 0.0001795422358262141, + "loss": 0.8255, + "step": 15030 + }, + { + "epoch": 0.16821290564307323, + "grad_norm": 0.27918189764022827, + "learning_rate": 0.00017931381058979395, + "loss": 0.8345, + "step": 15040 + }, + { + "epoch": 0.16832474933033592, + "grad_norm": 0.2272186279296875, + "learning_rate": 0.00017908538535337386, + "loss": 0.8178, + "step": 15050 + }, + { + "epoch": 0.1684365930175986, + "grad_norm": 0.269189715385437, + "learning_rate": 0.00017885696011695374, + "loss": 0.8343, + "step": 15060 + }, + { + "epoch": 0.16854843670486128, + "grad_norm": 0.2805529832839966, + "learning_rate": 0.0001786285348805336, + "loss": 0.8126, + "step": 15070 + }, + { + "epoch": 0.16866028039212397, + "grad_norm": 0.28788769245147705, + "learning_rate": 0.00017840010964411348, + "loss": 0.8278, + "step": 15080 + }, + { + "epoch": 0.16877212407938666, + "grad_norm": 0.2439277619123459, + "learning_rate": 0.00017817168440769336, + "loss": 0.8272, + "step": 15090 + }, + { + "epoch": 0.16888396776664932, + "grad_norm": 0.3151440918445587, + "learning_rate": 0.00017794325917127324, + "loss": 0.8201, + "step": 15100 + }, + { + "epoch": 0.16899581145391201, + "grad_norm": 0.2562885880470276, + "learning_rate": 0.00017771483393485313, + "loss": 0.8275, + "step": 15110 + }, + { + "epoch": 0.1691076551411747, + "grad_norm": 0.2718476355075836, + "learning_rate": 0.00017748640869843298, + "loss": 0.821, + "step": 15120 + }, + { + "epoch": 0.16921949882843737, + "grad_norm": 0.2699459493160248, + "learning_rate": 0.0001772579834620129, + "loss": 0.8352, + "step": 15130 + }, + { + "epoch": 0.16933134251570006, + "grad_norm": 0.29737600684165955, + "learning_rate": 0.00017702955822559277, + "loss": 0.8279, + "step": 15140 + }, + { + "epoch": 0.16944318620296273, + "grad_norm": 0.3075369894504547, + "learning_rate": 0.00017680113298917263, + "loss": 0.8037, + "step": 15150 + }, + { + "epoch": 0.16955502989022542, + "grad_norm": 0.27061593532562256, + "learning_rate": 0.00017657270775275254, + "loss": 0.8149, + "step": 15160 + }, + { + "epoch": 0.1696668735774881, + "grad_norm": 0.26719844341278076, + "learning_rate": 0.00017634428251633242, + "loss": 0.7896, + "step": 15170 + }, + { + "epoch": 0.16977871726475077, + "grad_norm": 0.2871409058570862, + "learning_rate": 0.00017611585727991228, + "loss": 0.7863, + "step": 15180 + }, + { + "epoch": 0.16989056095201346, + "grad_norm": 0.2502906620502472, + "learning_rate": 0.00017588743204349216, + "loss": 0.7817, + "step": 15190 + }, + { + "epoch": 0.17000240463927616, + "grad_norm": 0.2579248547554016, + "learning_rate": 0.00017565900680707207, + "loss": 0.796, + "step": 15200 + }, + { + "epoch": 0.17011424832653882, + "grad_norm": 0.2537415325641632, + "learning_rate": 0.00017543058157065193, + "loss": 0.78, + "step": 15210 + }, + { + "epoch": 0.1702260920138015, + "grad_norm": 0.2420157790184021, + "learning_rate": 0.0001752021563342318, + "loss": 0.7946, + "step": 15220 + }, + { + "epoch": 0.1703379357010642, + "grad_norm": 0.2423790544271469, + "learning_rate": 0.0001749737310978117, + "loss": 0.797, + "step": 15230 + }, + { + "epoch": 0.17044977938832687, + "grad_norm": 0.2521071434020996, + "learning_rate": 0.00017474530586139157, + "loss": 0.8073, + "step": 15240 + }, + { + "epoch": 0.17056162307558956, + "grad_norm": 0.22921273112297058, + "learning_rate": 0.00017451688062497146, + "loss": 0.7916, + "step": 15250 + }, + { + "epoch": 0.17067346676285225, + "grad_norm": 0.35150206089019775, + "learning_rate": 0.00017428845538855134, + "loss": 0.8001, + "step": 15260 + }, + { + "epoch": 0.1707853104501149, + "grad_norm": 0.27637869119644165, + "learning_rate": 0.0001740600301521312, + "loss": 0.7948, + "step": 15270 + }, + { + "epoch": 0.1708971541373776, + "grad_norm": 0.22480230033397675, + "learning_rate": 0.0001738316049157111, + "loss": 0.7932, + "step": 15280 + }, + { + "epoch": 0.1710089978246403, + "grad_norm": 0.27264508605003357, + "learning_rate": 0.000173603179679291, + "loss": 0.8083, + "step": 15290 + }, + { + "epoch": 0.17112084151190296, + "grad_norm": 0.2647417485713959, + "learning_rate": 0.00017337475444287084, + "loss": 0.8177, + "step": 15300 + }, + { + "epoch": 0.17123268519916565, + "grad_norm": 0.23619987070560455, + "learning_rate": 0.00017314632920645073, + "loss": 0.8068, + "step": 15310 + }, + { + "epoch": 0.17134452888642832, + "grad_norm": 0.22450131177902222, + "learning_rate": 0.0001729179039700306, + "loss": 0.8004, + "step": 15320 + }, + { + "epoch": 0.171456372573691, + "grad_norm": 0.2784859240055084, + "learning_rate": 0.0001726894787336105, + "loss": 0.7938, + "step": 15330 + }, + { + "epoch": 0.1715682162609537, + "grad_norm": 0.25513574481010437, + "learning_rate": 0.00017246105349719038, + "loss": 0.7844, + "step": 15340 + }, + { + "epoch": 0.17168005994821636, + "grad_norm": 0.27425146102905273, + "learning_rate": 0.00017223262826077023, + "loss": 0.7906, + "step": 15350 + }, + { + "epoch": 0.17179190363547905, + "grad_norm": 0.2500791847705841, + "learning_rate": 0.00017200420302435014, + "loss": 0.7834, + "step": 15360 + }, + { + "epoch": 0.17190374732274175, + "grad_norm": 0.2550630271434784, + "learning_rate": 0.00017177577778793002, + "loss": 0.7736, + "step": 15370 + }, + { + "epoch": 0.1720155910100044, + "grad_norm": 0.25209444761276245, + "learning_rate": 0.00017154735255150988, + "loss": 0.773, + "step": 15380 + }, + { + "epoch": 0.1721274346972671, + "grad_norm": 0.2347812056541443, + "learning_rate": 0.00017131892731508976, + "loss": 0.7745, + "step": 15390 + }, + { + "epoch": 0.1722392783845298, + "grad_norm": 0.2858305871486664, + "learning_rate": 0.00017109050207866967, + "loss": 0.7776, + "step": 15400 + }, + { + "epoch": 0.17235112207179246, + "grad_norm": 0.30414941906929016, + "learning_rate": 0.00017086207684224953, + "loss": 0.7701, + "step": 15410 + }, + { + "epoch": 0.17246296575905515, + "grad_norm": 0.2645011842250824, + "learning_rate": 0.0001706336516058294, + "loss": 0.7746, + "step": 15420 + }, + { + "epoch": 0.17257480944631784, + "grad_norm": 0.2984048128128052, + "learning_rate": 0.0001704052263694093, + "loss": 0.771, + "step": 15430 + }, + { + "epoch": 0.1726866531335805, + "grad_norm": 0.2734147906303406, + "learning_rate": 0.00017017680113298918, + "loss": 0.7769, + "step": 15440 + }, + { + "epoch": 0.1727984968208432, + "grad_norm": 0.2632124125957489, + "learning_rate": 0.00016994837589656906, + "loss": 0.7754, + "step": 15450 + }, + { + "epoch": 0.17291034050810586, + "grad_norm": 0.29384443163871765, + "learning_rate": 0.00016971995066014894, + "loss": 0.7833, + "step": 15460 + }, + { + "epoch": 0.17302218419536855, + "grad_norm": 0.3194182813167572, + "learning_rate": 0.0001694915254237288, + "loss": 0.7813, + "step": 15470 + }, + { + "epoch": 0.17313402788263124, + "grad_norm": 0.25995251536369324, + "learning_rate": 0.0001692631001873087, + "loss": 0.7796, + "step": 15480 + }, + { + "epoch": 0.1732458715698939, + "grad_norm": 0.272419810295105, + "learning_rate": 0.0001690346749508886, + "loss": 0.7839, + "step": 15490 + }, + { + "epoch": 0.1733577152571566, + "grad_norm": 0.26239413022994995, + "learning_rate": 0.00016880624971446845, + "loss": 0.7807, + "step": 15500 + }, + { + "epoch": 0.1734695589444193, + "grad_norm": 0.29991698265075684, + "learning_rate": 0.00016857782447804833, + "loss": 0.7941, + "step": 15510 + }, + { + "epoch": 0.17358140263168195, + "grad_norm": 0.2812528908252716, + "learning_rate": 0.00016834939924162824, + "loss": 0.7863, + "step": 15520 + }, + { + "epoch": 0.17369324631894464, + "grad_norm": 0.2557685077190399, + "learning_rate": 0.0001681209740052081, + "loss": 0.7953, + "step": 15530 + }, + { + "epoch": 0.17380509000620734, + "grad_norm": 0.28565913438796997, + "learning_rate": 0.00016789254876878798, + "loss": 0.7934, + "step": 15540 + }, + { + "epoch": 0.17391693369347, + "grad_norm": 0.25316086411476135, + "learning_rate": 0.00016766412353236783, + "loss": 0.7969, + "step": 15550 + }, + { + "epoch": 0.1740287773807327, + "grad_norm": 0.2636478543281555, + "learning_rate": 0.00016743569829594774, + "loss": 0.8021, + "step": 15560 + }, + { + "epoch": 0.17414062106799538, + "grad_norm": 0.28839442133903503, + "learning_rate": 0.00016720727305952763, + "loss": 0.8108, + "step": 15570 + }, + { + "epoch": 0.17425246475525805, + "grad_norm": 0.2453639954328537, + "learning_rate": 0.00016697884782310748, + "loss": 0.8034, + "step": 15580 + }, + { + "epoch": 0.17436430844252074, + "grad_norm": 0.2550848424434662, + "learning_rate": 0.0001667504225866874, + "loss": 0.8169, + "step": 15590 + }, + { + "epoch": 0.1744761521297834, + "grad_norm": 0.24949923157691956, + "learning_rate": 0.00016652199735026727, + "loss": 0.8167, + "step": 15600 + }, + { + "epoch": 0.1745879958170461, + "grad_norm": 0.24357125163078308, + "learning_rate": 0.00016629357211384713, + "loss": 0.821, + "step": 15610 + }, + { + "epoch": 0.17469983950430878, + "grad_norm": 0.2246461659669876, + "learning_rate": 0.000166065146877427, + "loss": 0.82, + "step": 15620 + }, + { + "epoch": 0.17481168319157145, + "grad_norm": 0.26160740852355957, + "learning_rate": 0.00016583672164100692, + "loss": 0.8167, + "step": 15630 + }, + { + "epoch": 0.17492352687883414, + "grad_norm": 0.25773337483406067, + "learning_rate": 0.00016560829640458678, + "loss": 0.8305, + "step": 15640 + }, + { + "epoch": 0.17503537056609683, + "grad_norm": 0.24051527678966522, + "learning_rate": 0.00016537987116816666, + "loss": 0.8201, + "step": 15650 + }, + { + "epoch": 0.1751472142533595, + "grad_norm": 0.2507860064506531, + "learning_rate": 0.00016515144593174654, + "loss": 0.8444, + "step": 15660 + }, + { + "epoch": 0.1752590579406222, + "grad_norm": 0.24071821570396423, + "learning_rate": 0.00016492302069532643, + "loss": 0.8071, + "step": 15670 + }, + { + "epoch": 0.17537090162788488, + "grad_norm": 0.2533905506134033, + "learning_rate": 0.0001646945954589063, + "loss": 0.8164, + "step": 15680 + }, + { + "epoch": 0.17548274531514754, + "grad_norm": 0.2546316683292389, + "learning_rate": 0.0001644661702224862, + "loss": 0.8237, + "step": 15690 + }, + { + "epoch": 0.17559458900241023, + "grad_norm": 0.25692155957221985, + "learning_rate": 0.00016423774498606605, + "loss": 0.8198, + "step": 15700 + }, + { + "epoch": 0.17570643268967293, + "grad_norm": 0.254535436630249, + "learning_rate": 0.00016400931974964596, + "loss": 0.8061, + "step": 15710 + }, + { + "epoch": 0.1758182763769356, + "grad_norm": 0.2557326555252075, + "learning_rate": 0.00016378089451322584, + "loss": 0.8194, + "step": 15720 + }, + { + "epoch": 0.17593012006419828, + "grad_norm": 0.24234241247177124, + "learning_rate": 0.0001635524692768057, + "loss": 0.8183, + "step": 15730 + }, + { + "epoch": 0.17604196375146094, + "grad_norm": 0.2597709596157074, + "learning_rate": 0.00016332404404038558, + "loss": 0.7957, + "step": 15740 + }, + { + "epoch": 0.17615380743872364, + "grad_norm": 0.2896418273448944, + "learning_rate": 0.0001630956188039655, + "loss": 0.8146, + "step": 15750 + }, + { + "epoch": 0.17626565112598633, + "grad_norm": 0.2686966061592102, + "learning_rate": 0.00016286719356754535, + "loss": 0.7988, + "step": 15760 + }, + { + "epoch": 0.176377494813249, + "grad_norm": 0.26220840215682983, + "learning_rate": 0.00016263876833112523, + "loss": 0.7936, + "step": 15770 + }, + { + "epoch": 0.17648933850051168, + "grad_norm": 0.260547012090683, + "learning_rate": 0.00016241034309470508, + "loss": 0.8002, + "step": 15780 + }, + { + "epoch": 0.17660118218777437, + "grad_norm": 0.22341471910476685, + "learning_rate": 0.000162181917858285, + "loss": 0.7935, + "step": 15790 + }, + { + "epoch": 0.17671302587503704, + "grad_norm": 0.24994009733200073, + "learning_rate": 0.00016195349262186488, + "loss": 0.7971, + "step": 15800 + }, + { + "epoch": 0.17682486956229973, + "grad_norm": 0.24070651829242706, + "learning_rate": 0.00016172506738544473, + "loss": 0.7844, + "step": 15810 + }, + { + "epoch": 0.17693671324956242, + "grad_norm": 0.23858696222305298, + "learning_rate": 0.00016149664214902461, + "loss": 0.7687, + "step": 15820 + }, + { + "epoch": 0.17704855693682509, + "grad_norm": 0.24684946238994598, + "learning_rate": 0.00016126821691260452, + "loss": 0.7848, + "step": 15830 + }, + { + "epoch": 0.17716040062408778, + "grad_norm": 0.2525545656681061, + "learning_rate": 0.00016103979167618438, + "loss": 0.773, + "step": 15840 + }, + { + "epoch": 0.17727224431135047, + "grad_norm": 0.2485392689704895, + "learning_rate": 0.00016081136643976426, + "loss": 0.7787, + "step": 15850 + }, + { + "epoch": 0.17738408799861313, + "grad_norm": 0.2384241223335266, + "learning_rate": 0.00016058294120334415, + "loss": 0.7732, + "step": 15860 + }, + { + "epoch": 0.17749593168587582, + "grad_norm": 0.25029659271240234, + "learning_rate": 0.00016035451596692403, + "loss": 0.7819, + "step": 15870 + }, + { + "epoch": 0.1776077753731385, + "grad_norm": 0.2988499701023102, + "learning_rate": 0.0001601260907305039, + "loss": 0.7815, + "step": 15880 + }, + { + "epoch": 0.17771961906040118, + "grad_norm": 0.25840380787849426, + "learning_rate": 0.0001598976654940838, + "loss": 0.7899, + "step": 15890 + }, + { + "epoch": 0.17783146274766387, + "grad_norm": 0.2870889902114868, + "learning_rate": 0.00015966924025766365, + "loss": 0.7964, + "step": 15900 + }, + { + "epoch": 0.17794330643492653, + "grad_norm": 0.270702987909317, + "learning_rate": 0.00015944081502124356, + "loss": 0.7907, + "step": 15910 + }, + { + "epoch": 0.17805515012218923, + "grad_norm": 0.24939289689064026, + "learning_rate": 0.00015921238978482344, + "loss": 0.7909, + "step": 15920 + }, + { + "epoch": 0.17816699380945192, + "grad_norm": 0.25692620873451233, + "learning_rate": 0.0001589839645484033, + "loss": 0.7864, + "step": 15930 + }, + { + "epoch": 0.17827883749671458, + "grad_norm": 0.25667235255241394, + "learning_rate": 0.00015875553931198318, + "loss": 0.7792, + "step": 15940 + }, + { + "epoch": 0.17839068118397727, + "grad_norm": 0.27988189458847046, + "learning_rate": 0.0001585271140755631, + "loss": 0.78, + "step": 15950 + }, + { + "epoch": 0.17850252487123996, + "grad_norm": 0.26706936955451965, + "learning_rate": 0.00015829868883914295, + "loss": 0.7764, + "step": 15960 + }, + { + "epoch": 0.17861436855850263, + "grad_norm": 0.25825801491737366, + "learning_rate": 0.00015807026360272283, + "loss": 0.7798, + "step": 15970 + }, + { + "epoch": 0.17872621224576532, + "grad_norm": 0.26630404591560364, + "learning_rate": 0.0001578418383663027, + "loss": 0.7877, + "step": 15980 + }, + { + "epoch": 0.178838055933028, + "grad_norm": 0.24562442302703857, + "learning_rate": 0.0001576134131298826, + "loss": 0.7761, + "step": 15990 + }, + { + "epoch": 0.17894989962029068, + "grad_norm": 0.2607520818710327, + "learning_rate": 0.00015738498789346248, + "loss": 0.7844, + "step": 16000 + }, + { + "epoch": 0.17906174330755337, + "grad_norm": 0.25256794691085815, + "learning_rate": 0.00015715656265704233, + "loss": 0.7712, + "step": 16010 + }, + { + "epoch": 0.17917358699481606, + "grad_norm": 0.24657808244228363, + "learning_rate": 0.00015692813742062222, + "loss": 0.7766, + "step": 16020 + }, + { + "epoch": 0.17928543068207872, + "grad_norm": 0.2546744644641876, + "learning_rate": 0.00015669971218420213, + "loss": 0.781, + "step": 16030 + }, + { + "epoch": 0.17939727436934141, + "grad_norm": 0.24849241971969604, + "learning_rate": 0.00015647128694778198, + "loss": 0.786, + "step": 16040 + }, + { + "epoch": 0.17950911805660408, + "grad_norm": 0.2447352409362793, + "learning_rate": 0.00015624286171136187, + "loss": 0.7805, + "step": 16050 + }, + { + "epoch": 0.17962096174386677, + "grad_norm": 0.3004114031791687, + "learning_rate": 0.00015601443647494178, + "loss": 0.7748, + "step": 16060 + }, + { + "epoch": 0.17973280543112946, + "grad_norm": 0.24974007904529572, + "learning_rate": 0.00015578601123852163, + "loss": 0.7823, + "step": 16070 + }, + { + "epoch": 0.17984464911839212, + "grad_norm": 0.2995624542236328, + "learning_rate": 0.00015555758600210151, + "loss": 0.7894, + "step": 16080 + }, + { + "epoch": 0.17995649280565482, + "grad_norm": 0.2560220956802368, + "learning_rate": 0.0001553291607656814, + "loss": 0.7849, + "step": 16090 + }, + { + "epoch": 0.1800683364929175, + "grad_norm": 0.24940122663974762, + "learning_rate": 0.00015510073552926128, + "loss": 0.7903, + "step": 16100 + }, + { + "epoch": 0.18018018018018017, + "grad_norm": 0.22082312405109406, + "learning_rate": 0.00015487231029284116, + "loss": 0.783, + "step": 16110 + }, + { + "epoch": 0.18029202386744286, + "grad_norm": 0.2670224606990814, + "learning_rate": 0.00015464388505642104, + "loss": 0.7919, + "step": 16120 + }, + { + "epoch": 0.18040386755470555, + "grad_norm": 0.2533135414123535, + "learning_rate": 0.0001544154598200009, + "loss": 0.8007, + "step": 16130 + }, + { + "epoch": 0.18051571124196822, + "grad_norm": 0.2660861909389496, + "learning_rate": 0.0001541870345835808, + "loss": 0.7913, + "step": 16140 + }, + { + "epoch": 0.1806275549292309, + "grad_norm": 0.2556677460670471, + "learning_rate": 0.0001539586093471607, + "loss": 0.7826, + "step": 16150 + }, + { + "epoch": 0.1807393986164936, + "grad_norm": 0.275900661945343, + "learning_rate": 0.00015373018411074055, + "loss": 0.8048, + "step": 16160 + }, + { + "epoch": 0.18085124230375627, + "grad_norm": 0.29176998138427734, + "learning_rate": 0.00015350175887432043, + "loss": 0.8241, + "step": 16170 + }, + { + "epoch": 0.18096308599101896, + "grad_norm": 0.2635776996612549, + "learning_rate": 0.00015327333363790034, + "loss": 0.8211, + "step": 16180 + }, + { + "epoch": 0.18107492967828162, + "grad_norm": 0.27744734287261963, + "learning_rate": 0.0001530449084014802, + "loss": 0.8254, + "step": 16190 + }, + { + "epoch": 0.1811867733655443, + "grad_norm": 0.28162074089050293, + "learning_rate": 0.00015281648316506008, + "loss": 0.8182, + "step": 16200 + }, + { + "epoch": 0.181298617052807, + "grad_norm": 0.29347339272499084, + "learning_rate": 0.00015258805792863996, + "loss": 0.812, + "step": 16210 + }, + { + "epoch": 0.18141046074006967, + "grad_norm": 0.26170992851257324, + "learning_rate": 0.00015235963269221985, + "loss": 0.8221, + "step": 16220 + }, + { + "epoch": 0.18152230442733236, + "grad_norm": 0.27848196029663086, + "learning_rate": 0.00015213120745579973, + "loss": 0.825, + "step": 16230 + }, + { + "epoch": 0.18163414811459505, + "grad_norm": 0.2994973659515381, + "learning_rate": 0.00015190278221937958, + "loss": 0.8158, + "step": 16240 + }, + { + "epoch": 0.18174599180185771, + "grad_norm": 0.27873843908309937, + "learning_rate": 0.00015167435698295947, + "loss": 0.816, + "step": 16250 + }, + { + "epoch": 0.1818578354891204, + "grad_norm": 0.3014775812625885, + "learning_rate": 0.00015144593174653938, + "loss": 0.8174, + "step": 16260 + }, + { + "epoch": 0.1819696791763831, + "grad_norm": 0.29963594675064087, + "learning_rate": 0.00015121750651011923, + "loss": 0.8104, + "step": 16270 + }, + { + "epoch": 0.18208152286364576, + "grad_norm": 0.3388141393661499, + "learning_rate": 0.00015098908127369912, + "loss": 0.826, + "step": 16280 + }, + { + "epoch": 0.18219336655090845, + "grad_norm": 0.29143062233924866, + "learning_rate": 0.000150760656037279, + "loss": 0.8222, + "step": 16290 + }, + { + "epoch": 0.18230521023817114, + "grad_norm": 0.327824205160141, + "learning_rate": 0.00015053223080085888, + "loss": 0.8186, + "step": 16300 + }, + { + "epoch": 0.1824170539254338, + "grad_norm": 0.3053797483444214, + "learning_rate": 0.00015030380556443876, + "loss": 0.8214, + "step": 16310 + }, + { + "epoch": 0.1825288976126965, + "grad_norm": 0.3030015230178833, + "learning_rate": 0.00015007538032801865, + "loss": 0.8198, + "step": 16320 + }, + { + "epoch": 0.18264074129995916, + "grad_norm": 0.3147192597389221, + "learning_rate": 0.0001498469550915985, + "loss": 0.8224, + "step": 16330 + }, + { + "epoch": 0.18275258498722186, + "grad_norm": 0.2838999927043915, + "learning_rate": 0.0001496185298551784, + "loss": 0.8142, + "step": 16340 + }, + { + "epoch": 0.18286442867448455, + "grad_norm": 0.27273476123809814, + "learning_rate": 0.0001493901046187583, + "loss": 0.8054, + "step": 16350 + }, + { + "epoch": 0.1829762723617472, + "grad_norm": 0.2754770517349243, + "learning_rate": 0.00014916167938233815, + "loss": 0.8131, + "step": 16360 + }, + { + "epoch": 0.1830881160490099, + "grad_norm": 0.29061514139175415, + "learning_rate": 0.00014893325414591803, + "loss": 0.7988, + "step": 16370 + }, + { + "epoch": 0.1831999597362726, + "grad_norm": 0.2525017559528351, + "learning_rate": 0.00014870482890949794, + "loss": 0.8023, + "step": 16380 + }, + { + "epoch": 0.18331180342353526, + "grad_norm": 0.3019058108329773, + "learning_rate": 0.0001484764036730778, + "loss": 0.8077, + "step": 16390 + }, + { + "epoch": 0.18342364711079795, + "grad_norm": 0.302090048789978, + "learning_rate": 0.00014824797843665768, + "loss": 0.812, + "step": 16400 + }, + { + "epoch": 0.18353549079806064, + "grad_norm": 0.29742154479026794, + "learning_rate": 0.00014801955320023757, + "loss": 0.7911, + "step": 16410 + }, + { + "epoch": 0.1836473344853233, + "grad_norm": 0.31950804591178894, + "learning_rate": 0.00014779112796381745, + "loss": 0.7875, + "step": 16420 + }, + { + "epoch": 0.183759178172586, + "grad_norm": 0.32971978187561035, + "learning_rate": 0.00014756270272739733, + "loss": 0.7788, + "step": 16430 + }, + { + "epoch": 0.1838710218598487, + "grad_norm": 0.2941220700740814, + "learning_rate": 0.00014733427749097721, + "loss": 0.7772, + "step": 16440 + }, + { + "epoch": 0.18398286554711135, + "grad_norm": 0.2639923393726349, + "learning_rate": 0.00014710585225455707, + "loss": 0.7708, + "step": 16450 + }, + { + "epoch": 0.18409470923437404, + "grad_norm": 0.2483467161655426, + "learning_rate": 0.00014687742701813698, + "loss": 0.7846, + "step": 16460 + }, + { + "epoch": 0.1842065529216367, + "grad_norm": 0.31150713562965393, + "learning_rate": 0.00014664900178171683, + "loss": 0.7853, + "step": 16470 + }, + { + "epoch": 0.1843183966088994, + "grad_norm": 0.30439406633377075, + "learning_rate": 0.00014642057654529672, + "loss": 0.7779, + "step": 16480 + }, + { + "epoch": 0.1844302402961621, + "grad_norm": 0.29318898916244507, + "learning_rate": 0.00014619215130887663, + "loss": 0.7911, + "step": 16490 + }, + { + "epoch": 0.18454208398342475, + "grad_norm": 0.2726874053478241, + "learning_rate": 0.00014596372607245648, + "loss": 0.7869, + "step": 16500 + }, + { + "epoch": 0.18465392767068745, + "grad_norm": 0.2978016436100006, + "learning_rate": 0.00014573530083603637, + "loss": 0.783, + "step": 16510 + }, + { + "epoch": 0.18476577135795014, + "grad_norm": 0.3107501268386841, + "learning_rate": 0.00014550687559961625, + "loss": 0.801, + "step": 16520 + }, + { + "epoch": 0.1848776150452128, + "grad_norm": 0.2848517894744873, + "learning_rate": 0.00014527845036319613, + "loss": 0.8063, + "step": 16530 + }, + { + "epoch": 0.1849894587324755, + "grad_norm": 0.2625429332256317, + "learning_rate": 0.00014505002512677601, + "loss": 0.8074, + "step": 16540 + }, + { + "epoch": 0.18510130241973818, + "grad_norm": 0.2805044949054718, + "learning_rate": 0.0001448215998903559, + "loss": 0.8013, + "step": 16550 + }, + { + "epoch": 0.18521314610700085, + "grad_norm": 0.27657589316368103, + "learning_rate": 0.00014459317465393575, + "loss": 0.8012, + "step": 16560 + }, + { + "epoch": 0.18532498979426354, + "grad_norm": 0.2780141532421112, + "learning_rate": 0.00014436474941751566, + "loss": 0.8161, + "step": 16570 + }, + { + "epoch": 0.18543683348152623, + "grad_norm": 0.2871207892894745, + "learning_rate": 0.00014413632418109555, + "loss": 0.7899, + "step": 16580 + }, + { + "epoch": 0.1855486771687889, + "grad_norm": 0.2656658887863159, + "learning_rate": 0.0001439078989446754, + "loss": 0.7985, + "step": 16590 + }, + { + "epoch": 0.1856605208560516, + "grad_norm": 0.2766350209712982, + "learning_rate": 0.00014367947370825528, + "loss": 0.7999, + "step": 16600 + }, + { + "epoch": 0.18577236454331428, + "grad_norm": 0.2616749107837677, + "learning_rate": 0.0001434510484718352, + "loss": 0.8002, + "step": 16610 + }, + { + "epoch": 0.18588420823057694, + "grad_norm": 0.25887414813041687, + "learning_rate": 0.00014322262323541505, + "loss": 0.8112, + "step": 16620 + }, + { + "epoch": 0.18599605191783963, + "grad_norm": 0.2594297528266907, + "learning_rate": 0.00014299419799899493, + "loss": 0.802, + "step": 16630 + }, + { + "epoch": 0.1861078956051023, + "grad_norm": 0.2535499036312103, + "learning_rate": 0.00014276577276257482, + "loss": 0.7867, + "step": 16640 + }, + { + "epoch": 0.186219739292365, + "grad_norm": 0.25161436200141907, + "learning_rate": 0.0001425373475261547, + "loss": 0.8059, + "step": 16650 + }, + { + "epoch": 0.18633158297962768, + "grad_norm": 0.22897444665431976, + "learning_rate": 0.00014230892228973458, + "loss": 0.7864, + "step": 16660 + }, + { + "epoch": 0.18644342666689034, + "grad_norm": 0.27164047956466675, + "learning_rate": 0.00014208049705331446, + "loss": 0.796, + "step": 16670 + }, + { + "epoch": 0.18655527035415304, + "grad_norm": 0.2717941701412201, + "learning_rate": 0.00014185207181689432, + "loss": 0.7801, + "step": 16680 + }, + { + "epoch": 0.18666711404141573, + "grad_norm": 0.27144837379455566, + "learning_rate": 0.00014162364658047423, + "loss": 0.7758, + "step": 16690 + }, + { + "epoch": 0.1867789577286784, + "grad_norm": 0.2357831746339798, + "learning_rate": 0.00014139522134405409, + "loss": 0.7674, + "step": 16700 + }, + { + "epoch": 0.18689080141594108, + "grad_norm": 0.23233544826507568, + "learning_rate": 0.00014116679610763397, + "loss": 0.7827, + "step": 16710 + }, + { + "epoch": 0.18700264510320377, + "grad_norm": 0.2399321347475052, + "learning_rate": 0.00014093837087121385, + "loss": 0.7811, + "step": 16720 + }, + { + "epoch": 0.18711448879046644, + "grad_norm": 0.2493642419576645, + "learning_rate": 0.00014070994563479373, + "loss": 0.7762, + "step": 16730 + }, + { + "epoch": 0.18722633247772913, + "grad_norm": 0.23383350670337677, + "learning_rate": 0.00014048152039837362, + "loss": 0.7754, + "step": 16740 + }, + { + "epoch": 0.18733817616499182, + "grad_norm": 0.2624364197254181, + "learning_rate": 0.0001402530951619535, + "loss": 0.7766, + "step": 16750 + }, + { + "epoch": 0.18745001985225448, + "grad_norm": 0.24138151109218597, + "learning_rate": 0.00014002466992553336, + "loss": 0.7869, + "step": 16760 + }, + { + "epoch": 0.18756186353951718, + "grad_norm": 0.2397204041481018, + "learning_rate": 0.00013979624468911326, + "loss": 0.7974, + "step": 16770 + }, + { + "epoch": 0.18767370722677984, + "grad_norm": 0.27491655945777893, + "learning_rate": 0.00013956781945269315, + "loss": 0.8011, + "step": 16780 + }, + { + "epoch": 0.18778555091404253, + "grad_norm": 0.2321402132511139, + "learning_rate": 0.000139339394216273, + "loss": 0.803, + "step": 16790 + }, + { + "epoch": 0.18789739460130522, + "grad_norm": 0.24487042427062988, + "learning_rate": 0.00013911096897985289, + "loss": 0.7975, + "step": 16800 + }, + { + "epoch": 0.1880092382885679, + "grad_norm": 0.23328396677970886, + "learning_rate": 0.0001388825437434328, + "loss": 0.795, + "step": 16810 + }, + { + "epoch": 0.18812108197583058, + "grad_norm": 0.22705566883087158, + "learning_rate": 0.00013865411850701265, + "loss": 0.7895, + "step": 16820 + }, + { + "epoch": 0.18823292566309327, + "grad_norm": 0.24339929223060608, + "learning_rate": 0.00013842569327059253, + "loss": 0.7931, + "step": 16830 + }, + { + "epoch": 0.18834476935035593, + "grad_norm": 0.2613057494163513, + "learning_rate": 0.00013819726803417242, + "loss": 0.7785, + "step": 16840 + }, + { + "epoch": 0.18845661303761863, + "grad_norm": 0.27011603116989136, + "learning_rate": 0.0001379688427977523, + "loss": 0.7853, + "step": 16850 + }, + { + "epoch": 0.18856845672488132, + "grad_norm": 0.26589342951774597, + "learning_rate": 0.00013774041756133218, + "loss": 0.7893, + "step": 16860 + }, + { + "epoch": 0.18868030041214398, + "grad_norm": 0.26286208629608154, + "learning_rate": 0.00013751199232491207, + "loss": 0.7707, + "step": 16870 + }, + { + "epoch": 0.18879214409940667, + "grad_norm": 0.3021993637084961, + "learning_rate": 0.00013728356708849192, + "loss": 0.7896, + "step": 16880 + }, + { + "epoch": 0.18890398778666936, + "grad_norm": 0.30742523074150085, + "learning_rate": 0.00013705514185207183, + "loss": 0.7895, + "step": 16890 + }, + { + "epoch": 0.18901583147393203, + "grad_norm": 0.3027999699115753, + "learning_rate": 0.0001368267166156517, + "loss": 0.7839, + "step": 16900 + }, + { + "epoch": 0.18912767516119472, + "grad_norm": 0.29199281334877014, + "learning_rate": 0.00013659829137923157, + "loss": 0.7771, + "step": 16910 + }, + { + "epoch": 0.18923951884845738, + "grad_norm": 0.2460477203130722, + "learning_rate": 0.00013636986614281145, + "loss": 0.7823, + "step": 16920 + }, + { + "epoch": 0.18935136253572007, + "grad_norm": 0.2608555853366852, + "learning_rate": 0.00013614144090639134, + "loss": 0.7664, + "step": 16930 + }, + { + "epoch": 0.18946320622298277, + "grad_norm": 0.2723162770271301, + "learning_rate": 0.00013591301566997122, + "loss": 0.7768, + "step": 16940 + }, + { + "epoch": 0.18957504991024543, + "grad_norm": 0.2690962255001068, + "learning_rate": 0.0001356845904335511, + "loss": 0.7697, + "step": 16950 + }, + { + "epoch": 0.18968689359750812, + "grad_norm": 0.2892717719078064, + "learning_rate": 0.00013545616519713096, + "loss": 0.769, + "step": 16960 + }, + { + "epoch": 0.1897987372847708, + "grad_norm": 0.2581406533718109, + "learning_rate": 0.00013522773996071087, + "loss": 0.7766, + "step": 16970 + }, + { + "epoch": 0.18991058097203348, + "grad_norm": 0.2944723963737488, + "learning_rate": 0.00013499931472429075, + "loss": 0.7638, + "step": 16980 + }, + { + "epoch": 0.19002242465929617, + "grad_norm": 0.2776504158973694, + "learning_rate": 0.0001347708894878706, + "loss": 0.7731, + "step": 16990 + }, + { + "epoch": 0.19013426834655886, + "grad_norm": 0.267098993062973, + "learning_rate": 0.00013454246425145052, + "loss": 0.7772, + "step": 17000 + }, + { + "epoch": 0.19024611203382152, + "grad_norm": 0.2806127071380615, + "learning_rate": 0.0001343140390150304, + "loss": 0.772, + "step": 17010 + }, + { + "epoch": 0.19035795572108422, + "grad_norm": 0.2872319519519806, + "learning_rate": 0.00013408561377861025, + "loss": 0.7695, + "step": 17020 + }, + { + "epoch": 0.1904697994083469, + "grad_norm": 0.24477818608283997, + "learning_rate": 0.00013385718854219014, + "loss": 0.7764, + "step": 17030 + }, + { + "epoch": 0.19058164309560957, + "grad_norm": 0.2637476623058319, + "learning_rate": 0.00013362876330577005, + "loss": 0.7712, + "step": 17040 + }, + { + "epoch": 0.19069348678287226, + "grad_norm": 0.2676442861557007, + "learning_rate": 0.0001334003380693499, + "loss": 0.7707, + "step": 17050 + }, + { + "epoch": 0.19080533047013493, + "grad_norm": 0.2592306435108185, + "learning_rate": 0.00013317191283292979, + "loss": 0.7808, + "step": 17060 + }, + { + "epoch": 0.19091717415739762, + "grad_norm": 0.3543199896812439, + "learning_rate": 0.00013294348759650967, + "loss": 0.7928, + "step": 17070 + }, + { + "epoch": 0.1910290178446603, + "grad_norm": 0.26262548565864563, + "learning_rate": 0.00013271506236008955, + "loss": 0.7677, + "step": 17080 + }, + { + "epoch": 0.19114086153192297, + "grad_norm": 0.2845424711704254, + "learning_rate": 0.00013248663712366943, + "loss": 0.7758, + "step": 17090 + }, + { + "epoch": 0.19125270521918566, + "grad_norm": 0.2694297730922699, + "learning_rate": 0.00013225821188724932, + "loss": 0.7857, + "step": 17100 + }, + { + "epoch": 0.19136454890644836, + "grad_norm": 0.2682325839996338, + "learning_rate": 0.00013202978665082917, + "loss": 0.782, + "step": 17110 + }, + { + "epoch": 0.19147639259371102, + "grad_norm": 0.26535049080848694, + "learning_rate": 0.00013180136141440908, + "loss": 0.7796, + "step": 17120 + }, + { + "epoch": 0.1915882362809737, + "grad_norm": 0.2759861946105957, + "learning_rate": 0.00013157293617798894, + "loss": 0.7732, + "step": 17130 + }, + { + "epoch": 0.1917000799682364, + "grad_norm": 0.24873244762420654, + "learning_rate": 0.00013134451094156882, + "loss": 0.7763, + "step": 17140 + }, + { + "epoch": 0.19181192365549907, + "grad_norm": 0.2826152443885803, + "learning_rate": 0.0001311160857051487, + "loss": 0.7748, + "step": 17150 + }, + { + "epoch": 0.19192376734276176, + "grad_norm": 0.2823798358440399, + "learning_rate": 0.00013088766046872859, + "loss": 0.768, + "step": 17160 + }, + { + "epoch": 0.19203561103002445, + "grad_norm": 0.2591745853424072, + "learning_rate": 0.00013065923523230847, + "loss": 0.7831, + "step": 17170 + }, + { + "epoch": 0.19214745471728711, + "grad_norm": 0.24773742258548737, + "learning_rate": 0.00013043080999588835, + "loss": 0.7799, + "step": 17180 + }, + { + "epoch": 0.1922592984045498, + "grad_norm": 0.28184765577316284, + "learning_rate": 0.0001302023847594682, + "loss": 0.787, + "step": 17190 + }, + { + "epoch": 0.19237114209181247, + "grad_norm": 0.24396668374538422, + "learning_rate": 0.00012997395952304812, + "loss": 0.7777, + "step": 17200 + }, + { + "epoch": 0.19248298577907516, + "grad_norm": 0.25493332743644714, + "learning_rate": 0.000129745534286628, + "loss": 0.7842, + "step": 17210 + }, + { + "epoch": 0.19259482946633785, + "grad_norm": 0.2615022361278534, + "learning_rate": 0.00012951710905020786, + "loss": 0.788, + "step": 17220 + }, + { + "epoch": 0.19270667315360052, + "grad_norm": 0.28270524740219116, + "learning_rate": 0.00012928868381378774, + "loss": 0.7788, + "step": 17230 + }, + { + "epoch": 0.1928185168408632, + "grad_norm": 0.24917210638523102, + "learning_rate": 0.00012906025857736765, + "loss": 0.7731, + "step": 17240 + }, + { + "epoch": 0.1929303605281259, + "grad_norm": 0.2589946985244751, + "learning_rate": 0.0001288318333409475, + "loss": 0.7781, + "step": 17250 + }, + { + "epoch": 0.19304220421538856, + "grad_norm": 0.23770585656166077, + "learning_rate": 0.0001286034081045274, + "loss": 0.7902, + "step": 17260 + }, + { + "epoch": 0.19315404790265125, + "grad_norm": 0.22782771289348602, + "learning_rate": 0.00012837498286810727, + "loss": 0.7875, + "step": 17270 + }, + { + "epoch": 0.19326589158991395, + "grad_norm": 0.2611001431941986, + "learning_rate": 0.00012814655763168715, + "loss": 0.794, + "step": 17280 + }, + { + "epoch": 0.1933777352771766, + "grad_norm": 0.2642746865749359, + "learning_rate": 0.00012791813239526704, + "loss": 0.8005, + "step": 17290 + }, + { + "epoch": 0.1934895789644393, + "grad_norm": 0.2470688372850418, + "learning_rate": 0.00012768970715884692, + "loss": 0.7854, + "step": 17300 + }, + { + "epoch": 0.193601422651702, + "grad_norm": 0.24735964834690094, + "learning_rate": 0.00012746128192242677, + "loss": 0.7918, + "step": 17310 + }, + { + "epoch": 0.19371326633896466, + "grad_norm": 0.2734208405017853, + "learning_rate": 0.00012723285668600668, + "loss": 0.7719, + "step": 17320 + }, + { + "epoch": 0.19382511002622735, + "grad_norm": 0.28373652696609497, + "learning_rate": 0.00012700443144958657, + "loss": 0.7743, + "step": 17330 + }, + { + "epoch": 0.19393695371349004, + "grad_norm": 0.25755295157432556, + "learning_rate": 0.00012677600621316642, + "loss": 0.7761, + "step": 17340 + }, + { + "epoch": 0.1940487974007527, + "grad_norm": 0.2918241322040558, + "learning_rate": 0.0001265475809767463, + "loss": 0.7885, + "step": 17350 + }, + { + "epoch": 0.1941606410880154, + "grad_norm": 0.2589518427848816, + "learning_rate": 0.0001263191557403262, + "loss": 0.7781, + "step": 17360 + }, + { + "epoch": 0.19427248477527806, + "grad_norm": 0.2941739857196808, + "learning_rate": 0.00012609073050390607, + "loss": 0.7896, + "step": 17370 + }, + { + "epoch": 0.19438432846254075, + "grad_norm": 0.2625831663608551, + "learning_rate": 0.00012586230526748595, + "loss": 0.7797, + "step": 17380 + }, + { + "epoch": 0.19449617214980344, + "grad_norm": 0.2731517255306244, + "learning_rate": 0.0001256338800310658, + "loss": 0.7861, + "step": 17390 + }, + { + "epoch": 0.1946080158370661, + "grad_norm": 0.2802453637123108, + "learning_rate": 0.00012540545479464572, + "loss": 0.8066, + "step": 17400 + }, + { + "epoch": 0.1947198595243288, + "grad_norm": 0.24151596426963806, + "learning_rate": 0.0001251770295582256, + "loss": 0.7746, + "step": 17410 + }, + { + "epoch": 0.1948317032115915, + "grad_norm": 0.27006617188453674, + "learning_rate": 0.00012494860432180549, + "loss": 0.7796, + "step": 17420 + }, + { + "epoch": 0.19494354689885415, + "grad_norm": 0.2574283480644226, + "learning_rate": 0.00012472017908538537, + "loss": 0.7809, + "step": 17430 + }, + { + "epoch": 0.19505539058611684, + "grad_norm": 0.25741514563560486, + "learning_rate": 0.00012449175384896522, + "loss": 0.7792, + "step": 17440 + }, + { + "epoch": 0.19516723427337954, + "grad_norm": 0.2619360685348511, + "learning_rate": 0.00012426332861254513, + "loss": 0.7768, + "step": 17450 + }, + { + "epoch": 0.1952790779606422, + "grad_norm": 0.28053224086761475, + "learning_rate": 0.000124034903376125, + "loss": 0.7841, + "step": 17460 + }, + { + "epoch": 0.1953909216479049, + "grad_norm": 0.24019859731197357, + "learning_rate": 0.00012380647813970487, + "loss": 0.783, + "step": 17470 + }, + { + "epoch": 0.19550276533516758, + "grad_norm": 0.2747540771961212, + "learning_rate": 0.00012357805290328475, + "loss": 0.7911, + "step": 17480 + }, + { + "epoch": 0.19561460902243025, + "grad_norm": 0.28044483065605164, + "learning_rate": 0.00012334962766686464, + "loss": 0.7986, + "step": 17490 + }, + { + "epoch": 0.19572645270969294, + "grad_norm": 0.24908137321472168, + "learning_rate": 0.00012312120243044452, + "loss": 0.8087, + "step": 17500 + }, + { + "epoch": 0.1958382963969556, + "grad_norm": 0.29041793942451477, + "learning_rate": 0.0001228927771940244, + "loss": 0.8063, + "step": 17510 + }, + { + "epoch": 0.1959501400842183, + "grad_norm": 0.3020537495613098, + "learning_rate": 0.00012266435195760429, + "loss": 0.8004, + "step": 17520 + }, + { + "epoch": 0.19606198377148099, + "grad_norm": 0.29414400458335876, + "learning_rate": 0.00012243592672118417, + "loss": 0.7846, + "step": 17530 + }, + { + "epoch": 0.19617382745874365, + "grad_norm": 0.2648397386074066, + "learning_rate": 0.00012220750148476402, + "loss": 0.7708, + "step": 17540 + }, + { + "epoch": 0.19628567114600634, + "grad_norm": 0.2834302484989166, + "learning_rate": 0.00012197907624834392, + "loss": 0.7818, + "step": 17550 + }, + { + "epoch": 0.19639751483326903, + "grad_norm": 0.2748505175113678, + "learning_rate": 0.0001217506510119238, + "loss": 0.7642, + "step": 17560 + }, + { + "epoch": 0.1965093585205317, + "grad_norm": 0.32425326108932495, + "learning_rate": 0.00012152222577550367, + "loss": 0.7765, + "step": 17570 + }, + { + "epoch": 0.1966212022077944, + "grad_norm": 0.27183324098587036, + "learning_rate": 0.00012129380053908357, + "loss": 0.7572, + "step": 17580 + }, + { + "epoch": 0.19673304589505708, + "grad_norm": 0.28190943598747253, + "learning_rate": 0.00012106537530266344, + "loss": 0.7571, + "step": 17590 + }, + { + "epoch": 0.19684488958231974, + "grad_norm": 0.5151196718215942, + "learning_rate": 0.00012083695006624332, + "loss": 0.7565, + "step": 17600 + }, + { + "epoch": 0.19695673326958243, + "grad_norm": 0.2523132264614105, + "learning_rate": 0.0001206085248298232, + "loss": 0.7597, + "step": 17610 + }, + { + "epoch": 0.19706857695684513, + "grad_norm": 0.27336063981056213, + "learning_rate": 0.00012038009959340309, + "loss": 0.7546, + "step": 17620 + }, + { + "epoch": 0.1971804206441078, + "grad_norm": 0.25119057297706604, + "learning_rate": 0.00012015167435698296, + "loss": 0.7519, + "step": 17630 + }, + { + "epoch": 0.19729226433137048, + "grad_norm": 0.281147301197052, + "learning_rate": 0.00011992324912056284, + "loss": 0.7623, + "step": 17640 + }, + { + "epoch": 0.19740410801863315, + "grad_norm": 0.2463361769914627, + "learning_rate": 0.00011969482388414272, + "loss": 0.754, + "step": 17650 + }, + { + "epoch": 0.19751595170589584, + "grad_norm": 0.2902059853076935, + "learning_rate": 0.0001194663986477226, + "loss": 0.7578, + "step": 17660 + }, + { + "epoch": 0.19762779539315853, + "grad_norm": 0.2590588629245758, + "learning_rate": 0.00011923797341130247, + "loss": 0.7427, + "step": 17670 + }, + { + "epoch": 0.1977396390804212, + "grad_norm": 0.24349506199359894, + "learning_rate": 0.00011900954817488237, + "loss": 0.7599, + "step": 17680 + }, + { + "epoch": 0.19785148276768388, + "grad_norm": 0.2568139135837555, + "learning_rate": 0.00011878112293846224, + "loss": 0.7673, + "step": 17690 + }, + { + "epoch": 0.19796332645494658, + "grad_norm": 0.2617419958114624, + "learning_rate": 0.00011855269770204212, + "loss": 0.7637, + "step": 17700 + }, + { + "epoch": 0.19807517014220924, + "grad_norm": 0.24309082329273224, + "learning_rate": 0.000118324272465622, + "loss": 0.7583, + "step": 17710 + }, + { + "epoch": 0.19818701382947193, + "grad_norm": 0.22027656435966492, + "learning_rate": 0.00011809584722920189, + "loss": 0.7479, + "step": 17720 + }, + { + "epoch": 0.19829885751673462, + "grad_norm": 0.27296265959739685, + "learning_rate": 0.00011786742199278176, + "loss": 0.765, + "step": 17730 + }, + { + "epoch": 0.1984107012039973, + "grad_norm": 0.2589128613471985, + "learning_rate": 0.00011763899675636165, + "loss": 0.777, + "step": 17740 + }, + { + "epoch": 0.19852254489125998, + "grad_norm": 0.27665242552757263, + "learning_rate": 0.00011741057151994152, + "loss": 0.7656, + "step": 17750 + }, + { + "epoch": 0.19863438857852267, + "grad_norm": 0.27103251218795776, + "learning_rate": 0.0001171821462835214, + "loss": 0.7716, + "step": 17760 + }, + { + "epoch": 0.19874623226578533, + "grad_norm": 0.2768172025680542, + "learning_rate": 0.00011695372104710127, + "loss": 0.7738, + "step": 17770 + }, + { + "epoch": 0.19885807595304802, + "grad_norm": 0.2424757182598114, + "learning_rate": 0.00011672529581068117, + "loss": 0.7793, + "step": 17780 + }, + { + "epoch": 0.1989699196403107, + "grad_norm": 0.2821860909461975, + "learning_rate": 0.00011649687057426104, + "loss": 0.7771, + "step": 17790 + }, + { + "epoch": 0.19908176332757338, + "grad_norm": 0.28263264894485474, + "learning_rate": 0.00011626844533784092, + "loss": 0.7812, + "step": 17800 + }, + { + "epoch": 0.19919360701483607, + "grad_norm": 0.24835869669914246, + "learning_rate": 0.0001160400201014208, + "loss": 0.7753, + "step": 17810 + }, + { + "epoch": 0.19930545070209874, + "grad_norm": 0.23325562477111816, + "learning_rate": 0.00011581159486500069, + "loss": 0.7763, + "step": 17820 + }, + { + "epoch": 0.19941729438936143, + "grad_norm": 0.2520182132720947, + "learning_rate": 0.00011558316962858056, + "loss": 0.791, + "step": 17830 + }, + { + "epoch": 0.19952913807662412, + "grad_norm": 0.2478768676519394, + "learning_rate": 0.00011535474439216045, + "loss": 0.7819, + "step": 17840 + }, + { + "epoch": 0.19964098176388678, + "grad_norm": 0.2749478220939636, + "learning_rate": 0.00011512631915574032, + "loss": 0.7805, + "step": 17850 + }, + { + "epoch": 0.19975282545114947, + "grad_norm": 0.2417723685503006, + "learning_rate": 0.0001148978939193202, + "loss": 0.766, + "step": 17860 + }, + { + "epoch": 0.19986466913841217, + "grad_norm": 0.25219354033470154, + "learning_rate": 0.00011466946868290008, + "loss": 0.758, + "step": 17870 + }, + { + "epoch": 0.19997651282567483, + "grad_norm": 0.24644000828266144, + "learning_rate": 0.00011444104344647997, + "loss": 0.7569, + "step": 17880 + }, + { + "epoch": 0.20008835651293752, + "grad_norm": 0.2683338224887848, + "learning_rate": 0.00011421261821005986, + "loss": 0.7509, + "step": 17890 + }, + { + "epoch": 0.2002002002002002, + "grad_norm": 0.29149681329727173, + "learning_rate": 0.00011398419297363972, + "loss": 0.7611, + "step": 17900 + }, + { + "epoch": 0.20031204388746288, + "grad_norm": 0.2651118338108063, + "learning_rate": 0.00011375576773721962, + "loss": 0.756, + "step": 17910 + }, + { + "epoch": 0.20042388757472557, + "grad_norm": 0.26990607380867004, + "learning_rate": 0.00011352734250079949, + "loss": 0.7726, + "step": 17920 + }, + { + "epoch": 0.20053573126198823, + "grad_norm": 0.23897935450077057, + "learning_rate": 0.00011329891726437937, + "loss": 0.7875, + "step": 17930 + }, + { + "epoch": 0.20064757494925092, + "grad_norm": 0.2300727218389511, + "learning_rate": 0.00011307049202795926, + "loss": 0.7697, + "step": 17940 + }, + { + "epoch": 0.20075941863651361, + "grad_norm": 0.2873596251010895, + "learning_rate": 0.00011284206679153914, + "loss": 0.7776, + "step": 17950 + }, + { + "epoch": 0.20087126232377628, + "grad_norm": 0.29036712646484375, + "learning_rate": 0.00011261364155511901, + "loss": 0.7794, + "step": 17960 + }, + { + "epoch": 0.20098310601103897, + "grad_norm": 0.2837420701980591, + "learning_rate": 0.0001123852163186989, + "loss": 0.7818, + "step": 17970 + }, + { + "epoch": 0.20109494969830166, + "grad_norm": 0.2920686602592468, + "learning_rate": 0.00011215679108227877, + "loss": 0.7851, + "step": 17980 + }, + { + "epoch": 0.20120679338556433, + "grad_norm": 0.27664583921432495, + "learning_rate": 0.00011192836584585866, + "loss": 0.7601, + "step": 17990 + }, + { + "epoch": 0.20131863707282702, + "grad_norm": 0.26870399713516235, + "learning_rate": 0.00011169994060943853, + "loss": 0.7961, + "step": 18000 + }, + { + "epoch": 0.2014304807600897, + "grad_norm": 0.2502228021621704, + "learning_rate": 0.00011147151537301842, + "loss": 0.7827, + "step": 18010 + }, + { + "epoch": 0.20154232444735237, + "grad_norm": 0.2473440319299698, + "learning_rate": 0.00011124309013659829, + "loss": 0.7815, + "step": 18020 + }, + { + "epoch": 0.20165416813461506, + "grad_norm": 0.2513076663017273, + "learning_rate": 0.00011101466490017817, + "loss": 0.7675, + "step": 18030 + }, + { + "epoch": 0.20176601182187776, + "grad_norm": 0.2829226851463318, + "learning_rate": 0.00011078623966375806, + "loss": 0.7669, + "step": 18040 + }, + { + "epoch": 0.20187785550914042, + "grad_norm": 0.25758418440818787, + "learning_rate": 0.00011055781442733794, + "loss": 0.7707, + "step": 18050 + }, + { + "epoch": 0.2019896991964031, + "grad_norm": 0.27185285091400146, + "learning_rate": 0.00011032938919091781, + "loss": 0.7742, + "step": 18060 + }, + { + "epoch": 0.2021015428836658, + "grad_norm": 0.2802230417728424, + "learning_rate": 0.0001101009639544977, + "loss": 0.7821, + "step": 18070 + }, + { + "epoch": 0.20221338657092847, + "grad_norm": 0.2882921099662781, + "learning_rate": 0.00010987253871807757, + "loss": 0.779, + "step": 18080 + }, + { + "epoch": 0.20232523025819116, + "grad_norm": 0.2569839358329773, + "learning_rate": 0.00010964411348165746, + "loss": 0.7694, + "step": 18090 + }, + { + "epoch": 0.20243707394545382, + "grad_norm": 0.2600938379764557, + "learning_rate": 0.00010941568824523733, + "loss": 0.7781, + "step": 18100 + }, + { + "epoch": 0.2025489176327165, + "grad_norm": 0.28083154559135437, + "learning_rate": 0.00010918726300881722, + "loss": 0.7799, + "step": 18110 + }, + { + "epoch": 0.2026607613199792, + "grad_norm": 0.22990182042121887, + "learning_rate": 0.00010895883777239709, + "loss": 0.7883, + "step": 18120 + }, + { + "epoch": 0.20277260500724187, + "grad_norm": 0.27432581782341003, + "learning_rate": 0.00010873041253597697, + "loss": 0.7942, + "step": 18130 + }, + { + "epoch": 0.20288444869450456, + "grad_norm": 0.2607738971710205, + "learning_rate": 0.00010850198729955686, + "loss": 0.7877, + "step": 18140 + }, + { + "epoch": 0.20299629238176725, + "grad_norm": 0.2818219065666199, + "learning_rate": 0.00010827356206313674, + "loss": 0.7948, + "step": 18150 + }, + { + "epoch": 0.20310813606902992, + "grad_norm": 0.2751563489437103, + "learning_rate": 0.00010804513682671661, + "loss": 0.7836, + "step": 18160 + }, + { + "epoch": 0.2032199797562926, + "grad_norm": 0.2746957242488861, + "learning_rate": 0.0001078167115902965, + "loss": 0.7693, + "step": 18170 + }, + { + "epoch": 0.2033318234435553, + "grad_norm": 0.24990054965019226, + "learning_rate": 0.00010758828635387638, + "loss": 0.7869, + "step": 18180 + }, + { + "epoch": 0.20344366713081796, + "grad_norm": 0.24581623077392578, + "learning_rate": 0.00010735986111745626, + "loss": 0.768, + "step": 18190 + }, + { + "epoch": 0.20355551081808065, + "grad_norm": 0.26637768745422363, + "learning_rate": 0.00010713143588103613, + "loss": 0.7711, + "step": 18200 + }, + { + "epoch": 0.20366735450534335, + "grad_norm": 0.2510250508785248, + "learning_rate": 0.00010690301064461602, + "loss": 0.7748, + "step": 18210 + }, + { + "epoch": 0.203779198192606, + "grad_norm": 0.2378496378660202, + "learning_rate": 0.00010667458540819589, + "loss": 0.7622, + "step": 18220 + }, + { + "epoch": 0.2038910418798687, + "grad_norm": 0.2507869601249695, + "learning_rate": 0.00010644616017177578, + "loss": 0.7739, + "step": 18230 + }, + { + "epoch": 0.20400288556713136, + "grad_norm": 0.24733096361160278, + "learning_rate": 0.00010621773493535566, + "loss": 0.7508, + "step": 18240 + }, + { + "epoch": 0.20411472925439406, + "grad_norm": 0.23383109271526337, + "learning_rate": 0.00010598930969893554, + "loss": 0.7507, + "step": 18250 + }, + { + "epoch": 0.20422657294165675, + "grad_norm": 0.2543237805366516, + "learning_rate": 0.00010576088446251541, + "loss": 0.7578, + "step": 18260 + }, + { + "epoch": 0.2043384166289194, + "grad_norm": 0.25807520747184753, + "learning_rate": 0.00010553245922609531, + "loss": 0.7513, + "step": 18270 + }, + { + "epoch": 0.2044502603161821, + "grad_norm": 0.23354406654834747, + "learning_rate": 0.00010530403398967518, + "loss": 0.7566, + "step": 18280 + }, + { + "epoch": 0.2045621040034448, + "grad_norm": 0.2685154676437378, + "learning_rate": 0.00010507560875325506, + "loss": 0.758, + "step": 18290 + }, + { + "epoch": 0.20467394769070746, + "grad_norm": 0.24349918961524963, + "learning_rate": 0.00010484718351683494, + "loss": 0.7686, + "step": 18300 + }, + { + "epoch": 0.20478579137797015, + "grad_norm": 0.24823498725891113, + "learning_rate": 0.00010461875828041482, + "loss": 0.7659, + "step": 18310 + }, + { + "epoch": 0.20489763506523284, + "grad_norm": 0.2511804401874542, + "learning_rate": 0.0001043903330439947, + "loss": 0.77, + "step": 18320 + }, + { + "epoch": 0.2050094787524955, + "grad_norm": 0.24065516889095306, + "learning_rate": 0.00010416190780757458, + "loss": 0.7677, + "step": 18330 + }, + { + "epoch": 0.2051213224397582, + "grad_norm": 0.2819323241710663, + "learning_rate": 0.00010393348257115447, + "loss": 0.753, + "step": 18340 + }, + { + "epoch": 0.2052331661270209, + "grad_norm": 0.26467952132225037, + "learning_rate": 0.00010370505733473434, + "loss": 0.7826, + "step": 18350 + }, + { + "epoch": 0.20534500981428355, + "grad_norm": 0.22962163388729095, + "learning_rate": 0.00010347663209831423, + "loss": 0.7683, + "step": 18360 + }, + { + "epoch": 0.20545685350154624, + "grad_norm": 0.2582736611366272, + "learning_rate": 0.00010324820686189411, + "loss": 0.7951, + "step": 18370 + }, + { + "epoch": 0.2055686971888089, + "grad_norm": 0.2352149486541748, + "learning_rate": 0.00010301978162547399, + "loss": 0.7577, + "step": 18380 + }, + { + "epoch": 0.2056805408760716, + "grad_norm": 0.25687554478645325, + "learning_rate": 0.00010279135638905386, + "loss": 0.7696, + "step": 18390 + }, + { + "epoch": 0.2057923845633343, + "grad_norm": 0.2579772472381592, + "learning_rate": 0.00010256293115263376, + "loss": 0.7837, + "step": 18400 + }, + { + "epoch": 0.20590422825059695, + "grad_norm": 0.24537009000778198, + "learning_rate": 0.00010233450591621363, + "loss": 0.7799, + "step": 18410 + }, + { + "epoch": 0.20601607193785965, + "grad_norm": 0.2636966109275818, + "learning_rate": 0.00010210608067979351, + "loss": 0.7588, + "step": 18420 + }, + { + "epoch": 0.20612791562512234, + "grad_norm": 0.30670562386512756, + "learning_rate": 0.00010187765544337338, + "loss": 0.771, + "step": 18430 + }, + { + "epoch": 0.206239759312385, + "grad_norm": 0.28400668501853943, + "learning_rate": 0.00010164923020695327, + "loss": 0.7686, + "step": 18440 + }, + { + "epoch": 0.2063516029996477, + "grad_norm": 0.27395951747894287, + "learning_rate": 0.00010142080497053314, + "loss": 0.776, + "step": 18450 + }, + { + "epoch": 0.20646344668691038, + "grad_norm": 0.284868061542511, + "learning_rate": 0.00010119237973411303, + "loss": 0.7864, + "step": 18460 + }, + { + "epoch": 0.20657529037417305, + "grad_norm": 0.2859087586402893, + "learning_rate": 0.00010096395449769291, + "loss": 0.7749, + "step": 18470 + }, + { + "epoch": 0.20668713406143574, + "grad_norm": 0.28758034110069275, + "learning_rate": 0.00010073552926127279, + "loss": 0.7919, + "step": 18480 + }, + { + "epoch": 0.20679897774869843, + "grad_norm": 0.2752404510974884, + "learning_rate": 0.00010050710402485266, + "loss": 0.7808, + "step": 18490 + }, + { + "epoch": 0.2069108214359611, + "grad_norm": 0.30756843090057373, + "learning_rate": 0.00010027867878843256, + "loss": 0.7734, + "step": 18500 + }, + { + "epoch": 0.2070226651232238, + "grad_norm": 0.2694368064403534, + "learning_rate": 0.00010005025355201243, + "loss": 0.7751, + "step": 18510 + }, + { + "epoch": 0.20713450881048645, + "grad_norm": 0.25838834047317505, + "learning_rate": 9.982182831559231e-05, + "loss": 0.7686, + "step": 18520 + }, + { + "epoch": 0.20724635249774914, + "grad_norm": 0.257729709148407, + "learning_rate": 9.959340307917219e-05, + "loss": 0.7827, + "step": 18530 + }, + { + "epoch": 0.20735819618501183, + "grad_norm": 0.2938844859600067, + "learning_rate": 9.936497784275208e-05, + "loss": 0.7685, + "step": 18540 + }, + { + "epoch": 0.2074700398722745, + "grad_norm": 0.25894027948379517, + "learning_rate": 9.913655260633194e-05, + "loss": 0.7738, + "step": 18550 + }, + { + "epoch": 0.2075818835595372, + "grad_norm": 0.2751148045063019, + "learning_rate": 9.890812736991183e-05, + "loss": 0.7594, + "step": 18560 + }, + { + "epoch": 0.20769372724679988, + "grad_norm": 0.28643253445625305, + "learning_rate": 9.867970213349171e-05, + "loss": 0.7737, + "step": 18570 + }, + { + "epoch": 0.20780557093406254, + "grad_norm": 0.2575749158859253, + "learning_rate": 9.845127689707159e-05, + "loss": 0.7778, + "step": 18580 + }, + { + "epoch": 0.20791741462132524, + "grad_norm": 0.27625295519828796, + "learning_rate": 9.822285166065146e-05, + "loss": 0.7716, + "step": 18590 + }, + { + "epoch": 0.20802925830858793, + "grad_norm": 0.2803322672843933, + "learning_rate": 9.799442642423136e-05, + "loss": 0.7805, + "step": 18600 + }, + { + "epoch": 0.2081411019958506, + "grad_norm": 0.2567484676837921, + "learning_rate": 9.776600118781123e-05, + "loss": 0.7633, + "step": 18610 + }, + { + "epoch": 0.20825294568311328, + "grad_norm": 0.28193768858909607, + "learning_rate": 9.753757595139111e-05, + "loss": 0.7895, + "step": 18620 + }, + { + "epoch": 0.20836478937037597, + "grad_norm": 0.28459542989730835, + "learning_rate": 9.7309150714971e-05, + "loss": 0.7741, + "step": 18630 + }, + { + "epoch": 0.20847663305763864, + "grad_norm": 0.28346261382102966, + "learning_rate": 9.708072547855088e-05, + "loss": 0.7813, + "step": 18640 + }, + { + "epoch": 0.20858847674490133, + "grad_norm": 0.2818828523159027, + "learning_rate": 9.685230024213075e-05, + "loss": 0.7755, + "step": 18650 + }, + { + "epoch": 0.208700320432164, + "grad_norm": 0.28914326429367065, + "learning_rate": 9.662387500571063e-05, + "loss": 0.7798, + "step": 18660 + }, + { + "epoch": 0.20881216411942669, + "grad_norm": 0.2600755989551544, + "learning_rate": 9.639544976929051e-05, + "loss": 0.7758, + "step": 18670 + }, + { + "epoch": 0.20892400780668938, + "grad_norm": 0.2726733088493347, + "learning_rate": 9.61670245328704e-05, + "loss": 0.7769, + "step": 18680 + }, + { + "epoch": 0.20903585149395204, + "grad_norm": 0.23421594500541687, + "learning_rate": 9.593859929645026e-05, + "loss": 0.758, + "step": 18690 + }, + { + "epoch": 0.20914769518121473, + "grad_norm": 0.29468339681625366, + "learning_rate": 9.571017406003016e-05, + "loss": 0.7746, + "step": 18700 + }, + { + "epoch": 0.20925953886847742, + "grad_norm": 0.29477235674858093, + "learning_rate": 9.548174882361003e-05, + "loss": 0.7633, + "step": 18710 + }, + { + "epoch": 0.2093713825557401, + "grad_norm": 0.2564197778701782, + "learning_rate": 9.525332358718991e-05, + "loss": 0.7541, + "step": 18720 + }, + { + "epoch": 0.20948322624300278, + "grad_norm": 0.2745250165462494, + "learning_rate": 9.50248983507698e-05, + "loss": 0.7887, + "step": 18730 + }, + { + "epoch": 0.20959506993026547, + "grad_norm": 0.2572060525417328, + "learning_rate": 9.479647311434968e-05, + "loss": 0.774, + "step": 18740 + }, + { + "epoch": 0.20970691361752813, + "grad_norm": 0.28513193130493164, + "learning_rate": 9.456804787792955e-05, + "loss": 0.7871, + "step": 18750 + }, + { + "epoch": 0.20981875730479083, + "grad_norm": 0.2643887400627136, + "learning_rate": 9.433962264150944e-05, + "loss": 0.77, + "step": 18760 + }, + { + "epoch": 0.20993060099205352, + "grad_norm": 0.27534207701683044, + "learning_rate": 9.411119740508931e-05, + "loss": 0.7775, + "step": 18770 + }, + { + "epoch": 0.21004244467931618, + "grad_norm": 0.2620585858821869, + "learning_rate": 9.38827721686692e-05, + "loss": 0.7808, + "step": 18780 + }, + { + "epoch": 0.21015428836657887, + "grad_norm": 0.2759549915790558, + "learning_rate": 9.365434693224908e-05, + "loss": 0.7642, + "step": 18790 + }, + { + "epoch": 0.21026613205384156, + "grad_norm": 0.2919774353504181, + "learning_rate": 9.342592169582896e-05, + "loss": 0.7828, + "step": 18800 + }, + { + "epoch": 0.21037797574110423, + "grad_norm": 0.2717173099517822, + "learning_rate": 9.319749645940884e-05, + "loss": 0.7513, + "step": 18810 + }, + { + "epoch": 0.21048981942836692, + "grad_norm": 0.2662122845649719, + "learning_rate": 9.296907122298871e-05, + "loss": 0.7668, + "step": 18820 + }, + { + "epoch": 0.21060166311562958, + "grad_norm": 0.26051005721092224, + "learning_rate": 9.274064598656861e-05, + "loss": 0.7676, + "step": 18830 + }, + { + "epoch": 0.21071350680289228, + "grad_norm": 0.27510005235671997, + "learning_rate": 9.251222075014848e-05, + "loss": 0.7507, + "step": 18840 + }, + { + "epoch": 0.21082535049015497, + "grad_norm": 0.23877868056297302, + "learning_rate": 9.228379551372836e-05, + "loss": 0.7535, + "step": 18850 + }, + { + "epoch": 0.21093719417741763, + "grad_norm": 0.256104439496994, + "learning_rate": 9.205537027730824e-05, + "loss": 0.7546, + "step": 18860 + }, + { + "epoch": 0.21104903786468032, + "grad_norm": 0.2829015552997589, + "learning_rate": 9.182694504088813e-05, + "loss": 0.7588, + "step": 18870 + }, + { + "epoch": 0.211160881551943, + "grad_norm": 0.22898368537425995, + "learning_rate": 9.1598519804468e-05, + "loss": 0.7551, + "step": 18880 + }, + { + "epoch": 0.21127272523920568, + "grad_norm": 0.23679418861865997, + "learning_rate": 9.137009456804788e-05, + "loss": 0.7718, + "step": 18890 + }, + { + "epoch": 0.21138456892646837, + "grad_norm": 0.2878457009792328, + "learning_rate": 9.114166933162776e-05, + "loss": 0.7593, + "step": 18900 + }, + { + "epoch": 0.21149641261373106, + "grad_norm": 0.2936013638973236, + "learning_rate": 9.091324409520764e-05, + "loss": 0.7713, + "step": 18910 + }, + { + "epoch": 0.21160825630099372, + "grad_norm": 0.26062774658203125, + "learning_rate": 9.068481885878751e-05, + "loss": 0.7763, + "step": 18920 + }, + { + "epoch": 0.21172009998825642, + "grad_norm": 0.3092271685600281, + "learning_rate": 9.045639362236741e-05, + "loss": 0.7807, + "step": 18930 + }, + { + "epoch": 0.2118319436755191, + "grad_norm": 0.23566113412380219, + "learning_rate": 9.022796838594728e-05, + "loss": 0.7779, + "step": 18940 + }, + { + "epoch": 0.21194378736278177, + "grad_norm": 0.27366477251052856, + "learning_rate": 8.999954314952716e-05, + "loss": 0.77, + "step": 18950 + }, + { + "epoch": 0.21205563105004446, + "grad_norm": 0.23270778357982635, + "learning_rate": 8.977111791310704e-05, + "loss": 0.7549, + "step": 18960 + }, + { + "epoch": 0.21216747473730713, + "grad_norm": 0.28785306215286255, + "learning_rate": 8.954269267668693e-05, + "loss": 0.7677, + "step": 18970 + }, + { + "epoch": 0.21227931842456982, + "grad_norm": 0.2588510811328888, + "learning_rate": 8.93142674402668e-05, + "loss": 0.7715, + "step": 18980 + }, + { + "epoch": 0.2123911621118325, + "grad_norm": 0.248029887676239, + "learning_rate": 8.908584220384668e-05, + "loss": 0.7749, + "step": 18990 + }, + { + "epoch": 0.21250300579909517, + "grad_norm": 0.2579936981201172, + "learning_rate": 8.885741696742656e-05, + "loss": 0.7552, + "step": 19000 + }, + { + "epoch": 0.21261484948635787, + "grad_norm": 0.26293206214904785, + "learning_rate": 8.862899173100645e-05, + "loss": 0.7657, + "step": 19010 + }, + { + "epoch": 0.21272669317362056, + "grad_norm": 0.24589793384075165, + "learning_rate": 8.840056649458631e-05, + "loss": 0.7598, + "step": 19020 + }, + { + "epoch": 0.21283853686088322, + "grad_norm": 0.2315252274274826, + "learning_rate": 8.817214125816621e-05, + "loss": 0.7637, + "step": 19030 + }, + { + "epoch": 0.2129503805481459, + "grad_norm": 0.2538358271121979, + "learning_rate": 8.794371602174608e-05, + "loss": 0.7587, + "step": 19040 + }, + { + "epoch": 0.2130622242354086, + "grad_norm": 0.2626616060733795, + "learning_rate": 8.771529078532596e-05, + "loss": 0.7597, + "step": 19050 + }, + { + "epoch": 0.21317406792267127, + "grad_norm": 0.2557279169559479, + "learning_rate": 8.748686554890585e-05, + "loss": 0.7499, + "step": 19060 + }, + { + "epoch": 0.21328591160993396, + "grad_norm": 0.25008153915405273, + "learning_rate": 8.725844031248573e-05, + "loss": 0.7466, + "step": 19070 + }, + { + "epoch": 0.21339775529719665, + "grad_norm": 0.2647120952606201, + "learning_rate": 8.70300150760656e-05, + "loss": 0.7574, + "step": 19080 + }, + { + "epoch": 0.21350959898445931, + "grad_norm": 0.2535738945007324, + "learning_rate": 8.68015898396455e-05, + "loss": 0.7672, + "step": 19090 + }, + { + "epoch": 0.213621442671722, + "grad_norm": 0.28925755620002747, + "learning_rate": 8.657316460322536e-05, + "loss": 0.7692, + "step": 19100 + }, + { + "epoch": 0.21373328635898467, + "grad_norm": 0.26770591735839844, + "learning_rate": 8.634473936680525e-05, + "loss": 0.7511, + "step": 19110 + }, + { + "epoch": 0.21384513004624736, + "grad_norm": 0.25162947177886963, + "learning_rate": 8.611631413038512e-05, + "loss": 0.7573, + "step": 19120 + }, + { + "epoch": 0.21395697373351005, + "grad_norm": 0.253324031829834, + "learning_rate": 8.588788889396501e-05, + "loss": 0.7516, + "step": 19130 + }, + { + "epoch": 0.21406881742077272, + "grad_norm": 0.2784843146800995, + "learning_rate": 8.565946365754488e-05, + "loss": 0.7522, + "step": 19140 + }, + { + "epoch": 0.2141806611080354, + "grad_norm": 0.2869722247123718, + "learning_rate": 8.543103842112476e-05, + "loss": 0.7525, + "step": 19150 + }, + { + "epoch": 0.2142925047952981, + "grad_norm": 0.2467101663351059, + "learning_rate": 8.520261318470465e-05, + "loss": 0.7336, + "step": 19160 + }, + { + "epoch": 0.21440434848256076, + "grad_norm": 0.26108691096305847, + "learning_rate": 8.497418794828453e-05, + "loss": 0.751, + "step": 19170 + }, + { + "epoch": 0.21451619216982346, + "grad_norm": 0.2992580533027649, + "learning_rate": 8.47457627118644e-05, + "loss": 0.7599, + "step": 19180 + }, + { + "epoch": 0.21462803585708615, + "grad_norm": 0.2573351562023163, + "learning_rate": 8.45173374754443e-05, + "loss": 0.752, + "step": 19190 + }, + { + "epoch": 0.2147398795443488, + "grad_norm": 0.30148234963417053, + "learning_rate": 8.428891223902416e-05, + "loss": 0.7536, + "step": 19200 + }, + { + "epoch": 0.2148517232316115, + "grad_norm": 0.2811321020126343, + "learning_rate": 8.406048700260405e-05, + "loss": 0.761, + "step": 19210 + }, + { + "epoch": 0.2149635669188742, + "grad_norm": 0.2792038321495056, + "learning_rate": 8.383206176618392e-05, + "loss": 0.7558, + "step": 19220 + }, + { + "epoch": 0.21507541060613686, + "grad_norm": 0.30432426929473877, + "learning_rate": 8.360363652976381e-05, + "loss": 0.7541, + "step": 19230 + }, + { + "epoch": 0.21518725429339955, + "grad_norm": 0.28335481882095337, + "learning_rate": 8.33752112933437e-05, + "loss": 0.7628, + "step": 19240 + }, + { + "epoch": 0.2152990979806622, + "grad_norm": 0.28402864933013916, + "learning_rate": 8.314678605692357e-05, + "loss": 0.7835, + "step": 19250 + }, + { + "epoch": 0.2154109416679249, + "grad_norm": 0.2914164662361145, + "learning_rate": 8.291836082050346e-05, + "loss": 0.7705, + "step": 19260 + }, + { + "epoch": 0.2155227853551876, + "grad_norm": 0.27296769618988037, + "learning_rate": 8.268993558408333e-05, + "loss": 0.7791, + "step": 19270 + }, + { + "epoch": 0.21563462904245026, + "grad_norm": 0.2987435460090637, + "learning_rate": 8.246151034766321e-05, + "loss": 0.7918, + "step": 19280 + }, + { + "epoch": 0.21574647272971295, + "grad_norm": 0.2743736207485199, + "learning_rate": 8.22330851112431e-05, + "loss": 0.7777, + "step": 19290 + }, + { + "epoch": 0.21585831641697564, + "grad_norm": 0.2775188982486725, + "learning_rate": 8.200465987482298e-05, + "loss": 0.7811, + "step": 19300 + }, + { + "epoch": 0.2159701601042383, + "grad_norm": 0.2942585349082947, + "learning_rate": 8.177623463840285e-05, + "loss": 0.7748, + "step": 19310 + }, + { + "epoch": 0.216082003791501, + "grad_norm": 0.2545025050640106, + "learning_rate": 8.154780940198274e-05, + "loss": 0.77, + "step": 19320 + }, + { + "epoch": 0.2161938474787637, + "grad_norm": 0.2571526765823364, + "learning_rate": 8.131938416556261e-05, + "loss": 0.7735, + "step": 19330 + }, + { + "epoch": 0.21630569116602635, + "grad_norm": 0.2687735855579376, + "learning_rate": 8.10909589291425e-05, + "loss": 0.7703, + "step": 19340 + }, + { + "epoch": 0.21641753485328905, + "grad_norm": 0.27332374453544617, + "learning_rate": 8.086253369272237e-05, + "loss": 0.7645, + "step": 19350 + }, + { + "epoch": 0.21652937854055174, + "grad_norm": 0.25585636496543884, + "learning_rate": 8.063410845630226e-05, + "loss": 0.7651, + "step": 19360 + }, + { + "epoch": 0.2166412222278144, + "grad_norm": 0.25861334800720215, + "learning_rate": 8.040568321988213e-05, + "loss": 0.7788, + "step": 19370 + }, + { + "epoch": 0.2167530659150771, + "grad_norm": 0.26126453280448914, + "learning_rate": 8.017725798346201e-05, + "loss": 0.7631, + "step": 19380 + }, + { + "epoch": 0.21686490960233978, + "grad_norm": 0.27623289823532104, + "learning_rate": 7.99488327470419e-05, + "loss": 0.7555, + "step": 19390 + }, + { + "epoch": 0.21697675328960245, + "grad_norm": 0.256489634513855, + "learning_rate": 7.972040751062178e-05, + "loss": 0.7565, + "step": 19400 + }, + { + "epoch": 0.21708859697686514, + "grad_norm": 0.26825475692749023, + "learning_rate": 7.949198227420165e-05, + "loss": 0.7619, + "step": 19410 + }, + { + "epoch": 0.2172004406641278, + "grad_norm": 0.2633214294910431, + "learning_rate": 7.926355703778155e-05, + "loss": 0.7576, + "step": 19420 + }, + { + "epoch": 0.2173122843513905, + "grad_norm": 0.24602185189723969, + "learning_rate": 7.903513180136141e-05, + "loss": 0.748, + "step": 19430 + }, + { + "epoch": 0.21742412803865319, + "grad_norm": 0.24769659340381622, + "learning_rate": 7.88067065649413e-05, + "loss": 0.749, + "step": 19440 + }, + { + "epoch": 0.21753597172591585, + "grad_norm": 0.22824670374393463, + "learning_rate": 7.857828132852117e-05, + "loss": 0.7439, + "step": 19450 + }, + { + "epoch": 0.21764781541317854, + "grad_norm": 0.24848710000514984, + "learning_rate": 7.834985609210106e-05, + "loss": 0.7422, + "step": 19460 + }, + { + "epoch": 0.21775965910044123, + "grad_norm": 0.25875037908554077, + "learning_rate": 7.812143085568093e-05, + "loss": 0.7411, + "step": 19470 + }, + { + "epoch": 0.2178715027877039, + "grad_norm": 0.24616488814353943, + "learning_rate": 7.789300561926082e-05, + "loss": 0.723, + "step": 19480 + }, + { + "epoch": 0.2179833464749666, + "grad_norm": 0.26018476486206055, + "learning_rate": 7.76645803828407e-05, + "loss": 0.7388, + "step": 19490 + }, + { + "epoch": 0.21809519016222928, + "grad_norm": 0.24355724453926086, + "learning_rate": 7.743615514642058e-05, + "loss": 0.7337, + "step": 19500 + }, + { + "epoch": 0.21820703384949194, + "grad_norm": 0.24908235669136047, + "learning_rate": 7.720772991000045e-05, + "loss": 0.7378, + "step": 19510 + }, + { + "epoch": 0.21831887753675464, + "grad_norm": 0.2710162401199341, + "learning_rate": 7.697930467358035e-05, + "loss": 0.7336, + "step": 19520 + }, + { + "epoch": 0.21843072122401733, + "grad_norm": 0.24222905933856964, + "learning_rate": 7.675087943716022e-05, + "loss": 0.7386, + "step": 19530 + }, + { + "epoch": 0.21854256491128, + "grad_norm": 0.23762881755828857, + "learning_rate": 7.65224542007401e-05, + "loss": 0.7354, + "step": 19540 + }, + { + "epoch": 0.21865440859854268, + "grad_norm": 0.25905948877334595, + "learning_rate": 7.629402896431998e-05, + "loss": 0.7453, + "step": 19550 + }, + { + "epoch": 0.21876625228580535, + "grad_norm": 0.24563716351985931, + "learning_rate": 7.606560372789986e-05, + "loss": 0.7422, + "step": 19560 + }, + { + "epoch": 0.21887809597306804, + "grad_norm": 0.2649664878845215, + "learning_rate": 7.583717849147973e-05, + "loss": 0.7301, + "step": 19570 + }, + { + "epoch": 0.21898993966033073, + "grad_norm": 0.24720273911952972, + "learning_rate": 7.560875325505962e-05, + "loss": 0.7321, + "step": 19580 + }, + { + "epoch": 0.2191017833475934, + "grad_norm": 0.23652884364128113, + "learning_rate": 7.53803280186395e-05, + "loss": 0.7296, + "step": 19590 + }, + { + "epoch": 0.21921362703485608, + "grad_norm": 0.23715312778949738, + "learning_rate": 7.515190278221938e-05, + "loss": 0.7237, + "step": 19600 + }, + { + "epoch": 0.21932547072211878, + "grad_norm": 0.2500048577785492, + "learning_rate": 7.492347754579925e-05, + "loss": 0.7372, + "step": 19610 + }, + { + "epoch": 0.21943731440938144, + "grad_norm": 0.2575337886810303, + "learning_rate": 7.469505230937915e-05, + "loss": 0.7393, + "step": 19620 + }, + { + "epoch": 0.21954915809664413, + "grad_norm": 0.255375474691391, + "learning_rate": 7.446662707295902e-05, + "loss": 0.75, + "step": 19630 + }, + { + "epoch": 0.21966100178390682, + "grad_norm": 0.2793714106082916, + "learning_rate": 7.42382018365389e-05, + "loss": 0.7585, + "step": 19640 + }, + { + "epoch": 0.2197728454711695, + "grad_norm": 0.2588786482810974, + "learning_rate": 7.400977660011878e-05, + "loss": 0.7661, + "step": 19650 + }, + { + "epoch": 0.21988468915843218, + "grad_norm": 0.27130866050720215, + "learning_rate": 7.378135136369867e-05, + "loss": 0.7579, + "step": 19660 + }, + { + "epoch": 0.21999653284569487, + "grad_norm": 0.2730309069156647, + "learning_rate": 7.355292612727853e-05, + "loss": 0.7463, + "step": 19670 + }, + { + "epoch": 0.22010837653295753, + "grad_norm": 0.24330918490886688, + "learning_rate": 7.332450089085842e-05, + "loss": 0.7388, + "step": 19680 + }, + { + "epoch": 0.22022022022022023, + "grad_norm": 0.30004703998565674, + "learning_rate": 7.309607565443831e-05, + "loss": 0.7633, + "step": 19690 + }, + { + "epoch": 0.2203320639074829, + "grad_norm": 0.2754705548286438, + "learning_rate": 7.286765041801818e-05, + "loss": 0.7587, + "step": 19700 + }, + { + "epoch": 0.22044390759474558, + "grad_norm": 0.27601394057273865, + "learning_rate": 7.263922518159807e-05, + "loss": 0.7468, + "step": 19710 + }, + { + "epoch": 0.22055575128200827, + "grad_norm": 0.2328653633594513, + "learning_rate": 7.241079994517795e-05, + "loss": 0.7432, + "step": 19720 + }, + { + "epoch": 0.22066759496927094, + "grad_norm": 0.23960436880588531, + "learning_rate": 7.218237470875783e-05, + "loss": 0.7384, + "step": 19730 + }, + { + "epoch": 0.22077943865653363, + "grad_norm": 0.2687484323978424, + "learning_rate": 7.19539494723377e-05, + "loss": 0.738, + "step": 19740 + }, + { + "epoch": 0.22089128234379632, + "grad_norm": 0.2243189811706543, + "learning_rate": 7.17255242359176e-05, + "loss": 0.7467, + "step": 19750 + }, + { + "epoch": 0.22100312603105898, + "grad_norm": 0.26094529032707214, + "learning_rate": 7.149709899949747e-05, + "loss": 0.7579, + "step": 19760 + }, + { + "epoch": 0.22111496971832167, + "grad_norm": 0.2761390507221222, + "learning_rate": 7.126867376307735e-05, + "loss": 0.7491, + "step": 19770 + }, + { + "epoch": 0.22122681340558437, + "grad_norm": 0.2523578405380249, + "learning_rate": 7.104024852665723e-05, + "loss": 0.7358, + "step": 19780 + }, + { + "epoch": 0.22133865709284703, + "grad_norm": 0.25612056255340576, + "learning_rate": 7.081182329023711e-05, + "loss": 0.7322, + "step": 19790 + }, + { + "epoch": 0.22145050078010972, + "grad_norm": 0.24379362165927887, + "learning_rate": 7.058339805381698e-05, + "loss": 0.7438, + "step": 19800 + }, + { + "epoch": 0.2215623444673724, + "grad_norm": 0.2315502017736435, + "learning_rate": 7.035497281739687e-05, + "loss": 0.7349, + "step": 19810 + }, + { + "epoch": 0.22167418815463508, + "grad_norm": 0.41941365599632263, + "learning_rate": 7.012654758097675e-05, + "loss": 0.743, + "step": 19820 + }, + { + "epoch": 0.22178603184189777, + "grad_norm": 0.23147599399089813, + "learning_rate": 6.989812234455663e-05, + "loss": 0.7381, + "step": 19830 + }, + { + "epoch": 0.22189787552916043, + "grad_norm": 0.25920864939689636, + "learning_rate": 6.96696971081365e-05, + "loss": 0.7469, + "step": 19840 + }, + { + "epoch": 0.22200971921642312, + "grad_norm": 0.23870904743671417, + "learning_rate": 6.94412718717164e-05, + "loss": 0.7476, + "step": 19850 + }, + { + "epoch": 0.22212156290368582, + "grad_norm": 0.2372673749923706, + "learning_rate": 6.921284663529627e-05, + "loss": 0.7468, + "step": 19860 + }, + { + "epoch": 0.22223340659094848, + "grad_norm": 0.2703365683555603, + "learning_rate": 6.898442139887615e-05, + "loss": 0.742, + "step": 19870 + }, + { + "epoch": 0.22234525027821117, + "grad_norm": 0.24437329173088074, + "learning_rate": 6.875599616245603e-05, + "loss": 0.7217, + "step": 19880 + }, + { + "epoch": 0.22245709396547386, + "grad_norm": 0.21680840849876404, + "learning_rate": 6.852757092603592e-05, + "loss": 0.7547, + "step": 19890 + }, + { + "epoch": 0.22256893765273653, + "grad_norm": 0.29101526737213135, + "learning_rate": 6.829914568961579e-05, + "loss": 0.7389, + "step": 19900 + }, + { + "epoch": 0.22268078133999922, + "grad_norm": 0.2821531891822815, + "learning_rate": 6.807072045319567e-05, + "loss": 0.731, + "step": 19910 + }, + { + "epoch": 0.2227926250272619, + "grad_norm": 0.2773050367832184, + "learning_rate": 6.784229521677555e-05, + "loss": 0.7369, + "step": 19920 + }, + { + "epoch": 0.22290446871452457, + "grad_norm": 0.2531367838382721, + "learning_rate": 6.761386998035543e-05, + "loss": 0.7399, + "step": 19930 + }, + { + "epoch": 0.22301631240178726, + "grad_norm": 0.28158465027809143, + "learning_rate": 6.73854447439353e-05, + "loss": 0.7523, + "step": 19940 + }, + { + "epoch": 0.22312815608904996, + "grad_norm": 0.25612935423851013, + "learning_rate": 6.71570195075152e-05, + "loss": 0.7725, + "step": 19950 + }, + { + "epoch": 0.22323999977631262, + "grad_norm": 0.26996153593063354, + "learning_rate": 6.692859427109507e-05, + "loss": 0.7823, + "step": 19960 + }, + { + "epoch": 0.2233518434635753, + "grad_norm": 0.28008782863616943, + "learning_rate": 6.670016903467495e-05, + "loss": 0.7679, + "step": 19970 + }, + { + "epoch": 0.22346368715083798, + "grad_norm": 0.27016493678092957, + "learning_rate": 6.647174379825483e-05, + "loss": 0.7617, + "step": 19980 + }, + { + "epoch": 0.22357553083810067, + "grad_norm": 0.2679850459098816, + "learning_rate": 6.624331856183472e-05, + "loss": 0.7737, + "step": 19990 + }, + { + "epoch": 0.22368737452536336, + "grad_norm": 0.2570480406284332, + "learning_rate": 6.601489332541459e-05, + "loss": 0.758, + "step": 20000 + }, + { + "epoch": 0.22379921821262602, + "grad_norm": 0.2503785490989685, + "learning_rate": 6.578646808899447e-05, + "loss": 0.761, + "step": 20010 + }, + { + "epoch": 0.2239110618998887, + "grad_norm": 0.2648092210292816, + "learning_rate": 6.555804285257435e-05, + "loss": 0.7532, + "step": 20020 + }, + { + "epoch": 0.2240229055871514, + "grad_norm": 0.26829221844673157, + "learning_rate": 6.532961761615423e-05, + "loss": 0.7542, + "step": 20030 + }, + { + "epoch": 0.22413474927441407, + "grad_norm": 0.27535539865493774, + "learning_rate": 6.51011923797341e-05, + "loss": 0.7578, + "step": 20040 + }, + { + "epoch": 0.22424659296167676, + "grad_norm": 0.28674209117889404, + "learning_rate": 6.4872767143314e-05, + "loss": 0.756, + "step": 20050 + }, + { + "epoch": 0.22435843664893945, + "grad_norm": 0.2523026466369629, + "learning_rate": 6.464434190689387e-05, + "loss": 0.7514, + "step": 20060 + }, + { + "epoch": 0.22447028033620212, + "grad_norm": 0.24213305115699768, + "learning_rate": 6.441591667047375e-05, + "loss": 0.7546, + "step": 20070 + }, + { + "epoch": 0.2245821240234648, + "grad_norm": 0.2779023349285126, + "learning_rate": 6.418749143405363e-05, + "loss": 0.7654, + "step": 20080 + }, + { + "epoch": 0.2246939677107275, + "grad_norm": 0.28806111216545105, + "learning_rate": 6.395906619763352e-05, + "loss": 0.7612, + "step": 20090 + }, + { + "epoch": 0.22480581139799016, + "grad_norm": 0.2637580931186676, + "learning_rate": 6.373064096121339e-05, + "loss": 0.7659, + "step": 20100 + }, + { + "epoch": 0.22491765508525285, + "grad_norm": 0.2683275043964386, + "learning_rate": 6.350221572479328e-05, + "loss": 0.753, + "step": 20110 + }, + { + "epoch": 0.22502949877251555, + "grad_norm": 0.2693597078323364, + "learning_rate": 6.327379048837315e-05, + "loss": 0.7697, + "step": 20120 + }, + { + "epoch": 0.2251413424597782, + "grad_norm": 0.26335635781288147, + "learning_rate": 6.304536525195304e-05, + "loss": 0.7644, + "step": 20130 + }, + { + "epoch": 0.2252531861470409, + "grad_norm": 0.29237446188926697, + "learning_rate": 6.28169400155329e-05, + "loss": 0.7721, + "step": 20140 + }, + { + "epoch": 0.22536502983430357, + "grad_norm": 0.3080182373523712, + "learning_rate": 6.25885147791128e-05, + "loss": 0.7666, + "step": 20150 + }, + { + "epoch": 0.22547687352156626, + "grad_norm": 0.2831542193889618, + "learning_rate": 6.236008954269268e-05, + "loss": 0.7805, + "step": 20160 + }, + { + "epoch": 0.22558871720882895, + "grad_norm": 0.2860835790634155, + "learning_rate": 6.213166430627257e-05, + "loss": 0.7816, + "step": 20170 + }, + { + "epoch": 0.2257005608960916, + "grad_norm": 0.28273066878318787, + "learning_rate": 6.190323906985244e-05, + "loss": 0.7812, + "step": 20180 + }, + { + "epoch": 0.2258124045833543, + "grad_norm": 0.29203614592552185, + "learning_rate": 6.167481383343232e-05, + "loss": 0.7699, + "step": 20190 + }, + { + "epoch": 0.225924248270617, + "grad_norm": 0.2811570167541504, + "learning_rate": 6.14463885970122e-05, + "loss": 0.7833, + "step": 20200 + }, + { + "epoch": 0.22603609195787966, + "grad_norm": 0.30047500133514404, + "learning_rate": 6.121796336059208e-05, + "loss": 0.7594, + "step": 20210 + }, + { + "epoch": 0.22614793564514235, + "grad_norm": 0.2838903069496155, + "learning_rate": 6.098953812417196e-05, + "loss": 0.7678, + "step": 20220 + }, + { + "epoch": 0.22625977933240504, + "grad_norm": 0.2840651273727417, + "learning_rate": 6.0761112887751836e-05, + "loss": 0.7546, + "step": 20230 + }, + { + "epoch": 0.2263716230196677, + "grad_norm": 0.31575652956962585, + "learning_rate": 6.053268765133172e-05, + "loss": 0.7533, + "step": 20240 + }, + { + "epoch": 0.2264834667069304, + "grad_norm": 0.2692145109176636, + "learning_rate": 6.03042624149116e-05, + "loss": 0.744, + "step": 20250 + }, + { + "epoch": 0.2265953103941931, + "grad_norm": 0.3094116449356079, + "learning_rate": 6.007583717849148e-05, + "loss": 0.7708, + "step": 20260 + }, + { + "epoch": 0.22670715408145575, + "grad_norm": 0.3123047947883606, + "learning_rate": 5.984741194207136e-05, + "loss": 0.7431, + "step": 20270 + }, + { + "epoch": 0.22681899776871844, + "grad_norm": 0.2733646631240845, + "learning_rate": 5.961898670565124e-05, + "loss": 0.762, + "step": 20280 + }, + { + "epoch": 0.2269308414559811, + "grad_norm": 0.23944342136383057, + "learning_rate": 5.939056146923112e-05, + "loss": 0.7488, + "step": 20290 + }, + { + "epoch": 0.2270426851432438, + "grad_norm": 0.2459600865840912, + "learning_rate": 5.9162136232811e-05, + "loss": 0.7443, + "step": 20300 + }, + { + "epoch": 0.2271545288305065, + "grad_norm": 0.2502724826335907, + "learning_rate": 5.893371099639088e-05, + "loss": 0.7417, + "step": 20310 + }, + { + "epoch": 0.22726637251776916, + "grad_norm": 0.23721522092819214, + "learning_rate": 5.870528575997076e-05, + "loss": 0.7393, + "step": 20320 + }, + { + "epoch": 0.22737821620503185, + "grad_norm": 0.2526785135269165, + "learning_rate": 5.847686052355064e-05, + "loss": 0.7346, + "step": 20330 + }, + { + "epoch": 0.22749005989229454, + "grad_norm": 0.2573647201061249, + "learning_rate": 5.824843528713052e-05, + "loss": 0.7192, + "step": 20340 + }, + { + "epoch": 0.2276019035795572, + "grad_norm": 0.2632768750190735, + "learning_rate": 5.80200100507104e-05, + "loss": 0.7234, + "step": 20350 + }, + { + "epoch": 0.2277137472668199, + "grad_norm": 0.2589345872402191, + "learning_rate": 5.779158481429028e-05, + "loss": 0.7165, + "step": 20360 + }, + { + "epoch": 0.22782559095408259, + "grad_norm": 0.2480648308992386, + "learning_rate": 5.756315957787016e-05, + "loss": 0.7099, + "step": 20370 + }, + { + "epoch": 0.22793743464134525, + "grad_norm": 0.24949654936790466, + "learning_rate": 5.733473434145004e-05, + "loss": 0.7187, + "step": 20380 + }, + { + "epoch": 0.22804927832860794, + "grad_norm": 0.25637611746788025, + "learning_rate": 5.710630910502993e-05, + "loss": 0.7098, + "step": 20390 + }, + { + "epoch": 0.22816112201587063, + "grad_norm": 0.28809231519699097, + "learning_rate": 5.687788386860981e-05, + "loss": 0.7315, + "step": 20400 + }, + { + "epoch": 0.2282729657031333, + "grad_norm": 0.25564566254615784, + "learning_rate": 5.6649458632189686e-05, + "loss": 0.7319, + "step": 20410 + }, + { + "epoch": 0.228384809390396, + "grad_norm": 0.2693794369697571, + "learning_rate": 5.642103339576957e-05, + "loss": 0.7173, + "step": 20420 + }, + { + "epoch": 0.22849665307765865, + "grad_norm": 0.24680989980697632, + "learning_rate": 5.619260815934945e-05, + "loss": 0.708, + "step": 20430 + }, + { + "epoch": 0.22860849676492134, + "grad_norm": 0.2790026068687439, + "learning_rate": 5.596418292292933e-05, + "loss": 0.7023, + "step": 20440 + }, + { + "epoch": 0.22872034045218403, + "grad_norm": 0.2656199038028717, + "learning_rate": 5.573575768650921e-05, + "loss": 0.7113, + "step": 20450 + }, + { + "epoch": 0.2288321841394467, + "grad_norm": 0.30832743644714355, + "learning_rate": 5.550733245008909e-05, + "loss": 0.7161, + "step": 20460 + }, + { + "epoch": 0.2289440278267094, + "grad_norm": 0.27060794830322266, + "learning_rate": 5.527890721366897e-05, + "loss": 0.7208, + "step": 20470 + }, + { + "epoch": 0.22905587151397208, + "grad_norm": 0.26036307215690613, + "learning_rate": 5.505048197724885e-05, + "loss": 0.7004, + "step": 20480 + }, + { + "epoch": 0.22916771520123475, + "grad_norm": 0.2758086919784546, + "learning_rate": 5.482205674082873e-05, + "loss": 0.7179, + "step": 20490 + }, + { + "epoch": 0.22927955888849744, + "grad_norm": 0.2821243107318878, + "learning_rate": 5.459363150440861e-05, + "loss": 0.7255, + "step": 20500 + }, + { + "epoch": 0.22939140257576013, + "grad_norm": 0.2782810628414154, + "learning_rate": 5.436520626798849e-05, + "loss": 0.7149, + "step": 20510 + }, + { + "epoch": 0.2295032462630228, + "grad_norm": 0.2755940854549408, + "learning_rate": 5.413678103156837e-05, + "loss": 0.7117, + "step": 20520 + }, + { + "epoch": 0.22961508995028548, + "grad_norm": 0.29176777601242065, + "learning_rate": 5.390835579514825e-05, + "loss": 0.7188, + "step": 20530 + }, + { + "epoch": 0.22972693363754818, + "grad_norm": 0.27739444375038147, + "learning_rate": 5.367993055872813e-05, + "loss": 0.7196, + "step": 20540 + }, + { + "epoch": 0.22983877732481084, + "grad_norm": 0.27187204360961914, + "learning_rate": 5.345150532230801e-05, + "loss": 0.722, + "step": 20550 + }, + { + "epoch": 0.22995062101207353, + "grad_norm": 0.2951996624469757, + "learning_rate": 5.322308008588789e-05, + "loss": 0.7325, + "step": 20560 + }, + { + "epoch": 0.2300624646993362, + "grad_norm": 0.2677932381629944, + "learning_rate": 5.299465484946777e-05, + "loss": 0.7263, + "step": 20570 + }, + { + "epoch": 0.23017430838659889, + "grad_norm": 0.29231807589530945, + "learning_rate": 5.2766229613047654e-05, + "loss": 0.7284, + "step": 20580 + }, + { + "epoch": 0.23028615207386158, + "grad_norm": 0.30211326479911804, + "learning_rate": 5.253780437662753e-05, + "loss": 0.7222, + "step": 20590 + }, + { + "epoch": 0.23039799576112424, + "grad_norm": 0.29821720719337463, + "learning_rate": 5.230937914020741e-05, + "loss": 0.7316, + "step": 20600 + }, + { + "epoch": 0.23050983944838693, + "grad_norm": 0.3019379675388336, + "learning_rate": 5.208095390378729e-05, + "loss": 0.7328, + "step": 20610 + }, + { + "epoch": 0.23062168313564962, + "grad_norm": 0.2569403052330017, + "learning_rate": 5.185252866736717e-05, + "loss": 0.7215, + "step": 20620 + }, + { + "epoch": 0.2307335268229123, + "grad_norm": 0.3151782155036926, + "learning_rate": 5.1624103430947054e-05, + "loss": 0.7326, + "step": 20630 + }, + { + "epoch": 0.23084537051017498, + "grad_norm": 0.2748591899871826, + "learning_rate": 5.139567819452693e-05, + "loss": 0.7359, + "step": 20640 + }, + { + "epoch": 0.23095721419743767, + "grad_norm": 0.27494433522224426, + "learning_rate": 5.116725295810681e-05, + "loss": 0.7351, + "step": 20650 + }, + { + "epoch": 0.23106905788470034, + "grad_norm": 0.29428452253341675, + "learning_rate": 5.093882772168669e-05, + "loss": 0.7361, + "step": 20660 + }, + { + "epoch": 0.23118090157196303, + "grad_norm": 0.2924981117248535, + "learning_rate": 5.071040248526657e-05, + "loss": 0.7539, + "step": 20670 + }, + { + "epoch": 0.23129274525922572, + "grad_norm": 0.28647035360336304, + "learning_rate": 5.0481977248846455e-05, + "loss": 0.7576, + "step": 20680 + }, + { + "epoch": 0.23140458894648838, + "grad_norm": 0.3107542097568512, + "learning_rate": 5.025355201242633e-05, + "loss": 0.7615, + "step": 20690 + }, + { + "epoch": 0.23151643263375107, + "grad_norm": 0.27186501026153564, + "learning_rate": 5.0025126776006213e-05, + "loss": 0.7641, + "step": 20700 + }, + { + "epoch": 0.23162827632101374, + "grad_norm": 0.2838156819343567, + "learning_rate": 4.9796701539586096e-05, + "loss": 0.7695, + "step": 20710 + }, + { + "epoch": 0.23174012000827643, + "grad_norm": 0.3377101421356201, + "learning_rate": 4.956827630316597e-05, + "loss": 0.7696, + "step": 20720 + }, + { + "epoch": 0.23185196369553912, + "grad_norm": 0.3177778422832489, + "learning_rate": 4.9339851066745855e-05, + "loss": 0.7677, + "step": 20730 + }, + { + "epoch": 0.23196380738280178, + "grad_norm": 0.3157583773136139, + "learning_rate": 4.911142583032573e-05, + "loss": 0.7653, + "step": 20740 + }, + { + "epoch": 0.23207565107006448, + "grad_norm": 0.3123907148838043, + "learning_rate": 4.8883000593905614e-05, + "loss": 0.7677, + "step": 20750 + }, + { + "epoch": 0.23218749475732717, + "grad_norm": 0.30460426211357117, + "learning_rate": 4.86545753574855e-05, + "loss": 0.7743, + "step": 20760 + }, + { + "epoch": 0.23229933844458983, + "grad_norm": 0.27507251501083374, + "learning_rate": 4.842615012106537e-05, + "loss": 0.767, + "step": 20770 + }, + { + "epoch": 0.23241118213185252, + "grad_norm": 0.3233499228954315, + "learning_rate": 4.8197724884645256e-05, + "loss": 0.7717, + "step": 20780 + }, + { + "epoch": 0.23252302581911521, + "grad_norm": 0.30144819617271423, + "learning_rate": 4.796929964822513e-05, + "loss": 0.7609, + "step": 20790 + }, + { + "epoch": 0.23263486950637788, + "grad_norm": 0.29588454961776733, + "learning_rate": 4.7740874411805014e-05, + "loss": 0.7682, + "step": 20800 + }, + { + "epoch": 0.23274671319364057, + "grad_norm": 0.3111203610897064, + "learning_rate": 4.75124491753849e-05, + "loss": 0.7652, + "step": 20810 + }, + { + "epoch": 0.23285855688090326, + "grad_norm": 0.28917646408081055, + "learning_rate": 4.728402393896477e-05, + "loss": 0.7584, + "step": 20820 + }, + { + "epoch": 0.23297040056816593, + "grad_norm": 0.3156343698501587, + "learning_rate": 4.7055598702544656e-05, + "loss": 0.7643, + "step": 20830 + }, + { + "epoch": 0.23308224425542862, + "grad_norm": 0.2909680902957916, + "learning_rate": 4.682717346612454e-05, + "loss": 0.7613, + "step": 20840 + }, + { + "epoch": 0.2331940879426913, + "grad_norm": 0.3006870746612549, + "learning_rate": 4.659874822970442e-05, + "loss": 0.7603, + "step": 20850 + }, + { + "epoch": 0.23330593162995397, + "grad_norm": 0.2844945192337036, + "learning_rate": 4.6370322993284304e-05, + "loss": 0.7589, + "step": 20860 + }, + { + "epoch": 0.23341777531721666, + "grad_norm": 0.26857924461364746, + "learning_rate": 4.614189775686418e-05, + "loss": 0.7401, + "step": 20870 + }, + { + "epoch": 0.23352961900447933, + "grad_norm": 0.31332314014434814, + "learning_rate": 4.591347252044406e-05, + "loss": 0.7468, + "step": 20880 + }, + { + "epoch": 0.23364146269174202, + "grad_norm": 0.28083765506744385, + "learning_rate": 4.568504728402394e-05, + "loss": 0.7451, + "step": 20890 + }, + { + "epoch": 0.2337533063790047, + "grad_norm": 0.29185009002685547, + "learning_rate": 4.545662204760382e-05, + "loss": 0.7478, + "step": 20900 + }, + { + "epoch": 0.23386515006626737, + "grad_norm": 0.30532801151275635, + "learning_rate": 4.5228196811183705e-05, + "loss": 0.7404, + "step": 20910 + }, + { + "epoch": 0.23397699375353007, + "grad_norm": 0.2724134922027588, + "learning_rate": 4.499977157476358e-05, + "loss": 0.732, + "step": 20920 + }, + { + "epoch": 0.23408883744079276, + "grad_norm": 0.29753822088241577, + "learning_rate": 4.4771346338343464e-05, + "loss": 0.7236, + "step": 20930 + }, + { + "epoch": 0.23420068112805542, + "grad_norm": 0.31980055570602417, + "learning_rate": 4.454292110192334e-05, + "loss": 0.7407, + "step": 20940 + }, + { + "epoch": 0.2343125248153181, + "grad_norm": 0.29578351974487305, + "learning_rate": 4.431449586550322e-05, + "loss": 0.7166, + "step": 20950 + }, + { + "epoch": 0.2344243685025808, + "grad_norm": 0.25261184573173523, + "learning_rate": 4.4086070629083105e-05, + "loss": 0.7195, + "step": 20960 + }, + { + "epoch": 0.23453621218984347, + "grad_norm": 0.2669534385204315, + "learning_rate": 4.385764539266298e-05, + "loss": 0.7224, + "step": 20970 + }, + { + "epoch": 0.23464805587710616, + "grad_norm": 0.2817215919494629, + "learning_rate": 4.3629220156242864e-05, + "loss": 0.7405, + "step": 20980 + }, + { + "epoch": 0.23475989956436885, + "grad_norm": 0.27033400535583496, + "learning_rate": 4.340079491982275e-05, + "loss": 0.7292, + "step": 20990 + }, + { + "epoch": 0.23487174325163152, + "grad_norm": 0.3083013594150543, + "learning_rate": 4.317236968340262e-05, + "loss": 0.7271, + "step": 21000 + }, + { + "epoch": 0.2349835869388942, + "grad_norm": 0.27074989676475525, + "learning_rate": 4.2943944446982506e-05, + "loss": 0.7346, + "step": 21010 + }, + { + "epoch": 0.23509543062615687, + "grad_norm": 0.31609755754470825, + "learning_rate": 4.271551921056238e-05, + "loss": 0.7285, + "step": 21020 + }, + { + "epoch": 0.23520727431341956, + "grad_norm": 0.27084672451019287, + "learning_rate": 4.2487093974142265e-05, + "loss": 0.7411, + "step": 21030 + }, + { + "epoch": 0.23531911800068225, + "grad_norm": 0.26669842004776, + "learning_rate": 4.225866873772215e-05, + "loss": 0.7423, + "step": 21040 + }, + { + "epoch": 0.23543096168794492, + "grad_norm": 0.2873358428478241, + "learning_rate": 4.2030243501302024e-05, + "loss": 0.7345, + "step": 21050 + }, + { + "epoch": 0.2355428053752076, + "grad_norm": 0.2831687033176422, + "learning_rate": 4.1801818264881906e-05, + "loss": 0.7537, + "step": 21060 + }, + { + "epoch": 0.2356546490624703, + "grad_norm": 0.2781788110733032, + "learning_rate": 4.157339302846178e-05, + "loss": 0.7494, + "step": 21070 + }, + { + "epoch": 0.23576649274973296, + "grad_norm": 0.27109071612358093, + "learning_rate": 4.1344967792041665e-05, + "loss": 0.7493, + "step": 21080 + }, + { + "epoch": 0.23587833643699566, + "grad_norm": 0.25398164987564087, + "learning_rate": 4.111654255562155e-05, + "loss": 0.7369, + "step": 21090 + }, + { + "epoch": 0.23599018012425835, + "grad_norm": 0.3150353729724884, + "learning_rate": 4.0888117319201424e-05, + "loss": 0.754, + "step": 21100 + }, + { + "epoch": 0.236102023811521, + "grad_norm": 0.27384257316589355, + "learning_rate": 4.065969208278131e-05, + "loss": 0.7439, + "step": 21110 + }, + { + "epoch": 0.2362138674987837, + "grad_norm": 0.2770559787750244, + "learning_rate": 4.043126684636118e-05, + "loss": 0.7391, + "step": 21120 + }, + { + "epoch": 0.2363257111860464, + "grad_norm": 0.29367002844810486, + "learning_rate": 4.0202841609941066e-05, + "loss": 0.746, + "step": 21130 + }, + { + "epoch": 0.23643755487330906, + "grad_norm": 0.2554051876068115, + "learning_rate": 3.997441637352095e-05, + "loss": 0.7386, + "step": 21140 + }, + { + "epoch": 0.23654939856057175, + "grad_norm": 0.2943428158760071, + "learning_rate": 3.9745991137100825e-05, + "loss": 0.7437, + "step": 21150 + }, + { + "epoch": 0.2366612422478344, + "grad_norm": 0.24465301632881165, + "learning_rate": 3.951756590068071e-05, + "loss": 0.7331, + "step": 21160 + }, + { + "epoch": 0.2367730859350971, + "grad_norm": 0.2545934021472931, + "learning_rate": 3.9289140664260584e-05, + "loss": 0.7361, + "step": 21170 + }, + { + "epoch": 0.2368849296223598, + "grad_norm": 0.2792121469974518, + "learning_rate": 3.9060715427840466e-05, + "loss": 0.7238, + "step": 21180 + }, + { + "epoch": 0.23699677330962246, + "grad_norm": 0.27943745255470276, + "learning_rate": 3.883229019142035e-05, + "loss": 0.726, + "step": 21190 + }, + { + "epoch": 0.23710861699688515, + "grad_norm": 0.2514471411705017, + "learning_rate": 3.8603864955000225e-05, + "loss": 0.7214, + "step": 21200 + }, + { + "epoch": 0.23722046068414784, + "grad_norm": 0.2698551416397095, + "learning_rate": 3.837543971858011e-05, + "loss": 0.7318, + "step": 21210 + }, + { + "epoch": 0.2373323043714105, + "grad_norm": 0.29603877663612366, + "learning_rate": 3.814701448215999e-05, + "loss": 0.742, + "step": 21220 + }, + { + "epoch": 0.2374441480586732, + "grad_norm": 0.26655495166778564, + "learning_rate": 3.791858924573987e-05, + "loss": 0.7331, + "step": 21230 + }, + { + "epoch": 0.2375559917459359, + "grad_norm": 0.29367104172706604, + "learning_rate": 3.769016400931975e-05, + "loss": 0.7233, + "step": 21240 + }, + { + "epoch": 0.23766783543319855, + "grad_norm": 0.2680334746837616, + "learning_rate": 3.7461738772899626e-05, + "loss": 0.732, + "step": 21250 + }, + { + "epoch": 0.23777967912046125, + "grad_norm": 0.2748298943042755, + "learning_rate": 3.723331353647951e-05, + "loss": 0.7453, + "step": 21260 + }, + { + "epoch": 0.23789152280772394, + "grad_norm": 0.28276947140693665, + "learning_rate": 3.700488830005939e-05, + "loss": 0.7524, + "step": 21270 + }, + { + "epoch": 0.2380033664949866, + "grad_norm": 0.2645372450351715, + "learning_rate": 3.677646306363927e-05, + "loss": 0.7542, + "step": 21280 + }, + { + "epoch": 0.2381152101822493, + "grad_norm": 0.2866505980491638, + "learning_rate": 3.654803782721916e-05, + "loss": 0.7447, + "step": 21290 + }, + { + "epoch": 0.23822705386951196, + "grad_norm": 0.29611489176750183, + "learning_rate": 3.631961259079903e-05, + "loss": 0.7662, + "step": 21300 + }, + { + "epoch": 0.23833889755677465, + "grad_norm": 0.29184749722480774, + "learning_rate": 3.6091187354378916e-05, + "loss": 0.7558, + "step": 21310 + }, + { + "epoch": 0.23845074124403734, + "grad_norm": 0.27304571866989136, + "learning_rate": 3.58627621179588e-05, + "loss": 0.7578, + "step": 21320 + }, + { + "epoch": 0.2385625849313, + "grad_norm": 0.2700962424278259, + "learning_rate": 3.5634336881538675e-05, + "loss": 0.7411, + "step": 21330 + }, + { + "epoch": 0.2386744286185627, + "grad_norm": 0.2845793664455414, + "learning_rate": 3.540591164511856e-05, + "loss": 0.7392, + "step": 21340 + }, + { + "epoch": 0.2387862723058254, + "grad_norm": 0.32136180996894836, + "learning_rate": 3.5177486408698433e-05, + "loss": 0.7431, + "step": 21350 + }, + { + "epoch": 0.23889811599308805, + "grad_norm": 0.26846998929977417, + "learning_rate": 3.4949061172278316e-05, + "loss": 0.737, + "step": 21360 + }, + { + "epoch": 0.23900995968035074, + "grad_norm": 0.26363828778266907, + "learning_rate": 3.47206359358582e-05, + "loss": 0.7416, + "step": 21370 + }, + { + "epoch": 0.23912180336761343, + "grad_norm": 0.2900106906890869, + "learning_rate": 3.4492210699438075e-05, + "loss": 0.7373, + "step": 21380 + }, + { + "epoch": 0.2392336470548761, + "grad_norm": 0.2762589156627655, + "learning_rate": 3.426378546301796e-05, + "loss": 0.7379, + "step": 21390 + }, + { + "epoch": 0.2393454907421388, + "grad_norm": 0.2697104513645172, + "learning_rate": 3.4035360226597834e-05, + "loss": 0.7448, + "step": 21400 + }, + { + "epoch": 0.23945733442940148, + "grad_norm": 0.2901761829853058, + "learning_rate": 3.380693499017772e-05, + "loss": 0.7394, + "step": 21410 + }, + { + "epoch": 0.23956917811666414, + "grad_norm": 0.245674267411232, + "learning_rate": 3.35785097537576e-05, + "loss": 0.7387, + "step": 21420 + }, + { + "epoch": 0.23968102180392684, + "grad_norm": 0.2713403105735779, + "learning_rate": 3.3350084517337476e-05, + "loss": 0.7604, + "step": 21430 + }, + { + "epoch": 0.2397928654911895, + "grad_norm": 0.27368244528770447, + "learning_rate": 3.312165928091736e-05, + "loss": 0.7489, + "step": 21440 + }, + { + "epoch": 0.2399047091784522, + "grad_norm": 0.3079991340637207, + "learning_rate": 3.2893234044497234e-05, + "loss": 0.7653, + "step": 21450 + }, + { + "epoch": 0.24001655286571488, + "grad_norm": 0.2920658588409424, + "learning_rate": 3.266480880807712e-05, + "loss": 0.7588, + "step": 21460 + }, + { + "epoch": 0.24012839655297755, + "grad_norm": 0.27589842677116394, + "learning_rate": 3.2436383571657e-05, + "loss": 0.7607, + "step": 21470 + }, + { + "epoch": 0.24024024024024024, + "grad_norm": 0.2592112720012665, + "learning_rate": 3.2207958335236876e-05, + "loss": 0.745, + "step": 21480 + }, + { + "epoch": 0.24035208392750293, + "grad_norm": 0.27625855803489685, + "learning_rate": 3.197953309881676e-05, + "loss": 0.7488, + "step": 21490 + }, + { + "epoch": 0.2404639276147656, + "grad_norm": 0.2769569456577301, + "learning_rate": 3.175110786239664e-05, + "loss": 0.7326, + "step": 21500 + }, + { + "epoch": 0.24057577130202829, + "grad_norm": 0.2705914080142975, + "learning_rate": 3.152268262597652e-05, + "loss": 0.7512, + "step": 21510 + }, + { + "epoch": 0.24068761498929098, + "grad_norm": 0.2655676603317261, + "learning_rate": 3.12942573895564e-05, + "loss": 0.7366, + "step": 21520 + }, + { + "epoch": 0.24079945867655364, + "grad_norm": 0.2606657147407532, + "learning_rate": 3.106583215313628e-05, + "loss": 0.7436, + "step": 21530 + }, + { + "epoch": 0.24091130236381633, + "grad_norm": 0.27843552827835083, + "learning_rate": 3.083740691671616e-05, + "loss": 0.7342, + "step": 21540 + }, + { + "epoch": 0.24102314605107902, + "grad_norm": 0.27866050601005554, + "learning_rate": 3.060898168029604e-05, + "loss": 0.7305, + "step": 21550 + }, + { + "epoch": 0.2411349897383417, + "grad_norm": 0.2803070545196533, + "learning_rate": 3.0380556443875918e-05, + "loss": 0.727, + "step": 21560 + }, + { + "epoch": 0.24124683342560438, + "grad_norm": 0.27220121026039124, + "learning_rate": 3.01521312074558e-05, + "loss": 0.7195, + "step": 21570 + }, + { + "epoch": 0.24135867711286707, + "grad_norm": 0.26060426235198975, + "learning_rate": 2.992370597103568e-05, + "loss": 0.7013, + "step": 21580 + }, + { + "epoch": 0.24147052080012973, + "grad_norm": 0.24253526329994202, + "learning_rate": 2.969528073461556e-05, + "loss": 0.6925, + "step": 21590 + }, + { + "epoch": 0.24158236448739243, + "grad_norm": 0.26293566823005676, + "learning_rate": 2.946685549819544e-05, + "loss": 0.7028, + "step": 21600 + }, + { + "epoch": 0.2416942081746551, + "grad_norm": 0.26427412033081055, + "learning_rate": 2.923843026177532e-05, + "loss": 0.6993, + "step": 21610 + }, + { + "epoch": 0.24180605186191778, + "grad_norm": 0.26823869347572327, + "learning_rate": 2.90100050253552e-05, + "loss": 0.6999, + "step": 21620 + }, + { + "epoch": 0.24191789554918047, + "grad_norm": 0.24203690886497498, + "learning_rate": 2.878157978893508e-05, + "loss": 0.6906, + "step": 21630 + }, + { + "epoch": 0.24202973923644314, + "grad_norm": 0.2612786889076233, + "learning_rate": 2.8553154552514964e-05, + "loss": 0.6952, + "step": 21640 + }, + { + "epoch": 0.24214158292370583, + "grad_norm": 0.27152737975120544, + "learning_rate": 2.8324729316094843e-05, + "loss": 0.692, + "step": 21650 + }, + { + "epoch": 0.24225342661096852, + "grad_norm": 0.2592925727367401, + "learning_rate": 2.8096304079674726e-05, + "loss": 0.6995, + "step": 21660 + }, + { + "epoch": 0.24236527029823118, + "grad_norm": 0.2419063299894333, + "learning_rate": 2.7867878843254605e-05, + "loss": 0.7067, + "step": 21670 + }, + { + "epoch": 0.24247711398549388, + "grad_norm": 0.24731135368347168, + "learning_rate": 2.7639453606834485e-05, + "loss": 0.734, + "step": 21680 + }, + { + "epoch": 0.24258895767275657, + "grad_norm": 0.25746017694473267, + "learning_rate": 2.7411028370414364e-05, + "loss": 0.7075, + "step": 21690 + }, + { + "epoch": 0.24270080136001923, + "grad_norm": 0.2521972060203552, + "learning_rate": 2.7182603133994244e-05, + "loss": 0.7137, + "step": 21700 + }, + { + "epoch": 0.24281264504728192, + "grad_norm": 0.26796218752861023, + "learning_rate": 2.6954177897574127e-05, + "loss": 0.7227, + "step": 21710 + }, + { + "epoch": 0.2429244887345446, + "grad_norm": 0.30404597520828247, + "learning_rate": 2.6725752661154006e-05, + "loss": 0.7243, + "step": 21720 + }, + { + "epoch": 0.24303633242180728, + "grad_norm": 0.29561156034469604, + "learning_rate": 2.6497327424733885e-05, + "loss": 0.7357, + "step": 21730 + }, + { + "epoch": 0.24314817610906997, + "grad_norm": 0.28066596388816833, + "learning_rate": 2.6268902188313765e-05, + "loss": 0.7224, + "step": 21740 + }, + { + "epoch": 0.24326001979633263, + "grad_norm": 0.29235216975212097, + "learning_rate": 2.6040476951893644e-05, + "loss": 0.7288, + "step": 21750 + }, + { + "epoch": 0.24337186348359532, + "grad_norm": 0.26750460267066956, + "learning_rate": 2.5812051715473527e-05, + "loss": 0.7414, + "step": 21760 + }, + { + "epoch": 0.24348370717085802, + "grad_norm": 0.2707473039627075, + "learning_rate": 2.5583626479053406e-05, + "loss": 0.7478, + "step": 21770 + }, + { + "epoch": 0.24359555085812068, + "grad_norm": 0.26526397466659546, + "learning_rate": 2.5355201242633286e-05, + "loss": 0.7513, + "step": 21780 + }, + { + "epoch": 0.24370739454538337, + "grad_norm": 0.2362915724515915, + "learning_rate": 2.5126776006213165e-05, + "loss": 0.7507, + "step": 21790 + }, + { + "epoch": 0.24381923823264606, + "grad_norm": 0.2512950599193573, + "learning_rate": 2.4898350769793048e-05, + "loss": 0.7417, + "step": 21800 + }, + { + "epoch": 0.24393108191990873, + "grad_norm": 0.2366458922624588, + "learning_rate": 2.4669925533372928e-05, + "loss": 0.7402, + "step": 21810 + }, + { + "epoch": 0.24404292560717142, + "grad_norm": 0.24888353049755096, + "learning_rate": 2.4441500296952807e-05, + "loss": 0.7456, + "step": 21820 + }, + { + "epoch": 0.2441547692944341, + "grad_norm": 0.24143491685390472, + "learning_rate": 2.4213075060532686e-05, + "loss": 0.7405, + "step": 21830 + }, + { + "epoch": 0.24426661298169677, + "grad_norm": 0.2669823169708252, + "learning_rate": 2.3984649824112566e-05, + "loss": 0.7544, + "step": 21840 + }, + { + "epoch": 0.24437845666895947, + "grad_norm": 0.24328452348709106, + "learning_rate": 2.375622458769245e-05, + "loss": 0.7347, + "step": 21850 + }, + { + "epoch": 0.24449030035622216, + "grad_norm": 0.26204219460487366, + "learning_rate": 2.3527799351272328e-05, + "loss": 0.7397, + "step": 21860 + }, + { + "epoch": 0.24460214404348482, + "grad_norm": 0.2631550431251526, + "learning_rate": 2.329937411485221e-05, + "loss": 0.7413, + "step": 21870 + }, + { + "epoch": 0.2447139877307475, + "grad_norm": 0.2729988694190979, + "learning_rate": 2.307094887843209e-05, + "loss": 0.7336, + "step": 21880 + }, + { + "epoch": 0.24482583141801018, + "grad_norm": 0.2702917754650116, + "learning_rate": 2.284252364201197e-05, + "loss": 0.7294, + "step": 21890 + }, + { + "epoch": 0.24493767510527287, + "grad_norm": 0.22882196307182312, + "learning_rate": 2.2614098405591852e-05, + "loss": 0.7164, + "step": 21900 + }, + { + "epoch": 0.24504951879253556, + "grad_norm": 0.2660382390022278, + "learning_rate": 2.2385673169171732e-05, + "loss": 0.7231, + "step": 21910 + }, + { + "epoch": 0.24516136247979822, + "grad_norm": 0.2580036222934723, + "learning_rate": 2.215724793275161e-05, + "loss": 0.7243, + "step": 21920 + }, + { + "epoch": 0.24527320616706091, + "grad_norm": 0.25490158796310425, + "learning_rate": 2.192882269633149e-05, + "loss": 0.7129, + "step": 21930 + }, + { + "epoch": 0.2453850498543236, + "grad_norm": 0.2626509368419647, + "learning_rate": 2.1700397459911374e-05, + "loss": 0.7177, + "step": 21940 + }, + { + "epoch": 0.24549689354158627, + "grad_norm": 0.2642146646976471, + "learning_rate": 2.1471972223491253e-05, + "loss": 0.7119, + "step": 21950 + }, + { + "epoch": 0.24560873722884896, + "grad_norm": 0.2683079242706299, + "learning_rate": 2.1243546987071132e-05, + "loss": 0.7226, + "step": 21960 + }, + { + "epoch": 0.24572058091611165, + "grad_norm": 0.26513761281967163, + "learning_rate": 2.1015121750651012e-05, + "loss": 0.7276, + "step": 21970 + }, + { + "epoch": 0.24583242460337432, + "grad_norm": 0.25856319069862366, + "learning_rate": 2.078669651423089e-05, + "loss": 0.7168, + "step": 21980 + }, + { + "epoch": 0.245944268290637, + "grad_norm": 0.29048866033554077, + "learning_rate": 2.0558271277810774e-05, + "loss": 0.7189, + "step": 21990 + }, + { + "epoch": 0.2460561119778997, + "grad_norm": 0.2775687575340271, + "learning_rate": 2.0329846041390653e-05, + "loss": 0.7276, + "step": 22000 + }, + { + "epoch": 0.24616795566516236, + "grad_norm": 0.30157843232154846, + "learning_rate": 2.0101420804970533e-05, + "loss": 0.7435, + "step": 22010 + }, + { + "epoch": 0.24627979935242506, + "grad_norm": 0.2602044939994812, + "learning_rate": 1.9872995568550412e-05, + "loss": 0.7365, + "step": 22020 + }, + { + "epoch": 0.24639164303968772, + "grad_norm": 0.29975757002830505, + "learning_rate": 1.9644570332130292e-05, + "loss": 0.7484, + "step": 22030 + }, + { + "epoch": 0.2465034867269504, + "grad_norm": 0.26586923003196716, + "learning_rate": 1.9416145095710175e-05, + "loss": 0.7499, + "step": 22040 + }, + { + "epoch": 0.2466153304142131, + "grad_norm": 0.25447341799736023, + "learning_rate": 1.9187719859290054e-05, + "loss": 0.7523, + "step": 22050 + }, + { + "epoch": 0.24672717410147577, + "grad_norm": 0.2876524031162262, + "learning_rate": 1.8959294622869933e-05, + "loss": 0.7532, + "step": 22060 + }, + { + "epoch": 0.24683901778873846, + "grad_norm": 0.29897189140319824, + "learning_rate": 1.8730869386449813e-05, + "loss": 0.7339, + "step": 22070 + }, + { + "epoch": 0.24695086147600115, + "grad_norm": 0.24629873037338257, + "learning_rate": 1.8502444150029696e-05, + "loss": 0.7253, + "step": 22080 + }, + { + "epoch": 0.2470627051632638, + "grad_norm": 0.2844459116458893, + "learning_rate": 1.827401891360958e-05, + "loss": 0.7247, + "step": 22090 + }, + { + "epoch": 0.2471745488505265, + "grad_norm": 0.2798469662666321, + "learning_rate": 1.8045593677189458e-05, + "loss": 0.7334, + "step": 22100 + }, + { + "epoch": 0.2472863925377892, + "grad_norm": 0.26282501220703125, + "learning_rate": 1.7817168440769337e-05, + "loss": 0.735, + "step": 22110 + }, + { + "epoch": 0.24739823622505186, + "grad_norm": 0.25192755460739136, + "learning_rate": 1.7588743204349217e-05, + "loss": 0.733, + "step": 22120 + }, + { + "epoch": 0.24751007991231455, + "grad_norm": 0.2808292508125305, + "learning_rate": 1.73603179679291e-05, + "loss": 0.7403, + "step": 22130 + }, + { + "epoch": 0.24762192359957724, + "grad_norm": 0.28252866864204407, + "learning_rate": 1.713189273150898e-05, + "loss": 0.7296, + "step": 22140 + }, + { + "epoch": 0.2477337672868399, + "grad_norm": 0.2730456590652466, + "learning_rate": 1.690346749508886e-05, + "loss": 0.7321, + "step": 22150 + }, + { + "epoch": 0.2478456109741026, + "grad_norm": 0.2562378942966461, + "learning_rate": 1.6675042258668738e-05, + "loss": 0.7195, + "step": 22160 + }, + { + "epoch": 0.2479574546613653, + "grad_norm": 0.2450082004070282, + "learning_rate": 1.6446617022248617e-05, + "loss": 0.7277, + "step": 22170 + }, + { + "epoch": 0.24806929834862795, + "grad_norm": 0.25871893763542175, + "learning_rate": 1.62181917858285e-05, + "loss": 0.7143, + "step": 22180 + }, + { + "epoch": 0.24818114203589065, + "grad_norm": 0.2587449848651886, + "learning_rate": 1.598976654940838e-05, + "loss": 0.708, + "step": 22190 + }, + { + "epoch": 0.2482929857231533, + "grad_norm": 0.25496092438697815, + "learning_rate": 1.576134131298826e-05, + "loss": 0.7123, + "step": 22200 + }, + { + "epoch": 0.248404829410416, + "grad_norm": 0.2394058257341385, + "learning_rate": 1.553291607656814e-05, + "loss": 0.714, + "step": 22210 + }, + { + "epoch": 0.2485166730976787, + "grad_norm": 0.2560165524482727, + "learning_rate": 1.530449084014802e-05, + "loss": 0.7162, + "step": 22220 + }, + { + "epoch": 0.24862851678494136, + "grad_norm": 0.24602052569389343, + "learning_rate": 1.50760656037279e-05, + "loss": 0.7408, + "step": 22230 + }, + { + "epoch": 0.24874036047220405, + "grad_norm": 0.27800559997558594, + "learning_rate": 1.484764036730778e-05, + "loss": 0.7247, + "step": 22240 + }, + { + "epoch": 0.24885220415946674, + "grad_norm": 0.24703536927700043, + "learning_rate": 1.461921513088766e-05, + "loss": 0.7352, + "step": 22250 + }, + { + "epoch": 0.2489640478467294, + "grad_norm": 0.27936097979545593, + "learning_rate": 1.439078989446754e-05, + "loss": 0.7421, + "step": 22260 + }, + { + "epoch": 0.2490758915339921, + "grad_norm": 0.265828400850296, + "learning_rate": 1.4162364658047422e-05, + "loss": 0.7234, + "step": 22270 + }, + { + "epoch": 0.24918773522125479, + "grad_norm": 0.26921194791793823, + "learning_rate": 1.3933939421627303e-05, + "loss": 0.7414, + "step": 22280 + }, + { + "epoch": 0.24929957890851745, + "grad_norm": 0.2829255163669586, + "learning_rate": 1.3705514185207182e-05, + "loss": 0.7378, + "step": 22290 + }, + { + "epoch": 0.24941142259578014, + "grad_norm": 0.25702667236328125, + "learning_rate": 1.3477088948787063e-05, + "loss": 0.7475, + "step": 22300 + }, + { + "epoch": 0.24952326628304283, + "grad_norm": 0.28925350308418274, + "learning_rate": 1.3248663712366943e-05, + "loss": 0.738, + "step": 22310 + }, + { + "epoch": 0.2496351099703055, + "grad_norm": 0.2792825698852539, + "learning_rate": 1.3020238475946822e-05, + "loss": 0.7315, + "step": 22320 + }, + { + "epoch": 0.2497469536575682, + "grad_norm": 0.246215358376503, + "learning_rate": 1.2791813239526703e-05, + "loss": 0.7391, + "step": 22330 + }, + { + "epoch": 0.24985879734483085, + "grad_norm": 0.26492443680763245, + "learning_rate": 1.2563388003106583e-05, + "loss": 0.7478, + "step": 22340 + }, + { + "epoch": 0.24997064103209354, + "grad_norm": 0.27402445673942566, + "learning_rate": 1.2334962766686464e-05, + "loss": 0.7528, + "step": 22350 + }, + { + "epoch": 0.25008248471935624, + "grad_norm": 0.2757234573364258, + "learning_rate": 1.2106537530266343e-05, + "loss": 0.7306, + "step": 22360 + }, + { + "epoch": 0.2501943284066189, + "grad_norm": 0.2723679840564728, + "learning_rate": 1.1878112293846224e-05, + "loss": 0.7472, + "step": 22370 + }, + { + "epoch": 0.2503061720938816, + "grad_norm": 0.22666431963443756, + "learning_rate": 1.1649687057426105e-05, + "loss": 0.7443, + "step": 22380 + }, + { + "epoch": 0.25041801578114425, + "grad_norm": 0.24548636376857758, + "learning_rate": 1.1421261821005985e-05, + "loss": 0.7525, + "step": 22390 + }, + { + "epoch": 0.25052985946840695, + "grad_norm": 0.26941460371017456, + "learning_rate": 1.1192836584585866e-05, + "loss": 0.7482, + "step": 22400 + }, + { + "epoch": 0.25064170315566964, + "grad_norm": 0.2741219997406006, + "learning_rate": 1.0964411348165745e-05, + "loss": 0.7404, + "step": 22410 + }, + { + "epoch": 0.25075354684293233, + "grad_norm": 0.2622029483318329, + "learning_rate": 1.0735986111745626e-05, + "loss": 0.7463, + "step": 22420 + }, + { + "epoch": 0.250865390530195, + "grad_norm": 0.25730788707733154, + "learning_rate": 1.0507560875325506e-05, + "loss": 0.7596, + "step": 22430 + }, + { + "epoch": 0.25097723421745766, + "grad_norm": 0.24054691195487976, + "learning_rate": 1.0279135638905387e-05, + "loss": 0.7397, + "step": 22440 + }, + { + "epoch": 0.25108907790472035, + "grad_norm": 0.23557224869728088, + "learning_rate": 1.0050710402485266e-05, + "loss": 0.7426, + "step": 22450 + }, + { + "epoch": 0.25120092159198304, + "grad_norm": 0.25929298996925354, + "learning_rate": 9.822285166065146e-06, + "loss": 0.7402, + "step": 22460 + }, + { + "epoch": 0.25131276527924573, + "grad_norm": 0.26300865411758423, + "learning_rate": 9.593859929645027e-06, + "loss": 0.755, + "step": 22470 + }, + { + "epoch": 0.2514246089665084, + "grad_norm": 0.25753623247146606, + "learning_rate": 9.365434693224906e-06, + "loss": 0.7536, + "step": 22480 + }, + { + "epoch": 0.2515364526537711, + "grad_norm": 0.2438272088766098, + "learning_rate": 9.13700945680479e-06, + "loss": 0.7528, + "step": 22490 + }, + { + "epoch": 0.25164829634103375, + "grad_norm": 0.2870919406414032, + "learning_rate": 8.908584220384669e-06, + "loss": 0.772, + "step": 22500 + }, + { + "epoch": 0.25176014002829644, + "grad_norm": 0.2551197111606598, + "learning_rate": 8.68015898396455e-06, + "loss": 0.7571, + "step": 22510 + }, + { + "epoch": 0.25187198371555913, + "grad_norm": 0.24423009157180786, + "learning_rate": 8.45173374754443e-06, + "loss": 0.7548, + "step": 22520 + }, + { + "epoch": 0.2519838274028218, + "grad_norm": 0.2683405578136444, + "learning_rate": 8.223308511124309e-06, + "loss": 0.7631, + "step": 22530 + }, + { + "epoch": 0.2520956710900845, + "grad_norm": 0.25919967889785767, + "learning_rate": 7.99488327470419e-06, + "loss": 0.7556, + "step": 22540 + }, + { + "epoch": 0.25220751477734715, + "grad_norm": 0.25076591968536377, + "learning_rate": 7.76645803828407e-06, + "loss": 0.7528, + "step": 22550 + }, + { + "epoch": 0.25231935846460984, + "grad_norm": 0.2598860561847687, + "learning_rate": 7.53803280186395e-06, + "loss": 0.7565, + "step": 22560 + }, + { + "epoch": 0.25243120215187254, + "grad_norm": 0.30933788418769836, + "learning_rate": 7.30960756544383e-06, + "loss": 0.7645, + "step": 22570 + }, + { + "epoch": 0.2525430458391352, + "grad_norm": 0.26472121477127075, + "learning_rate": 7.081182329023711e-06, + "loss": 0.7559, + "step": 22580 + }, + { + "epoch": 0.2526548895263979, + "grad_norm": 0.28362420201301575, + "learning_rate": 6.852757092603591e-06, + "loss": 0.7618, + "step": 22590 + }, + { + "epoch": 0.2527667332136606, + "grad_norm": 0.27758538722991943, + "learning_rate": 6.624331856183471e-06, + "loss": 0.7656, + "step": 22600 + }, + { + "epoch": 0.25287857690092325, + "grad_norm": 0.28303948044776917, + "learning_rate": 6.395906619763352e-06, + "loss": 0.7672, + "step": 22610 + }, + { + "epoch": 0.25299042058818594, + "grad_norm": 0.2938460409641266, + "learning_rate": 6.167481383343232e-06, + "loss": 0.7662, + "step": 22620 + }, + { + "epoch": 0.25310226427544863, + "grad_norm": 0.25707969069480896, + "learning_rate": 5.939056146923112e-06, + "loss": 0.7667, + "step": 22630 + }, + { + "epoch": 0.2532141079627113, + "grad_norm": 0.2813314199447632, + "learning_rate": 5.710630910502992e-06, + "loss": 0.7645, + "step": 22640 + }, + { + "epoch": 0.253325951649974, + "grad_norm": 0.2911704480648041, + "learning_rate": 5.482205674082873e-06, + "loss": 0.763, + "step": 22650 + }, + { + "epoch": 0.2534377953372367, + "grad_norm": 0.2982921600341797, + "learning_rate": 5.253780437662753e-06, + "loss": 0.7606, + "step": 22660 + }, + { + "epoch": 0.25354963902449934, + "grad_norm": 0.2803521156311035, + "learning_rate": 5.025355201242633e-06, + "loss": 0.7617, + "step": 22670 + }, + { + "epoch": 0.25366148271176203, + "grad_norm": 0.26502448320388794, + "learning_rate": 4.7969299648225135e-06, + "loss": 0.7802, + "step": 22680 + }, + { + "epoch": 0.2537733263990247, + "grad_norm": 0.27778494358062744, + "learning_rate": 4.568504728402395e-06, + "loss": 0.7776, + "step": 22690 + }, + { + "epoch": 0.2538851700862874, + "grad_norm": 0.27522069215774536, + "learning_rate": 4.340079491982275e-06, + "loss": 0.7712, + "step": 22700 + }, + { + "epoch": 0.2539970137735501, + "grad_norm": 0.2718433141708374, + "learning_rate": 4.111654255562154e-06, + "loss": 0.7696, + "step": 22710 + }, + { + "epoch": 0.25410885746081274, + "grad_norm": 0.35057663917541504, + "learning_rate": 3.883229019142035e-06, + "loss": 0.7648, + "step": 22720 + }, + { + "epoch": 0.25422070114807543, + "grad_norm": 0.274494469165802, + "learning_rate": 3.654803782721915e-06, + "loss": 0.7578, + "step": 22730 + }, + { + "epoch": 0.2543325448353381, + "grad_norm": 0.2570250928401947, + "learning_rate": 3.4263785463017955e-06, + "loss": 0.7502, + "step": 22740 + }, + { + "epoch": 0.2544443885226008, + "grad_norm": 0.290217787027359, + "learning_rate": 3.197953309881676e-06, + "loss": 0.7607, + "step": 22750 + }, + { + "epoch": 0.2545562322098635, + "grad_norm": 0.25752514600753784, + "learning_rate": 2.969528073461556e-06, + "loss": 0.7612, + "step": 22760 + }, + { + "epoch": 0.2546680758971262, + "grad_norm": 0.23857931792736053, + "learning_rate": 2.7411028370414363e-06, + "loss": 0.7495, + "step": 22770 + }, + { + "epoch": 0.25477991958438884, + "grad_norm": 0.26004472374916077, + "learning_rate": 2.5126776006213166e-06, + "loss": 0.7477, + "step": 22780 + }, + { + "epoch": 0.25489176327165153, + "grad_norm": 0.25449565052986145, + "learning_rate": 2.2842523642011973e-06, + "loss": 0.7379, + "step": 22790 + }, + { + "epoch": 0.2550036069589142, + "grad_norm": 0.2568104565143585, + "learning_rate": 2.055827127781077e-06, + "loss": 0.7407, + "step": 22800 + }, + { + "epoch": 0.2551154506461769, + "grad_norm": 0.253451406955719, + "learning_rate": 1.8274018913609574e-06, + "loss": 0.7241, + "step": 22810 + }, + { + "epoch": 0.2552272943334396, + "grad_norm": 0.25928062200546265, + "learning_rate": 1.598976654940838e-06, + "loss": 0.7502, + "step": 22820 + }, + { + "epoch": 0.2553391380207023, + "grad_norm": 0.24965140223503113, + "learning_rate": 1.3705514185207182e-06, + "loss": 0.7417, + "step": 22830 + }, + { + "epoch": 0.25545098170796493, + "grad_norm": 0.2660306394100189, + "learning_rate": 1.1421261821005987e-06, + "loss": 0.7463, + "step": 22840 + }, + { + "epoch": 0.2555628253952276, + "grad_norm": 0.25784334540367126, + "learning_rate": 9.137009456804787e-07, + "loss": 0.7379, + "step": 22850 + }, + { + "epoch": 0.2556746690824903, + "grad_norm": 0.27776214480400085, + "learning_rate": 6.852757092603591e-07, + "loss": 0.7562, + "step": 22860 + }, + { + "epoch": 0.255786512769753, + "grad_norm": 0.24403463304042816, + "learning_rate": 4.5685047284023936e-07, + "loss": 0.7427, + "step": 22870 + }, + { + "epoch": 0.2558983564570157, + "grad_norm": 0.24544622004032135, + "learning_rate": 2.2842523642011968e-07, + "loss": 0.748, + "step": 22880 + } + ], + "logging_steps": 10, + "max_steps": 22889, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.946484739580887e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}